1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import java.io.IOException;
23 import java.net.InetAddress;
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.Set;
32 import java.util.concurrent.ConcurrentHashMap;
33
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.apache.hadoop.conf.Configuration;
37 import org.apache.hadoop.hbase.ClockOutOfSyncException;
38 import org.apache.hadoop.hbase.HRegionInfo;
39 import org.apache.hadoop.hbase.HServerAddress;
40 import org.apache.hadoop.hbase.HServerLoad;
41 import org.apache.hadoop.hbase.PleaseHoldException;
42 import org.apache.hadoop.hbase.Server;
43 import org.apache.hadoop.hbase.ServerName;
44 import org.apache.hadoop.hbase.YouAreDeadException;
45 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
46 import org.apache.hadoop.hbase.client.HConnection;
47 import org.apache.hadoop.hbase.client.HConnectionManager;
48 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
49 import org.apache.hadoop.hbase.ipc.HRegionInterface;
50 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
51 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
52 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
53 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 public class ServerManager {
69 public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
70 "hbase.master.wait.on.regionservers.maxtostart";
71
72 public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
73 "hbase.master.wait.on.regionservers.mintostart";
74
75 public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
76 "hbase.master.wait.on.regionservers.timeout";
77
78 public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
79 "hbase.master.wait.on.regionservers.interval";
80
81 private static final Log LOG = LogFactory.getLog(ServerManager.class);
82
83
84 private volatile boolean clusterShutdown = false;
85
86
87 private final Map<ServerName, HServerLoad> onlineServers =
88 new ConcurrentHashMap<ServerName, HServerLoad>();
89
90
91
92
93
94 private final Map<ServerName, HRegionInterface> serverConnections =
95 new HashMap<ServerName, HRegionInterface>();
96
97
98
99
100
101 private final ArrayList<ServerName> drainingServers =
102 new ArrayList<ServerName>();
103
104 private final Server master;
105 private final MasterServices services;
106 private final HConnection connection;
107
108 private final DeadServer deadservers;
109
110 private final long maxSkew;
111 private final long warningSkew;
112
113
114
115
116
117
118
119 private Set<ServerName> deadNotExpiredServers = new HashSet<ServerName>();
120
121
122
123
124
125 private boolean isSSHForRootEnabled = false;
126
127
128
129
130
131
132
133 public ServerManager(final Server master, final MasterServices services)
134 throws ZooKeeperConnectionException {
135 this(master, services, true);
136 }
137
138 ServerManager(final Server master, final MasterServices services,
139 final boolean connect) throws ZooKeeperConnectionException {
140 this.master = master;
141 this.services = services;
142 Configuration c = master.getConfiguration();
143 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
144 warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
145 this.deadservers = new DeadServer();
146 this.connection = connect ? HConnectionManager.getConnection(c) : null;
147 }
148
149
150
151
152
153
154
155
156
157
158 ServerName regionServerStartup(final InetAddress ia, final int port,
159 final long serverStartcode, long serverCurrentTime)
160 throws IOException {
161
162
163
164
165
166
167
168 ServerName sn = new ServerName(ia.getHostName(), port, serverStartcode);
169 checkClockSkew(sn, serverCurrentTime);
170 checkIsDead(sn, "STARTUP");
171 checkAlreadySameHostPort(sn);
172 recordNewServer(sn, HServerLoad.EMPTY_HSERVERLOAD);
173 return sn;
174 }
175
176 void regionServerReport(ServerName sn, HServerLoad hsl)
177 throws YouAreDeadException, PleaseHoldException {
178 checkIsDead(sn, "REPORT");
179 if (!this.onlineServers.containsKey(sn)) {
180
181 checkAlreadySameHostPort(sn);
182
183
184
185
186
187 recordNewServer(sn, hsl);
188 } else {
189 this.onlineServers.put(sn, hsl);
190 }
191 }
192
193
194
195
196
197
198 void checkAlreadySameHostPort(final ServerName serverName)
199 throws PleaseHoldException {
200 ServerName existingServer =
201 ServerName.findServerWithSameHostnamePort(getOnlineServersList(), serverName);
202 if (existingServer != null) {
203 String message = "Server serverName=" + serverName +
204 " rejected; we already have " + existingServer.toString() +
205 " registered with same hostname and port";
206 LOG.info(message);
207 if (existingServer.getStartcode() < serverName.getStartcode()) {
208 LOG.info("Triggering server recovery; existingServer " +
209 existingServer + " looks stale, new server:" + serverName);
210 expireServer(existingServer);
211 }
212 if (services.isServerShutdownHandlerEnabled()) {
213
214 throw new PleaseHoldException(message);
215 }
216 }
217 }
218
219
220
221
222
223
224
225
226
227 private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
228 throws ClockOutOfSyncException {
229 long skew = System.currentTimeMillis() - serverCurrentTime;
230 if (skew > maxSkew) {
231 String message = "Server " + serverName + " has been " +
232 "rejected; Reported time is too far out of sync with master. " +
233 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
234 LOG.warn(message);
235 throw new ClockOutOfSyncException(message);
236 } else if (skew > warningSkew){
237 String message = "Reported time for server " + serverName + " is out of sync with master " +
238 "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " +
239 "error threshold is " + maxSkew + "ms)";
240 LOG.warn(message);
241 }
242 }
243
244
245
246
247
248
249
250
251
252 private void checkIsDead(final ServerName serverName, final String what)
253 throws YouAreDeadException {
254 if (this.deadservers.isDeadServer(serverName)) {
255
256
257 String message = "Server " + what + " rejected; currently processing " +
258 serverName + " as dead server";
259 LOG.debug(message);
260 throw new YouAreDeadException(message);
261 }
262
263
264
265 if ((this.services == null || ((HMaster) this.services).isInitialized())
266 && this.deadservers.cleanPreviousInstance(serverName)) {
267
268
269 LOG.debug(what + ":" + " Server " + serverName + " came back up," +
270 " removed it from the dead servers list");
271 }
272 }
273
274
275
276
277
278
279 void recordNewServer(final ServerName serverName, final HServerLoad hsl) {
280 LOG.info("Registering server=" + serverName);
281 this.onlineServers.put(serverName, hsl);
282 this.serverConnections.remove(serverName);
283 }
284
285
286
287
288
289 public HServerLoad getLoad(final ServerName serverName) {
290 return this.onlineServers.get(serverName);
291 }
292
293
294
295
296
297
298 public HServerLoad getLoad(final HServerAddress address) {
299 ServerName sn = new ServerName(address.toString(), ServerName.NON_STARTCODE);
300 ServerName actual =
301 ServerName.findServerWithSameHostnamePort(this.getOnlineServersList(), sn);
302 return actual == null? null: getLoad(actual);
303 }
304
305
306
307
308
309
310
311 public double getAverageLoad() {
312 int totalLoad = 0;
313 int numServers = 0;
314 double averageLoad = 0.0;
315 for (HServerLoad hsl: this.onlineServers.values()) {
316 numServers++;
317 totalLoad += hsl.getNumberOfRegions();
318 }
319 averageLoad = (double)totalLoad / (double)numServers;
320 return averageLoad;
321 }
322
323
324 int countOfRegionServers() {
325
326 return this.onlineServers.size();
327 }
328
329
330
331
332 public Map<ServerName, HServerLoad> getOnlineServers() {
333
334 synchronized (this.onlineServers) {
335 return Collections.unmodifiableMap(this.onlineServers);
336 }
337 }
338
339 public Set<ServerName> getDeadServers() {
340 return this.deadservers.clone();
341 }
342
343
344
345
346
347 public boolean areDeadServersInProgress() {
348 return this.deadservers.areDeadServersInProgress();
349 }
350
351 void letRegionServersShutdown() {
352 long previousLogTime = 0;
353 while (!onlineServers.isEmpty()) {
354
355 if (System.currentTimeMillis() > (previousLogTime + 1000)) {
356 StringBuilder sb = new StringBuilder();
357 for (ServerName key : this.onlineServers.keySet()) {
358 if (sb.length() > 0) {
359 sb.append(", ");
360 }
361 sb.append(key);
362 }
363 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
364 previousLogTime = System.currentTimeMillis();
365 }
366
367 synchronized (onlineServers) {
368 try {
369 onlineServers.wait(100);
370 } catch (InterruptedException ignored) {
371
372 }
373 }
374 }
375 }
376
377
378
379
380
381 public synchronized void expireServer(final ServerName serverName) {
382 boolean carryingRoot = services.getAssignmentManager().isCarryingRoot(serverName);
383 if (!services.isServerShutdownHandlerEnabled() && (!carryingRoot || !this.isSSHForRootEnabled)) {
384 LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
385 + "delay expiring server " + serverName);
386 this.deadNotExpiredServers.add(serverName);
387 return;
388 }
389 if (!this.onlineServers.containsKey(serverName)) {
390 LOG.warn("Received expiration of " + serverName +
391 " but server is not currently online");
392 }
393 if (this.deadservers.contains(serverName)) {
394
395 LOG.warn("Received expiration of " + serverName +
396 " but server shutdown is already in progress");
397 return;
398 }
399
400
401
402 this.deadservers.add(serverName);
403 this.onlineServers.remove(serverName);
404 synchronized (onlineServers) {
405 onlineServers.notifyAll();
406 }
407 this.serverConnections.remove(serverName);
408
409
410 if (this.clusterShutdown) {
411 LOG.info("Cluster shutdown set; " + serverName +
412 " expired; onlineServers=" + this.onlineServers.size());
413 if (this.onlineServers.isEmpty()) {
414 master.stop("Cluster shutdown set; onlineServer=0");
415 }
416 return;
417 }
418
419 boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
420 if (carryingRoot || carryingMeta) {
421 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
422 this.services, this.deadservers, serverName, carryingRoot, carryingMeta));
423 } else {
424 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
425 this.services, this.deadservers, serverName, true));
426 }
427 LOG.debug("Added=" + serverName +
428 " to dead servers, submitted shutdown handler to be executed, root=" +
429 carryingRoot + ", meta=" + carryingMeta);
430 }
431
432
433
434
435
436
437 synchronized void expireDeadNotExpiredServers() throws IOException {
438 if (!services.isServerShutdownHandlerEnabled()) {
439 throw new IOException("Master hasn't enabled ServerShutdownHandler ");
440 }
441 Iterator<ServerName> serverIterator = deadNotExpiredServers.iterator();
442 while (serverIterator.hasNext()) {
443 expireServer(serverIterator.next());
444 serverIterator.remove();
445 }
446 }
447
448
449
450
451
452
453 void enableSSHForRoot() throws IOException {
454 if (this.isSSHForRootEnabled) {
455 return;
456 }
457 this.isSSHForRootEnabled = true;
458 Iterator<ServerName> serverIterator = deadNotExpiredServers.iterator();
459 while (serverIterator.hasNext()) {
460 ServerName curServerName = serverIterator.next();
461 if (services.getAssignmentManager().isCarryingRoot(curServerName)) {
462 expireServer(curServerName);
463 serverIterator.remove();
464 }
465 }
466 }
467
468
469
470
471 void disableSSHForRoot() {
472 this.isSSHForRootEnabled = false;
473 }
474
475
476
477
478 public boolean removeServerFromDrainList(final ServerName sn) {
479
480
481
482 if (!this.isServerOnline(sn)) {
483 LOG.warn("Server " + sn + " is not currently online. " +
484 "Removing from draining list anyway, as requested.");
485 }
486
487 return this.drainingServers.remove(sn);
488 }
489
490
491
492
493 public boolean addServerToDrainList(final ServerName sn) {
494
495
496
497 if (!this.isServerOnline(sn)) {
498 LOG.warn("Server " + sn + " is not currently online. " +
499 "Ignoring request to add it to draining list.");
500 return false;
501 }
502
503
504 if (this.drainingServers.contains(sn)) {
505 LOG.warn("Server " + sn + " is already in the draining server list." +
506 "Ignoring request to add it again.");
507 return false;
508 }
509 return this.drainingServers.add(sn);
510 }
511
512
513
514
515
516
517
518
519
520
521
522
523
524 public RegionOpeningState sendRegionOpen(final ServerName server,
525 HRegionInfo region, int versionOfOfflineNode)
526 throws IOException {
527 HRegionInterface hri = getServerConnection(server);
528 if (hri == null) {
529 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
530 " failed because no RPC connection found to this server");
531 return RegionOpeningState.FAILED_OPENING;
532 }
533 return (versionOfOfflineNode == -1) ? hri.openRegion(region) : hri
534 .openRegion(region, versionOfOfflineNode);
535 }
536
537
538
539
540
541
542
543
544
545 public void sendRegionOpen(ServerName server, List<HRegionInfo> regions)
546 throws IOException {
547 HRegionInterface hri = getServerConnection(server);
548 if (hri == null) {
549 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
550 " failed because no RPC connection found to this server");
551 return;
552 }
553 hri.openRegions(regions);
554 }
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569 public boolean sendRegionClose(ServerName server, HRegionInfo region,
570 int versionOfClosingNode) throws IOException {
571 if (server == null) throw new NullPointerException("Passed server is null");
572 HRegionInterface hri = getServerConnection(server);
573 if (hri == null) {
574 throw new IOException("Attempting to send CLOSE RPC to server " +
575 server.toString() + " for region " +
576 region.getRegionNameAsString() +
577 " failed because no RPC connection found to this server");
578 }
579 return hri.closeRegion(region, versionOfClosingNode);
580 }
581
582
583
584
585
586
587
588
589 private HRegionInterface getServerConnection(final ServerName sn)
590 throws IOException {
591 HRegionInterface hri = this.serverConnections.get(sn);
592 if (hri == null) {
593 LOG.debug("New connection to " + sn.toString());
594 hri = this.connection.getHRegionConnection(sn.getHostname(), sn.getPort());
595 this.serverConnections.put(sn, hri);
596 }
597 return hri;
598 }
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613 public void waitForRegionServers(MonitoredTask status)
614 throws InterruptedException {
615 final long interval = this.master.getConfiguration().
616 getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
617 final long timeout = this.master.getConfiguration().
618 getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
619 int minToStart = this.master.getConfiguration().
620 getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
621 if (minToStart < 1) {
622 LOG.warn(String.format(
623 "The value of '%s' (%d) can not be less than 1, ignoring.",
624 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
625 minToStart = 1;
626 }
627 int maxToStart = this.master.getConfiguration().
628 getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
629 if (maxToStart < minToStart) {
630 LOG.warn(String.format(
631 "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
632 WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
633 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
634 maxToStart = Integer.MAX_VALUE;
635 }
636
637 long now = System.currentTimeMillis();
638 final long startTime = now;
639 long slept = 0;
640 long lastLogTime = 0;
641 long lastCountChange = startTime;
642 int count = countOfRegionServers();
643 int oldCount = 0;
644 while (
645 !this.master.isStopped() &&
646 count < maxToStart &&
647 (lastCountChange+interval > now || timeout > slept || count < minToStart)
648 ){
649
650
651 if (oldCount != count || lastLogTime+interval < now){
652 lastLogTime = now;
653 String msg =
654 "Waiting for region servers count to settle; currently"+
655 " checked in " + count + ", slept for " + slept + " ms," +
656 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
657 ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
658 LOG.info(msg);
659 status.setStatus(msg);
660 }
661
662
663 final long sleepTime = 50;
664 Thread.sleep(sleepTime);
665 now = System.currentTimeMillis();
666 slept = now - startTime;
667
668 oldCount = count;
669 count = countOfRegionServers();
670 if (count != oldCount) {
671 lastCountChange = now;
672 }
673 }
674
675 LOG.info("Finished waiting for region servers count to settle;" +
676 " checked in " + count + ", slept for " + slept + " ms," +
677 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
678 " master is "+ (this.master.isStopped() ? "stopped.": "running.")
679 );
680 }
681
682
683
684
685 public List<ServerName> getOnlineServersList() {
686
687
688 return new ArrayList<ServerName>(this.onlineServers.keySet());
689 }
690
691
692
693
694 public List<ServerName> getDrainingServersList() {
695 return new ArrayList<ServerName>(this.drainingServers);
696 }
697
698
699
700
701 Set<ServerName> getDeadNotExpiredServers() {
702 return new HashSet<ServerName>(this.deadNotExpiredServers);
703 }
704
705 public boolean isServerOnline(ServerName serverName) {
706 return onlineServers.containsKey(serverName);
707 }
708
709 public void shutdownCluster() {
710 this.clusterShutdown = true;
711 this.master.stop("Cluster shutdown requested");
712 }
713
714 public boolean isClusterShutdown() {
715 return this.clusterShutdown;
716 }
717
718
719
720
721 public void stop() {
722 if (connection != null) {
723 try {
724 connection.close();
725 } catch (IOException e) {
726 LOG.error("Attempt to close connection to master failed", e);
727 }
728 }
729 }
730
731
732
733
734 void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
735 ServerName sn = null;
736 for (ServerName serverName : getOnlineServersList()) {
737 while ((sn = ServerName.
738 findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
739 this.deadservers.remove(sn);
740 }
741 }
742 }
743
744 }