1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import java.io.IOException;
22 import java.net.InetAddress;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Map.Entry;
31 import java.util.Set;
32 import java.util.SortedMap;
33 import java.util.concurrent.ConcurrentHashMap;
34 import java.util.concurrent.ConcurrentSkipListMap;
35 import java.util.concurrent.CopyOnWriteArrayList;
36
37 import org.apache.commons.logging.Log;
38 import org.apache.commons.logging.LogFactory;
39 import org.apache.hadoop.conf.Configuration;
40 import org.apache.hadoop.hbase.ClockOutOfSyncException;
41 import org.apache.hadoop.hbase.HRegionInfo;
42 import org.apache.hadoop.hbase.RegionLoad;
43 import org.apache.hadoop.hbase.Server;
44 import org.apache.hadoop.hbase.ServerLoad;
45 import org.apache.hadoop.hbase.ServerName;
46 import org.apache.hadoop.hbase.YouAreDeadException;
47 import org.apache.hadoop.hbase.classification.InterfaceAudience;
48 import org.apache.hadoop.hbase.client.ClusterConnection;
49 import org.apache.hadoop.hbase.client.ConnectionFactory;
50 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
51 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
52 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
53 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
54 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
55 import org.apache.hadoop.hbase.protobuf.RequestConverter;
56 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
57 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
58 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
59 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
60 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
61 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
62 import org.apache.hadoop.hbase.regionserver.HRegionServer;
63 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
64 import org.apache.hadoop.hbase.util.Bytes;
65 import org.apache.hadoop.hbase.util.Triple;
66 import org.apache.hadoop.hbase.util.Pair;
67 import org.apache.hadoop.hbase.util.RetryCounter;
68 import org.apache.hadoop.hbase.util.RetryCounterFactory;
69 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
70 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
71 import org.apache.zookeeper.KeeperException;
72
73 import com.google.common.annotations.VisibleForTesting;
74 import com.google.protobuf.ServiceException;
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 @InterfaceAudience.Private
99 public class ServerManager {
100 public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
101 "hbase.master.wait.on.regionservers.maxtostart";
102
103 public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
104 "hbase.master.wait.on.regionservers.mintostart";
105
106 public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
107 "hbase.master.wait.on.regionservers.timeout";
108
109 public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
110 "hbase.master.wait.on.regionservers.interval";
111
112 private static final Log LOG = LogFactory.getLog(ServerManager.class);
113
114
115 private volatile boolean clusterShutdown = false;
116
117 private final SortedMap<byte[], Long> flushedSequenceIdByRegion =
118 new ConcurrentSkipListMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
119
120
121 private final ConcurrentHashMap<ServerName, ServerLoad> onlineServers =
122 new ConcurrentHashMap<ServerName, ServerLoad>();
123
124
125
126
127
128 private final Map<ServerName, AdminService.BlockingInterface> rsAdmins =
129 new HashMap<ServerName, AdminService.BlockingInterface>();
130
131
132
133
134
135 private final ArrayList<ServerName> drainingServers =
136 new ArrayList<ServerName>();
137
138 private final Server master;
139 private final MasterServices services;
140 private final ClusterConnection connection;
141
142 private final DeadServer deadservers = new DeadServer();
143
144 private final long maxSkew;
145 private final long warningSkew;
146
147 private final RetryCounterFactory pingRetryCounterFactory;
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165 private Set<ServerName> queuedDeadServers = new HashSet<ServerName>();
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182 private Map<ServerName, Boolean> requeuedDeadServers
183 = new ConcurrentHashMap<ServerName, Boolean>();
184
185
186 private List<ServerListener> listeners = new CopyOnWriteArrayList<ServerListener>();
187
188
189
190
191
192
193
194 public ServerManager(final Server master, final MasterServices services)
195 throws IOException {
196 this(master, services, true);
197 }
198
199 ServerManager(final Server master, final MasterServices services,
200 final boolean connect) throws IOException {
201 this.master = master;
202 this.services = services;
203 Configuration c = master.getConfiguration();
204 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
205 warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
206 this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
207 int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
208 "hbase.master.maximum.ping.server.attempts", 10));
209 int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
210 "hbase.master.ping.server.retry.sleep.interval", 100));
211 this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
212 }
213
214
215
216
217
218 public void registerListener(final ServerListener listener) {
219 this.listeners.add(listener);
220 }
221
222
223
224
225
226 public boolean unregisterListener(final ServerListener listener) {
227 return this.listeners.remove(listener);
228 }
229
230
231
232
233
234
235
236
237
238
239 ServerName regionServerStartup(final InetAddress ia, final int port,
240 final long serverStartcode, long serverCurrentTime)
241 throws IOException {
242
243
244
245
246
247
248
249 ServerName sn = ServerName.valueOf(ia.getHostName(), port, serverStartcode);
250 checkClockSkew(sn, serverCurrentTime);
251 checkIsDead(sn, "STARTUP");
252 if (!checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
253 LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
254 + " could not record the server: " + sn);
255 }
256 return sn;
257 }
258
259
260
261
262
263
264 private void updateLastFlushedSequenceIds(ServerName sn, ServerLoad hsl) {
265 Map<byte[], RegionLoad> regionsLoad = hsl.getRegionsLoad();
266 for (Entry<byte[], RegionLoad> entry : regionsLoad.entrySet()) {
267 byte[] encodedRegionName = Bytes.toBytes(HRegionInfo.encodeRegionName(entry.getKey()));
268 Long existingValue = flushedSequenceIdByRegion.get(encodedRegionName);
269 long l = entry.getValue().getCompleteSequenceId();
270 if (existingValue != null) {
271 if (l != -1 && l < existingValue) {
272 LOG.warn("RegionServer " + sn +
273 " indicates a last flushed sequence id (" + entry.getValue() +
274 ") that is less than the previous last flushed sequence id (" +
275 existingValue + ") for region " +
276 Bytes.toString(entry.getKey()) + " Ignoring.");
277
278 continue;
279 }
280 }
281 flushedSequenceIdByRegion.put(encodedRegionName, l);
282 }
283 }
284
285 void regionServerReport(ServerName sn,
286 ServerLoad sl) throws YouAreDeadException {
287 checkIsDead(sn, "REPORT");
288 if (null == this.onlineServers.replace(sn, sl)) {
289
290
291
292
293
294
295 if (!checkAndRecordNewServer(sn, sl)) {
296 LOG.info("RegionServerReport ignored, could not record the server: " + sn);
297 return;
298 }
299 }
300 updateLastFlushedSequenceIds(sn, sl);
301 }
302
303
304
305
306
307
308
309
310
311 boolean checkAndRecordNewServer(
312 final ServerName serverName, final ServerLoad sl) {
313 ServerName existingServer = null;
314 synchronized (this.onlineServers) {
315 existingServer = findServerWithSameHostnamePortWithLock(serverName);
316 if (existingServer != null && (existingServer.getStartcode() > serverName.getStartcode())) {
317 LOG.info("Server serverName=" + serverName + " rejected; we already have "
318 + existingServer.toString() + " registered with same hostname and port");
319 return false;
320 }
321 recordNewServerWithLock(serverName, sl);
322 }
323
324
325 if (!this.listeners.isEmpty()) {
326 for (ServerListener listener : this.listeners) {
327 listener.serverAdded(serverName);
328 }
329 }
330
331
332
333 if (existingServer != null && (existingServer.getStartcode() < serverName.getStartcode())) {
334 LOG.info("Triggering server recovery; existingServer " +
335 existingServer + " looks stale, new server:" + serverName);
336 expireServer(existingServer);
337 }
338 return true;
339 }
340
341
342
343
344
345
346
347
348
349 private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
350 throws ClockOutOfSyncException {
351 long skew = Math.abs(System.currentTimeMillis() - serverCurrentTime);
352 if (skew > maxSkew) {
353 String message = "Server " + serverName + " has been " +
354 "rejected; Reported time is too far out of sync with master. " +
355 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
356 LOG.warn(message);
357 throw new ClockOutOfSyncException(message);
358 } else if (skew > warningSkew){
359 String message = "Reported time for server " + serverName + " is out of sync with master " +
360 "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " +
361 "error threshold is " + maxSkew + "ms)";
362 LOG.warn(message);
363 }
364 }
365
366
367
368
369
370
371
372
373
374 private void checkIsDead(final ServerName serverName, final String what)
375 throws YouAreDeadException {
376 if (this.deadservers.isDeadServer(serverName)) {
377
378
379 String message = "Server " + what + " rejected; currently processing " +
380 serverName + " as dead server";
381 LOG.debug(message);
382 throw new YouAreDeadException(message);
383 }
384
385
386 if ((this.services == null || ((HMaster) this.services).isInitialized())
387 && this.deadservers.cleanPreviousInstance(serverName)) {
388
389
390 LOG.debug(what + ":" + " Server " + serverName + " came back up," +
391 " removed it from the dead servers list");
392 }
393 }
394
395
396
397
398
399 private ServerName findServerWithSameHostnamePortWithLock(
400 final ServerName serverName) {
401 for (ServerName sn: this.onlineServers.keySet()) {
402 if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn;
403 }
404 return null;
405 }
406
407
408
409
410
411
412
413 @VisibleForTesting
414 void recordNewServerWithLock(final ServerName serverName, final ServerLoad sl) {
415 LOG.info("Registering server=" + serverName);
416 this.onlineServers.put(serverName, sl);
417 this.rsAdmins.remove(serverName);
418 }
419
420 public long getLastFlushedSequenceId(byte[] encodedRegionName) {
421 long seqId = -1L;
422 if (flushedSequenceIdByRegion.containsKey(encodedRegionName)) {
423 seqId = flushedSequenceIdByRegion.get(encodedRegionName);
424 }
425 return seqId;
426 }
427
428
429
430
431
432 public ServerLoad getLoad(final ServerName serverName) {
433 return this.onlineServers.get(serverName);
434 }
435
436
437
438
439
440
441
442 public double getAverageLoad() {
443 int totalLoad = 0;
444 int numServers = 0;
445 for (ServerLoad sl: this.onlineServers.values()) {
446 numServers++;
447 totalLoad += sl.getNumberOfRegions();
448 }
449 return numServers == 0 ? 0 :
450 (double)totalLoad / (double)numServers;
451 }
452
453
454 public int countOfRegionServers() {
455
456 return this.onlineServers.size();
457 }
458
459
460
461
462 public Map<ServerName, ServerLoad> getOnlineServers() {
463
464 synchronized (this.onlineServers) {
465 return Collections.unmodifiableMap(this.onlineServers);
466 }
467 }
468
469
470 public DeadServer getDeadServers() {
471 return this.deadservers;
472 }
473
474
475
476
477
478 public boolean areDeadServersInProgress() {
479 return this.deadservers.areDeadServersInProgress();
480 }
481
482 void letRegionServersShutdown() {
483 long previousLogTime = 0;
484 ServerName sn = master.getServerName();
485 ZooKeeperWatcher zkw = master.getZooKeeper();
486 int onlineServersCt;
487 while ((onlineServersCt = onlineServers.size()) > 0){
488
489 if (System.currentTimeMillis() > (previousLogTime + 1000)) {
490 Set<ServerName> remainingServers = onlineServers.keySet();
491 synchronized (onlineServers) {
492 if (remainingServers.size() == 1 && remainingServers.contains(sn)) {
493
494 return;
495 }
496 }
497 StringBuilder sb = new StringBuilder();
498
499 for (ServerName key : remainingServers) {
500 if (sb.length() > 0) {
501 sb.append(", ");
502 }
503 sb.append(key);
504 }
505 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
506 previousLogTime = System.currentTimeMillis();
507 }
508
509 try {
510 List<String> servers = ZKUtil.listChildrenNoWatch(zkw, zkw.rsZNode);
511 if (servers == null || servers.size() == 0 || (servers.size() == 1
512 && servers.contains(sn.toString()))) {
513 LOG.info("ZK shows there is only the master self online, exiting now");
514
515 break;
516 }
517 } catch (KeeperException ke) {
518 LOG.warn("Failed to list regionservers", ke);
519
520 break;
521 }
522 synchronized (onlineServers) {
523 try {
524 if (onlineServersCt == onlineServers.size()) onlineServers.wait(100);
525 } catch (InterruptedException ignored) {
526
527 }
528 }
529 }
530 }
531
532
533
534
535
536 public synchronized void expireServer(final ServerName serverName) {
537 if (serverName.equals(master.getServerName())) {
538 if (!(master.isAborted() || master.isStopped())) {
539 master.stop("We lost our znode?");
540 }
541 return;
542 }
543 if (!services.isServerShutdownHandlerEnabled()) {
544 LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
545 + "delay expiring server " + serverName);
546 this.queuedDeadServers.add(serverName);
547 return;
548 }
549 if (this.deadservers.isDeadServer(serverName)) {
550
551 LOG.warn("Expiration of " + serverName +
552 " but server shutdown already in progress");
553 return;
554 }
555 synchronized (onlineServers) {
556 if (!this.onlineServers.containsKey(serverName)) {
557 LOG.warn("Expiration of " + serverName + " but server not online");
558 }
559
560
561
562 this.deadservers.add(serverName);
563 this.onlineServers.remove(serverName);
564 onlineServers.notifyAll();
565 }
566 this.rsAdmins.remove(serverName);
567
568
569 if (this.clusterShutdown) {
570 LOG.info("Cluster shutdown set; " + serverName +
571 " expired; onlineServers=" + this.onlineServers.size());
572 if (this.onlineServers.isEmpty()) {
573 master.stop("Cluster shutdown set; onlineServer=0");
574 }
575 return;
576 }
577
578 boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
579 if (carryingMeta) {
580 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
581 this.services, this.deadservers, serverName));
582 } else {
583 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
584 this.services, this.deadservers, serverName, true));
585 }
586 LOG.debug("Added=" + serverName +
587 " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta);
588
589
590 if (!this.listeners.isEmpty()) {
591 for (ServerListener listener : this.listeners) {
592 listener.serverRemoved(serverName);
593 }
594 }
595 }
596
597 public synchronized void processDeadServer(final ServerName serverName) {
598 this.processDeadServer(serverName, false);
599 }
600
601 public synchronized void processDeadServer(final ServerName serverName, boolean shouldSplitWal) {
602
603
604
605
606
607
608
609
610 if (!services.getAssignmentManager().isFailoverCleanupDone()) {
611 requeuedDeadServers.put(serverName, shouldSplitWal);
612 return;
613 }
614
615 this.deadservers.add(serverName);
616 this.services.getExecutorService().submit(
617 new ServerShutdownHandler(this.master, this.services, this.deadservers, serverName,
618 shouldSplitWal));
619 }
620
621
622
623
624
625 synchronized void processQueuedDeadServers() {
626 if (!services.isServerShutdownHandlerEnabled()) {
627 LOG.info("Master hasn't enabled ServerShutdownHandler");
628 }
629 Iterator<ServerName> serverIterator = queuedDeadServers.iterator();
630 while (serverIterator.hasNext()) {
631 ServerName tmpServerName = serverIterator.next();
632 expireServer(tmpServerName);
633 serverIterator.remove();
634 requeuedDeadServers.remove(tmpServerName);
635 }
636
637 if (!services.getAssignmentManager().isFailoverCleanupDone()) {
638 LOG.info("AssignmentManager hasn't finished failover cleanup; waiting");
639 }
640
641 for(ServerName tmpServerName : requeuedDeadServers.keySet()){
642 processDeadServer(tmpServerName, requeuedDeadServers.get(tmpServerName));
643 }
644 requeuedDeadServers.clear();
645 }
646
647
648
649
650 public boolean removeServerFromDrainList(final ServerName sn) {
651
652
653
654 if (!this.isServerOnline(sn)) {
655 LOG.warn("Server " + sn + " is not currently online. " +
656 "Removing from draining list anyway, as requested.");
657 }
658
659 return this.drainingServers.remove(sn);
660 }
661
662
663
664
665 public boolean addServerToDrainList(final ServerName sn) {
666
667
668
669 if (!this.isServerOnline(sn)) {
670 LOG.warn("Server " + sn + " is not currently online. " +
671 "Ignoring request to add it to draining list.");
672 return false;
673 }
674
675
676 if (this.drainingServers.contains(sn)) {
677 LOG.warn("Server " + sn + " is already in the draining server list." +
678 "Ignoring request to add it again.");
679 return false;
680 }
681 return this.drainingServers.add(sn);
682 }
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697 public RegionOpeningState sendRegionOpen(final ServerName server,
698 HRegionInfo region, int versionOfOfflineNode, List<ServerName> favoredNodes)
699 throws IOException {
700 AdminService.BlockingInterface admin = getRsAdmin(server);
701 if (admin == null) {
702 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
703 " failed because no RPC connection found to this server");
704 return RegionOpeningState.FAILED_OPENING;
705 }
706 OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
707 region, versionOfOfflineNode, favoredNodes,
708 (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
709 try {
710 OpenRegionResponse response = admin.openRegion(null, request);
711 return ResponseConverter.getRegionOpeningState(response);
712 } catch (ServiceException se) {
713 throw ProtobufUtil.getRemoteException(se);
714 }
715 }
716
717
718
719
720
721
722
723
724
725
726 public List<RegionOpeningState> sendRegionOpen(ServerName server,
727 List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos)
728 throws IOException {
729 AdminService.BlockingInterface admin = getRsAdmin(server);
730 if (admin == null) {
731 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
732 " failed because no RPC connection found to this server");
733 return null;
734 }
735
736 OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, regionOpenInfos,
737 (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
738 try {
739 OpenRegionResponse response = admin.openRegion(null, request);
740 return ResponseConverter.getRegionOpeningStateList(response);
741 } catch (ServiceException se) {
742 throw ProtobufUtil.getRemoteException(se);
743 }
744 }
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760 public boolean sendRegionClose(ServerName server, HRegionInfo region,
761 int versionOfClosingNode, ServerName dest, boolean transitionInZK) throws IOException {
762 if (server == null) throw new NullPointerException("Passed server is null");
763 AdminService.BlockingInterface admin = getRsAdmin(server);
764 if (admin == null) {
765 throw new IOException("Attempting to send CLOSE RPC to server " +
766 server.toString() + " for region " +
767 region.getRegionNameAsString() +
768 " failed because no RPC connection found to this server");
769 }
770 return ProtobufUtil.closeRegion(admin, server, region.getRegionName(),
771 versionOfClosingNode, dest, transitionInZK);
772 }
773
774 public boolean sendRegionClose(ServerName server,
775 HRegionInfo region, int versionOfClosingNode) throws IOException {
776 return sendRegionClose(server, region, versionOfClosingNode, null, true);
777 }
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792 public void sendRegionsMerge(ServerName server, HRegionInfo region_a,
793 HRegionInfo region_b, boolean forcible) throws IOException {
794 if (server == null)
795 throw new NullPointerException("Passed server is null");
796 if (region_a == null || region_b == null)
797 throw new NullPointerException("Passed region is null");
798 AdminService.BlockingInterface admin = getRsAdmin(server);
799 if (admin == null) {
800 throw new IOException("Attempting to send MERGE REGIONS RPC to server "
801 + server.toString() + " for region "
802 + region_a.getRegionNameAsString() + ","
803 + region_b.getRegionNameAsString()
804 + " failed because no RPC connection found to this server");
805 }
806 ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
807 }
808
809
810
811
812 public boolean isServerReachable(ServerName server) {
813 if (server == null) throw new NullPointerException("Passed server is null");
814
815 RetryCounter retryCounter = pingRetryCounterFactory.create();
816 while (retryCounter.shouldRetry()) {
817 try {
818 AdminService.BlockingInterface admin = getRsAdmin(server);
819 if (admin != null) {
820 ServerInfo info = ProtobufUtil.getServerInfo(admin);
821 return info != null && info.hasServerName()
822 && server.getStartcode() == info.getServerName().getStartCode();
823 }
824 } catch (IOException ioe) {
825 LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes()
826 + " of " + retryCounter.getMaxAttempts(), ioe);
827 try {
828 retryCounter.sleepUntilNextRetry();
829 } catch(InterruptedException ie) {
830 Thread.currentThread().interrupt();
831 }
832 }
833 }
834 return false;
835 }
836
837
838
839
840
841
842
843 private AdminService.BlockingInterface getRsAdmin(final ServerName sn)
844 throws IOException {
845 AdminService.BlockingInterface admin = this.rsAdmins.get(sn);
846 if (admin == null) {
847 LOG.debug("New admin connection to " + sn.toString());
848 if (sn.equals(master.getServerName()) && master instanceof HRegionServer) {
849
850 admin = ((HRegionServer)master).getRSRpcServices();
851 } else {
852 admin = this.connection.getAdmin(sn);
853 }
854 this.rsAdmins.put(sn, admin);
855 }
856 return admin;
857 }
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872 public void waitForRegionServers(MonitoredTask status)
873 throws InterruptedException {
874 final long interval = this.master.getConfiguration().
875 getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
876 final long timeout = this.master.getConfiguration().
877 getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
878 int defaultMinToStart = 1;
879 if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) {
880
881
882
883
884 defaultMinToStart = 2;
885 }
886 int minToStart = this.master.getConfiguration().
887 getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, defaultMinToStart);
888 if (minToStart < 1) {
889 LOG.warn(String.format(
890 "The value of '%s' (%d) can not be less than 1, ignoring.",
891 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
892 minToStart = 1;
893 }
894 int maxToStart = this.master.getConfiguration().
895 getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
896 if (maxToStart < minToStart) {
897 LOG.warn(String.format(
898 "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
899 WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
900 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
901 maxToStart = Integer.MAX_VALUE;
902 }
903
904 long now = System.currentTimeMillis();
905 final long startTime = now;
906 long slept = 0;
907 long lastLogTime = 0;
908 long lastCountChange = startTime;
909 int count = countOfRegionServers();
910 int oldCount = 0;
911 while (!this.master.isStopped() && count < maxToStart
912 && (lastCountChange+interval > now || timeout > slept || count < minToStart)) {
913
914 if (oldCount != count || lastLogTime+interval < now){
915 lastLogTime = now;
916 String msg =
917 "Waiting for region servers count to settle; currently"+
918 " checked in " + count + ", slept for " + slept + " ms," +
919 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
920 ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
921 LOG.info(msg);
922 status.setStatus(msg);
923 }
924
925
926 final long sleepTime = 50;
927 Thread.sleep(sleepTime);
928 now = System.currentTimeMillis();
929 slept = now - startTime;
930
931 oldCount = count;
932 count = countOfRegionServers();
933 if (count != oldCount) {
934 lastCountChange = now;
935 }
936 }
937
938 LOG.info("Finished waiting for region servers count to settle;" +
939 " checked in " + count + ", slept for " + slept + " ms," +
940 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
941 " master is "+ (this.master.isStopped() ? "stopped.": "running")
942 );
943 }
944
945
946
947
948 public List<ServerName> getOnlineServersList() {
949
950
951 return new ArrayList<ServerName>(this.onlineServers.keySet());
952 }
953
954
955
956
957 public List<ServerName> getDrainingServersList() {
958 return new ArrayList<ServerName>(this.drainingServers);
959 }
960
961
962
963
964 Set<ServerName> getDeadNotExpiredServers() {
965 return new HashSet<ServerName>(this.queuedDeadServers);
966 }
967
968
969
970
971
972
973 void removeRequeuedDeadServers() {
974 requeuedDeadServers.clear();
975 }
976
977
978
979
980
981 Map<ServerName, Boolean> getRequeuedDeadServers() {
982 return Collections.unmodifiableMap(this.requeuedDeadServers);
983 }
984
985 public boolean isServerOnline(ServerName serverName) {
986 return serverName != null && onlineServers.containsKey(serverName);
987 }
988
989
990
991
992
993
994
995 public synchronized boolean isServerDead(ServerName serverName) {
996 return serverName == null || deadservers.isDeadServer(serverName)
997 || queuedDeadServers.contains(serverName)
998 || requeuedDeadServers.containsKey(serverName);
999 }
1000
1001 public void shutdownCluster() {
1002 this.clusterShutdown = true;
1003 this.master.stop("Cluster shutdown requested");
1004 }
1005
1006 public boolean isClusterShutdown() {
1007 return this.clusterShutdown;
1008 }
1009
1010
1011
1012
1013 public void stop() {
1014 if (connection != null) {
1015 try {
1016 connection.close();
1017 } catch (IOException e) {
1018 LOG.error("Attempt to close connection to master failed", e);
1019 }
1020 }
1021 }
1022
1023
1024
1025
1026
1027
1028 public List<ServerName> createDestinationServersList(final ServerName serverToExclude){
1029 final List<ServerName> destServers = getOnlineServersList();
1030
1031 if (serverToExclude != null){
1032 destServers.remove(serverToExclude);
1033 }
1034
1035
1036 final List<ServerName> drainingServersCopy = getDrainingServersList();
1037 if (!drainingServersCopy.isEmpty()) {
1038 for (final ServerName server: drainingServersCopy) {
1039 destServers.remove(server);
1040 }
1041 }
1042
1043
1044 removeDeadNotExpiredServers(destServers);
1045 return destServers;
1046 }
1047
1048
1049
1050
1051 public List<ServerName> createDestinationServersList(){
1052 return createDestinationServersList(null);
1053 }
1054
1055
1056
1057
1058
1059
1060
1061 void removeDeadNotExpiredServers(List<ServerName> servers) {
1062 Set<ServerName> deadNotExpiredServersCopy = this.getDeadNotExpiredServers();
1063 if (!deadNotExpiredServersCopy.isEmpty()) {
1064 for (ServerName server : deadNotExpiredServersCopy) {
1065 LOG.debug("Removing dead but not expired server: " + server
1066 + " from eligible server pool.");
1067 servers.remove(server);
1068 }
1069 }
1070 }
1071
1072
1073
1074
1075 void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
1076 for (ServerName serverName : getOnlineServersList()) {
1077 deadservers.cleanAllPreviousInstances(serverName);
1078 }
1079 }
1080 }