1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import java.io.IOException;
22 import java.net.InetAddress;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Map.Entry;
31 import java.util.Set;
32 import java.util.SortedMap;
33 import java.util.concurrent.ConcurrentHashMap;
34 import java.util.concurrent.ConcurrentSkipListMap;
35 import java.util.concurrent.CopyOnWriteArrayList;
36
37 import org.apache.commons.logging.Log;
38 import org.apache.commons.logging.LogFactory;
39 import org.apache.hadoop.conf.Configuration;
40 import org.apache.hadoop.hbase.ClockOutOfSyncException;
41 import org.apache.hadoop.hbase.HRegionInfo;
42 import org.apache.hadoop.hbase.RegionLoad;
43 import org.apache.hadoop.hbase.Server;
44 import org.apache.hadoop.hbase.ServerLoad;
45 import org.apache.hadoop.hbase.ServerName;
46 import org.apache.hadoop.hbase.YouAreDeadException;
47 import org.apache.hadoop.hbase.classification.InterfaceAudience;
48 import org.apache.hadoop.hbase.client.ClusterConnection;
49 import org.apache.hadoop.hbase.client.ConnectionFactory;
50 import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
51 import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
52 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
53 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
54 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
55 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
56 import org.apache.hadoop.hbase.protobuf.RequestConverter;
57 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
58 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
59 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
60 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
61 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
62 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
63 import org.apache.hadoop.hbase.regionserver.HRegionServer;
64 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
65 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
66 import org.apache.hadoop.hbase.util.Bytes;
67 import org.apache.hadoop.hbase.util.Triple;
68 import org.apache.hadoop.hbase.util.Pair;
69 import org.apache.hadoop.hbase.util.RetryCounter;
70 import org.apache.hadoop.hbase.util.RetryCounterFactory;
71 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
72 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
73 import org.apache.zookeeper.KeeperException;
74
75 import com.google.common.annotations.VisibleForTesting;
76 import com.google.protobuf.ServiceException;
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100 @InterfaceAudience.Private
101 public class ServerManager {
102 public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
103 "hbase.master.wait.on.regionservers.maxtostart";
104
105 public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
106 "hbase.master.wait.on.regionservers.mintostart";
107
108 public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
109 "hbase.master.wait.on.regionservers.timeout";
110
111 public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
112 "hbase.master.wait.on.regionservers.interval";
113
114 private static final Log LOG = LogFactory.getLog(ServerManager.class);
115
116
117 private volatile boolean clusterShutdown = false;
118
119 private final SortedMap<byte[], Long> flushedSequenceIdByRegion =
120 new ConcurrentSkipListMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
121
122
123 private final ConcurrentHashMap<ServerName, ServerLoad> onlineServers =
124 new ConcurrentHashMap<ServerName, ServerLoad>();
125
126
127
128
129
130 private final Map<ServerName, AdminService.BlockingInterface> rsAdmins =
131 new HashMap<ServerName, AdminService.BlockingInterface>();
132
133
134
135
136
137 private final ArrayList<ServerName> drainingServers =
138 new ArrayList<ServerName>();
139
140 private final Server master;
141 private final MasterServices services;
142 private final ClusterConnection connection;
143
144 private final DeadServer deadservers = new DeadServer();
145
146 private final long maxSkew;
147 private final long warningSkew;
148
149 private final RetryCounterFactory pingRetryCounterFactory;
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167 private Set<ServerName> queuedDeadServers = new HashSet<ServerName>();
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184 private Map<ServerName, Boolean> requeuedDeadServers
185 = new ConcurrentHashMap<ServerName, Boolean>();
186
187
188 private List<ServerListener> listeners = new CopyOnWriteArrayList<ServerListener>();
189
190
191
192
193
194
195
196 public ServerManager(final Server master, final MasterServices services)
197 throws IOException {
198 this(master, services, true);
199 }
200
201 ServerManager(final Server master, final MasterServices services,
202 final boolean connect) throws IOException {
203 this.master = master;
204 this.services = services;
205 Configuration c = master.getConfiguration();
206 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
207 warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
208 this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
209 int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
210 "hbase.master.maximum.ping.server.attempts", 10));
211 int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
212 "hbase.master.ping.server.retry.sleep.interval", 100));
213 this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
214 }
215
216
217
218
219
220 public void registerListener(final ServerListener listener) {
221 this.listeners.add(listener);
222 }
223
224
225
226
227
228 public boolean unregisterListener(final ServerListener listener) {
229 return this.listeners.remove(listener);
230 }
231
232
233
234
235
236
237
238
239
240
241 ServerName regionServerStartup(final InetAddress ia, final int port,
242 final long serverStartcode, long serverCurrentTime)
243 throws IOException {
244
245
246
247
248
249
250
251 ServerName sn = ServerName.valueOf(ia.getHostName(), port, serverStartcode);
252 checkClockSkew(sn, serverCurrentTime);
253 checkIsDead(sn, "STARTUP");
254 if (!checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
255 LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
256 + " could not record the server: " + sn);
257 }
258 return sn;
259 }
260
261
262
263
264
265
266 private void updateLastFlushedSequenceIds(ServerName sn, ServerLoad hsl) {
267 Map<byte[], RegionLoad> regionsLoad = hsl.getRegionsLoad();
268 for (Entry<byte[], RegionLoad> entry : regionsLoad.entrySet()) {
269 byte[] encodedRegionName = Bytes.toBytes(HRegionInfo.encodeRegionName(entry.getKey()));
270 Long existingValue = flushedSequenceIdByRegion.get(encodedRegionName);
271 long l = entry.getValue().getCompleteSequenceId();
272 if (existingValue != null) {
273 if (l != -1 && l < existingValue) {
274 LOG.warn("RegionServer " + sn +
275 " indicates a last flushed sequence id (" + entry.getValue() +
276 ") that is less than the previous last flushed sequence id (" +
277 existingValue + ") for region " +
278 Bytes.toString(entry.getKey()) + " Ignoring.");
279
280 continue;
281 }
282 }
283 flushedSequenceIdByRegion.put(encodedRegionName, l);
284 }
285 }
286
287 void regionServerReport(ServerName sn,
288 ServerLoad sl) throws YouAreDeadException {
289 checkIsDead(sn, "REPORT");
290 if (null == this.onlineServers.replace(sn, sl)) {
291
292
293
294
295
296
297 if (!checkAndRecordNewServer(sn, sl)) {
298 LOG.info("RegionServerReport ignored, could not record the server: " + sn);
299 return;
300 }
301 }
302 updateLastFlushedSequenceIds(sn, sl);
303 }
304
305
306
307
308
309
310
311
312
313 boolean checkAndRecordNewServer(
314 final ServerName serverName, final ServerLoad sl) {
315 ServerName existingServer = null;
316 synchronized (this.onlineServers) {
317 existingServer = findServerWithSameHostnamePortWithLock(serverName);
318 if (existingServer != null && (existingServer.getStartcode() > serverName.getStartcode())) {
319 LOG.info("Server serverName=" + serverName + " rejected; we already have "
320 + existingServer.toString() + " registered with same hostname and port");
321 return false;
322 }
323 recordNewServerWithLock(serverName, sl);
324 }
325
326
327 if (!this.listeners.isEmpty()) {
328 for (ServerListener listener : this.listeners) {
329 listener.serverAdded(serverName);
330 }
331 }
332
333
334
335 if (existingServer != null && (existingServer.getStartcode() < serverName.getStartcode())) {
336 LOG.info("Triggering server recovery; existingServer " +
337 existingServer + " looks stale, new server:" + serverName);
338 expireServer(existingServer);
339 }
340 return true;
341 }
342
343
344
345
346
347
348
349
350
351 private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
352 throws ClockOutOfSyncException {
353 long skew = Math.abs(System.currentTimeMillis() - serverCurrentTime);
354 if (skew > maxSkew) {
355 String message = "Server " + serverName + " has been " +
356 "rejected; Reported time is too far out of sync with master. " +
357 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
358 LOG.warn(message);
359 throw new ClockOutOfSyncException(message);
360 } else if (skew > warningSkew){
361 String message = "Reported time for server " + serverName + " is out of sync with master " +
362 "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " +
363 "error threshold is " + maxSkew + "ms)";
364 LOG.warn(message);
365 }
366 }
367
368
369
370
371
372
373
374
375
376 private void checkIsDead(final ServerName serverName, final String what)
377 throws YouAreDeadException {
378 if (this.deadservers.isDeadServer(serverName)) {
379
380
381 String message = "Server " + what + " rejected; currently processing " +
382 serverName + " as dead server";
383 LOG.debug(message);
384 throw new YouAreDeadException(message);
385 }
386
387
388 if ((this.services == null || ((HMaster) this.services).isInitialized())
389 && this.deadservers.cleanPreviousInstance(serverName)) {
390
391
392 LOG.debug(what + ":" + " Server " + serverName + " came back up," +
393 " removed it from the dead servers list");
394 }
395 }
396
397
398
399
400
401 private ServerName findServerWithSameHostnamePortWithLock(
402 final ServerName serverName) {
403 for (ServerName sn: this.onlineServers.keySet()) {
404 if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn;
405 }
406 return null;
407 }
408
409
410
411
412
413
414
415 @VisibleForTesting
416 void recordNewServerWithLock(final ServerName serverName, final ServerLoad sl) {
417 LOG.info("Registering server=" + serverName);
418 this.onlineServers.put(serverName, sl);
419 this.rsAdmins.remove(serverName);
420 }
421
422 public long getLastFlushedSequenceId(byte[] encodedRegionName) {
423 long seqId = -1L;
424 if (flushedSequenceIdByRegion.containsKey(encodedRegionName)) {
425 seqId = flushedSequenceIdByRegion.get(encodedRegionName);
426 }
427 return seqId;
428 }
429
430
431
432
433
434 public ServerLoad getLoad(final ServerName serverName) {
435 return this.onlineServers.get(serverName);
436 }
437
438
439
440
441
442
443
444 public double getAverageLoad() {
445 int totalLoad = 0;
446 int numServers = 0;
447 for (ServerLoad sl: this.onlineServers.values()) {
448 numServers++;
449 totalLoad += sl.getNumberOfRegions();
450 }
451 return numServers == 0 ? 0 :
452 (double)totalLoad / (double)numServers;
453 }
454
455
456 public int countOfRegionServers() {
457
458 return this.onlineServers.size();
459 }
460
461
462
463
464 public Map<ServerName, ServerLoad> getOnlineServers() {
465
466 synchronized (this.onlineServers) {
467 return Collections.unmodifiableMap(this.onlineServers);
468 }
469 }
470
471
472 public DeadServer getDeadServers() {
473 return this.deadservers;
474 }
475
476
477
478
479
480 public boolean areDeadServersInProgress() {
481 return this.deadservers.areDeadServersInProgress();
482 }
483
484 void letRegionServersShutdown() {
485 long previousLogTime = 0;
486 ServerName sn = master.getServerName();
487 ZooKeeperWatcher zkw = master.getZooKeeper();
488 int onlineServersCt;
489 while ((onlineServersCt = onlineServers.size()) > 0){
490
491 if (System.currentTimeMillis() > (previousLogTime + 1000)) {
492 Set<ServerName> remainingServers = onlineServers.keySet();
493 synchronized (onlineServers) {
494 if (remainingServers.size() == 1 && remainingServers.contains(sn)) {
495
496 return;
497 }
498 }
499 StringBuilder sb = new StringBuilder();
500
501 for (ServerName key : remainingServers) {
502 if (sb.length() > 0) {
503 sb.append(", ");
504 }
505 sb.append(key);
506 }
507 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
508 previousLogTime = System.currentTimeMillis();
509 }
510
511 try {
512 List<String> servers = ZKUtil.listChildrenNoWatch(zkw, zkw.rsZNode);
513 if (servers == null || servers.size() == 0 || (servers.size() == 1
514 && servers.contains(sn.toString()))) {
515 LOG.info("ZK shows there is only the master self online, exiting now");
516
517 break;
518 }
519 } catch (KeeperException ke) {
520 LOG.warn("Failed to list regionservers", ke);
521
522 break;
523 }
524 synchronized (onlineServers) {
525 try {
526 if (onlineServersCt == onlineServers.size()) onlineServers.wait(100);
527 } catch (InterruptedException ignored) {
528
529 }
530 }
531 }
532 }
533
534
535
536
537
538 public synchronized void expireServer(final ServerName serverName) {
539 if (serverName.equals(master.getServerName())) {
540 if (!(master.isAborted() || master.isStopped())) {
541 master.stop("We lost our znode?");
542 }
543 return;
544 }
545 if (!services.isServerShutdownHandlerEnabled()) {
546 LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
547 + "delay expiring server " + serverName);
548 this.queuedDeadServers.add(serverName);
549 return;
550 }
551 if (this.deadservers.isDeadServer(serverName)) {
552
553 LOG.warn("Expiration of " + serverName +
554 " but server shutdown already in progress");
555 return;
556 }
557 synchronized (onlineServers) {
558 if (!this.onlineServers.containsKey(serverName)) {
559 LOG.warn("Expiration of " + serverName + " but server not online");
560 }
561
562
563
564 this.deadservers.add(serverName);
565 this.onlineServers.remove(serverName);
566 onlineServers.notifyAll();
567 }
568 this.rsAdmins.remove(serverName);
569
570
571 if (this.clusterShutdown) {
572 LOG.info("Cluster shutdown set; " + serverName +
573 " expired; onlineServers=" + this.onlineServers.size());
574 if (this.onlineServers.isEmpty()) {
575 master.stop("Cluster shutdown set; onlineServer=0");
576 }
577 return;
578 }
579
580 boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
581 if (carryingMeta) {
582 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
583 this.services, this.deadservers, serverName));
584 } else {
585 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
586 this.services, this.deadservers, serverName, true));
587 }
588 LOG.debug("Added=" + serverName +
589 " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta);
590
591
592 if (!this.listeners.isEmpty()) {
593 for (ServerListener listener : this.listeners) {
594 listener.serverRemoved(serverName);
595 }
596 }
597 }
598
599 public synchronized void processDeadServer(final ServerName serverName) {
600 this.processDeadServer(serverName, false);
601 }
602
603 public synchronized void processDeadServer(final ServerName serverName, boolean shouldSplitWal) {
604
605
606
607
608
609
610
611
612 if (!services.getAssignmentManager().isFailoverCleanupDone()) {
613 requeuedDeadServers.put(serverName, shouldSplitWal);
614 return;
615 }
616
617 this.deadservers.add(serverName);
618 this.services.getExecutorService().submit(
619 new ServerShutdownHandler(this.master, this.services, this.deadservers, serverName,
620 shouldSplitWal));
621 }
622
623
624
625
626
627 synchronized void processQueuedDeadServers() {
628 if (!services.isServerShutdownHandlerEnabled()) {
629 LOG.info("Master hasn't enabled ServerShutdownHandler");
630 }
631 Iterator<ServerName> serverIterator = queuedDeadServers.iterator();
632 while (serverIterator.hasNext()) {
633 ServerName tmpServerName = serverIterator.next();
634 expireServer(tmpServerName);
635 serverIterator.remove();
636 requeuedDeadServers.remove(tmpServerName);
637 }
638
639 if (!services.getAssignmentManager().isFailoverCleanupDone()) {
640 LOG.info("AssignmentManager hasn't finished failover cleanup; waiting");
641 }
642
643 for(ServerName tmpServerName : requeuedDeadServers.keySet()){
644 processDeadServer(tmpServerName, requeuedDeadServers.get(tmpServerName));
645 }
646 requeuedDeadServers.clear();
647 }
648
649
650
651
652 public boolean removeServerFromDrainList(final ServerName sn) {
653
654
655
656 if (!this.isServerOnline(sn)) {
657 LOG.warn("Server " + sn + " is not currently online. " +
658 "Removing from draining list anyway, as requested.");
659 }
660
661 return this.drainingServers.remove(sn);
662 }
663
664
665
666
667 public boolean addServerToDrainList(final ServerName sn) {
668
669
670
671 if (!this.isServerOnline(sn)) {
672 LOG.warn("Server " + sn + " is not currently online. " +
673 "Ignoring request to add it to draining list.");
674 return false;
675 }
676
677
678 if (this.drainingServers.contains(sn)) {
679 LOG.warn("Server " + sn + " is already in the draining server list." +
680 "Ignoring request to add it again.");
681 return false;
682 }
683 return this.drainingServers.add(sn);
684 }
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699 public RegionOpeningState sendRegionOpen(final ServerName server,
700 HRegionInfo region, int versionOfOfflineNode, List<ServerName> favoredNodes)
701 throws IOException {
702 AdminService.BlockingInterface admin = getRsAdmin(server);
703 if (admin == null) {
704 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
705 " failed because no RPC connection found to this server");
706 return RegionOpeningState.FAILED_OPENING;
707 }
708 OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server,
709 region, versionOfOfflineNode, favoredNodes,
710 (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
711 try {
712 OpenRegionResponse response = admin.openRegion(null, request);
713 return ResponseConverter.getRegionOpeningState(response);
714 } catch (ServiceException se) {
715 throw ProtobufUtil.getRemoteException(se);
716 }
717 }
718
719
720
721
722
723
724
725
726
727
728 public List<RegionOpeningState> sendRegionOpen(ServerName server,
729 List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos)
730 throws IOException {
731 AdminService.BlockingInterface admin = getRsAdmin(server);
732 if (admin == null) {
733 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
734 " failed because no RPC connection found to this server");
735 return null;
736 }
737
738 OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, regionOpenInfos,
739 (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
740 try {
741 OpenRegionResponse response = admin.openRegion(null, request);
742 return ResponseConverter.getRegionOpeningStateList(response);
743 } catch (ServiceException se) {
744 throw ProtobufUtil.getRemoteException(se);
745 }
746 }
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762 public boolean sendRegionClose(ServerName server, HRegionInfo region,
763 int versionOfClosingNode, ServerName dest, boolean transitionInZK) throws IOException {
764 if (server == null) throw new NullPointerException("Passed server is null");
765 AdminService.BlockingInterface admin = getRsAdmin(server);
766 if (admin == null) {
767 throw new IOException("Attempting to send CLOSE RPC to server " +
768 server.toString() + " for region " +
769 region.getRegionNameAsString() +
770 " failed because no RPC connection found to this server");
771 }
772 return ProtobufUtil.closeRegion(admin, server, region.getRegionName(),
773 versionOfClosingNode, dest, transitionInZK);
774 }
775
776 public boolean sendRegionClose(ServerName server,
777 HRegionInfo region, int versionOfClosingNode) throws IOException {
778 return sendRegionClose(server, region, versionOfClosingNode, null, true);
779 }
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794 public void sendRegionsMerge(ServerName server, HRegionInfo region_a,
795 HRegionInfo region_b, boolean forcible) throws IOException {
796 if (server == null)
797 throw new NullPointerException("Passed server is null");
798 if (region_a == null || region_b == null)
799 throw new NullPointerException("Passed region is null");
800 AdminService.BlockingInterface admin = getRsAdmin(server);
801 if (admin == null) {
802 throw new IOException("Attempting to send MERGE REGIONS RPC to server "
803 + server.toString() + " for region "
804 + region_a.getRegionNameAsString() + ","
805 + region_b.getRegionNameAsString()
806 + " failed because no RPC connection found to this server");
807 }
808 ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
809 }
810
811
812
813
814 public boolean isServerReachable(ServerName server) {
815 if (server == null) throw new NullPointerException("Passed server is null");
816
817 RetryCounter retryCounter = pingRetryCounterFactory.create();
818 while (retryCounter.shouldRetry()) {
819 synchronized (this.onlineServers) {
820 if (this.deadservers.isDeadServer(server)) {
821 return false;
822 }
823 }
824 try {
825 AdminService.BlockingInterface admin = getRsAdmin(server);
826 if (admin != null) {
827 ServerInfo info = ProtobufUtil.getServerInfo(admin);
828 return info != null && info.hasServerName()
829 && server.getStartcode() == info.getServerName().getStartCode();
830 }
831 } catch (RegionServerStoppedException | ServerNotRunningYetException e) {
832 if (LOG.isDebugEnabled()) {
833 LOG.debug("Couldn't reach " + server, e);
834 }
835 break;
836 } catch (IOException ioe) {
837 if (LOG.isDebugEnabled()) {
838 LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + " of "
839 + retryCounter.getMaxAttempts(), ioe);
840 }
841 try {
842 retryCounter.sleepUntilNextRetry();
843 } catch(InterruptedException ie) {
844 Thread.currentThread().interrupt();
845 break;
846 }
847 }
848 }
849 return false;
850 }
851
852
853
854
855
856
857
858 private AdminService.BlockingInterface getRsAdmin(final ServerName sn)
859 throws IOException {
860 AdminService.BlockingInterface admin = this.rsAdmins.get(sn);
861 if (admin == null) {
862 LOG.debug("New admin connection to " + sn.toString());
863 if (sn.equals(master.getServerName()) && master instanceof HRegionServer) {
864
865 admin = ((HRegionServer)master).getRSRpcServices();
866 } else {
867 admin = this.connection.getAdmin(sn);
868 }
869 this.rsAdmins.put(sn, admin);
870 }
871 return admin;
872 }
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887 public void waitForRegionServers(MonitoredTask status)
888 throws InterruptedException {
889 final long interval = this.master.getConfiguration().
890 getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
891 final long timeout = this.master.getConfiguration().
892 getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
893 int defaultMinToStart = 1;
894 if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) {
895
896
897
898
899 defaultMinToStart = 2;
900 }
901 int minToStart = this.master.getConfiguration().
902 getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, defaultMinToStart);
903 if (minToStart < 1) {
904 LOG.warn(String.format(
905 "The value of '%s' (%d) can not be less than 1, ignoring.",
906 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
907 minToStart = 1;
908 }
909 int maxToStart = this.master.getConfiguration().
910 getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
911 if (maxToStart < minToStart) {
912 LOG.warn(String.format(
913 "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
914 WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
915 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
916 maxToStart = Integer.MAX_VALUE;
917 }
918
919 long now = System.currentTimeMillis();
920 final long startTime = now;
921 long slept = 0;
922 long lastLogTime = 0;
923 long lastCountChange = startTime;
924 int count = countOfRegionServers();
925 int oldCount = 0;
926 while (!this.master.isStopped() && count < maxToStart
927 && (lastCountChange+interval > now || timeout > slept || count < minToStart)) {
928
929 if (oldCount != count || lastLogTime+interval < now){
930 lastLogTime = now;
931 String msg =
932 "Waiting for region servers count to settle; currently"+
933 " checked in " + count + ", slept for " + slept + " ms," +
934 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
935 ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
936 LOG.info(msg);
937 status.setStatus(msg);
938 }
939
940
941 final long sleepTime = 50;
942 Thread.sleep(sleepTime);
943 now = System.currentTimeMillis();
944 slept = now - startTime;
945
946 oldCount = count;
947 count = countOfRegionServers();
948 if (count != oldCount) {
949 lastCountChange = now;
950 }
951 }
952
953 LOG.info("Finished waiting for region servers count to settle;" +
954 " checked in " + count + ", slept for " + slept + " ms," +
955 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
956 " master is "+ (this.master.isStopped() ? "stopped.": "running")
957 );
958 }
959
960
961
962
963 public List<ServerName> getOnlineServersList() {
964
965
966 return new ArrayList<ServerName>(this.onlineServers.keySet());
967 }
968
969
970
971
972 public List<ServerName> getDrainingServersList() {
973 return new ArrayList<ServerName>(this.drainingServers);
974 }
975
976
977
978
979 Set<ServerName> getDeadNotExpiredServers() {
980 return new HashSet<ServerName>(this.queuedDeadServers);
981 }
982
983
984
985
986
987
988 void removeRequeuedDeadServers() {
989 requeuedDeadServers.clear();
990 }
991
992
993
994
995
996 Map<ServerName, Boolean> getRequeuedDeadServers() {
997 return Collections.unmodifiableMap(this.requeuedDeadServers);
998 }
999
1000 public boolean isServerOnline(ServerName serverName) {
1001 return serverName != null && onlineServers.containsKey(serverName);
1002 }
1003
1004
1005
1006
1007
1008
1009
1010 public synchronized boolean isServerDead(ServerName serverName) {
1011 return serverName == null || deadservers.isDeadServer(serverName)
1012 || queuedDeadServers.contains(serverName)
1013 || requeuedDeadServers.containsKey(serverName);
1014 }
1015
1016 public void shutdownCluster() {
1017 this.clusterShutdown = true;
1018 this.master.stop("Cluster shutdown requested");
1019 }
1020
1021 public boolean isClusterShutdown() {
1022 return this.clusterShutdown;
1023 }
1024
1025
1026
1027
1028 public void stop() {
1029 if (connection != null) {
1030 try {
1031 connection.close();
1032 } catch (IOException e) {
1033 LOG.error("Attempt to close connection to master failed", e);
1034 }
1035 }
1036 }
1037
1038
1039
1040
1041
1042
1043 public List<ServerName> createDestinationServersList(final ServerName serverToExclude){
1044 final List<ServerName> destServers = getOnlineServersList();
1045
1046 if (serverToExclude != null){
1047 destServers.remove(serverToExclude);
1048 }
1049
1050
1051 final List<ServerName> drainingServersCopy = getDrainingServersList();
1052 if (!drainingServersCopy.isEmpty()) {
1053 for (final ServerName server: drainingServersCopy) {
1054 destServers.remove(server);
1055 }
1056 }
1057
1058
1059 removeDeadNotExpiredServers(destServers);
1060 return destServers;
1061 }
1062
1063
1064
1065
1066 public List<ServerName> createDestinationServersList(){
1067 return createDestinationServersList(null);
1068 }
1069
1070
1071
1072
1073
1074
1075
1076 void removeDeadNotExpiredServers(List<ServerName> servers) {
1077 Set<ServerName> deadNotExpiredServersCopy = this.getDeadNotExpiredServers();
1078 if (!deadNotExpiredServersCopy.isEmpty()) {
1079 for (ServerName server : deadNotExpiredServersCopy) {
1080 LOG.debug("Removing dead but not expired server: " + server
1081 + " from eligible server pool.");
1082 servers.remove(server);
1083 }
1084 }
1085 }
1086
1087
1088
1089
1090 void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
1091 for (ServerName serverName : getOnlineServersList()) {
1092 deadservers.cleanAllPreviousInstances(serverName);
1093 }
1094 }
1095 }