1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import java.io.IOException;
23 import java.net.InetAddress;
24 import java.util.ArrayList;
25 import java.util.Collections;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.Set;
32 import java.util.concurrent.ConcurrentHashMap;
33
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.apache.hadoop.conf.Configuration;
37 import org.apache.hadoop.hbase.ClockOutOfSyncException;
38 import org.apache.hadoop.hbase.HRegionInfo;
39 import org.apache.hadoop.hbase.HServerAddress;
40 import org.apache.hadoop.hbase.HServerLoad;
41 import org.apache.hadoop.hbase.PleaseHoldException;
42 import org.apache.hadoop.hbase.Server;
43 import org.apache.hadoop.hbase.ServerName;
44 import org.apache.hadoop.hbase.YouAreDeadException;
45 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
46 import org.apache.hadoop.hbase.client.HConnection;
47 import org.apache.hadoop.hbase.client.HConnectionManager;
48 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
49 import org.apache.hadoop.hbase.ipc.HRegionInterface;
50 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
51 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
52 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
53 import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 public class ServerManager {
69 public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
70 "hbase.master.wait.on.regionservers.maxtostart";
71
72 public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
73 "hbase.master.wait.on.regionservers.mintostart";
74
75 public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
76 "hbase.master.wait.on.regionservers.timeout";
77
78 public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
79 "hbase.master.wait.on.regionservers.interval";
80
81 private static final Log LOG = LogFactory.getLog(ServerManager.class);
82
83
84 private volatile boolean clusterShutdown = false;
85
86
87 private final Map<ServerName, HServerLoad> onlineServers =
88 new ConcurrentHashMap<ServerName, HServerLoad>();
89
90
91
92
93
94 private final Map<ServerName, HRegionInterface> serverConnections =
95 new HashMap<ServerName, HRegionInterface>();
96
97
98
99
100
101 private final ArrayList<ServerName> drainingServers =
102 new ArrayList<ServerName>();
103
104 private final Server master;
105 private final MasterServices services;
106 private final HConnection connection;
107
108 private final DeadServer deadservers;
109
110 private final long maxSkew;
111 private final long warningSkew;
112
113
114
115
116
117
118
119 private Set<ServerName> deadNotExpiredServers = new HashSet<ServerName>();
120
121
122
123
124
125
126
127 public ServerManager(final Server master, final MasterServices services)
128 throws ZooKeeperConnectionException {
129 this(master, services, true);
130 }
131
132 ServerManager(final Server master, final MasterServices services,
133 final boolean connect) throws ZooKeeperConnectionException {
134 this.master = master;
135 this.services = services;
136 Configuration c = master.getConfiguration();
137 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
138 warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
139 this.deadservers = new DeadServer();
140 this.connection = connect ? HConnectionManager.getConnection(c) : null;
141 }
142
143
144
145
146
147
148
149
150
151
152 ServerName regionServerStartup(final InetAddress ia, final int port,
153 final long serverStartcode, long serverCurrentTime)
154 throws IOException {
155
156
157
158
159
160
161
162 ServerName sn = new ServerName(ia.getHostName(), port, serverStartcode);
163 checkClockSkew(sn, serverCurrentTime);
164 checkIsDead(sn, "STARTUP");
165 checkAlreadySameHostPort(sn);
166 recordNewServer(sn, HServerLoad.EMPTY_HSERVERLOAD);
167 return sn;
168 }
169
170 void regionServerReport(ServerName sn, HServerLoad hsl)
171 throws YouAreDeadException, PleaseHoldException {
172 checkIsDead(sn, "REPORT");
173 if (!this.onlineServers.containsKey(sn)) {
174
175 checkAlreadySameHostPort(sn);
176
177
178
179
180
181 recordNewServer(sn, hsl);
182 } else {
183 this.onlineServers.put(sn, hsl);
184 }
185 }
186
187
188
189
190
191
192 void checkAlreadySameHostPort(final ServerName serverName)
193 throws PleaseHoldException {
194 ServerName existingServer =
195 ServerName.findServerWithSameHostnamePort(getOnlineServersList(), serverName);
196 if (existingServer != null) {
197 String message = "Server serverName=" + serverName +
198 " rejected; we already have " + existingServer.toString() +
199 " registered with same hostname and port";
200 LOG.info(message);
201 if (existingServer.getStartcode() < serverName.getStartcode()) {
202 LOG.info("Triggering server recovery; existingServer " +
203 existingServer + " looks stale, new server:" + serverName);
204 expireServer(existingServer);
205 }
206 if (services.isServerShutdownHandlerEnabled()) {
207
208 throw new PleaseHoldException(message);
209 }
210 }
211 }
212
213
214
215
216
217
218
219
220
221 private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
222 throws ClockOutOfSyncException {
223 long skew = System.currentTimeMillis() - serverCurrentTime;
224 if (skew > maxSkew) {
225 String message = "Server " + serverName + " has been " +
226 "rejected; Reported time is too far out of sync with master. " +
227 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
228 LOG.warn(message);
229 throw new ClockOutOfSyncException(message);
230 } else if (skew > warningSkew){
231 String message = "Reported time for server " + serverName + " is out of sync with master " +
232 "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " +
233 "error threshold is " + maxSkew + "ms)";
234 LOG.warn(message);
235 }
236 }
237
238
239
240
241
242
243
244
245
246 private void checkIsDead(final ServerName serverName, final String what)
247 throws YouAreDeadException {
248 if (this.deadservers.isDeadServer(serverName)) {
249
250
251 String message = "Server " + what + " rejected; currently processing " +
252 serverName + " as dead server";
253 LOG.debug(message);
254 throw new YouAreDeadException(message);
255 }
256
257
258
259 if ((this.services == null || ((HMaster) this.services).isInitialized())
260 && this.deadservers.cleanPreviousInstance(serverName)) {
261
262
263 LOG.debug(what + ":" + " Server " + serverName + " came back up," +
264 " removed it from the dead servers list");
265 }
266 }
267
268
269
270
271
272
273 void recordNewServer(final ServerName serverName, final HServerLoad hsl) {
274 LOG.info("Registering server=" + serverName);
275 this.onlineServers.put(serverName, hsl);
276 this.serverConnections.remove(serverName);
277 }
278
279
280
281
282
283 public HServerLoad getLoad(final ServerName serverName) {
284 return this.onlineServers.get(serverName);
285 }
286
287
288
289
290
291
292 public HServerLoad getLoad(final HServerAddress address) {
293 ServerName sn = new ServerName(address.toString(), ServerName.NON_STARTCODE);
294 ServerName actual =
295 ServerName.findServerWithSameHostnamePort(this.getOnlineServersList(), sn);
296 return actual == null? null: getLoad(actual);
297 }
298
299
300
301
302
303
304
305 public double getAverageLoad() {
306 int totalLoad = 0;
307 int numServers = 0;
308 double averageLoad = 0.0;
309 for (HServerLoad hsl: this.onlineServers.values()) {
310 numServers++;
311 totalLoad += hsl.getNumberOfRegions();
312 }
313 averageLoad = (double)totalLoad / (double)numServers;
314 return averageLoad;
315 }
316
317
318 int countOfRegionServers() {
319
320 return this.onlineServers.size();
321 }
322
323
324
325
326 public Map<ServerName, HServerLoad> getOnlineServers() {
327
328 synchronized (this.onlineServers) {
329 return Collections.unmodifiableMap(this.onlineServers);
330 }
331 }
332
333 public Set<ServerName> getDeadServers() {
334 return this.deadservers.clone();
335 }
336
337
338
339
340
341 public boolean areDeadServersInProgress() {
342 return this.deadservers.areDeadServersInProgress();
343 }
344
345 void letRegionServersShutdown() {
346 long previousLogTime = 0;
347 while (!onlineServers.isEmpty()) {
348
349 if (System.currentTimeMillis() > (previousLogTime + 1000)) {
350 StringBuilder sb = new StringBuilder();
351 for (ServerName key : this.onlineServers.keySet()) {
352 if (sb.length() > 0) {
353 sb.append(", ");
354 }
355 sb.append(key);
356 }
357 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
358 previousLogTime = System.currentTimeMillis();
359 }
360
361 synchronized (onlineServers) {
362 try {
363 onlineServers.wait(100);
364 } catch (InterruptedException ignored) {
365
366 }
367 }
368 }
369 }
370
371
372
373
374
375 public synchronized void expireServer(final ServerName serverName) {
376 if (!services.isServerShutdownHandlerEnabled()) {
377 LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
378 + "delay expiring server " + serverName);
379 this.deadNotExpiredServers.add(serverName);
380 return;
381 }
382 if (!this.onlineServers.containsKey(serverName)) {
383 LOG.warn("Received expiration of " + serverName +
384 " but server is not currently online");
385 return;
386 }
387 if (this.deadservers.contains(serverName)) {
388
389 LOG.warn("Received expiration of " + serverName +
390 " but server shutdown is already in progress");
391 return;
392 }
393
394
395
396 this.deadservers.add(serverName);
397 this.onlineServers.remove(serverName);
398 synchronized (onlineServers) {
399 onlineServers.notifyAll();
400 }
401 this.serverConnections.remove(serverName);
402
403
404 if (this.clusterShutdown) {
405 LOG.info("Cluster shutdown set; " + serverName +
406 " expired; onlineServers=" + this.onlineServers.size());
407 if (this.onlineServers.isEmpty()) {
408 master.stop("Cluster shutdown set; onlineServer=0");
409 }
410 return;
411 }
412
413 boolean carryingRoot = services.getAssignmentManager().isCarryingRoot(serverName);
414 boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
415 if (carryingRoot || carryingMeta) {
416 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
417 this.services, this.deadservers, serverName, carryingRoot, carryingMeta));
418 } else {
419 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
420 this.services, this.deadservers, serverName, true));
421 }
422 LOG.debug("Added=" + serverName +
423 " to dead servers, submitted shutdown handler to be executed, root=" +
424 carryingRoot + ", meta=" + carryingMeta);
425 }
426
427
428
429
430
431
432 synchronized void expireDeadNotExpiredServers() throws IOException {
433 if (!services.isServerShutdownHandlerEnabled()) {
434 throw new IOException("Master hasn't enabled ServerShutdownHandler ");
435 }
436 Iterator<ServerName> serverIterator = deadNotExpiredServers.iterator();
437 while (serverIterator.hasNext()) {
438 expireServer(serverIterator.next());
439 serverIterator.remove();
440 }
441 }
442
443
444
445
446 public boolean removeServerFromDrainList(final ServerName sn) {
447
448
449
450 if (!this.isServerOnline(sn)) {
451 LOG.warn("Server " + sn + " is not currently online. " +
452 "Removing from draining list anyway, as requested.");
453 }
454
455 return this.drainingServers.remove(sn);
456 }
457
458
459
460
461 public boolean addServerToDrainList(final ServerName sn) {
462
463
464
465 if (!this.isServerOnline(sn)) {
466 LOG.warn("Server " + sn + " is not currently online. " +
467 "Ignoring request to add it to draining list.");
468 return false;
469 }
470
471
472 if (this.drainingServers.contains(sn)) {
473 LOG.warn("Server " + sn + " is already in the draining server list." +
474 "Ignoring request to add it again.");
475 return false;
476 }
477 return this.drainingServers.add(sn);
478 }
479
480
481
482
483
484
485
486
487
488
489
490
491
492 public RegionOpeningState sendRegionOpen(final ServerName server,
493 HRegionInfo region, int versionOfOfflineNode)
494 throws IOException {
495 HRegionInterface hri = getServerConnection(server);
496 if (hri == null) {
497 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
498 " failed because no RPC connection found to this server");
499 return RegionOpeningState.FAILED_OPENING;
500 }
501 return (versionOfOfflineNode == -1) ? hri.openRegion(region) : hri
502 .openRegion(region, versionOfOfflineNode);
503 }
504
505
506
507
508
509
510
511
512
513 public void sendRegionOpen(ServerName server, List<HRegionInfo> regions)
514 throws IOException {
515 HRegionInterface hri = getServerConnection(server);
516 if (hri == null) {
517 LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
518 " failed because no RPC connection found to this server");
519 return;
520 }
521 hri.openRegions(regions);
522 }
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537 public boolean sendRegionClose(ServerName server, HRegionInfo region,
538 int versionOfClosingNode) throws IOException {
539 if (server == null) throw new NullPointerException("Passed server is null");
540 HRegionInterface hri = getServerConnection(server);
541 if (hri == null) {
542 throw new IOException("Attempting to send CLOSE RPC to server " +
543 server.toString() + " for region " +
544 region.getRegionNameAsString() +
545 " failed because no RPC connection found to this server");
546 }
547 return hri.closeRegion(region, versionOfClosingNode);
548 }
549
550
551
552
553
554
555
556
557 private HRegionInterface getServerConnection(final ServerName sn)
558 throws IOException {
559 HRegionInterface hri = this.serverConnections.get(sn);
560 if (hri == null) {
561 LOG.debug("New connection to " + sn.toString());
562 hri = this.connection.getHRegionConnection(sn.getHostname(), sn.getPort());
563 this.serverConnections.put(sn, hri);
564 }
565 return hri;
566 }
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581 public void waitForRegionServers(MonitoredTask status)
582 throws InterruptedException {
583 final long interval = this.master.getConfiguration().
584 getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
585 final long timeout = this.master.getConfiguration().
586 getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
587 int minToStart = this.master.getConfiguration().
588 getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
589 if (minToStart < 1) {
590 LOG.warn(String.format(
591 "The value of '%s' (%d) can not be less than 1, ignoring.",
592 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
593 minToStart = 1;
594 }
595 int maxToStart = this.master.getConfiguration().
596 getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
597 if (maxToStart < minToStart) {
598 LOG.warn(String.format(
599 "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
600 WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
601 WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
602 maxToStart = Integer.MAX_VALUE;
603 }
604
605 long now = System.currentTimeMillis();
606 final long startTime = now;
607 long slept = 0;
608 long lastLogTime = 0;
609 long lastCountChange = startTime;
610 int count = countOfRegionServers();
611 int oldCount = 0;
612 while (
613 !this.master.isStopped() &&
614 count < maxToStart &&
615 (lastCountChange+interval > now || timeout > slept || count < minToStart)
616 ){
617
618
619 if (oldCount != count || lastLogTime+interval < now){
620 lastLogTime = now;
621 String msg =
622 "Waiting for region servers count to settle; currently"+
623 " checked in " + count + ", slept for " + slept + " ms," +
624 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
625 ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
626 LOG.info(msg);
627 status.setStatus(msg);
628 }
629
630
631 final long sleepTime = 50;
632 Thread.sleep(sleepTime);
633 now = System.currentTimeMillis();
634 slept = now - startTime;
635
636 oldCount = count;
637 count = countOfRegionServers();
638 if (count != oldCount) {
639 lastCountChange = now;
640 }
641 }
642
643 LOG.info("Finished waiting for region servers count to settle;" +
644 " checked in " + count + ", slept for " + slept + " ms," +
645 " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
646 " master is "+ (this.master.isStopped() ? "stopped.": "running.")
647 );
648 }
649
650
651
652
653 public List<ServerName> getOnlineServersList() {
654
655
656 return new ArrayList<ServerName>(this.onlineServers.keySet());
657 }
658
659
660
661
662 public List<ServerName> getDrainingServersList() {
663 return new ArrayList<ServerName>(this.drainingServers);
664 }
665
666
667
668
669 Set<ServerName> getDeadNotExpiredServers() {
670 return new HashSet<ServerName>(this.deadNotExpiredServers);
671 }
672
673 public boolean isServerOnline(ServerName serverName) {
674 return onlineServers.containsKey(serverName);
675 }
676
677 public void shutdownCluster() {
678 this.clusterShutdown = true;
679 this.master.stop("Cluster shutdown requested");
680 }
681
682 public boolean isClusterShutdown() {
683 return this.clusterShutdown;
684 }
685
686
687
688
689 public void stop() {
690 if (connection != null) {
691 try {
692 connection.close();
693 } catch (IOException e) {
694 LOG.error("Attempt to close connection to master failed", e);
695 }
696 }
697 }
698
699
700
701
702 void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
703 ServerName sn = null;
704 for (ServerName serverName : getOnlineServersList()) {
705 while ((sn = ServerName.
706 findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
707 this.deadservers.remove(sn);
708 }
709 }
710 }
711
712 }