1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Set;
29 import java.util.concurrent.ConcurrentHashMap;
30
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.apache.hadoop.conf.Configuration;
34 import org.apache.hadoop.hbase.ClockOutOfSyncException;
35 import org.apache.hadoop.hbase.HMsg;
36 import org.apache.hadoop.hbase.HRegionInfo;
37 import org.apache.hadoop.hbase.HServerAddress;
38 import org.apache.hadoop.hbase.HServerInfo;
39 import org.apache.hadoop.hbase.HServerLoad;
40 import org.apache.hadoop.hbase.PleaseHoldException;
41 import org.apache.hadoop.hbase.Server;
42 import org.apache.hadoop.hbase.YouAreDeadException;
43 import org.apache.hadoop.hbase.catalog.CatalogTracker;
44 import org.apache.hadoop.hbase.client.HConnection;
45 import org.apache.hadoop.hbase.client.HConnectionManager;
46 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
47 import org.apache.hadoop.hbase.ipc.HRegionInterface;
48 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
49 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
50 import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
51 import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 public class ServerManager {
68 private static final Log LOG = LogFactory.getLog(ServerManager.class);
69
70
71 private volatile boolean clusterShutdown = false;
72
73
74 private final Map<String, HServerInfo> onlineServers =
75 new ConcurrentHashMap<String, HServerInfo>();
76
77
78
79
80
81 private final Map<String, HRegionInterface> serverConnections =
82 new HashMap<String, HRegionInterface>();
83
84 private final Server master;
85 private final MasterServices services;
86
87
88 private final MasterMetrics metrics;
89
90 private final DeadServer deadservers;
91
92 private final long maxSkew;
93
94
95
96
97
98
99
100 public ServerManager(final Server master, final MasterServices services,
101 MasterMetrics metrics) {
102 this.master = master;
103 this.services = services;
104 this.metrics = metrics;
105 Configuration c = master.getConfiguration();
106 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
107 this.deadservers =
108 new DeadServer(c.getInt("hbase.master.maxdeadservers", 100));
109 }
110
111
112
113
114
115
116
117 void regionServerStartup(final HServerInfo serverInfo, long serverCurrentTime)
118 throws IOException {
119
120
121
122
123
124
125
126 HServerInfo info = new HServerInfo(serverInfo);
127 checkIsDead(info.getServerName(), "STARTUP");
128 checkAlreadySameHostPort(info);
129 checkClockSkew(info, serverCurrentTime);
130 recordNewServer(info, false, null);
131 }
132
133
134
135
136
137
138 void checkAlreadySameHostPort(final HServerInfo serverInfo)
139 throws PleaseHoldException {
140 String hostAndPort = serverInfo.getServerAddress().toString();
141 HServerInfo existingServer =
142 haveServerWithSameHostAndPortAlready(serverInfo.getHostnamePort());
143 if (existingServer != null) {
144 String message = "Server start rejected; we already have " + hostAndPort +
145 " registered; existingServer=" + existingServer + ", newServer=" + serverInfo;
146 LOG.info(message);
147 if (existingServer.getStartCode() < serverInfo.getStartCode()) {
148 LOG.info("Triggering server recovery; existingServer " +
149 existingServer.getServerName() + " looks stale");
150 expireServer(existingServer);
151 }
152 throw new PleaseHoldException(message);
153 }
154 }
155
156 private HServerInfo haveServerWithSameHostAndPortAlready(final String hostnamePort) {
157 synchronized (this.onlineServers) {
158 for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
159 if (e.getValue().getHostnamePort().equals(hostnamePort)) {
160 return e.getValue();
161 }
162 }
163 }
164 return null;
165 }
166
167
168
169
170
171
172 private void checkClockSkew(final HServerInfo serverInfo,
173 final long serverCurrentTime)
174 throws ClockOutOfSyncException {
175 long skew = System.currentTimeMillis() - serverCurrentTime;
176 if (skew > maxSkew) {
177 String message = "Server " + serverInfo.getServerName() + " has been " +
178 "rejected; Reported time is too far out of sync with master. " +
179 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
180 LOG.warn(message);
181 throw new ClockOutOfSyncException(message);
182 }
183 }
184
185
186
187
188
189
190
191
192
193 private void checkIsDead(final String serverName, final String what)
194 throws YouAreDeadException {
195 if (this.deadservers.isDeadServer(serverName)) {
196
197
198 String message = "Server " + what + " rejected; currently processing " +
199 serverName + " as dead server";
200 LOG.debug(message);
201 throw new YouAreDeadException(message);
202 }
203
204 if (this.deadservers.cleanPreviousInstance(serverName)) {
205
206
207 LOG.debug("Server " + serverName + " came back up, removed it from the" +
208 " dead servers list");
209 }
210 }
211
212
213
214
215
216
217
218
219 void recordNewServer(HServerInfo info, boolean useInfoLoad,
220 HRegionInterface hri) {
221 HServerLoad load = useInfoLoad? info.getLoad(): new HServerLoad();
222 String serverName = info.getServerName();
223 LOG.info("Registering server=" + serverName + ", regionCount=" +
224 load.getLoad() + ", userLoad=" + useInfoLoad);
225 info.setLoad(load);
226
227
228
229
230
231 this.onlineServers.put(serverName, info);
232 if (hri == null) {
233 serverConnections.remove(serverName);
234 } else {
235 serverConnections.put(serverName, hri);
236 }
237 }
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252 HMsg [] regionServerReport(final HServerInfo serverInfo,
253 final HMsg [] msgs, final HRegionInfo[] mostLoadedRegions)
254 throws IOException {
255
256 HServerInfo info = new HServerInfo(serverInfo);
257
258
259 checkIsDead(info.getServerName(), "REPORT");
260
261
262 HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
263 if (storedInfo == null) {
264
265
266 checkAlreadySameHostPort(info);
267
268
269
270
271
272 recordNewServer(info, true, null);
273
274
275
276
277
278 if (msgs.length > 0)
279 throw new PleaseHoldException("FIX! Putting off " +
280 "message processing because not yet rwady but possible we won't be " +
281 "ready next on next report");
282 }
283
284
285 if (raceThatShouldNotHappenAnymore(storedInfo, info)) {
286 return HMsg.STOP_REGIONSERVER_ARRAY;
287 }
288
289 for (HMsg msg: msgs) {
290 LOG.info("Received " + msg + " from " + serverInfo.getServerName());
291 switch (msg.getType()) {
292 case REGION_SPLIT:
293 this.services.getAssignmentManager().handleSplitReport(serverInfo,
294 msg.getRegionInfo(), msg.getDaughterA(), msg.getDaughterB());
295 break;
296
297 default:
298 LOG.error("Unhandled msg type " + msg);
299 }
300 }
301
302 HMsg [] reply = null;
303 int numservers = countOfRegionServers();
304 if (this.clusterShutdown) {
305 if (numservers <= 2) {
306
307
308
309
310
311 reply = HMsg.STOP_REGIONSERVER_ARRAY;
312 }
313 }
314 return processRegionServerAllsWell(info, mostLoadedRegions, reply);
315 }
316
317 private boolean raceThatShouldNotHappenAnymore(final HServerInfo storedInfo,
318 final HServerInfo reportedInfo) {
319 if (storedInfo.getStartCode() != reportedInfo.getStartCode()) {
320
321
322
323
324
325
326
327 LOG.warn("Race condition detected: " + reportedInfo.getServerName());
328 synchronized (this.onlineServers) {
329 removeServerInfo(reportedInfo.getServerName());
330 notifyOnlineServers();
331 }
332 return true;
333 }
334 return false;
335 }
336
337
338
339
340
341
342
343
344
345 private HMsg[] processRegionServerAllsWell(HServerInfo serverInfo,
346 final HRegionInfo[] mostLoadedRegions, HMsg[] msgs)
347 throws IOException {
348
349 this.onlineServers.put(serverInfo.getServerName(), serverInfo);
350 HServerLoad load = serverInfo.getLoad();
351 if (load != null && this.metrics != null) {
352 this.metrics.incrementRequests(load.getNumberOfRequests());
353 }
354
355 return msgs;
356 }
357
358
359
360
361
362 private boolean removeServerInfo(final String serverName) {
363 HServerInfo info = this.onlineServers.remove(serverName);
364 if (info != null) {
365 return true;
366 }
367 return false;
368 }
369
370
371
372
373
374
375
376 public double getAverageLoad() {
377 int totalLoad = 0;
378 int numServers = 0;
379 double averageLoad = 0.0;
380 for (HServerInfo hsi : onlineServers.values()) {
381 numServers++;
382 totalLoad += hsi.getLoad().getNumberOfRegions();
383 }
384 averageLoad = (double)totalLoad / (double)numServers;
385 return averageLoad;
386 }
387
388
389 int countOfRegionServers() {
390
391 return this.onlineServers.size();
392 }
393
394
395
396
397
398 public HServerInfo getServerInfo(String name) {
399 return this.onlineServers.get(name);
400 }
401
402
403
404
405 public Map<String, HServerInfo> getOnlineServers() {
406
407 synchronized (this.onlineServers) {
408 return Collections.unmodifiableMap(this.onlineServers);
409 }
410 }
411
412 public Set<String> getDeadServers() {
413 return this.deadservers.clone();
414 }
415
416
417
418
419
420 public boolean areDeadServersInProgress() {
421 return this.deadservers.areDeadServersInProgress();
422 }
423
424
425
426
427
428
429 public HServerInfo getHServerInfo(final HServerAddress hsa) {
430 synchronized(this.onlineServers) {
431
432 for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
433 if (e.getValue().getServerAddress().equals(hsa)) {
434 return e.getValue();
435 }
436 }
437 }
438 return null;
439 }
440
441 private void notifyOnlineServers() {
442 synchronized (this.onlineServers) {
443 this.onlineServers.notifyAll();
444 }
445 }
446
447
448
449
450
451
452
453 void letRegionServersShutdown() {
454 synchronized (onlineServers) {
455 while (onlineServers.size() > 0) {
456 StringBuilder sb = new StringBuilder();
457 for (String key: this.onlineServers.keySet()) {
458 if (sb.length() > 0) {
459 sb.append(", ");
460 }
461 sb.append(key);
462 }
463 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
464 try {
465 this.onlineServers.wait(1000);
466 } catch (InterruptedException e) {
467
468 }
469 }
470 }
471 }
472
473
474
475
476
477 public synchronized void expireServer(final HServerInfo hsi) {
478
479
480 String serverName = hsi.getServerName();
481 HServerInfo info = this.onlineServers.get(serverName);
482 if (info == null) {
483 LOG.warn("Received expiration of " + hsi.getServerName() +
484 " but server is not currently online");
485 return;
486 }
487 if (this.deadservers.contains(serverName)) {
488
489 LOG.warn("Received expiration of " + hsi.getServerName() +
490 " but server shutdown is already in progress");
491 return;
492 }
493
494
495
496 this.deadservers.add(serverName);
497 this.onlineServers.remove(serverName);
498 this.serverConnections.remove(serverName);
499
500
501 if (this.clusterShutdown) {
502 LOG.info("Cluster shutdown set; " + hsi.getServerName() +
503 " expired; onlineServers=" + this.onlineServers.size());
504 if (this.onlineServers.isEmpty()) {
505 master.stop("Cluster shutdown set; onlineServer=0");
506 }
507 return;
508 }
509 CatalogTracker ct = this.master.getCatalogTracker();
510
511 boolean carryingRoot;
512 try {
513 HServerAddress address = ct.getRootLocation();
514 carryingRoot = address != null &&
515 hsi.getServerAddress().equals(address);
516 } catch (InterruptedException e) {
517 Thread.currentThread().interrupt();
518 LOG.info("Interrupted");
519 return;
520 }
521
522
523
524
525
526 HServerAddress address = ct.getMetaLocation();
527 boolean carryingMeta =
528 address != null && hsi.getServerAddress().equals(address);
529 if (carryingRoot || carryingMeta) {
530 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
531 this.services, this.deadservers, info, carryingRoot, carryingMeta));
532 } else {
533 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
534 this.services, this.deadservers, info));
535 }
536 LOG.debug("Added=" + serverName +
537 " to dead servers, submitted shutdown handler to be executed, root=" +
538 carryingRoot + ", meta=" + carryingMeta);
539 }
540
541
542
543
544
545
546
547
548
549
550
551 public void sendRegionOpen(HServerInfo server, HRegionInfo region)
552 throws IOException {
553 HRegionInterface hri = getServerConnection(server);
554 if (hri == null) {
555 LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
556 + " failed because no RPC connection found to this server");
557 return;
558 }
559 hri.openRegion(region);
560 }
561
562
563
564
565
566
567
568
569
570 public void sendRegionOpen(HServerInfo server, List<HRegionInfo> regions)
571 throws IOException {
572 HRegionInterface hri = getServerConnection(server);
573 if (hri == null) {
574 LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
575 + " failed because no RPC connection found to this server");
576 return;
577 }
578 hri.openRegions(regions);
579 }
580
581
582
583
584
585
586
587
588
589
590
591 public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
592 throws IOException {
593 if (server == null) throw new NullPointerException("Passed server is null");
594 HRegionInterface hri = getServerConnection(server);
595 if (hri == null) {
596 throw new IOException("Attempting to send CLOSE RPC to server " +
597 server.getServerName() + " for region " +
598 region.getRegionNameAsString() +
599 " failed because no RPC connection found to this server");
600 }
601 return hri.closeRegion(region);
602 }
603
604
605
606
607
608
609
610
611 private HRegionInterface getServerConnection(HServerInfo info)
612 throws IOException {
613 HConnection connection =
614 HConnectionManager.getConnection(this.master.getConfiguration());
615 HRegionInterface hri = serverConnections.get(info.getServerName());
616 if (hri == null) {
617 LOG.debug("New connection to " + info.getServerName());
618 hri = connection.getHRegionConnection(info.getServerAddress(), false);
619 this.serverConnections.put(info.getServerName(), hri);
620 }
621 return hri;
622 }
623
624
625
626
627
628
629 public int waitForRegionServers()
630 throws InterruptedException {
631 long interval = this.master.getConfiguration().
632 getLong("hbase.master.wait.on.regionservers.interval", 1500);
633 long timeout = this.master.getConfiguration().
634 getLong("hbase.master.wait.on.regionservers.timeout", 4500);
635 int minToStart = this.master.getConfiguration().
636 getInt("hbase.master.wait.on.regionservers.mintostart", 1);
637 int maxToStart = this.master.getConfiguration().
638 getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE);
639
640
641 int count = 0;
642 long slept = 0;
643 for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
644 Thread.sleep(interval);
645 slept += interval;
646 count = countOfRegionServers();
647 if (count == oldcount && count >= minToStart && slept >= timeout) {
648 LOG.info("Finished waiting for regionserver count to settle; " +
649 "count=" + count + ", sleptFor=" + slept);
650 break;
651 }
652 if (count >= maxToStart) {
653 LOG.info("At least the max configured number of regionserver(s) have " +
654 "checked in: " + count);
655 break;
656 }
657 if (count == 0) {
658 LOG.info("Waiting on regionserver(s) to checkin");
659 } else {
660 LOG.info("Waiting on regionserver(s) count to settle; currently=" + count);
661 }
662 oldcount = count;
663 }
664
665
666
667
668
669 int regionCount = 0;
670 for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
671 HServerLoad load = e.getValue().getLoad();
672 if (load != null) regionCount += load.getLoad();
673 }
674 LOG.info("Exiting wait on regionserver(s) to checkin; count=" + count +
675 ", stopped=" + this.master.isStopped() +
676 ", count of regions out on cluster=" + regionCount);
677 return regionCount;
678 }
679
680
681
682
683 public List<HServerInfo> getOnlineServersList() {
684
685 return new ArrayList<HServerInfo>(onlineServers.values());
686 }
687
688 public boolean isServerOnline(String serverName) {
689 return onlineServers.containsKey(serverName);
690 }
691
692 public void shutdownCluster() {
693 this.clusterShutdown = true;
694 this.master.stop("Cluster shutdown requested");
695 }
696
697 public boolean isClusterShutdown() {
698 return this.clusterShutdown;
699 }
700
701
702
703
704 public void stop() {
705
706 }
707 }