1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28 import java.util.Set;
29 import java.util.concurrent.ConcurrentHashMap;
30
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.apache.hadoop.conf.Configuration;
34 import org.apache.hadoop.hbase.ClockOutOfSyncException;
35 import org.apache.hadoop.hbase.HMsg;
36 import org.apache.hadoop.hbase.HRegionInfo;
37 import org.apache.hadoop.hbase.HServerAddress;
38 import org.apache.hadoop.hbase.HServerInfo;
39 import org.apache.hadoop.hbase.HServerLoad;
40 import org.apache.hadoop.hbase.PleaseHoldException;
41 import org.apache.hadoop.hbase.Server;
42 import org.apache.hadoop.hbase.YouAreDeadException;
43 import org.apache.hadoop.hbase.catalog.CatalogTracker;
44 import org.apache.hadoop.hbase.client.HConnection;
45 import org.apache.hadoop.hbase.client.HConnectionManager;
46 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
47 import org.apache.hadoop.hbase.ipc.HRegionInterface;
48 import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
49 import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
50 import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
51 import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 public class ServerManager {
68 private static final Log LOG = LogFactory.getLog(ServerManager.class);
69
70
71 private volatile boolean clusterShutdown = false;
72
73
74 private final Map<String, HServerInfo> onlineServers =
75 new ConcurrentHashMap<String, HServerInfo>();
76
77
78
79
80
81 private final Map<String, HRegionInterface> serverConnections =
82 new HashMap<String, HRegionInterface>();
83
84 private final Server master;
85 private final MasterServices services;
86
87
88 private final MasterMetrics metrics;
89
90 private final DeadServer deadservers;
91
92 private final long maxSkew;
93
94
95
96
97
98
99
100 public ServerManager(final Server master, final MasterServices services,
101 MasterMetrics metrics) {
102 this.master = master;
103 this.services = services;
104 this.metrics = metrics;
105 Configuration c = master.getConfiguration();
106 maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
107 this.deadservers =
108 new DeadServer(c.getInt("hbase.master.maxdeadservers", 100));
109 }
110
111
112
113
114
115
116
117 void regionServerStartup(final HServerInfo serverInfo, long serverCurrentTime)
118 throws IOException {
119
120
121
122
123
124
125
126 HServerInfo info = new HServerInfo(serverInfo);
127 checkIsDead(info.getServerName(), "STARTUP");
128 checkAlreadySameHostPort(info);
129 checkClockSkew(info, serverCurrentTime);
130 recordNewServer(info, false, null);
131 }
132
133
134
135
136
137
138 void checkAlreadySameHostPort(final HServerInfo serverInfo)
139 throws PleaseHoldException {
140 String hostAndPort = serverInfo.getServerAddress().toString();
141 HServerInfo existingServer =
142 haveServerWithSameHostAndPortAlready(serverInfo.getHostnamePort());
143 if (existingServer != null) {
144 String message = "Server start rejected; we already have " + hostAndPort +
145 " registered; existingServer=" + existingServer + ", newServer=" + serverInfo;
146 LOG.info(message);
147 if (existingServer.getStartCode() < serverInfo.getStartCode()) {
148 LOG.info("Triggering server recovery; existingServer " +
149 existingServer.getServerName() + " looks stale");
150 expireServer(existingServer);
151 }
152 throw new PleaseHoldException(message);
153 }
154 }
155
156 private HServerInfo haveServerWithSameHostAndPortAlready(final String hostnamePort) {
157 synchronized (this.onlineServers) {
158 for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
159 if (e.getValue().getHostnamePort().equals(hostnamePort)) {
160 return e.getValue();
161 }
162 }
163 }
164 return null;
165 }
166
167
168
169
170
171
172 private void checkClockSkew(final HServerInfo serverInfo,
173 final long serverCurrentTime)
174 throws ClockOutOfSyncException {
175 long skew = System.currentTimeMillis() - serverCurrentTime;
176 if (skew > maxSkew) {
177 String message = "Server " + serverInfo.getServerName() + " has been " +
178 "rejected; Reported time is too far out of sync with master. " +
179 "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
180 LOG.warn(message);
181 throw new ClockOutOfSyncException(message);
182 }
183 }
184
185
186
187
188
189
190
191 private void checkIsDead(final String serverName, final String what)
192 throws YouAreDeadException {
193 if (!this.deadservers.isDeadServer(serverName)) return;
194 String message = "Server " + what + " rejected; currently processing " +
195 serverName + " as dead server";
196 LOG.debug(message);
197 throw new YouAreDeadException(message);
198 }
199
200
201
202
203
204
205
206
207 void recordNewServer(HServerInfo info, boolean useInfoLoad,
208 HRegionInterface hri) {
209 HServerLoad load = useInfoLoad? info.getLoad(): new HServerLoad();
210 String serverName = info.getServerName();
211 LOG.info("Registering server=" + serverName + ", regionCount=" +
212 load.getLoad() + ", userLoad=" + useInfoLoad);
213 info.setLoad(load);
214
215
216
217
218
219 this.onlineServers.put(serverName, info);
220 if (hri == null) {
221 serverConnections.remove(serverName);
222 } else {
223 serverConnections.put(serverName, hri);
224 }
225 }
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240 HMsg [] regionServerReport(final HServerInfo serverInfo,
241 final HMsg [] msgs, final HRegionInfo[] mostLoadedRegions)
242 throws IOException {
243
244 HServerInfo info = new HServerInfo(serverInfo);
245
246
247 checkIsDead(info.getServerName(), "REPORT");
248
249
250 HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
251 if (storedInfo == null) {
252
253
254 checkAlreadySameHostPort(info);
255
256
257
258
259
260 recordNewServer(info, true, null);
261
262
263
264
265
266 if (msgs.length > 0)
267 throw new PleaseHoldException("FIX! Putting off " +
268 "message processing because not yet rwady but possible we won't be " +
269 "ready next on next report");
270 }
271
272
273 if (raceThatShouldNotHappenAnymore(storedInfo, info)) {
274 return HMsg.STOP_REGIONSERVER_ARRAY;
275 }
276
277 for (HMsg msg: msgs) {
278 LOG.info("Received " + msg + " from " + serverInfo.getServerName());
279 switch (msg.getType()) {
280 case REGION_SPLIT:
281 this.services.getAssignmentManager().handleSplitReport(serverInfo,
282 msg.getRegionInfo(), msg.getDaughterA(), msg.getDaughterB());
283 break;
284
285 default:
286 LOG.error("Unhandled msg type " + msg);
287 }
288 }
289
290 HMsg [] reply = null;
291 int numservers = countOfRegionServers();
292 if (this.clusterShutdown) {
293 if (numservers <= 2) {
294
295
296
297
298
299 reply = HMsg.STOP_REGIONSERVER_ARRAY;
300 }
301 }
302 return processRegionServerAllsWell(info, mostLoadedRegions, reply);
303 }
304
305 private boolean raceThatShouldNotHappenAnymore(final HServerInfo storedInfo,
306 final HServerInfo reportedInfo) {
307 if (storedInfo.getStartCode() != reportedInfo.getStartCode()) {
308
309
310
311
312
313
314
315 LOG.warn("Race condition detected: " + reportedInfo.getServerName());
316 synchronized (this.onlineServers) {
317 removeServerInfo(reportedInfo.getServerName());
318 notifyOnlineServers();
319 }
320 return true;
321 }
322 return false;
323 }
324
325
326
327
328
329
330
331
332
333 private HMsg[] processRegionServerAllsWell(HServerInfo serverInfo,
334 final HRegionInfo[] mostLoadedRegions, HMsg[] msgs)
335 throws IOException {
336
337 this.onlineServers.put(serverInfo.getServerName(), serverInfo);
338 HServerLoad load = serverInfo.getLoad();
339 if (load != null && this.metrics != null) {
340 this.metrics.incrementRequests(load.getNumberOfRequests());
341 }
342
343 return msgs;
344 }
345
346
347
348
349
350 private boolean removeServerInfo(final String serverName) {
351 HServerInfo info = this.onlineServers.remove(serverName);
352 if (info != null) {
353 return true;
354 }
355 return false;
356 }
357
358
359
360
361
362
363
364 public double getAverageLoad() {
365 int totalLoad = 0;
366 int numServers = 0;
367 double averageLoad = 0.0;
368 for (HServerInfo hsi : onlineServers.values()) {
369 numServers++;
370 totalLoad += hsi.getLoad().getNumberOfRegions();
371 }
372 averageLoad = (double)totalLoad / (double)numServers;
373 return averageLoad;
374 }
375
376
377 int countOfRegionServers() {
378
379 return this.onlineServers.size();
380 }
381
382
383
384
385
386 public HServerInfo getServerInfo(String name) {
387 return this.onlineServers.get(name);
388 }
389
390
391
392
393 public Map<String, HServerInfo> getOnlineServers() {
394
395 synchronized (this.onlineServers) {
396 return Collections.unmodifiableMap(this.onlineServers);
397 }
398 }
399
400 public Set<String> getDeadServers() {
401 return this.deadservers.clone();
402 }
403
404
405
406
407
408 public boolean areDeadServersInProgress() {
409 return this.deadservers.areDeadServersInProgress();
410 }
411
412
413
414
415
416
417 public HServerInfo getHServerInfo(final HServerAddress hsa) {
418 synchronized(this.onlineServers) {
419
420 for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
421 if (e.getValue().getServerAddress().equals(hsa)) {
422 return e.getValue();
423 }
424 }
425 }
426 return null;
427 }
428
429 private void notifyOnlineServers() {
430 synchronized (this.onlineServers) {
431 this.onlineServers.notifyAll();
432 }
433 }
434
435
436
437
438
439
440
441 void letRegionServersShutdown() {
442 synchronized (onlineServers) {
443 while (onlineServers.size() > 0) {
444 StringBuilder sb = new StringBuilder();
445 for (String key: this.onlineServers.keySet()) {
446 if (sb.length() > 0) {
447 sb.append(", ");
448 }
449 sb.append(key);
450 }
451 LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
452 try {
453 this.onlineServers.wait(1000);
454 } catch (InterruptedException e) {
455
456 }
457 }
458 }
459 }
460
461
462
463
464
465 public synchronized void expireServer(final HServerInfo hsi) {
466
467
468 String serverName = hsi.getServerName();
469 HServerInfo info = this.onlineServers.get(serverName);
470 if (info == null) {
471 LOG.warn("Received expiration of " + hsi.getServerName() +
472 " but server is not currently online");
473 return;
474 }
475 if (this.deadservers.contains(serverName)) {
476
477 LOG.warn("Received expiration of " + hsi.getServerName() +
478 " but server shutdown is already in progress");
479 return;
480 }
481
482
483
484 this.deadservers.add(serverName);
485 this.onlineServers.remove(serverName);
486 this.serverConnections.remove(serverName);
487
488
489 if (this.clusterShutdown) {
490 LOG.info("Cluster shutdown set; " + hsi.getServerName() +
491 " expired; onlineServers=" + this.onlineServers.size());
492 if (this.onlineServers.isEmpty()) {
493 master.stop("Cluster shutdown set; onlineServer=0");
494 }
495 return;
496 }
497 CatalogTracker ct = this.master.getCatalogTracker();
498
499 boolean carryingRoot;
500 try {
501 HServerAddress address = ct.getRootLocation();
502 carryingRoot = address != null &&
503 hsi.getServerAddress().equals(address);
504 } catch (InterruptedException e) {
505 Thread.currentThread().interrupt();
506 LOG.info("Interrupted");
507 return;
508 }
509
510
511
512
513
514 HServerAddress address = ct.getMetaLocation();
515 boolean carryingMeta =
516 address != null && hsi.getServerAddress().equals(address);
517 if (carryingRoot || carryingMeta) {
518 this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
519 this.services, this.deadservers, info, carryingRoot, carryingMeta));
520 } else {
521 this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
522 this.services, this.deadservers, info));
523 }
524 LOG.debug("Added=" + serverName +
525 " to dead servers, submitted shutdown handler to be executed, root=" +
526 carryingRoot + ", meta=" + carryingMeta);
527 }
528
529
530
531
532
533
534
535
536
537
538
539 public void sendRegionOpen(HServerInfo server, HRegionInfo region)
540 throws IOException {
541 HRegionInterface hri = getServerConnection(server);
542 if (hri == null) {
543 LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
544 + " failed because no RPC connection found to this server");
545 return;
546 }
547 hri.openRegion(region);
548 }
549
550
551
552
553
554
555
556
557
558 public void sendRegionOpen(HServerInfo server, List<HRegionInfo> regions)
559 throws IOException {
560 HRegionInterface hri = getServerConnection(server);
561 if (hri == null) {
562 LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
563 + " failed because no RPC connection found to this server");
564 return;
565 }
566 hri.openRegions(regions);
567 }
568
569
570
571
572
573
574
575
576
577
578
579 public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
580 throws IOException {
581 if (server == null) throw new NullPointerException("Passed server is null");
582 HRegionInterface hri = getServerConnection(server);
583 if (hri == null) {
584 throw new IOException("Attempting to send CLOSE RPC to server " +
585 server.getServerName() + " for region " +
586 region.getRegionNameAsString() +
587 " failed because no RPC connection found to this server");
588 }
589 return hri.closeRegion(region);
590 }
591
592
593
594
595
596
597
598
599 private HRegionInterface getServerConnection(HServerInfo info)
600 throws IOException {
601 HConnection connection =
602 HConnectionManager.getConnection(this.master.getConfiguration());
603 HRegionInterface hri = serverConnections.get(info.getServerName());
604 if (hri == null) {
605 LOG.debug("New connection to " + info.getServerName());
606 hri = connection.getHRegionConnection(info.getServerAddress(), false);
607 this.serverConnections.put(info.getServerName(), hri);
608 }
609 return hri;
610 }
611
612
613
614
615
616
617 public int waitForRegionServers()
618 throws InterruptedException {
619 long interval = this.master.getConfiguration().
620 getLong("hbase.master.wait.on.regionservers.interval", 1500);
621 long timeout = this.master.getConfiguration().
622 getLong("hbase.master.wait.on.regionservers.timeout", 4500);
623 int minToStart = this.master.getConfiguration().
624 getInt("hbase.master.wait.on.regionservers.mintostart", 1);
625 int maxToStart = this.master.getConfiguration().
626 getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE);
627
628
629 int count = 0;
630 long slept = 0;
631 for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
632 Thread.sleep(interval);
633 slept += interval;
634 count = countOfRegionServers();
635 if (count == oldcount && count >= minToStart && slept >= timeout) {
636 LOG.info("Finished waiting for regionserver count to settle; " +
637 "count=" + count + ", sleptFor=" + slept);
638 break;
639 }
640 if (count >= maxToStart) {
641 LOG.info("At least the max configured number of regionserver(s) have " +
642 "checked in: " + count);
643 break;
644 }
645 if (count == 0) {
646 LOG.info("Waiting on regionserver(s) to checkin");
647 } else {
648 LOG.info("Waiting on regionserver(s) count to settle; currently=" + count);
649 }
650 oldcount = count;
651 }
652
653
654
655
656
657 int regionCount = 0;
658 for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
659 HServerLoad load = e.getValue().getLoad();
660 if (load != null) regionCount += load.getLoad();
661 }
662 LOG.info("Exiting wait on regionserver(s) to checkin; count=" + count +
663 ", stopped=" + this.master.isStopped() +
664 ", count of regions out on cluster=" + regionCount);
665 return regionCount;
666 }
667
668
669
670
671 public List<HServerInfo> getOnlineServersList() {
672
673 return new ArrayList<HServerInfo>(onlineServers.values());
674 }
675
676 public boolean isServerOnline(String serverName) {
677 return onlineServers.containsKey(serverName);
678 }
679
680 public void shutdownCluster() {
681 this.clusterShutdown = true;
682 this.master.stop("Cluster shutdown requested");
683 }
684
685 public boolean isClusterShutdown() {
686 return this.clusterShutdown;
687 }
688
689
690
691
692 public void stop() {
693
694 }
695 }