View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Set;
29  import java.util.concurrent.ConcurrentHashMap;
30  
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  import org.apache.hadoop.conf.Configuration;
34  import org.apache.hadoop.hbase.ClockOutOfSyncException;
35  import org.apache.hadoop.hbase.HMsg;
36  import org.apache.hadoop.hbase.HRegionInfo;
37  import org.apache.hadoop.hbase.HServerAddress;
38  import org.apache.hadoop.hbase.HServerInfo;
39  import org.apache.hadoop.hbase.HServerLoad;
40  import org.apache.hadoop.hbase.PleaseHoldException;
41  import org.apache.hadoop.hbase.Server;
42  import org.apache.hadoop.hbase.YouAreDeadException;
43  import org.apache.hadoop.hbase.catalog.CatalogTracker;
44  import org.apache.hadoop.hbase.client.HConnection;
45  import org.apache.hadoop.hbase.client.HConnectionManager;
46  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
47  import org.apache.hadoop.hbase.ipc.HRegionInterface;
48  import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
49  import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
50  import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
51  import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
52  
53  /**
54   * The ServerManager class manages info about region servers - HServerInfo,
55   * load numbers, dying servers, etc.
56   * <p>
57   * Maintains lists of online and dead servers.  Processes the startups,
58   * shutdowns, and deaths of region servers.
59   * <p>
60   * Servers are distinguished in two different ways.  A given server has a
61   * location, specified by hostname and port, and of which there can only be one
62   * online at any given time.  A server instance is specified by the location
63   * (hostname and port) as well as the startcode (timestamp from when the server
64   * was started).  This is used to differentiate a restarted instance of a given
65   * server from the original instance.
66   */
67  public class ServerManager {
68    private static final Log LOG = LogFactory.getLog(ServerManager.class);
69  
70    // Set if we are to shutdown the cluster.
71    private volatile boolean clusterShutdown = false;
72  
73    /** The map of known server names to server info */
74    private final Map<String, HServerInfo> onlineServers =
75      new ConcurrentHashMap<String, HServerInfo>();
76  
77    // TODO: This is strange to have two maps but HSI above is used on both sides
78    /**
79     * Map from full server-instance name to the RPC connection for this server.
80     */
81    private final Map<String, HRegionInterface> serverConnections =
82      new HashMap<String, HRegionInterface>();
83  
84    private final Server master;
85    private final MasterServices services;
86  
87    // Reporting to track master metrics.
88    private final MasterMetrics metrics;
89  
90    private final DeadServer deadservers;
91  
92    private final long maxSkew;
93  
94    /**
95     * Constructor.
96     * @param master
97     * @param services
98     * @param metrics
99     */
100   public ServerManager(final Server master, final MasterServices services,
101       MasterMetrics metrics) {
102     this.master = master;
103     this.services = services;
104     this.metrics = metrics;
105     Configuration c = master.getConfiguration();
106     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
107     this.deadservers =
108       new DeadServer(c.getInt("hbase.master.maxdeadservers", 100));
109   }
110 
111   /**
112    * Let the server manager know a new regionserver has come online
113    * @param serverInfo
114    * @param serverCurrentTime The current time of the region server in ms
115    * @throws IOException
116    */
117   void regionServerStartup(final HServerInfo serverInfo, long serverCurrentTime)
118   throws IOException {
119     // Test for case where we get a region startup message from a regionserver
120     // that has been quickly restarted but whose znode expiration handler has
121     // not yet run, or from a server whose fail we are currently processing.
122     // Test its host+port combo is present in serverAddresstoServerInfo.  If it
123     // is, reject the server and trigger its expiration. The next time it comes
124     // in, it should have been removed from serverAddressToServerInfo and queued
125     // for processing by ProcessServerShutdown.
126     HServerInfo info = new HServerInfo(serverInfo);
127     checkIsDead(info.getServerName(), "STARTUP");
128     checkAlreadySameHostPort(info);
129     checkClockSkew(info, serverCurrentTime);
130     recordNewServer(info, false, null);
131   }
132 
133   /**
134    * Test to see if we have a server of same host and port already.
135    * @param serverInfo
136    * @throws PleaseHoldException
137    */
138   void checkAlreadySameHostPort(final HServerInfo serverInfo)
139   throws PleaseHoldException {
140     String hostAndPort = serverInfo.getServerAddress().toString();
141     HServerInfo existingServer =
142       haveServerWithSameHostAndPortAlready(serverInfo.getHostnamePort());
143     if (existingServer != null) {
144       String message = "Server start rejected; we already have " + hostAndPort +
145         " registered; existingServer=" + existingServer + ", newServer=" + serverInfo;
146       LOG.info(message);
147       if (existingServer.getStartCode() < serverInfo.getStartCode()) {
148         LOG.info("Triggering server recovery; existingServer " +
149           existingServer.getServerName() + " looks stale");
150         expireServer(existingServer);
151       }
152       throw new PleaseHoldException(message);
153     }
154   }
155 
156   private HServerInfo haveServerWithSameHostAndPortAlready(final String hostnamePort) {
157     synchronized (this.onlineServers) {
158       for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
159         if (e.getValue().getHostnamePort().equals(hostnamePort)) {
160           return e.getValue();
161         }
162       }
163     }
164     return null;
165   }
166 
167   /**
168    * Checks if the clock skew between the server and the master. If the clock
169    * skew is too much it will throw an Exception.
170    * @throws ClockOutOfSyncException
171    */
172   private void checkClockSkew(final HServerInfo serverInfo,
173       final long serverCurrentTime)
174   throws ClockOutOfSyncException {
175     long skew = System.currentTimeMillis() - serverCurrentTime;
176     if (skew > maxSkew) {
177       String message = "Server " + serverInfo.getServerName() + " has been " +
178         "rejected; Reported time is too far out of sync with master.  " +
179         "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
180       LOG.warn(message);
181       throw new ClockOutOfSyncException(message);
182     }
183   }
184 
185   /**
186    * If this server is on the dead list, reject it with a YouAreDeadException.
187    * If it was dead but came back with a new start code, remove the old entry
188    * from the dead list.
189    * @param serverName Server name formatted as host_port_startcode.
190    * @param what START or REPORT
191    * @throws YouAreDeadException
192    */
193   private void checkIsDead(final String serverName, final String what)
194       throws YouAreDeadException {
195     if (this.deadservers.isDeadServer(serverName)) {
196       // host name, port and start code all match with existing one of the
197       // dead servers. So, this server must be dead.
198       String message = "Server " + what + " rejected; currently processing " +
199           serverName + " as dead server";
200       LOG.debug(message);
201       throw new YouAreDeadException(message);
202     }
203 
204     if (this.deadservers.cleanPreviousInstance(serverName)) {
205       // This server has now become alive after we marked it as dead.
206       // We removed it's previous entry from the dead list to reflect it.
207       LOG.debug("Server " + serverName + " came back up, removed it from the" +
208           " dead servers list");
209     }
210   }
211 
212   /**
213    * Adds the HSI to the RS list
214    * @param info The region server informations
215    * @param useInfoLoad True if the load from the info should be used; e.g.
216    * under a master failover
217    * @param hri Region interface.  Can be null.
218    */
219   void recordNewServer(HServerInfo info, boolean useInfoLoad,
220       HRegionInterface hri) {
221     HServerLoad load = useInfoLoad? info.getLoad(): new HServerLoad();
222     String serverName = info.getServerName();
223     LOG.info("Registering server=" + serverName + ", regionCount=" +
224       load.getLoad() + ", userLoad=" + useInfoLoad);
225     info.setLoad(load);
226     // TODO: Why did we update the RS location ourself?  Shouldn't RS do this?
227     // masterStatus.getZooKeeper().updateRSLocationGetWatch(info, watcher);
228     // -- If I understand the question, the RS does not update the location
229     // because could be disagreement over locations because of DNS issues; only
230     // master does DNS now -- St.Ack 20100929.
231     this.onlineServers.put(serverName, info);
232     if (hri == null) {
233       serverConnections.remove(serverName);
234     } else {
235       serverConnections.put(serverName, hri);
236     }
237   }
238 
239   /**
240    * Called to process the messages sent from the region server to the master
241    * along with the heart beat.
242    *
243    * @param serverInfo
244    * @param msgs
245    * @param mostLoadedRegions Array of regions the region server is submitting
246    * as candidates to be rebalanced, should it be overloaded
247    * @return messages from master to region server indicating what region
248    * server should do.
249    *
250    * @throws IOException
251    */
252   HMsg [] regionServerReport(final HServerInfo serverInfo,
253     final HMsg [] msgs, final HRegionInfo[] mostLoadedRegions)
254   throws IOException {
255     // Be careful. This method does returns in the middle.
256     HServerInfo info = new HServerInfo(serverInfo);
257 
258     // Check if dead.  If it is, it'll get a 'You Are Dead!' exception.
259     checkIsDead(info.getServerName(), "REPORT");
260 
261     // If we don't know this server, tell it shutdown.
262     HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
263     if (storedInfo == null) {
264       // Maybe we already have this host+port combo and its just different
265       // start code?
266       checkAlreadySameHostPort(info);
267       // Just let the server in. Presume master joining a running cluster.
268       // recordNewServer is what happens at the end of reportServerStartup.
269       // The only thing we are skipping is passing back to the regionserver
270       // the HServerInfo to use. Here we presume a master has already done
271       // that so we'll press on with whatever it gave us for HSI.
272       recordNewServer(info, true, null);
273       // If msgs, put off their processing but this is not enough because
274       // its possible that the next time the server reports in, we'll still
275       // not be up and serving. For example, if a split, we'll need the
276       // regions and servers setup in the master before the below
277       // handleSplitReport will work. TODO: FIx!!
278       if (msgs.length > 0)
279         throw new PleaseHoldException("FIX! Putting off " +
280           "message processing because not yet ready but possible we won't be " +
281           "ready next on next report");
282 
283       storedInfo = this.onlineServers.get(info.getServerName());
284     }
285 
286     // Check startcodes
287     if (raceThatShouldNotHappenAnymore(storedInfo, info)) {
288       return HMsg.STOP_REGIONSERVER_ARRAY;
289     }
290 
291     for (HMsg msg: msgs) {
292       LOG.info("Received " + msg + " from " + serverInfo.getServerName());
293       switch (msg.getType()) {
294       case REGION_SPLIT:
295         this.services.getAssignmentManager().handleSplitReport(serverInfo,
296             msg.getRegionInfo(), msg.getDaughterA(), msg.getDaughterB());
297         break;
298 
299         default:
300           LOG.error("Unhandled msg type " + msg);
301       }
302     }
303 
304     HMsg [] reply = null;
305     int numservers = countOfRegionServers();
306     if (this.clusterShutdown) {
307       if (numservers <= 2) {
308         // Shutdown needs to be staggered; the meta regions need to close last
309         // in case they need to be updated during the close melee.  If <= 2
310         // servers left, then these are the two that were carrying root and meta
311         // most likely (TODO: This presumes unsplittable meta -- FIX). Tell
312         // these servers can shutdown now too.
313         reply = HMsg.STOP_REGIONSERVER_ARRAY;
314       }
315     }
316     return processRegionServerAllsWell(info, mostLoadedRegions, reply);
317   }
318 
319   private boolean raceThatShouldNotHappenAnymore(final HServerInfo storedInfo,
320       final HServerInfo reportedInfo) {
321     if (storedInfo.getStartCode() != reportedInfo.getStartCode()) {
322       // TODO: I don't think this possible any more.  We check startcodes when
323       // server comes in on regionServerStartup -- St.Ack
324       // This state is reachable if:
325       // 1) RegionServer A started
326       // 2) RegionServer B started on the same machine, then clobbered A in regionServerStartup.
327       // 3) RegionServer A returns, expecting to work as usual.
328       // The answer is to ask A to shut down for good.
329       LOG.warn("Race condition detected: " + reportedInfo.getServerName());
330       synchronized (this.onlineServers) {
331         removeServerInfo(reportedInfo.getServerName());
332         notifyOnlineServers();
333       }
334       return true;
335     }
336     return false;
337   }
338 
339   /**
340    *  RegionServer is checking in, no exceptional circumstances
341    * @param serverInfo
342    * @param mostLoadedRegions
343    * @param msgs
344    * @return
345    * @throws IOException
346    */
347   private HMsg[] processRegionServerAllsWell(HServerInfo serverInfo,
348       final HRegionInfo[] mostLoadedRegions, HMsg[] msgs)
349   throws IOException {
350     // Refresh the info object and the load information
351     this.onlineServers.put(serverInfo.getServerName(), serverInfo);
352     HServerLoad load = serverInfo.getLoad();
353     if (load != null && this.metrics != null) {
354       this.metrics.incrementRequests(load.getNumberOfRequests());
355     }
356     // No more piggyback messages on heartbeats for other stuff
357     return msgs;
358   }
359 
360   /**
361    * @param serverName
362    * @return True if we removed server from the list.
363    */
364   private boolean removeServerInfo(final String serverName) {
365     HServerInfo info = this.onlineServers.remove(serverName);
366     if (info != null) {
367       return true;
368     }
369     return false;
370   }
371 
372   /**
373    * Compute the average load across all region servers.
374    * Currently, this uses a very naive computation - just uses the number of
375    * regions being served, ignoring stats about number of requests.
376    * @return the average load
377    */
378   public double getAverageLoad() {
379     int totalLoad = 0;
380     int numServers = 0;
381     double averageLoad = 0.0;
382     for (HServerInfo hsi : onlineServers.values()) {
383         numServers++;
384         totalLoad += hsi.getLoad().getNumberOfRegions();
385     }
386     averageLoad = (double)totalLoad / (double)numServers;
387     return averageLoad;
388   }
389 
390   /** @return the count of active regionservers */
391   int countOfRegionServers() {
392     // Presumes onlineServers is a concurrent map
393     return this.onlineServers.size();
394   }
395 
396   /**
397    * @param name server name
398    * @return HServerInfo for the given server address
399    */
400   public HServerInfo getServerInfo(String name) {
401     return this.onlineServers.get(name);
402   }
403 
404   /**
405    * @return Read-only map of servers to serverinfo
406    */
407   public Map<String, HServerInfo> getOnlineServers() {
408     // Presumption is that iterating the returned Map is OK.
409     synchronized (this.onlineServers) {
410       return Collections.unmodifiableMap(this.onlineServers);
411     }
412   }
413 
414   public Set<String> getDeadServers() {
415     return this.deadservers.clone();
416   }
417 
418   /**
419    * Checks if any dead servers are currently in progress.
420    * @return true if any RS are being processed as dead, false if not
421    */
422   public boolean areDeadServersInProgress() {
423     return this.deadservers.areDeadServersInProgress();
424   }
425 
426   /**
427    * @param hsa
428    * @return The HServerInfo whose HServerAddress is <code>hsa</code> or null
429    * if nothing found.
430    */
431   public HServerInfo getHServerInfo(final HServerAddress hsa) {
432     synchronized(this.onlineServers) {
433       // TODO: This is primitive.  Do a better search.
434       for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
435         if (e.getValue().getServerAddress().equals(hsa)) {
436           return e.getValue();
437         }
438       }
439     }
440     return null;
441   }
442 
443   private void notifyOnlineServers() {
444     synchronized (this.onlineServers) {
445       this.onlineServers.notifyAll();
446     }
447   }
448 
449   /*
450    * Wait on regionservers to report in
451    * with {@link #regionServerReport(HServerInfo, HMsg[])} so they get notice
452    * the master is going down.  Waits until all region servers come back with
453    * a MSG_REGIONSERVER_STOP.
454    */
455   void letRegionServersShutdown() {
456     synchronized (onlineServers) {
457       while (onlineServers.size() > 0) {
458         StringBuilder sb = new StringBuilder();
459         for (String key: this.onlineServers.keySet()) {
460           if (sb.length() > 0) {
461             sb.append(", ");
462           }
463           sb.append(key);
464         }
465         LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
466         try {
467           this.onlineServers.wait(1000);
468         } catch (InterruptedException e) {
469           // continue
470         }
471       }
472     }
473   }
474 
475   /*
476    * Expire the passed server.  Add it to list of deadservers and queue a
477    * shutdown processing.
478    */
479   public synchronized void expireServer(final HServerInfo hsi) {
480     // First check a server to expire.  ServerName is of the form:
481     // <hostname> , <port> , <startcode>
482     String serverName = hsi.getServerName();
483     HServerInfo info = this.onlineServers.get(serverName);
484     if (info == null) {
485       LOG.warn("Received expiration of " + hsi.getServerName() +
486         " but server is not currently online");
487       return;
488     }
489     if (this.deadservers.contains(serverName)) {
490       // TODO: Can this happen?  It shouldn't be online in this case?
491       LOG.warn("Received expiration of " + hsi.getServerName() +
492           " but server shutdown is already in progress");
493       return;
494     }
495     // Remove the server from the known servers lists and update load info BUT
496     // add to deadservers first; do this so it'll show in dead servers list if
497     // not in online servers list.
498     this.deadservers.add(serverName);
499     this.onlineServers.remove(serverName);
500     this.serverConnections.remove(serverName);
501     // If cluster is going down, yes, servers are going to be expiring; don't
502     // process as a dead server
503     if (this.clusterShutdown) {
504       LOG.info("Cluster shutdown set; " + hsi.getServerName() +
505         " expired; onlineServers=" + this.onlineServers.size());
506       if (this.onlineServers.isEmpty()) {
507         master.stop("Cluster shutdown set; onlineServer=0");
508       }
509       return;
510     }
511     CatalogTracker ct = this.master.getCatalogTracker();
512     // Was this server carrying root?
513     boolean carryingRoot;
514     try {
515       HServerAddress address = ct.getRootLocation();
516       carryingRoot = address != null &&
517         hsi.getServerAddress().equals(address);
518     } catch (InterruptedException e) {
519       Thread.currentThread().interrupt();
520       LOG.info("Interrupted");
521       return;
522     }
523     // Was this server carrying meta?  Can't ask CatalogTracker because it
524     // may have reset the meta location as null already (it may have already
525     // run into fact that meta is dead).  I can ask assignment manager. It
526     // has an inmemory list of who has what.  This list will be cleared as we
527     // process the dead server but should be  find asking it now.
528     HServerAddress address = ct.getMetaLocation();
529     boolean carryingMeta =
530       address != null && hsi.getServerAddress().equals(address);
531     if (carryingRoot || carryingMeta) {
532       this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
533         this.services, this.deadservers, info, carryingRoot, carryingMeta));
534     } else {
535       this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
536         this.services, this.deadservers, info));
537     }
538     LOG.debug("Added=" + serverName +
539       " to dead servers, submitted shutdown handler to be executed, root=" +
540         carryingRoot + ", meta=" + carryingMeta);
541   }
542 
543   // RPC methods to region servers
544 
545   /**
546    * Sends an OPEN RPC to the specified server to open the specified region.
547    * <p>
548    * Open should not fail but can if server just crashed.
549    * <p>
550    * @param server server to open a region
551    * @param region region to open
552    */
553   public void sendRegionOpen(HServerInfo server, HRegionInfo region)
554   throws IOException {
555     HRegionInterface hri = getServerConnection(server);
556     if (hri == null) {
557       LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
558           + " failed because no RPC connection found to this server");
559       return;
560     }
561     hri.openRegion(region);
562   }
563 
564   /**
565    * Sends an OPEN RPC to the specified server to open the specified region.
566    * <p>
567    * Open should not fail but can if server just crashed.
568    * <p>
569    * @param server server to open a region
570    * @param regions regions to open
571    */
572   public void sendRegionOpen(HServerInfo server, List<HRegionInfo> regions)
573   throws IOException {
574     HRegionInterface hri = getServerConnection(server);
575     if (hri == null) {
576       LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
577           + " failed because no RPC connection found to this server");
578       return;
579     }
580     hri.openRegions(regions);
581   }
582 
583   /**
584    * Sends an CLOSE RPC to the specified server to close the specified region.
585    * <p>
586    * A region server could reject the close request because it either does not
587    * have the specified region or the region is being split.
588    * @param server server to open a region
589    * @param region region to open
590    * @return true if server acknowledged close, false if not
591    * @throws IOException
592    */
593   public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
594   throws IOException {
595     if (server == null) throw new NullPointerException("Passed server is null");
596     HRegionInterface hri = getServerConnection(server);
597     if (hri == null) {
598       throw new IOException("Attempting to send CLOSE RPC to server " +
599         server.getServerName() + " for region " +
600         region.getRegionNameAsString() +
601         " failed because no RPC connection found to this server");
602     }
603     return hri.closeRegion(region);
604   }
605 
606   /**
607    * @param info
608    * @return
609    * @throws IOException
610    * @throws RetriesExhaustedException wrapping a ConnectException if failed
611    * putting up proxy.
612    */
613   private HRegionInterface getServerConnection(HServerInfo info)
614   throws IOException {
615     HConnection connection =
616       HConnectionManager.getConnection(this.master.getConfiguration());
617     HRegionInterface hri = serverConnections.get(info.getServerName());
618     if (hri == null) {
619       LOG.debug("New connection to " + info.getServerName());
620       hri = connection.getHRegionConnection(info.getServerAddress(), false);
621       this.serverConnections.put(info.getServerName(), hri);
622     }
623     return hri;
624   }
625 
626   /**
627    * Waits for the regionservers to report in.
628    * @return Count of regions out on cluster
629    * @throws InterruptedException
630    */
631   public int waitForRegionServers()
632   throws InterruptedException {
633     long interval = this.master.getConfiguration().
634       getLong("hbase.master.wait.on.regionservers.interval", 1500);
635     long timeout = this.master.getConfiguration().
636       getLong("hbase.master.wait.on.regionservers.timeout", 4500);
637     int minToStart = this.master.getConfiguration().
638       getInt("hbase.master.wait.on.regionservers.mintostart", 1);
639     int maxToStart = this.master.getConfiguration().
640       getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE);
641     // So, number of regionservers > 0 and its been n since last check in, break,
642     // else just stall here
643     int count = 0;
644     long slept = 0;
645     for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
646       Thread.sleep(interval);
647       slept += interval;
648       count = countOfRegionServers();
649       if (count == oldcount && count >= minToStart && slept >= timeout) {
650         LOG.info("Finished waiting for regionserver count to settle; " +
651             "count=" + count + ", sleptFor=" + slept);
652         break;
653       }
654       if (count >= maxToStart) {
655         LOG.info("At least the max configured number of regionserver(s) have " +
656             "checked in: " + count);
657         break;
658       }
659       if (count == 0) {
660         LOG.info("Waiting on regionserver(s) to checkin");
661       } else {
662         LOG.info("Waiting on regionserver(s) count to settle; currently=" + count);
663       }
664       oldcount = count;
665     }
666     // Count how many regions deployed out on cluster.  If fresh start, it'll
667     // be none but if not a fresh start, we'll have registered servers when
668     // they came in on the {@link #regionServerReport(HServerInfo)} as opposed to
669     // {@link #regionServerStartup(HServerInfo)} and it'll be carrying an
670     // actual server load.
671     int regionCount = 0;
672     for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
673       HServerLoad load = e.getValue().getLoad();
674       if (load != null) regionCount += load.getLoad();
675     }
676     LOG.info("Exiting wait on regionserver(s) to checkin; count=" + count +
677       ", stopped=" + this.master.isStopped() +
678       ", count of regions out on cluster=" + regionCount);
679     return regionCount;
680   }
681 
682   /**
683    * @return A copy of the internal list of online servers.
684    */
685   public List<HServerInfo> getOnlineServersList() {
686     // TODO: optimize the load balancer call so we don't need to make a new list
687     return new ArrayList<HServerInfo>(onlineServers.values());
688   }
689 
690   public boolean isServerOnline(String serverName) {
691     return onlineServers.containsKey(serverName);
692   }
693 
694   public void shutdownCluster() {
695     this.clusterShutdown = true;
696     this.master.stop("Cluster shutdown requested");
697   }
698 
699   public boolean isClusterShutdown() {
700     return this.clusterShutdown;
701   }
702 
703   /**
704    * Stop the ServerManager.  Currently does nothing.
705    */
706   public void stop() {
707 
708   }
709 }