View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Set;
29  import java.util.concurrent.ConcurrentHashMap;
30  
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  import org.apache.hadoop.conf.Configuration;
34  import org.apache.hadoop.hbase.ClockOutOfSyncException;
35  import org.apache.hadoop.hbase.HMsg;
36  import org.apache.hadoop.hbase.HRegionInfo;
37  import org.apache.hadoop.hbase.HServerAddress;
38  import org.apache.hadoop.hbase.HServerInfo;
39  import org.apache.hadoop.hbase.HServerLoad;
40  import org.apache.hadoop.hbase.PleaseHoldException;
41  import org.apache.hadoop.hbase.Server;
42  import org.apache.hadoop.hbase.YouAreDeadException;
43  import org.apache.hadoop.hbase.catalog.CatalogTracker;
44  import org.apache.hadoop.hbase.client.HConnection;
45  import org.apache.hadoop.hbase.client.HConnectionManager;
46  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
47  import org.apache.hadoop.hbase.ipc.HRegionInterface;
48  import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
49  import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
50  import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
51  import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
52  
53  /**
54   * The ServerManager class manages info about region servers - HServerInfo,
55   * load numbers, dying servers, etc.
56   * <p>
57   * Maintains lists of online and dead servers.  Processes the startups,
58   * shutdowns, and deaths of region servers.
59   * <p>
60   * Servers are distinguished in two different ways.  A given server has a
61   * location, specified by hostname and port, and of which there can only be one
62   * online at any given time.  A server instance is specified by the location
63   * (hostname and port) as well as the startcode (timestamp from when the server
64   * was started).  This is used to differentiate a restarted instance of a given
65   * server from the original instance.
66   */
67  public class ServerManager {
68    private static final Log LOG = LogFactory.getLog(ServerManager.class);
69  
70    // Set if we are to shutdown the cluster.
71    private volatile boolean clusterShutdown = false;
72  
73    /** The map of known server names to server info */
74    private final Map<String, HServerInfo> onlineServers =
75      new ConcurrentHashMap<String, HServerInfo>();
76  
77    // TODO: This is strange to have two maps but HSI above is used on both sides
78    /**
79     * Map from full server-instance name to the RPC connection for this server.
80     */
81    private final Map<String, HRegionInterface> serverConnections =
82      new HashMap<String, HRegionInterface>();
83  
84    private final Server master;
85    private final MasterServices services;
86  
87    // Reporting to track master metrics.
88    private final MasterMetrics metrics;
89  
90    private final DeadServer deadservers;
91  
92    private final long maxSkew;
93  
94    /**
95     * Constructor.
96     * @param master
97     * @param services
98     * @param metrics
99     */
100   public ServerManager(final Server master, final MasterServices services,
101       MasterMetrics metrics) {
102     this.master = master;
103     this.services = services;
104     this.metrics = metrics;
105     Configuration c = master.getConfiguration();
106     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
107     this.deadservers =
108       new DeadServer(c.getInt("hbase.master.maxdeadservers", 100));
109   }
110 
111   /**
112    * Let the server manager know a new regionserver has come online
113    * @param serverInfo
114    * @param serverCurrentTime The current time of the region server in ms
115    * @throws IOException
116    */
117   void regionServerStartup(final HServerInfo serverInfo, long serverCurrentTime)
118   throws IOException {
119     // Test for case where we get a region startup message from a regionserver
120     // that has been quickly restarted but whose znode expiration handler has
121     // not yet run, or from a server whose fail we are currently processing.
122     // Test its host+port combo is present in serverAddresstoServerInfo.  If it
123     // is, reject the server and trigger its expiration. The next time it comes
124     // in, it should have been removed from serverAddressToServerInfo and queued
125     // for processing by ProcessServerShutdown.
126     HServerInfo info = new HServerInfo(serverInfo);
127     checkIsDead(info.getServerName(), "STARTUP");
128     checkAlreadySameHostPort(info);
129     checkClockSkew(info, serverCurrentTime);
130     recordNewServer(info, false, null);
131   }
132 
133   /**
134    * Test to see if we have a server of same host and port already.
135    * @param serverInfo
136    * @throws PleaseHoldException
137    */
138   void checkAlreadySameHostPort(final HServerInfo serverInfo)
139   throws PleaseHoldException {
140     String hostAndPort = serverInfo.getServerAddress().toString();
141     HServerInfo existingServer =
142       haveServerWithSameHostAndPortAlready(serverInfo.getHostnamePort());
143     if (existingServer != null) {
144       String message = "Server start rejected; we already have " + hostAndPort +
145         " registered; existingServer=" + existingServer + ", newServer=" + serverInfo;
146       LOG.info(message);
147       if (existingServer.getStartCode() < serverInfo.getStartCode()) {
148         LOG.info("Triggering server recovery; existingServer " +
149           existingServer.getServerName() + " looks stale");
150         expireServer(existingServer);
151       }
152       throw new PleaseHoldException(message);
153     }
154   }
155 
156   private HServerInfo haveServerWithSameHostAndPortAlready(final String hostnamePort) {
157     synchronized (this.onlineServers) {
158       for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
159         if (e.getValue().getHostnamePort().equals(hostnamePort)) {
160           return e.getValue();
161         }
162       }
163     }
164     return null;
165   }
166 
167   /**
168    * Checks if the clock skew between the server and the master. If the clock
169    * skew is too much it will throw an Exception.
170    * @throws ClockOutOfSyncException
171    */
172   private void checkClockSkew(final HServerInfo serverInfo,
173       final long serverCurrentTime)
174   throws ClockOutOfSyncException {
175     long skew = System.currentTimeMillis() - serverCurrentTime;
176     if (skew > maxSkew) {
177       String message = "Server " + serverInfo.getServerName() + " has been " +
178         "rejected; Reported time is too far out of sync with master.  " +
179         "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
180       LOG.warn(message);
181       throw new ClockOutOfSyncException(message);
182     }
183   }
184 
185   /**
186    * If this server is on the dead list, reject it with a YouAreDeadException.
187    * If it was dead but came back with a new start code, remove the old entry
188    * from the dead list.
189    * @param serverName Server name formatted as host_port_startcode.
190    * @param what START or REPORT
191    * @throws YouAreDeadException
192    */
193   private void checkIsDead(final String serverName, final String what)
194       throws YouAreDeadException {
195     if (this.deadservers.isDeadServer(serverName)) {
196       // host name, port and start code all match with existing one of the
197       // dead servers. So, this server must be dead.
198       String message = "Server " + what + " rejected; currently processing " +
199           serverName + " as dead server";
200       LOG.debug(message);
201       throw new YouAreDeadException(message);
202     }
203 
204     if (this.deadservers.cleanPreviousInstance(serverName)) {
205       // This server has now become alive after we marked it as dead.
206       // We removed it's previous entry from the dead list to reflect it.
207       LOG.debug("Server " + serverName + " came back up, removed it from the" +
208           " dead servers list");
209     }
210   }
211 
212   /**
213    * Adds the HSI to the RS list
214    * @param info The region server informations
215    * @param useInfoLoad True if the load from the info should be used; e.g.
216    * under a master failover
217    * @param hri Region interface.  Can be null.
218    */
219   void recordNewServer(HServerInfo info, boolean useInfoLoad,
220       HRegionInterface hri) {
221     HServerLoad load = useInfoLoad? info.getLoad(): new HServerLoad();
222     String serverName = info.getServerName();
223     LOG.info("Registering server=" + serverName + ", regionCount=" +
224       load.getLoad() + ", userLoad=" + useInfoLoad);
225     info.setLoad(load);
226     // TODO: Why did we update the RS location ourself?  Shouldn't RS do this?
227     // masterStatus.getZooKeeper().updateRSLocationGetWatch(info, watcher);
228     // -- If I understand the question, the RS does not update the location
229     // because could be disagreement over locations because of DNS issues; only
230     // master does DNS now -- St.Ack 20100929.
231     this.onlineServers.put(serverName, info);
232     if (hri == null) {
233       serverConnections.remove(serverName);
234     } else {
235       serverConnections.put(serverName, hri);
236     }
237   }
238 
239   /**
240    * Called to process the messages sent from the region server to the master
241    * along with the heart beat.
242    *
243    * @param serverInfo
244    * @param msgs
245    * @param mostLoadedRegions Array of regions the region server is submitting
246    * as candidates to be rebalanced, should it be overloaded
247    * @return messages from master to region server indicating what region
248    * server should do.
249    *
250    * @throws IOException
251    */
252   HMsg [] regionServerReport(final HServerInfo serverInfo,
253     final HMsg [] msgs, final HRegionInfo[] mostLoadedRegions)
254   throws IOException {
255     // Be careful. This method does returns in the middle.
256     HServerInfo info = new HServerInfo(serverInfo);
257 
258     // Check if dead.  If it is, it'll get a 'You Are Dead!' exception.
259     checkIsDead(info.getServerName(), "REPORT");
260 
261     // If we don't know this server, tell it shutdown.
262     HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
263     if (storedInfo == null) {
264       // Maybe we already have this host+port combo and its just different
265       // start code?
266       checkAlreadySameHostPort(info);
267       // Just let the server in. Presume master joining a running cluster.
268       // recordNewServer is what happens at the end of reportServerStartup.
269       // The only thing we are skipping is passing back to the regionserver
270       // the HServerInfo to use. Here we presume a master has already done
271       // that so we'll press on with whatever it gave us for HSI.
272       recordNewServer(info, true, null);
273       // If msgs, put off their processing but this is not enough because
274       // its possible that the next time the server reports in, we'll still
275       // not be up and serving. For example, if a split, we'll need the
276       // regions and servers setup in the master before the below
277       // handleSplitReport will work. TODO: FIx!!
278       if (msgs.length > 0)
279         throw new PleaseHoldException("FIX! Putting off " +
280           "message processing because not yet rwady but possible we won't be " +
281           "ready next on next report");
282     }
283 
284     // Check startcodes
285     if (raceThatShouldNotHappenAnymore(storedInfo, info)) {
286       return HMsg.STOP_REGIONSERVER_ARRAY;
287     }
288 
289     for (HMsg msg: msgs) {
290       LOG.info("Received " + msg + " from " + serverInfo.getServerName());
291       switch (msg.getType()) {
292       case REGION_SPLIT:
293         this.services.getAssignmentManager().handleSplitReport(serverInfo,
294             msg.getRegionInfo(), msg.getDaughterA(), msg.getDaughterB());
295         break;
296 
297         default:
298           LOG.error("Unhandled msg type " + msg);
299       }
300     }
301 
302     HMsg [] reply = null;
303     int numservers = countOfRegionServers();
304     if (this.clusterShutdown) {
305       if (numservers <= 2) {
306         // Shutdown needs to be staggered; the meta regions need to close last
307         // in case they need to be updated during the close melee.  If <= 2
308         // servers left, then these are the two that were carrying root and meta
309         // most likely (TODO: This presumes unsplittable meta -- FIX). Tell
310         // these servers can shutdown now too.
311         reply = HMsg.STOP_REGIONSERVER_ARRAY;
312       }
313     }
314     return processRegionServerAllsWell(info, mostLoadedRegions, reply);
315   }
316 
317   private boolean raceThatShouldNotHappenAnymore(final HServerInfo storedInfo,
318       final HServerInfo reportedInfo) {
319     if (storedInfo.getStartCode() != reportedInfo.getStartCode()) {
320       // TODO: I don't think this possible any more.  We check startcodes when
321       // server comes in on regionServerStartup -- St.Ack
322       // This state is reachable if:
323       // 1) RegionServer A started
324       // 2) RegionServer B started on the same machine, then clobbered A in regionServerStartup.
325       // 3) RegionServer A returns, expecting to work as usual.
326       // The answer is to ask A to shut down for good.
327       LOG.warn("Race condition detected: " + reportedInfo.getServerName());
328       synchronized (this.onlineServers) {
329         removeServerInfo(reportedInfo.getServerName());
330         notifyOnlineServers();
331       }
332       return true;
333     }
334     return false;
335   }
336 
337   /**
338    *  RegionServer is checking in, no exceptional circumstances
339    * @param serverInfo
340    * @param mostLoadedRegions
341    * @param msgs
342    * @return
343    * @throws IOException
344    */
345   private HMsg[] processRegionServerAllsWell(HServerInfo serverInfo,
346       final HRegionInfo[] mostLoadedRegions, HMsg[] msgs)
347   throws IOException {
348     // Refresh the info object and the load information
349     this.onlineServers.put(serverInfo.getServerName(), serverInfo);
350     HServerLoad load = serverInfo.getLoad();
351     if (load != null && this.metrics != null) {
352       this.metrics.incrementRequests(load.getNumberOfRequests());
353     }
354     // No more piggyback messages on heartbeats for other stuff
355     return msgs;
356   }
357 
358   /**
359    * @param serverName
360    * @return True if we removed server from the list.
361    */
362   private boolean removeServerInfo(final String serverName) {
363     HServerInfo info = this.onlineServers.remove(serverName);
364     if (info != null) {
365       return true;
366     }
367     return false;
368   }
369 
370   /**
371    * Compute the average load across all region servers.
372    * Currently, this uses a very naive computation - just uses the number of
373    * regions being served, ignoring stats about number of requests.
374    * @return the average load
375    */
376   public double getAverageLoad() {
377     int totalLoad = 0;
378     int numServers = 0;
379     double averageLoad = 0.0;
380     for (HServerInfo hsi : onlineServers.values()) {
381         numServers++;
382         totalLoad += hsi.getLoad().getNumberOfRegions();
383     }
384     averageLoad = (double)totalLoad / (double)numServers;
385     return averageLoad;
386   }
387 
388   /** @return the count of active regionservers */
389   int countOfRegionServers() {
390     // Presumes onlineServers is a concurrent map
391     return this.onlineServers.size();
392   }
393 
394   /**
395    * @param name server name
396    * @return HServerInfo for the given server address
397    */
398   public HServerInfo getServerInfo(String name) {
399     return this.onlineServers.get(name);
400   }
401 
402   /**
403    * @return Read-only map of servers to serverinfo
404    */
405   public Map<String, HServerInfo> getOnlineServers() {
406     // Presumption is that iterating the returned Map is OK.
407     synchronized (this.onlineServers) {
408       return Collections.unmodifiableMap(this.onlineServers);
409     }
410   }
411 
412   public Set<String> getDeadServers() {
413     return this.deadservers.clone();
414   }
415 
416   /**
417    * Checks if any dead servers are currently in progress.
418    * @return true if any RS are being processed as dead, false if not
419    */
420   public boolean areDeadServersInProgress() {
421     return this.deadservers.areDeadServersInProgress();
422   }
423 
424   /**
425    * @param hsa
426    * @return The HServerInfo whose HServerAddress is <code>hsa</code> or null
427    * if nothing found.
428    */
429   public HServerInfo getHServerInfo(final HServerAddress hsa) {
430     synchronized(this.onlineServers) {
431       // TODO: This is primitive.  Do a better search.
432       for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
433         if (e.getValue().getServerAddress().equals(hsa)) {
434           return e.getValue();
435         }
436       }
437     }
438     return null;
439   }
440 
441   private void notifyOnlineServers() {
442     synchronized (this.onlineServers) {
443       this.onlineServers.notifyAll();
444     }
445   }
446 
447   /*
448    * Wait on regionservers to report in
449    * with {@link #regionServerReport(HServerInfo, HMsg[])} so they get notice
450    * the master is going down.  Waits until all region servers come back with
451    * a MSG_REGIONSERVER_STOP.
452    */
453   void letRegionServersShutdown() {
454     synchronized (onlineServers) {
455       while (onlineServers.size() > 0) {
456         StringBuilder sb = new StringBuilder();
457         for (String key: this.onlineServers.keySet()) {
458           if (sb.length() > 0) {
459             sb.append(", ");
460           }
461           sb.append(key);
462         }
463         LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
464         try {
465           this.onlineServers.wait(1000);
466         } catch (InterruptedException e) {
467           // continue
468         }
469       }
470     }
471   }
472 
473   /*
474    * Expire the passed server.  Add it to list of deadservers and queue a
475    * shutdown processing.
476    */
477   public synchronized void expireServer(final HServerInfo hsi) {
478     // First check a server to expire.  ServerName is of the form:
479     // <hostname> , <port> , <startcode>
480     String serverName = hsi.getServerName();
481     HServerInfo info = this.onlineServers.get(serverName);
482     if (info == null) {
483       LOG.warn("Received expiration of " + hsi.getServerName() +
484         " but server is not currently online");
485       return;
486     }
487     if (this.deadservers.contains(serverName)) {
488       // TODO: Can this happen?  It shouldn't be online in this case?
489       LOG.warn("Received expiration of " + hsi.getServerName() +
490           " but server shutdown is already in progress");
491       return;
492     }
493     // Remove the server from the known servers lists and update load info BUT
494     // add to deadservers first; do this so it'll show in dead servers list if
495     // not in online servers list.
496     this.deadservers.add(serverName);
497     this.onlineServers.remove(serverName);
498     this.serverConnections.remove(serverName);
499     // If cluster is going down, yes, servers are going to be expiring; don't
500     // process as a dead server
501     if (this.clusterShutdown) {
502       LOG.info("Cluster shutdown set; " + hsi.getServerName() +
503         " expired; onlineServers=" + this.onlineServers.size());
504       if (this.onlineServers.isEmpty()) {
505         master.stop("Cluster shutdown set; onlineServer=0");
506       }
507       return;
508     }
509     CatalogTracker ct = this.master.getCatalogTracker();
510     // Was this server carrying root?
511     boolean carryingRoot;
512     try {
513       HServerAddress address = ct.getRootLocation();
514       carryingRoot = address != null &&
515         hsi.getServerAddress().equals(address);
516     } catch (InterruptedException e) {
517       Thread.currentThread().interrupt();
518       LOG.info("Interrupted");
519       return;
520     }
521     // Was this server carrying meta?  Can't ask CatalogTracker because it
522     // may have reset the meta location as null already (it may have already
523     // run into fact that meta is dead).  I can ask assignment manager. It
524     // has an inmemory list of who has what.  This list will be cleared as we
525     // process the dead server but should be  find asking it now.
526     HServerAddress address = ct.getMetaLocation();
527     boolean carryingMeta =
528       address != null && hsi.getServerAddress().equals(address);
529     if (carryingRoot || carryingMeta) {
530       this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
531         this.services, this.deadservers, info, carryingRoot, carryingMeta));
532     } else {
533       this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
534         this.services, this.deadservers, info));
535     }
536     LOG.debug("Added=" + serverName +
537       " to dead servers, submitted shutdown handler to be executed, root=" +
538         carryingRoot + ", meta=" + carryingMeta);
539   }
540 
541   // RPC methods to region servers
542 
543   /**
544    * Sends an OPEN RPC to the specified server to open the specified region.
545    * <p>
546    * Open should not fail but can if server just crashed.
547    * <p>
548    * @param server server to open a region
549    * @param region region to open
550    */
551   public void sendRegionOpen(HServerInfo server, HRegionInfo region)
552   throws IOException {
553     HRegionInterface hri = getServerConnection(server);
554     if (hri == null) {
555       LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
556           + " failed because no RPC connection found to this server");
557       return;
558     }
559     hri.openRegion(region);
560   }
561 
562   /**
563    * Sends an OPEN RPC to the specified server to open the specified region.
564    * <p>
565    * Open should not fail but can if server just crashed.
566    * <p>
567    * @param server server to open a region
568    * @param regions regions to open
569    */
570   public void sendRegionOpen(HServerInfo server, List<HRegionInfo> regions)
571   throws IOException {
572     HRegionInterface hri = getServerConnection(server);
573     if (hri == null) {
574       LOG.warn("Attempting to send OPEN RPC to server " + server.getServerName()
575           + " failed because no RPC connection found to this server");
576       return;
577     }
578     hri.openRegions(regions);
579   }
580 
581   /**
582    * Sends an CLOSE RPC to the specified server to close the specified region.
583    * <p>
584    * A region server could reject the close request because it either does not
585    * have the specified region or the region is being split.
586    * @param server server to open a region
587    * @param region region to open
588    * @return true if server acknowledged close, false if not
589    * @throws IOException
590    */
591   public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
592   throws IOException {
593     if (server == null) throw new NullPointerException("Passed server is null");
594     HRegionInterface hri = getServerConnection(server);
595     if (hri == null) {
596       throw new IOException("Attempting to send CLOSE RPC to server " +
597         server.getServerName() + " for region " +
598         region.getRegionNameAsString() +
599         " failed because no RPC connection found to this server");
600     }
601     return hri.closeRegion(region);
602   }
603 
604   /**
605    * @param info
606    * @return
607    * @throws IOException
608    * @throws RetriesExhaustedException wrapping a ConnectException if failed
609    * putting up proxy.
610    */
611   private HRegionInterface getServerConnection(HServerInfo info)
612   throws IOException {
613     HConnection connection =
614       HConnectionManager.getConnection(this.master.getConfiguration());
615     HRegionInterface hri = serverConnections.get(info.getServerName());
616     if (hri == null) {
617       LOG.debug("New connection to " + info.getServerName());
618       hri = connection.getHRegionConnection(info.getServerAddress(), false);
619       this.serverConnections.put(info.getServerName(), hri);
620     }
621     return hri;
622   }
623 
624   /**
625    * Waits for the regionservers to report in.
626    * @return Count of regions out on cluster
627    * @throws InterruptedException
628    */
629   public int waitForRegionServers()
630   throws InterruptedException {
631     long interval = this.master.getConfiguration().
632       getLong("hbase.master.wait.on.regionservers.interval", 1500);
633     long timeout = this.master.getConfiguration().
634       getLong("hbase.master.wait.on.regionservers.timeout", 4500);
635     int minToStart = this.master.getConfiguration().
636       getInt("hbase.master.wait.on.regionservers.mintostart", 1);
637     int maxToStart = this.master.getConfiguration().
638       getInt("hbase.master.wait.on.regionservers.maxtostart", Integer.MAX_VALUE);
639     // So, number of regionservers > 0 and its been n since last check in, break,
640     // else just stall here
641     int count = 0;
642     long slept = 0;
643     for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
644       Thread.sleep(interval);
645       slept += interval;
646       count = countOfRegionServers();
647       if (count == oldcount && count >= minToStart && slept >= timeout) {
648         LOG.info("Finished waiting for regionserver count to settle; " +
649             "count=" + count + ", sleptFor=" + slept);
650         break;
651       }
652       if (count >= maxToStart) {
653         LOG.info("At least the max configured number of regionserver(s) have " +
654             "checked in: " + count);
655         break;
656       }
657       if (count == 0) {
658         LOG.info("Waiting on regionserver(s) to checkin");
659       } else {
660         LOG.info("Waiting on regionserver(s) count to settle; currently=" + count);
661       }
662       oldcount = count;
663     }
664     // Count how many regions deployed out on cluster.  If fresh start, it'll
665     // be none but if not a fresh start, we'll have registered servers when
666     // they came in on the {@link #regionServerReport(HServerInfo)} as opposed to
667     // {@link #regionServerStartup(HServerInfo)} and it'll be carrying an
668     // actual server load.
669     int regionCount = 0;
670     for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
671       HServerLoad load = e.getValue().getLoad();
672       if (load != null) regionCount += load.getLoad();
673     }
674     LOG.info("Exiting wait on regionserver(s) to checkin; count=" + count +
675       ", stopped=" + this.master.isStopped() +
676       ", count of regions out on cluster=" + regionCount);
677     return regionCount;
678   }
679 
680   /**
681    * @return A copy of the internal list of online servers.
682    */
683   public List<HServerInfo> getOnlineServersList() {
684     // TODO: optimize the load balancer call so we don't need to make a new list
685     return new ArrayList<HServerInfo>(onlineServers.values());
686   }
687 
688   public boolean isServerOnline(String serverName) {
689     return onlineServers.containsKey(serverName);
690   }
691 
692   public void shutdownCluster() {
693     this.clusterShutdown = true;
694     this.master.stop("Cluster shutdown requested");
695   }
696 
697   public boolean isClusterShutdown() {
698     return this.clusterShutdown;
699   }
700 
701   /**
702    * Stop the ServerManager.  Currently does nothing.
703    */
704   public void stop() {
705 
706   }
707 }