View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.net.InetAddress;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Map.Entry;
31  import java.util.Set;
32  import java.util.SortedMap;
33  import java.util.concurrent.ConcurrentHashMap;
34  import java.util.concurrent.ConcurrentSkipListMap;
35  import java.util.concurrent.CopyOnWriteArrayList;
36  
37  import org.apache.commons.logging.Log;
38  import org.apache.commons.logging.LogFactory;
39  import org.apache.hadoop.conf.Configuration;
40  import org.apache.hadoop.hbase.ClockOutOfSyncException;
41  import org.apache.hadoop.hbase.HRegionInfo;
42  import org.apache.hadoop.hbase.RegionLoad;
43  import org.apache.hadoop.hbase.Server;
44  import org.apache.hadoop.hbase.ServerLoad;
45  import org.apache.hadoop.hbase.ServerName;
46  import org.apache.hadoop.hbase.YouAreDeadException;
47  import org.apache.hadoop.hbase.classification.InterfaceAudience;
48  import org.apache.hadoop.hbase.client.ClusterConnection;
49  import org.apache.hadoop.hbase.client.ConnectionFactory;
50  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
51  import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer;
52  import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
53  import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
54  import org.apache.hadoop.hbase.monitoring.MonitoredTask;
55  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
56  import org.apache.hadoop.hbase.protobuf.RequestConverter;
57  import org.apache.hadoop.hbase.protobuf.ResponseConverter;
58  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
59  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
60  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
61  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
62  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
63  import org.apache.hadoop.hbase.regionserver.HRegionServer;
64  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
65  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
66  import org.apache.hadoop.hbase.util.Bytes;
67  import org.apache.hadoop.hbase.util.Triple;
68  import org.apache.hadoop.hbase.util.Pair;
69  import org.apache.hadoop.hbase.util.RetryCounter;
70  import org.apache.hadoop.hbase.util.RetryCounterFactory;
71  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
72  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
73  import org.apache.zookeeper.KeeperException;
74  
75  import com.google.common.annotations.VisibleForTesting;
76  import com.google.protobuf.ServiceException;
77  
78  /**
79   * The ServerManager class manages info about region servers.
80   * <p>
81   * Maintains lists of online and dead servers.  Processes the startups,
82   * shutdowns, and deaths of region servers.
83   * <p>
84   * Servers are distinguished in two different ways.  A given server has a
85   * location, specified by hostname and port, and of which there can only be one
86   * online at any given time.  A server instance is specified by the location
87   * (hostname and port) as well as the startcode (timestamp from when the server
88   * was started).  This is used to differentiate a restarted instance of a given
89   * server from the original instance.
90   * <p>
91   * If a sever is known not to be running any more, it is called dead. The dead
92   * server needs to be handled by a ServerShutdownHandler.  If the handler is not
93   * enabled yet, the server can't be handled right away so it is queued up.
94   * After the handler is enabled, the server will be submitted to a handler to handle.
95   * However, the handler may be just partially enabled.  If so,
96   * the server cannot be fully processed, and be queued up for further processing.
97   * A server is fully processed only after the handler is fully enabled
98   * and has completed the handling.
99   */
100 @InterfaceAudience.Private
101 public class ServerManager {
102   public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
103       "hbase.master.wait.on.regionservers.maxtostart";
104 
105   public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
106       "hbase.master.wait.on.regionservers.mintostart";
107 
108   public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
109       "hbase.master.wait.on.regionservers.timeout";
110 
111   public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
112       "hbase.master.wait.on.regionservers.interval";
113 
114   private static final Log LOG = LogFactory.getLog(ServerManager.class);
115 
116   // Set if we are to shutdown the cluster.
117   private volatile boolean clusterShutdown = false;
118 
119   private final SortedMap<byte[], Long> flushedSequenceIdByRegion =
120     new ConcurrentSkipListMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
121 
122   /** Map of registered servers to their current load */
123   private final ConcurrentHashMap<ServerName, ServerLoad> onlineServers =
124     new ConcurrentHashMap<ServerName, ServerLoad>();
125 
126   /**
127    * Map of admin interfaces per registered regionserver; these interfaces we use to control
128    * regionservers out on the cluster
129    */
130   private final Map<ServerName, AdminService.BlockingInterface> rsAdmins =
131     new HashMap<ServerName, AdminService.BlockingInterface>();
132 
133   /**
134    * List of region servers <ServerName> that should not get any more new
135    * regions.
136    */
137   private final ArrayList<ServerName> drainingServers =
138     new ArrayList<ServerName>();
139 
140   private final Server master;
141   private final MasterServices services;
142   private final ClusterConnection connection;
143 
144   private final DeadServer deadservers = new DeadServer();
145 
146   private final long maxSkew;
147   private final long warningSkew;
148 
149   private final RetryCounterFactory pingRetryCounterFactory;
150 
151   /**
152    * Set of region servers which are dead but not processed immediately. If one
153    * server died before master enables ServerShutdownHandler, the server will be
154    * added to this set and will be processed through calling
155    * {@link ServerManager#processQueuedDeadServers()} by master.
156    * <p>
157    * A dead server is a server instance known to be dead, not listed in the /hbase/rs
158    * znode any more. It may have not been submitted to ServerShutdownHandler yet
159    * because the handler is not enabled.
160    * <p>
161    * A dead server, which has been submitted to ServerShutdownHandler while the
162    * handler is not enabled, is queued up.
163    * <p>
164    * So this is a set of region servers known to be dead but not submitted to
165    * ServerShutdownHander for processing yet.
166    */
167   private Set<ServerName> queuedDeadServers = new HashSet<ServerName>();
168 
169   /**
170    * Set of region servers which are dead and submitted to ServerShutdownHandler to process but not
171    * fully processed immediately.
172    * <p>
173    * If one server died before assignment manager finished the failover cleanup, the server will be
174    * added to this set and will be processed through calling
175    * {@link ServerManager#processQueuedDeadServers()} by assignment manager.
176    * <p>
177    * The Boolean value indicates whether log split is needed inside ServerShutdownHandler
178    * <p>
179    * ServerShutdownHandler processes a dead server submitted to the handler after the handler is
180    * enabled. It may not be able to complete the processing because meta is not yet online or master
181    * is currently in startup mode. In this case, the dead server will be parked in this set
182    * temporarily.
183    */
184   private Map<ServerName, Boolean> requeuedDeadServers
185     = new ConcurrentHashMap<ServerName, Boolean>();
186 
187   /** Listeners that are called on server events. */
188   private List<ServerListener> listeners = new CopyOnWriteArrayList<ServerListener>();
189 
190   /**
191    * Constructor.
192    * @param master
193    * @param services
194    * @throws ZooKeeperConnectionException
195    */
196   public ServerManager(final Server master, final MasterServices services)
197       throws IOException {
198     this(master, services, true);
199   }
200 
201   ServerManager(final Server master, final MasterServices services,
202       final boolean connect) throws IOException {
203     this.master = master;
204     this.services = services;
205     Configuration c = master.getConfiguration();
206     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
207     warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
208     this.connection = connect ? (ClusterConnection)ConnectionFactory.createConnection(c) : null;
209     int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt(
210       "hbase.master.maximum.ping.server.attempts", 10));
211     int pingSleepInterval = Math.max(1, master.getConfiguration().getInt(
212       "hbase.master.ping.server.retry.sleep.interval", 100));
213     this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval);
214   }
215 
216   /**
217    * Add the listener to the notification list.
218    * @param listener The ServerListener to register
219    */
220   public void registerListener(final ServerListener listener) {
221     this.listeners.add(listener);
222   }
223 
224   /**
225    * Remove the listener from the notification list.
226    * @param listener The ServerListener to unregister
227    */
228   public boolean unregisterListener(final ServerListener listener) {
229     return this.listeners.remove(listener);
230   }
231 
232   /**
233    * Let the server manager know a new regionserver has come online
234    * @param ia The remote address
235    * @param port The remote port
236    * @param serverStartcode
237    * @param serverCurrentTime The current time of the region server in ms
238    * @return The ServerName we know this server as.
239    * @throws IOException
240    */
241   ServerName regionServerStartup(final InetAddress ia, final int port,
242     final long serverStartcode, long serverCurrentTime)
243   throws IOException {
244     // Test for case where we get a region startup message from a regionserver
245     // that has been quickly restarted but whose znode expiration handler has
246     // not yet run, or from a server whose fail we are currently processing.
247     // Test its host+port combo is present in serverAddresstoServerInfo.  If it
248     // is, reject the server and trigger its expiration. The next time it comes
249     // in, it should have been removed from serverAddressToServerInfo and queued
250     // for processing by ProcessServerShutdown.
251     ServerName sn = ServerName.valueOf(ia.getHostName(), port, serverStartcode);
252     checkClockSkew(sn, serverCurrentTime);
253     checkIsDead(sn, "STARTUP");
254     if (!checkAndRecordNewServer(sn, ServerLoad.EMPTY_SERVERLOAD)) {
255       LOG.warn("THIS SHOULD NOT HAPPEN, RegionServerStartup"
256         + " could not record the server: " + sn);
257     }
258     return sn;
259   }
260 
261   /**
262    * Updates last flushed sequence Ids for the regions on server sn
263    * @param sn
264    * @param hsl
265    */
266   private void updateLastFlushedSequenceIds(ServerName sn, ServerLoad hsl) {
267     Map<byte[], RegionLoad> regionsLoad = hsl.getRegionsLoad();
268     for (Entry<byte[], RegionLoad> entry : regionsLoad.entrySet()) {
269       byte[] encodedRegionName = Bytes.toBytes(HRegionInfo.encodeRegionName(entry.getKey()));
270       Long existingValue = flushedSequenceIdByRegion.get(encodedRegionName);
271       long l = entry.getValue().getCompleteSequenceId();
272       if (existingValue != null) {
273         if (l != -1 && l < existingValue) {
274           LOG.warn("RegionServer " + sn +
275               " indicates a last flushed sequence id (" + entry.getValue() +
276               ") that is less than the previous last flushed sequence id (" +
277               existingValue + ") for region " +
278               Bytes.toString(entry.getKey()) + " Ignoring.");
279 
280           continue; // Don't let smaller sequence ids override greater sequence ids.
281         }
282       }
283       flushedSequenceIdByRegion.put(encodedRegionName, l);
284     }
285   }
286 
287   void regionServerReport(ServerName sn,
288       ServerLoad sl) throws YouAreDeadException {
289     checkIsDead(sn, "REPORT");
290     if (null == this.onlineServers.replace(sn, sl)) {
291       // Already have this host+port combo and its just different start code?
292       // Just let the server in. Presume master joining a running cluster.
293       // recordNewServer is what happens at the end of reportServerStartup.
294       // The only thing we are skipping is passing back to the regionserver
295       // the ServerName to use. Here we presume a master has already done
296       // that so we'll press on with whatever it gave us for ServerName.
297       if (!checkAndRecordNewServer(sn, sl)) {
298         LOG.info("RegionServerReport ignored, could not record the server: " + sn);
299         return; // Not recorded, so no need to move on
300       }
301     }
302     updateLastFlushedSequenceIds(sn, sl);
303   }
304 
305   /**
306    * Check is a server of same host and port already exists,
307    * if not, or the existed one got a smaller start code, record it.
308    *
309    * @param sn the server to check and record
310    * @param sl the server load on the server
311    * @return true if the server is recorded, otherwise, false
312    */
313   boolean checkAndRecordNewServer(
314       final ServerName serverName, final ServerLoad sl) {
315     ServerName existingServer = null;
316     synchronized (this.onlineServers) {
317       existingServer = findServerWithSameHostnamePortWithLock(serverName);
318       if (existingServer != null && (existingServer.getStartcode() > serverName.getStartcode())) {
319         LOG.info("Server serverName=" + serverName + " rejected; we already have "
320             + existingServer.toString() + " registered with same hostname and port");
321         return false;
322       }
323       recordNewServerWithLock(serverName, sl);
324     }
325 
326     // Tell our listeners that a server was added
327     if (!this.listeners.isEmpty()) {
328       for (ServerListener listener : this.listeners) {
329         listener.serverAdded(serverName);
330       }
331     }
332 
333     // Note that we assume that same ts means same server, and don't expire in that case.
334     //  TODO: ts can theoretically collide due to clock shifts, so this is a bit hacky.
335     if (existingServer != null && (existingServer.getStartcode() < serverName.getStartcode())) {
336       LOG.info("Triggering server recovery; existingServer " +
337           existingServer + " looks stale, new server:" + serverName);
338       expireServer(existingServer);
339     }
340     return true;
341   }
342 
343   /**
344    * Checks if the clock skew between the server and the master. If the clock skew exceeds the
345    * configured max, it will throw an exception; if it exceeds the configured warning threshold,
346    * it will log a warning but start normally.
347    * @param serverName Incoming servers's name
348    * @param serverCurrentTime
349    * @throws ClockOutOfSyncException if the skew exceeds the configured max value
350    */
351   private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
352   throws ClockOutOfSyncException {
353     long skew = Math.abs(System.currentTimeMillis() - serverCurrentTime);
354     if (skew > maxSkew) {
355       String message = "Server " + serverName + " has been " +
356         "rejected; Reported time is too far out of sync with master.  " +
357         "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
358       LOG.warn(message);
359       throw new ClockOutOfSyncException(message);
360     } else if (skew > warningSkew){
361       String message = "Reported time for server " + serverName + " is out of sync with master " +
362         "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " +
363         "error threshold is " + maxSkew + "ms)";
364       LOG.warn(message);
365     }
366   }
367 
368   /**
369    * If this server is on the dead list, reject it with a YouAreDeadException.
370    * If it was dead but came back with a new start code, remove the old entry
371    * from the dead list.
372    * @param serverName
373    * @param what START or REPORT
374    * @throws org.apache.hadoop.hbase.YouAreDeadException
375    */
376   private void checkIsDead(final ServerName serverName, final String what)
377       throws YouAreDeadException {
378     if (this.deadservers.isDeadServer(serverName)) {
379       // host name, port and start code all match with existing one of the
380       // dead servers. So, this server must be dead.
381       String message = "Server " + what + " rejected; currently processing " +
382           serverName + " as dead server";
383       LOG.debug(message);
384       throw new YouAreDeadException(message);
385     }
386     // remove dead server with same hostname and port of newly checking in rs after master
387     // initialization.See HBASE-5916 for more information.
388     if ((this.services == null || ((HMaster) this.services).isInitialized())
389         && this.deadservers.cleanPreviousInstance(serverName)) {
390       // This server has now become alive after we marked it as dead.
391       // We removed it's previous entry from the dead list to reflect it.
392       LOG.debug(what + ":" + " Server " + serverName + " came back up," +
393           " removed it from the dead servers list");
394     }
395   }
396 
397   /**
398    * Assumes onlineServers is locked.
399    * @return ServerName with matching hostname and port.
400    */
401   private ServerName findServerWithSameHostnamePortWithLock(
402       final ServerName serverName) {
403     for (ServerName sn: this.onlineServers.keySet()) {
404       if (ServerName.isSameHostnameAndPort(serverName, sn)) return sn;
405     }
406     return null;
407   }
408 
409   /**
410    * Adds the onlineServers list. onlineServers should be locked.
411    * @param serverName The remote servers name.
412    * @param sl
413    * @return Server load from the removed server, if any.
414    */
415   @VisibleForTesting
416   void recordNewServerWithLock(final ServerName serverName, final ServerLoad sl) {
417     LOG.info("Registering server=" + serverName);
418     this.onlineServers.put(serverName, sl);
419     this.rsAdmins.remove(serverName);
420   }
421 
422   public long getLastFlushedSequenceId(byte[] encodedRegionName) {
423     long seqId = -1L;
424     if (flushedSequenceIdByRegion.containsKey(encodedRegionName)) {
425       seqId = flushedSequenceIdByRegion.get(encodedRegionName);
426     }
427     return seqId;
428   }
429 
430   /**
431    * @param serverName
432    * @return ServerLoad if serverName is known else null
433    */
434   public ServerLoad getLoad(final ServerName serverName) {
435     return this.onlineServers.get(serverName);
436   }
437 
438   /**
439    * Compute the average load across all region servers.
440    * Currently, this uses a very naive computation - just uses the number of
441    * regions being served, ignoring stats about number of requests.
442    * @return the average load
443    */
444   public double getAverageLoad() {
445     int totalLoad = 0;
446     int numServers = 0;
447     for (ServerLoad sl: this.onlineServers.values()) {
448         numServers++;
449         totalLoad += sl.getNumberOfRegions();
450     }
451     return numServers == 0 ? 0 :
452       (double)totalLoad / (double)numServers;
453   }
454 
455   /** @return the count of active regionservers */
456   public int countOfRegionServers() {
457     // Presumes onlineServers is a concurrent map
458     return this.onlineServers.size();
459   }
460 
461   /**
462    * @return Read-only map of servers to serverinfo
463    */
464   public Map<ServerName, ServerLoad> getOnlineServers() {
465     // Presumption is that iterating the returned Map is OK.
466     synchronized (this.onlineServers) {
467       return Collections.unmodifiableMap(this.onlineServers);
468     }
469   }
470 
471 
472   public DeadServer getDeadServers() {
473     return this.deadservers;
474   }
475 
476   /**
477    * Checks if any dead servers are currently in progress.
478    * @return true if any RS are being processed as dead, false if not
479    */
480   public boolean areDeadServersInProgress() {
481     return this.deadservers.areDeadServersInProgress();
482   }
483 
484   void letRegionServersShutdown() {
485     long previousLogTime = 0;
486     ServerName sn = master.getServerName();
487     ZooKeeperWatcher zkw = master.getZooKeeper();
488     int onlineServersCt;
489     while ((onlineServersCt = onlineServers.size()) > 0){
490 
491       if (System.currentTimeMillis() > (previousLogTime + 1000)) {
492         Set<ServerName> remainingServers = onlineServers.keySet();
493         synchronized (onlineServers) {
494           if (remainingServers.size() == 1 && remainingServers.contains(sn)) {
495             // Master will delete itself later.
496             return;
497           }
498         }
499         StringBuilder sb = new StringBuilder();
500         // It's ok here to not sync on onlineServers - merely logging
501         for (ServerName key : remainingServers) {
502           if (sb.length() > 0) {
503             sb.append(", ");
504           }
505           sb.append(key);
506         }
507         LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
508         previousLogTime = System.currentTimeMillis();
509       }
510 
511       try {
512         List<String> servers = ZKUtil.listChildrenNoWatch(zkw, zkw.rsZNode);
513         if (servers == null || servers.size() == 0 || (servers.size() == 1
514             && servers.contains(sn.toString()))) {
515           LOG.info("ZK shows there is only the master self online, exiting now");
516           // Master could have lost some ZK events, no need to wait more.
517           break;
518         }
519       } catch (KeeperException ke) {
520         LOG.warn("Failed to list regionservers", ke);
521         // ZK is malfunctioning, don't hang here
522         break;
523       }
524       synchronized (onlineServers) {
525         try {
526           if (onlineServersCt == onlineServers.size()) onlineServers.wait(100);
527         } catch (InterruptedException ignored) {
528           // continue
529         }
530       }
531     }
532   }
533 
534   /*
535    * Expire the passed server.  Add it to list of dead servers and queue a
536    * shutdown processing.
537    */
538   public synchronized void expireServer(final ServerName serverName) {
539     if (serverName.equals(master.getServerName())) {
540       if (!(master.isAborted() || master.isStopped())) {
541         master.stop("We lost our znode?");
542       }
543       return;
544     }
545     if (!services.isServerShutdownHandlerEnabled()) {
546       LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
547           + "delay expiring server " + serverName);
548       this.queuedDeadServers.add(serverName);
549       return;
550     }
551     if (this.deadservers.isDeadServer(serverName)) {
552       // TODO: Can this happen?  It shouldn't be online in this case?
553       LOG.warn("Expiration of " + serverName +
554           " but server shutdown already in progress");
555       return;
556     }
557     synchronized (onlineServers) {
558       if (!this.onlineServers.containsKey(serverName)) {
559         LOG.warn("Expiration of " + serverName + " but server not online");
560       }
561       // Remove the server from the known servers lists and update load info BUT
562       // add to deadservers first; do this so it'll show in dead servers list if
563       // not in online servers list.
564       this.deadservers.add(serverName);
565       this.onlineServers.remove(serverName);
566       onlineServers.notifyAll();
567     }
568     this.rsAdmins.remove(serverName);
569     // If cluster is going down, yes, servers are going to be expiring; don't
570     // process as a dead server
571     if (this.clusterShutdown) {
572       LOG.info("Cluster shutdown set; " + serverName +
573         " expired; onlineServers=" + this.onlineServers.size());
574       if (this.onlineServers.isEmpty()) {
575         master.stop("Cluster shutdown set; onlineServer=0");
576       }
577       return;
578     }
579 
580     boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
581     if (carryingMeta) {
582       this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
583         this.services, this.deadservers, serverName));
584     } else {
585       this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
586         this.services, this.deadservers, serverName, true));
587     }
588     LOG.debug("Added=" + serverName +
589       " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta);
590 
591     // Tell our listeners that a server was removed
592     if (!this.listeners.isEmpty()) {
593       for (ServerListener listener : this.listeners) {
594         listener.serverRemoved(serverName);
595       }
596     }
597   }
598 
599   public synchronized void processDeadServer(final ServerName serverName) {
600     this.processDeadServer(serverName, false);
601   }
602 
603   public synchronized void processDeadServer(final ServerName serverName, boolean shouldSplitWal) {
604     // When assignment manager is cleaning up the zookeeper nodes and rebuilding the
605     // in-memory region states, region servers could be down. Meta table can and
606     // should be re-assigned, log splitting can be done too. However, it is better to
607     // wait till the cleanup is done before re-assigning user regions.
608     //
609     // We should not wait in the server shutdown handler thread since it can clog
610     // the handler threads and meta table could not be re-assigned in case
611     // the corresponding server is down. So we queue them up here instead.
612     if (!services.getAssignmentManager().isFailoverCleanupDone()) {
613       requeuedDeadServers.put(serverName, shouldSplitWal);
614       return;
615     }
616 
617     this.deadservers.add(serverName);
618     this.services.getExecutorService().submit(
619       new ServerShutdownHandler(this.master, this.services, this.deadservers, serverName,
620           shouldSplitWal));
621   }
622 
623   /**
624    * Process the servers which died during master's initialization. It will be
625    * called after HMaster#assignMeta and AssignmentManager#joinCluster.
626    * */
627   synchronized void processQueuedDeadServers() {
628     if (!services.isServerShutdownHandlerEnabled()) {
629       LOG.info("Master hasn't enabled ServerShutdownHandler");
630     }
631     Iterator<ServerName> serverIterator = queuedDeadServers.iterator();
632     while (serverIterator.hasNext()) {
633       ServerName tmpServerName = serverIterator.next();
634       expireServer(tmpServerName);
635       serverIterator.remove();
636       requeuedDeadServers.remove(tmpServerName);
637     }
638 
639     if (!services.getAssignmentManager().isFailoverCleanupDone()) {
640       LOG.info("AssignmentManager hasn't finished failover cleanup; waiting");
641     }
642 
643     for(ServerName tmpServerName : requeuedDeadServers.keySet()){
644       processDeadServer(tmpServerName, requeuedDeadServers.get(tmpServerName));
645     }
646     requeuedDeadServers.clear();
647   }
648 
649   /*
650    * Remove the server from the drain list.
651    */
652   public boolean removeServerFromDrainList(final ServerName sn) {
653     // Warn if the server (sn) is not online.  ServerName is of the form:
654     // <hostname> , <port> , <startcode>
655 
656     if (!this.isServerOnline(sn)) {
657       LOG.warn("Server " + sn + " is not currently online. " +
658                "Removing from draining list anyway, as requested.");
659     }
660     // Remove the server from the draining servers lists.
661     return this.drainingServers.remove(sn);
662   }
663 
664   /*
665    * Add the server to the drain list.
666    */
667   public boolean addServerToDrainList(final ServerName sn) {
668     // Warn if the server (sn) is not online.  ServerName is of the form:
669     // <hostname> , <port> , <startcode>
670 
671     if (!this.isServerOnline(sn)) {
672       LOG.warn("Server " + sn + " is not currently online. " +
673                "Ignoring request to add it to draining list.");
674       return false;
675     }
676     // Add the server to the draining servers lists, if it's not already in
677     // it.
678     if (this.drainingServers.contains(sn)) {
679       LOG.warn("Server " + sn + " is already in the draining server list." +
680                "Ignoring request to add it again.");
681       return false;
682     }
683     return this.drainingServers.add(sn);
684   }
685 
686   // RPC methods to region servers
687 
688   /**
689    * Sends an OPEN RPC to the specified server to open the specified region.
690    * <p>
691    * Open should not fail but can if server just crashed.
692    * <p>
693    * @param server server to open a region
694    * @param region region to open
695    * @param versionOfOfflineNode that needs to be present in the offline node
696    * when RS tries to change the state from OFFLINE to other states.
697    * @param favoredNodes
698    */
699   public RegionOpeningState sendRegionOpen(final ServerName server,
700       HRegionInfo region, int versionOfOfflineNode, List<ServerName> favoredNodes)
701   throws IOException {
702     AdminService.BlockingInterface admin = getRsAdmin(server);
703     if (admin == null) {
704       LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
705         " failed because no RPC connection found to this server");
706       return RegionOpeningState.FAILED_OPENING;
707     }
708     OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, 
709       region, versionOfOfflineNode, favoredNodes, 
710       (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
711     try {
712       OpenRegionResponse response = admin.openRegion(null, request);
713       return ResponseConverter.getRegionOpeningState(response);
714     } catch (ServiceException se) {
715       throw ProtobufUtil.getRemoteException(se);
716     }
717   }
718 
719   /**
720    * Sends an OPEN RPC to the specified server to open the specified region.
721    * <p>
722    * Open should not fail but can if server just crashed.
723    * <p>
724    * @param server server to open a region
725    * @param regionOpenInfos info of a list of regions to open
726    * @return a list of region opening states
727    */
728   public List<RegionOpeningState> sendRegionOpen(ServerName server,
729       List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos)
730   throws IOException {
731     AdminService.BlockingInterface admin = getRsAdmin(server);
732     if (admin == null) {
733       LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
734         " failed because no RPC connection found to this server");
735       return null;
736     }
737 
738     OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, regionOpenInfos,
739       (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode()));
740     try {
741       OpenRegionResponse response = admin.openRegion(null, request);
742       return ResponseConverter.getRegionOpeningStateList(response);
743     } catch (ServiceException se) {
744       throw ProtobufUtil.getRemoteException(se);
745     }
746   }
747 
748   /**
749    * Sends an CLOSE RPC to the specified server to close the specified region.
750    * <p>
751    * A region server could reject the close request because it either does not
752    * have the specified region or the region is being split.
753    * @param server server to open a region
754    * @param region region to open
755    * @param versionOfClosingNode
756    *   the version of znode to compare when RS transitions the znode from
757    *   CLOSING state.
758    * @param dest - if the region is moved to another server, the destination server. null otherwise.
759    * @return true if server acknowledged close, false if not
760    * @throws IOException
761    */
762   public boolean sendRegionClose(ServerName server, HRegionInfo region,
763     int versionOfClosingNode, ServerName dest, boolean transitionInZK) throws IOException {
764     if (server == null) throw new NullPointerException("Passed server is null");
765     AdminService.BlockingInterface admin = getRsAdmin(server);
766     if (admin == null) {
767       throw new IOException("Attempting to send CLOSE RPC to server " +
768         server.toString() + " for region " +
769         region.getRegionNameAsString() +
770         " failed because no RPC connection found to this server");
771     }
772     return ProtobufUtil.closeRegion(admin, server, region.getRegionName(),
773       versionOfClosingNode, dest, transitionInZK);
774   }
775 
776   public boolean sendRegionClose(ServerName server,
777       HRegionInfo region, int versionOfClosingNode) throws IOException {
778     return sendRegionClose(server, region, versionOfClosingNode, null, true);
779   }
780 
781   /**
782    * Sends an MERGE REGIONS RPC to the specified server to merge the specified
783    * regions.
784    * <p>
785    * A region server could reject the close request because it either does not
786    * have the specified region.
787    * @param server server to merge regions
788    * @param region_a region to merge
789    * @param region_b region to merge
790    * @param forcible true if do a compulsory merge, otherwise we will only merge
791    *          two adjacent regions
792    * @throws IOException
793    */
794   public void sendRegionsMerge(ServerName server, HRegionInfo region_a,
795       HRegionInfo region_b, boolean forcible) throws IOException {
796     if (server == null)
797       throw new NullPointerException("Passed server is null");
798     if (region_a == null || region_b == null)
799       throw new NullPointerException("Passed region is null");
800     AdminService.BlockingInterface admin = getRsAdmin(server);
801     if (admin == null) {
802       throw new IOException("Attempting to send MERGE REGIONS RPC to server "
803           + server.toString() + " for region "
804           + region_a.getRegionNameAsString() + ","
805           + region_b.getRegionNameAsString()
806           + " failed because no RPC connection found to this server");
807     }
808     ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible);
809   }
810 
811   /**
812    * Check if a region server is reachable and has the expected start code
813    */
814   public boolean isServerReachable(ServerName server) {
815     if (server == null) throw new NullPointerException("Passed server is null");
816 
817     RetryCounter retryCounter = pingRetryCounterFactory.create();
818     while (retryCounter.shouldRetry()) {
819       synchronized (this.onlineServers) {
820         if (this.deadservers.isDeadServer(server)) {
821           return false;
822         }
823       }
824       try {
825         AdminService.BlockingInterface admin = getRsAdmin(server);
826         if (admin != null) {
827           ServerInfo info = ProtobufUtil.getServerInfo(admin);
828           return info != null && info.hasServerName()
829             && server.getStartcode() == info.getServerName().getStartCode();
830         }
831       } catch (RegionServerStoppedException | ServerNotRunningYetException e) {
832         if (LOG.isDebugEnabled()) {
833           LOG.debug("Couldn't reach " + server, e);
834         }
835         break;
836       } catch (IOException ioe) {
837         if (LOG.isDebugEnabled()) {
838           LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + " of "
839               + retryCounter.getMaxAttempts(), ioe);
840         }
841         try {
842           retryCounter.sleepUntilNextRetry();
843         } catch(InterruptedException ie) {
844           Thread.currentThread().interrupt();
845           break;
846         }
847       }
848     }
849     return false;
850   }
851 
852     /**
853     * @param sn
854     * @return Admin interface for the remote regionserver named <code>sn</code>
855     * @throws IOException
856     * @throws RetriesExhaustedException wrapping a ConnectException if failed
857     */
858   private AdminService.BlockingInterface getRsAdmin(final ServerName sn)
859   throws IOException {
860     AdminService.BlockingInterface admin = this.rsAdmins.get(sn);
861     if (admin == null) {
862       LOG.debug("New admin connection to " + sn.toString());
863       if (sn.equals(master.getServerName()) && master instanceof HRegionServer) {
864         // A master is also a region server now, see HBASE-10569 for details
865         admin = ((HRegionServer)master).getRSRpcServices();
866       } else {
867         admin = this.connection.getAdmin(sn);
868       }
869       this.rsAdmins.put(sn, admin);
870     }
871     return admin;
872   }
873 
874   /**
875    * Wait for the region servers to report in.
876    * We will wait until one of this condition is met:
877    *  - the master is stopped
878    *  - the 'hbase.master.wait.on.regionservers.maxtostart' number of
879    *    region servers is reached
880    *  - the 'hbase.master.wait.on.regionservers.mintostart' is reached AND
881    *   there have been no new region server in for
882    *      'hbase.master.wait.on.regionservers.interval' time AND
883    *   the 'hbase.master.wait.on.regionservers.timeout' is reached
884    *
885    * @throws InterruptedException
886    */
887   public void waitForRegionServers(MonitoredTask status)
888   throws InterruptedException {
889     final long interval = this.master.getConfiguration().
890       getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
891     final long timeout = this.master.getConfiguration().
892       getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
893     int defaultMinToStart = 1;
894     if (BaseLoadBalancer.tablesOnMaster(master.getConfiguration())) {
895       // If we assign regions to master, we'd like to start
896       // at least another region server so that we don't
897       // assign all regions to master if other region servers
898       // don't come up in time.
899       defaultMinToStart = 2;
900     }
901     int minToStart = this.master.getConfiguration().
902       getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, defaultMinToStart);
903     if (minToStart < 1) {
904       LOG.warn(String.format(
905         "The value of '%s' (%d) can not be less than 1, ignoring.",
906         WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
907       minToStart = 1;
908     }
909     int maxToStart = this.master.getConfiguration().
910       getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
911     if (maxToStart < minToStart) {
912         LOG.warn(String.format(
913             "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
914             WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
915             WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
916         maxToStart = Integer.MAX_VALUE;
917     }
918 
919     long now =  System.currentTimeMillis();
920     final long startTime = now;
921     long slept = 0;
922     long lastLogTime = 0;
923     long lastCountChange = startTime;
924     int count = countOfRegionServers();
925     int oldCount = 0;
926     while (!this.master.isStopped() && count < maxToStart
927         && (lastCountChange+interval > now || timeout > slept || count < minToStart)) {
928       // Log some info at every interval time or if there is a change
929       if (oldCount != count || lastLogTime+interval < now){
930         lastLogTime = now;
931         String msg =
932           "Waiting for region servers count to settle; currently"+
933             " checked in " + count + ", slept for " + slept + " ms," +
934             " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
935             ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
936         LOG.info(msg);
937         status.setStatus(msg);
938       }
939 
940       // We sleep for some time
941       final long sleepTime = 50;
942       Thread.sleep(sleepTime);
943       now =  System.currentTimeMillis();
944       slept = now - startTime;
945 
946       oldCount = count;
947       count = countOfRegionServers();
948       if (count != oldCount) {
949         lastCountChange = now;
950       }
951     }
952 
953     LOG.info("Finished waiting for region servers count to settle;" +
954       " checked in " + count + ", slept for " + slept + " ms," +
955       " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
956       " master is "+ (this.master.isStopped() ? "stopped.": "running")
957     );
958   }
959 
960   /**
961    * @return A copy of the internal list of online servers.
962    */
963   public List<ServerName> getOnlineServersList() {
964     // TODO: optimize the load balancer call so we don't need to make a new list
965     // TODO: FIX. THIS IS POPULAR CALL.
966     return new ArrayList<ServerName>(this.onlineServers.keySet());
967   }
968 
969   /**
970    * @return A copy of the internal list of draining servers.
971    */
972   public List<ServerName> getDrainingServersList() {
973     return new ArrayList<ServerName>(this.drainingServers);
974   }
975 
976   /**
977    * @return A copy of the internal set of deadNotExpired servers.
978    */
979   Set<ServerName> getDeadNotExpiredServers() {
980     return new HashSet<ServerName>(this.queuedDeadServers);
981   }
982 
983   /**
984    * During startup, if we figure it is not a failover, i.e. there is
985    * no more WAL files to split, we won't try to recover these dead servers.
986    * So we just remove them from the queue. Use caution in calling this.
987    */
988   void removeRequeuedDeadServers() {
989     requeuedDeadServers.clear();
990   }
991 
992   /**
993    * @return A copy of the internal map of requeuedDeadServers servers and their corresponding
994    *         splitlog need flag.
995    */
996   Map<ServerName, Boolean> getRequeuedDeadServers() {
997     return Collections.unmodifiableMap(this.requeuedDeadServers);
998   }
999 
1000   public boolean isServerOnline(ServerName serverName) {
1001     return serverName != null && onlineServers.containsKey(serverName);
1002   }
1003 
1004   /**
1005    * Check if a server is known to be dead.  A server can be online,
1006    * or known to be dead, or unknown to this manager (i.e, not online,
1007    * not known to be dead either. it is simply not tracked by the
1008    * master any more, for example, a very old previous instance).
1009    */
1010   public synchronized boolean isServerDead(ServerName serverName) {
1011     return serverName == null || deadservers.isDeadServer(serverName)
1012       || queuedDeadServers.contains(serverName)
1013       || requeuedDeadServers.containsKey(serverName);
1014   }
1015 
1016   public void shutdownCluster() {
1017     this.clusterShutdown = true;
1018     this.master.stop("Cluster shutdown requested");
1019   }
1020 
1021   public boolean isClusterShutdown() {
1022     return this.clusterShutdown;
1023   }
1024 
1025   /**
1026    * Stop the ServerManager.  Currently closes the connection to the master.
1027    */
1028   public void stop() {
1029     if (connection != null) {
1030       try {
1031         connection.close();
1032       } catch (IOException e) {
1033         LOG.error("Attempt to close connection to master failed", e);
1034       }
1035     }
1036   }
1037 
1038   /**
1039    * Creates a list of possible destinations for a region. It contains the online servers, but not
1040    *  the draining or dying servers.
1041    *  @param serverToExclude can be null if there is no server to exclude
1042    */
1043   public List<ServerName> createDestinationServersList(final ServerName serverToExclude){
1044     final List<ServerName> destServers = getOnlineServersList();
1045 
1046     if (serverToExclude != null){
1047       destServers.remove(serverToExclude);
1048     }
1049 
1050     // Loop through the draining server list and remove them from the server list
1051     final List<ServerName> drainingServersCopy = getDrainingServersList();
1052     if (!drainingServersCopy.isEmpty()) {
1053       for (final ServerName server: drainingServersCopy) {
1054         destServers.remove(server);
1055       }
1056     }
1057 
1058     // Remove the deadNotExpired servers from the server list.
1059     removeDeadNotExpiredServers(destServers);
1060     return destServers;
1061   }
1062 
1063   /**
1064    * Calls {@link #createDestinationServersList} without server to exclude.
1065    */
1066   public List<ServerName> createDestinationServersList(){
1067     return createDestinationServersList(null);
1068   }
1069 
1070     /**
1071     * Loop through the deadNotExpired server list and remove them from the
1072     * servers.
1073     * This function should be used carefully outside of this class. You should use a high level
1074     *  method such as {@link #createDestinationServersList()} instead of managing you own list.
1075     */
1076   void removeDeadNotExpiredServers(List<ServerName> servers) {
1077     Set<ServerName> deadNotExpiredServersCopy = this.getDeadNotExpiredServers();
1078     if (!deadNotExpiredServersCopy.isEmpty()) {
1079       for (ServerName server : deadNotExpiredServersCopy) {
1080         LOG.debug("Removing dead but not expired server: " + server
1081           + " from eligible server pool.");
1082         servers.remove(server);
1083       }
1084     }
1085   }
1086 
1087   /**
1088    * To clear any dead server with same host name and port of any online server
1089    */
1090   void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
1091     for (ServerName serverName : getOnlineServersList()) {
1092       deadservers.cleanAllPreviousInstances(serverName);
1093     }
1094   }
1095 }