View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.NavigableMap;
31  import java.util.Set;
32  import java.util.TreeMap;
33  import java.util.concurrent.ConcurrentHashMap;
34  import java.util.concurrent.ConcurrentSkipListSet;
35  import java.util.concurrent.CopyOnWriteArrayList;
36  import java.util.concurrent.ThreadFactory;
37  import java.util.concurrent.TimeUnit;
38  import java.util.concurrent.atomic.AtomicBoolean;
39  import java.util.concurrent.atomic.AtomicInteger;
40  import java.util.concurrent.locks.Lock;
41  import java.util.concurrent.locks.ReentrantLock;
42  
43  import org.apache.commons.logging.Log;
44  import org.apache.commons.logging.LogFactory;
45  import org.apache.hadoop.hbase.classification.InterfaceAudience;
46  import org.apache.hadoop.conf.Configuration;
47  import org.apache.hadoop.fs.FileSystem;
48  import org.apache.hadoop.fs.Path;
49  import org.apache.hadoop.hbase.Chore;
50  import org.apache.hadoop.hbase.HBaseIOException;
51  import org.apache.hadoop.hbase.HConstants;
52  import org.apache.hadoop.hbase.HRegionInfo;
53  import org.apache.hadoop.hbase.NotServingRegionException;
54  import org.apache.hadoop.hbase.RegionTransition;
55  import org.apache.hadoop.hbase.Server;
56  import org.apache.hadoop.hbase.ServerName;
57  import org.apache.hadoop.hbase.Stoppable;
58  import org.apache.hadoop.hbase.TableName;
59  import org.apache.hadoop.hbase.TableNotFoundException;
60  import org.apache.hadoop.hbase.catalog.CatalogTracker;
61  import org.apache.hadoop.hbase.catalog.MetaReader;
62  import org.apache.hadoop.hbase.client.Result;
63  import org.apache.hadoop.hbase.exceptions.DeserializationException;
64  import org.apache.hadoop.hbase.executor.EventHandler;
65  import org.apache.hadoop.hbase.executor.EventType;
66  import org.apache.hadoop.hbase.executor.ExecutorService;
67  import org.apache.hadoop.hbase.ipc.RpcClient;
68  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
69  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
70  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
71  import org.apache.hadoop.hbase.master.RegionState.State;
72  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
73  import org.apache.hadoop.hbase.master.balancer.FavoredNodeLoadBalancer;
74  import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
75  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
76  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
77  import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
78  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos;
79  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionTransition.TransitionCode;
80  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
81  import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
82  import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
83  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
84  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
85  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
86  import org.apache.hadoop.hbase.regionserver.wal.HLog;
87  import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
88  import org.apache.hadoop.hbase.util.ConfigUtil;
89  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
90  import org.apache.hadoop.hbase.util.FSUtils;
91  import org.apache.hadoop.hbase.util.KeyLocker;
92  import org.apache.hadoop.hbase.util.Pair;
93  import org.apache.hadoop.hbase.util.PairOfSameType;
94  import org.apache.hadoop.hbase.util.Threads;
95  import org.apache.hadoop.hbase.util.Triple;
96  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
97  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
98  import org.apache.hadoop.hbase.zookeeper.ZKTable;
99  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
100 import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
101 import org.apache.hadoop.ipc.RemoteException;
102 import org.apache.zookeeper.AsyncCallback;
103 import org.apache.zookeeper.KeeperException;
104 import org.apache.zookeeper.KeeperException.NoNodeException;
105 import org.apache.zookeeper.KeeperException.NodeExistsException;
106 import org.apache.zookeeper.data.Stat;
107 
108 import com.google.common.annotations.VisibleForTesting;
109 import com.google.common.base.Preconditions;
110 import com.google.common.collect.LinkedHashMultimap;
111 
112 /**
113  * Manages and performs region assignment.
114  * <p>
115  * Monitors ZooKeeper for events related to regions in transition.
116  * <p>
117  * Handles existing regions in transition during master failover.
118  */
119 @InterfaceAudience.Private
120 public class AssignmentManager extends ZooKeeperListener {
121   private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
122 
123   public static final ServerName HBCK_CODE_SERVERNAME = ServerName.valueOf(HConstants.HBCK_CODE_NAME,
124       -1, -1L);
125 
126   public static final String ASSIGNMENT_TIMEOUT = "hbase.master.assignment.timeoutmonitor.timeout";
127   public static final int DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT = 600000;
128   public static final String ASSIGNMENT_TIMEOUT_MANAGEMENT = "hbase.assignment.timeout.management";
129   public static final boolean DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT = false;
130 
131   public static final String ALREADY_IN_TRANSITION_WAITTIME
132     = "hbase.assignment.already.intransition.waittime";
133   public static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
134 
135   protected final Server server;
136 
137   private ServerManager serverManager;
138 
139   private boolean shouldAssignRegionsWithFavoredNodes;
140 
141   private CatalogTracker catalogTracker;
142 
143   protected final TimeoutMonitor timeoutMonitor;
144 
145   private final TimerUpdater timerUpdater;
146 
147   private LoadBalancer balancer;
148 
149   private final MetricsAssignmentManager metricsAssignmentManager;
150 
151   private final TableLockManager tableLockManager;
152 
153   private AtomicInteger numRegionsOpened = new AtomicInteger(0);
154 
155   final private KeyLocker<String> locker = new KeyLocker<String>();
156 
157   /**
158    * Map of regions to reopen after the schema of a table is changed. Key -
159    * encoded region name, value - HRegionInfo
160    */
161   private final Map <String, HRegionInfo> regionsToReopen;
162 
163   /*
164    * Maximum times we recurse an assignment/unassignment.
165    * See below in {@link #assign()} and {@link #unassign()}.
166    */
167   private final int maximumAttempts;
168 
169   /**
170    * Map of two merging regions from the region to be created.
171    */
172   private final Map<String, PairOfSameType<HRegionInfo>> mergingRegions
173     = new HashMap<String, PairOfSameType<HRegionInfo>>();
174 
175   /**
176    * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
177    * failure due to lack of availability of region plan
178    */
179   private final long sleepTimeBeforeRetryingMetaAssignment;
180 
181   /** Plans for region movement. Key is the encoded version of a region name*/
182   // TODO: When do plans get cleaned out?  Ever? In server open and in server
183   // shutdown processing -- St.Ack
184   // All access to this Map must be synchronized.
185   final NavigableMap<String, RegionPlan> regionPlans =
186     new TreeMap<String, RegionPlan>();
187 
188   private final ZKTable zkTable;
189 
190   /**
191    * Contains the server which need to update timer, these servers will be
192    * handled by {@link TimerUpdater}
193    */
194   private final ConcurrentSkipListSet<ServerName> serversInUpdatingTimer;
195 
196   private final ExecutorService executorService;
197 
198   // For unit tests, keep track of calls to ClosedRegionHandler
199   private Map<HRegionInfo, AtomicBoolean> closedRegionHandlerCalled = null;
200 
201   // For unit tests, keep track of calls to OpenedRegionHandler
202   private Map<HRegionInfo, AtomicBoolean> openedRegionHandlerCalled = null;
203 
204   //Thread pool executor service for timeout monitor
205   private java.util.concurrent.ExecutorService threadPoolExecutorService;
206 
207   // A bunch of ZK events workers. Each is a single thread executor service
208   private final java.util.concurrent.ExecutorService zkEventWorkers;
209 
210   private List<EventType> ignoreStatesRSOffline = Arrays.asList(
211       EventType.RS_ZK_REGION_FAILED_OPEN, EventType.RS_ZK_REGION_CLOSED);
212 
213   private final RegionStates regionStates;
214 
215   // The threshold to use bulk assigning. Using bulk assignment
216   // only if assigning at least this many regions to at least this
217   // many servers. If assigning fewer regions to fewer servers,
218   // bulk assigning may be not as efficient.
219   private final int bulkAssignThresholdRegions;
220   private final int bulkAssignThresholdServers;
221 
222   // Should bulk assignment wait till all regions are assigned,
223   // or it is timed out?  This is useful to measure bulk assignment
224   // performance, but not needed in most use cases.
225   private final boolean bulkAssignWaitTillAllAssigned;
226 
227   /**
228    * Indicator that AssignmentManager has recovered the region states so
229    * that ServerShutdownHandler can be fully enabled and re-assign regions
230    * of dead servers. So that when re-assignment happens, AssignmentManager
231    * has proper region states.
232    *
233    * Protected to ease testing.
234    */
235   protected final AtomicBoolean failoverCleanupDone = new AtomicBoolean(false);
236 
237   /** Is the TimeOutManagement activated **/
238   private final boolean tomActivated;
239 
240   /**
241    * A map to track the count a region fails to open in a row.
242    * So that we don't try to open a region forever if the failure is
243    * unrecoverable.  We don't put this information in region states
244    * because we don't expect this to happen frequently; we don't
245    * want to copy this information over during each state transition either.
246    */
247   private final ConcurrentHashMap<String, AtomicInteger>
248     failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
249 
250   // A flag to indicate if we are using ZK for region assignment
251   private final boolean useZKForAssignment;
252 
253   // In case not using ZK for region assignment, region states
254   // are persisted in meta with a state store
255   private final RegionStateStore regionStateStore;
256 
257   /**
258    * For testing only!  Set to true to skip handling of split.
259    */
260   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="MS_SHOULD_BE_FINAL")
261   public static boolean TEST_SKIP_SPLIT_HANDLING = false;
262 
263   /** Listeners that are called on assignment events. */
264   private List<AssignmentListener> listeners = new CopyOnWriteArrayList<AssignmentListener>();
265 
266   /**
267    * Constructs a new assignment manager.
268    *
269    * @param server
270    * @param serverManager
271    * @param catalogTracker
272    * @param service
273    * @throws KeeperException
274    * @throws IOException
275    */
276   public AssignmentManager(Server server, ServerManager serverManager,
277       CatalogTracker catalogTracker, final LoadBalancer balancer,
278       final ExecutorService service, MetricsMaster metricsMaster,
279       final TableLockManager tableLockManager) throws KeeperException, IOException {
280     super(server.getZooKeeper());
281     this.server = server;
282     this.serverManager = serverManager;
283     this.catalogTracker = catalogTracker;
284     this.executorService = service;
285     this.regionStateStore = new RegionStateStore(server);
286     this.regionsToReopen = Collections.synchronizedMap
287                            (new HashMap<String, HRegionInfo> ());
288     Configuration conf = server.getConfiguration();
289     // Only read favored nodes if using the favored nodes load balancer.
290     this.shouldAssignRegionsWithFavoredNodes = conf.getClass(
291            HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class).equals(
292            FavoredNodeLoadBalancer.class);
293     this.tomActivated = conf.getBoolean(
294       ASSIGNMENT_TIMEOUT_MANAGEMENT, DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT);
295     if (tomActivated){
296       this.serversInUpdatingTimer =  new ConcurrentSkipListSet<ServerName>();
297       this.timeoutMonitor = new TimeoutMonitor(
298         conf.getInt("hbase.master.assignment.timeoutmonitor.period", 30000),
299         server, serverManager,
300         conf.getInt(ASSIGNMENT_TIMEOUT, DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT));
301       this.timerUpdater = new TimerUpdater(conf.getInt(
302         "hbase.master.assignment.timerupdater.period", 10000), server);
303       Threads.setDaemonThreadRunning(timerUpdater.getThread(),
304         server.getServerName() + ".timerUpdater");
305     } else {
306       this.serversInUpdatingTimer =  null;
307       this.timeoutMonitor = null;
308       this.timerUpdater = null;
309     }
310     this.zkTable = new ZKTable(this.watcher);
311     // This is the max attempts, not retries, so it should be at least 1.
312     this.maximumAttempts = Math.max(1,
313       this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10));
314     this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong(
315         "hbase.meta.assignment.retry.sleeptime", 1000l);
316     this.balancer = balancer;
317     int maxThreads = conf.getInt("hbase.assignment.threads.max", 30);
318     this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool(
319       maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM."));
320     this.regionStates = new RegionStates(server, serverManager, regionStateStore);
321 
322     this.bulkAssignWaitTillAllAssigned =
323       conf.getBoolean("hbase.bulk.assignment.waittillallassigned", false);
324     this.bulkAssignThresholdRegions = conf.getInt("hbase.bulk.assignment.threshold.regions", 7);
325     this.bulkAssignThresholdServers = conf.getInt("hbase.bulk.assignment.threshold.servers", 3);
326 
327     int workers = conf.getInt("hbase.assignment.zkevent.workers", 20);
328     ThreadFactory threadFactory = Threads.newDaemonThreadFactory("AM.ZK.Worker");
329     zkEventWorkers = Threads.getBoundedCachedThreadPool(workers, 60L,
330             TimeUnit.SECONDS, threadFactory);
331     this.tableLockManager = tableLockManager;
332 
333     this.metricsAssignmentManager = new MetricsAssignmentManager();
334     useZKForAssignment = ConfigUtil.useZKForAssignment(conf);
335   }
336 
337   void startTimeOutMonitor() {
338     if (tomActivated) {
339       Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), server.getServerName()
340           + ".timeoutMonitor");
341     }
342   }
343 
344   /**
345    * Add the listener to the notification list.
346    * @param listener The AssignmentListener to register
347    */
348   public void registerListener(final AssignmentListener listener) {
349     this.listeners.add(listener);
350   }
351 
352   /**
353    * Remove the listener from the notification list.
354    * @param listener The AssignmentListener to unregister
355    */
356   public boolean unregisterListener(final AssignmentListener listener) {
357     return this.listeners.remove(listener);
358   }
359 
360   /**
361    * @return Instance of ZKTable.
362    */
363   public ZKTable getZKTable() {
364     // These are 'expensive' to make involving trip to zk ensemble so allow
365     // sharing.
366     return this.zkTable;
367   }
368 
369   /**
370    * This SHOULD not be public. It is public now
371    * because of some unit tests.
372    *
373    * TODO: make it package private and keep RegionStates in the master package
374    */
375   public RegionStates getRegionStates() {
376     return regionStates;
377   }
378 
379   /**
380    * Used in some tests to mock up region state in meta
381    */
382   @VisibleForTesting
383   RegionStateStore getRegionStateStore() {
384     return regionStateStore;
385   }
386 
387   public RegionPlan getRegionReopenPlan(HRegionInfo hri) {
388     return new RegionPlan(hri, null, regionStates.getRegionServerOfRegion(hri));
389   }
390 
391   /**
392    * Add a regionPlan for the specified region.
393    * @param encodedName
394    * @param plan
395    */
396   public void addPlan(String encodedName, RegionPlan plan) {
397     synchronized (regionPlans) {
398       regionPlans.put(encodedName, plan);
399     }
400   }
401 
402   /**
403    * Add a map of region plans.
404    */
405   public void addPlans(Map<String, RegionPlan> plans) {
406     synchronized (regionPlans) {
407       regionPlans.putAll(plans);
408     }
409   }
410 
411   /**
412    * Set the list of regions that will be reopened
413    * because of an update in table schema
414    *
415    * @param regions
416    *          list of regions that should be tracked for reopen
417    */
418   public void setRegionsToReopen(List <HRegionInfo> regions) {
419     for(HRegionInfo hri : regions) {
420       regionsToReopen.put(hri.getEncodedName(), hri);
421     }
422   }
423 
424   /**
425    * Used by the client to identify if all regions have the schema updates
426    *
427    * @param tableName
428    * @return Pair indicating the status of the alter command
429    * @throws IOException
430    */
431   public Pair<Integer, Integer> getReopenStatus(TableName tableName)
432       throws IOException {
433     List <HRegionInfo> hris =
434       MetaReader.getTableRegions(this.server.getCatalogTracker(), tableName, true);
435     Integer pending = 0;
436     for (HRegionInfo hri : hris) {
437       String name = hri.getEncodedName();
438       // no lock concurrent access ok: sequential consistency respected.
439       if (regionsToReopen.containsKey(name)
440           || regionStates.isRegionInTransition(name)) {
441         pending++;
442       }
443     }
444     return new Pair<Integer, Integer>(pending, hris.size());
445   }
446 
447   /**
448    * Used by ServerShutdownHandler to make sure AssignmentManager has completed
449    * the failover cleanup before re-assigning regions of dead servers. So that
450    * when re-assignment happens, AssignmentManager has proper region states.
451    */
452   public boolean isFailoverCleanupDone() {
453     return failoverCleanupDone.get();
454   }
455 
456   /**
457    * To avoid racing with AM, external entities may need to lock a region,
458    * for example, when SSH checks what regions to skip re-assigning.
459    */
460   public Lock acquireRegionLock(final String encodedName) {
461     return locker.acquireLock(encodedName);
462   }
463 
464   /**
465    * Now, failover cleanup is completed. Notify server manager to
466    * process queued up dead servers processing, if any.
467    */
468   void failoverCleanupDone() {
469     failoverCleanupDone.set(true);
470     serverManager.processQueuedDeadServers();
471   }
472 
473   /**
474    * Called on startup.
475    * Figures whether a fresh cluster start of we are joining extant running cluster.
476    * @throws IOException
477    * @throws KeeperException
478    * @throws InterruptedException
479    */
480   void joinCluster() throws IOException,
481       KeeperException, InterruptedException {
482     long startTime = System.currentTimeMillis();
483     // Concurrency note: In the below the accesses on regionsInTransition are
484     // outside of a synchronization block where usually all accesses to RIT are
485     // synchronized.  The presumption is that in this case it is safe since this
486     // method is being played by a single thread on startup.
487 
488     // TODO: Regions that have a null location and are not in regionsInTransitions
489     // need to be handled.
490 
491     // Scan hbase:meta to build list of existing regions, servers, and assignment
492     // Returns servers who have not checked in (assumed dead) and their regions
493     Map<ServerName, List<HRegionInfo>> deadServers = rebuildUserRegions();
494 
495     // This method will assign all user regions if a clean server startup or
496     // it will reconstruct master state and cleanup any leftovers from
497     // previous master process.
498     boolean failover = processDeadServersAndRegionsInTransition(deadServers);
499 
500     if (!useZKForAssignment) {
501       // Not use ZK for assignment any more, remove the ZNode
502       ZKUtil.deleteNodeRecursively(watcher, watcher.assignmentZNode);
503     }
504     recoverTableInDisablingState();
505     recoverTableInEnablingState();
506     LOG.info("Joined the cluster in " + (System.currentTimeMillis()
507       - startTime) + "ms, failover=" + failover);
508   }
509 
510   /**
511    * Process all regions that are in transition in zookeeper and also
512    * processes the list of dead servers by scanning the META.
513    * Used by master joining an cluster.  If we figure this is a clean cluster
514    * startup, will assign all user regions.
515    * @param deadServers
516    *          Map of dead servers and their regions. Can be null.
517    * @throws KeeperException
518    * @throws IOException
519    * @throws InterruptedException
520    */
521   boolean processDeadServersAndRegionsInTransition(
522       final Map<ServerName, List<HRegionInfo>> deadServers)
523           throws KeeperException, IOException, InterruptedException {
524     List<String> nodes = ZKUtil.listChildrenNoWatch(watcher,
525       watcher.assignmentZNode);
526 
527     if (nodes == null && useZKForAssignment) {
528       String errorMessage = "Failed to get the children from ZK";
529       server.abort(errorMessage, new IOException(errorMessage));
530       return true; // Doesn't matter in this case
531     }
532 
533     boolean failover = !serverManager.getDeadServers().isEmpty();
534     if (failover) {
535       // This may not be a failover actually, especially if meta is on this master.
536       if (LOG.isDebugEnabled()) {
537         LOG.debug("Found dead servers out on cluster " + serverManager.getDeadServers());
538       }
539     } else {
540       // If any one region except meta is assigned, it's a failover.
541       Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
542       for (Map.Entry<HRegionInfo, ServerName> en : regionStates.getRegionAssignments().entrySet()) {
543         HRegionInfo hri = en.getKey();
544         if (!hri.isMetaTable() && onlineServers.contains(en.getValue())) {
545           LOG.debug("Found " + hri + " out on cluster");
546           failover = true;
547           break;
548         }
549       }
550     }
551 
552     if (!failover && nodes != null) {
553       // If any one region except meta is in transition, it's a failover.
554       for (String encodedName : nodes) {
555         RegionState regionState = regionStates.getRegionState(encodedName);
556         if (regionState != null && !regionState.getRegion().isMetaRegion()) {
557           LOG.debug("Found " + regionState + " in RITs");
558           failover = true;
559           break;
560         }
561       }
562     }
563 
564     if (!failover && !useZKForAssignment) {
565       // If any region except meta is in transition on a live server, it's a failover.
566       Map<String, RegionState> regionsInTransition = regionStates.getRegionsInTransition();
567       if (!regionsInTransition.isEmpty()) {
568         Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
569         for (RegionState regionState : regionsInTransition.values()) {
570           if (!regionState.getRegion().isMetaRegion()
571               && onlineServers.contains(regionState.getServerName())) {
572             LOG.debug("Found " + regionState + " in RITs");
573             failover = true;
574             break;
575           }
576         }
577       }
578     }
579 
580     if (!failover) {
581       // If we get here, we have a full cluster restart. It is a failover only
582       // if there are some HLogs are not split yet. For meta HLogs, they should have
583       // been split already, if any. We can walk through those queued dead servers,
584       // if they don't have any HLogs, this restart should be considered as a clean one
585       Set<ServerName> queuedDeadServers = serverManager.getRequeuedDeadServers().keySet();
586       if (!queuedDeadServers.isEmpty()) {
587         Configuration conf = server.getConfiguration();
588         Path rootdir = FSUtils.getRootDir(conf);
589         FileSystem fs = rootdir.getFileSystem(conf);
590         for (ServerName serverName : queuedDeadServers) {
591           Path logDir = new Path(rootdir, HLogUtil.getHLogDirectoryName(serverName.toString()));
592           Path splitDir = logDir.suffix(HLog.SPLITTING_EXT);
593           if (fs.exists(logDir) || fs.exists(splitDir)) {
594             LOG.debug("Found queued dead server " + serverName);
595             failover = true;
596             break;
597           }
598         }
599         if (!failover) {
600           // We figured that it's not a failover, so no need to
601           // work on these re-queued dead servers any more.
602           LOG.info("AM figured that it's not a failover and cleaned up " + queuedDeadServers.size()
603               + " queued dead servers");
604           serverManager.removeRequeuedDeadServers();
605         }
606       }
607     }
608 
609     Set<TableName> disabledOrDisablingOrEnabling = null;
610     if (!failover) {
611       disabledOrDisablingOrEnabling = ZKTable.getDisabledOrDisablingTables(watcher);
612       disabledOrDisablingOrEnabling.addAll(ZKTable.getEnablingTables(watcher));
613       // Clean re/start, mark all user regions closed before reassignment
614       // TODO -Hbase-11319
615       regionStates.closeAllUserRegions(disabledOrDisablingOrEnabling);
616     }
617 
618     // Now region states are restored
619     regionStateStore.start();
620 
621     // If we found user regions out on cluster, its a failover.
622     if (failover) {
623       LOG.info("Found regions out on cluster or in RIT; presuming failover");
624       // Process list of dead servers and regions in RIT.
625       // See HBASE-4580 for more information.
626       processDeadServersAndRecoverLostRegions(deadServers);
627     } 
628     if (!failover && useZKForAssignment) {
629       // Cleanup any existing ZK nodes and start watching
630       ZKAssign.deleteAllNodes(watcher);
631       ZKUtil.listChildrenAndWatchForNewChildren(this.watcher, this.watcher.assignmentZNode);
632     }
633     // Now we can safely claim failover cleanup completed and enable
634     // ServerShutdownHandler for further processing. The nodes (below)
635     // in transition, if any, are for regions not related to those
636     // dead servers at all, and can be done in parallel to SSH.
637     failoverCleanupDone();
638     if (!failover) {
639       // Fresh cluster startup.
640       LOG.info("Clean cluster startup. Assigning user regions");
641       assignAllUserRegions(disabledOrDisablingOrEnabling);
642     }
643     return failover;
644   }
645 
646   /**
647    * If region is up in zk in transition, then do fixup and block and wait until
648    * the region is assigned and out of transition.  Used on startup for
649    * catalog regions.
650    * @param hri Region to look for.
651    * @return True if we processed a region in transition else false if region
652    * was not up in zk in transition.
653    * @throws InterruptedException
654    * @throws KeeperException
655    * @throws IOException
656    */
657   boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
658       throws InterruptedException, KeeperException, IOException {
659     String encodedRegionName = hri.getEncodedName();
660     if (!processRegionInTransition(encodedRegionName, hri)) {
661       return false; // The region is not in transition
662     }
663     LOG.debug("Waiting on " + HRegionInfo.prettyPrint(encodedRegionName));
664     while (!this.server.isStopped() &&
665         this.regionStates.isRegionInTransition(encodedRegionName)) {
666       RegionState state = this.regionStates.getRegionTransitionState(encodedRegionName);
667       if (state == null || !serverManager.isServerOnline(state.getServerName())) {
668         // The region is not in transition, or not in transition on an online
669         // server. Doesn't help to block here any more. Caller need to
670         // verify the region is actually assigned.
671         break;
672       }
673       this.regionStates.waitForUpdate(100);
674     }
675     return true;
676   }
677 
678   /**
679    * Process failover of new master for region <code>encodedRegionName</code>
680    * up in zookeeper.
681    * @param encodedRegionName Region to process failover for.
682    * @param regionInfo If null we'll go get it from meta table.
683    * @return True if we processed <code>regionInfo</code> as a RIT.
684    * @throws KeeperException
685    * @throws IOException
686    */
687   boolean processRegionInTransition(final String encodedRegionName,
688       final HRegionInfo regionInfo) throws KeeperException, IOException {
689     // We need a lock here to ensure that we will not put the same region twice
690     // It has no reason to be a lock shared with the other operations.
691     // We can do the lock on the region only, instead of a global lock: what we want to ensure
692     // is that we don't have two threads working on the same region.
693     Lock lock = locker.acquireLock(encodedRegionName);
694     try {
695       Stat stat = new Stat();
696       byte [] data = ZKAssign.getDataAndWatch(watcher, encodedRegionName, stat);
697       if (data == null) return false;
698       RegionTransition rt;
699       try {
700         rt = RegionTransition.parseFrom(data);
701       } catch (DeserializationException e) {
702         LOG.warn("Failed parse znode data", e);
703         return false;
704       }
705       HRegionInfo hri = regionInfo;
706       if (hri == null) {
707         // The region info is not passed in. We will try to find the region
708         // from region states map/meta based on the encoded region name. But we
709         // may not be able to find it. This is valid for online merge that
710         // the region may have not been created if the merge is not completed.
711         // Therefore, it is not in meta at master recovery time.
712         hri = regionStates.getRegionInfo(rt.getRegionName());
713         EventType et = rt.getEventType();
714         if (hri == null && et != EventType.RS_ZK_REGION_MERGING
715             && et != EventType.RS_ZK_REQUEST_REGION_MERGE) {
716           LOG.warn("Couldn't find the region in recovering " + rt);
717           return false;
718         }
719       }
720       return processRegionsInTransition(
721         rt, hri, stat.getVersion());
722     } finally {
723       lock.unlock();
724     }
725   }
726 
727   /**
728    * This call is invoked only (1) master assign meta;
729    * (2) during failover mode startup, zk assignment node processing.
730    * The locker is set in the caller. It returns true if the region
731    * is in transition for sure, false otherwise.
732    *
733    * It should be private but it is used by some test too.
734    */
735   boolean processRegionsInTransition(
736       final RegionTransition rt, final HRegionInfo regionInfo,
737       final int expectedVersion) throws KeeperException {
738     EventType et = rt.getEventType();
739     // Get ServerName.  Could not be null.
740     final ServerName sn = rt.getServerName();
741     final byte[] regionName = rt.getRegionName();
742     final String encodedName = HRegionInfo.encodeRegionName(regionName);
743     final String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
744     LOG.info("Processing " + prettyPrintedRegionName + " in state: " + et);
745 
746     if (regionStates.isRegionInTransition(encodedName)
747         && (regionInfo.isMetaRegion() || !useZKForAssignment)) {
748       LOG.info("Processed region " + prettyPrintedRegionName + " in state: "
749         + et + ", does nothing since the region is already in transition "
750         + regionStates.getRegionTransitionState(encodedName));
751       // Just return
752       return true;
753     }
754     if (!serverManager.isServerOnline(sn)) {
755       // It was transitioning on a dead server, so it's closed now.
756       // Force to OFFLINE and put it in transition, but not assign it
757       // since log splitting for the dead server is not done yet.
758       LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
759         " was on deadserver; forcing offline");
760       if (regionStates.isRegionOnline(regionInfo)) {
761         // Meta could still show the region is assigned to the previous
762         // server. If that server is online, when we reload the meta, the
763         // region is put back to online, we need to offline it.
764         regionStates.regionOffline(regionInfo);
765         sendRegionClosedNotification(regionInfo);
766       }
767       // Put it back in transition so that SSH can re-assign it
768       regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
769 
770       if (regionInfo.isMetaRegion()) {
771         // If it's meta region, reset the meta location.
772         // So that master knows the right meta region server.
773         MetaRegionTracker.setMetaLocation(watcher, sn);
774       } else {
775         // No matter the previous server is online or offline,
776         // we need to reset the last region server of the region.
777         regionStates.setLastRegionServerOfRegion(sn, encodedName);
778         // Make sure we know the server is dead.
779         if (!serverManager.isServerDead(sn)) {
780           serverManager.expireServer(sn);
781         }
782       }
783       return false;
784     }
785     switch (et) {
786       case M_ZK_REGION_CLOSING:
787         // Insert into RIT & resend the query to the region server: may be the previous master
788         // died before sending the query the first time.
789         final RegionState rsClosing = regionStates.updateRegionState(rt, State.CLOSING);
790         this.executorService.submit(
791           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
792             @Override
793             public void process() throws IOException {
794               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
795               try {
796                 unassign(regionInfo, rsClosing, expectedVersion, null, useZKForAssignment, null);
797                 if (regionStates.isRegionOffline(regionInfo)) {
798                   assign(regionInfo, true);
799                 }
800               } finally {
801                 lock.unlock();
802               }
803             }
804           });
805         break;
806 
807       case RS_ZK_REGION_CLOSED:
808       case RS_ZK_REGION_FAILED_OPEN:
809         // Region is closed, insert into RIT and handle it
810         regionStates.updateRegionState(regionInfo, State.CLOSED, sn);
811         invokeAssign(regionInfo);
812         break;
813 
814       case M_ZK_REGION_OFFLINE:
815         // Insert in RIT and resend to the regionserver
816         regionStates.updateRegionState(rt, State.PENDING_OPEN);
817         final RegionState rsOffline = regionStates.getRegionState(regionInfo);
818         this.executorService.submit(
819           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
820             @Override
821             public void process() throws IOException {
822               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
823               try {
824                 RegionPlan plan = new RegionPlan(regionInfo, null, sn);
825                 addPlan(encodedName, plan);
826                 assign(rsOffline, false, false);
827               } finally {
828                 lock.unlock();
829               }
830             }
831           });
832         break;
833 
834       case RS_ZK_REGION_OPENING:
835         regionStates.updateRegionState(rt, State.OPENING);
836         break;
837 
838       case RS_ZK_REGION_OPENED:
839         // Region is opened, insert into RIT and handle it
840         // This could be done asynchronously, we would need then to acquire the lock in the
841         //  handler.
842         regionStates.updateRegionState(rt, State.OPEN);
843         new OpenedRegionHandler(server, this, regionInfo, sn, expectedVersion).process();
844         break;
845       case RS_ZK_REQUEST_REGION_SPLIT:
846       case RS_ZK_REGION_SPLITTING:
847       case RS_ZK_REGION_SPLIT:
848         // Splitting region should be online. We could have skipped it during
849         // user region rebuilding since we may consider the split is completed.
850         // Put it in SPLITTING state to avoid complications.
851         regionStates.regionOnline(regionInfo, sn);
852         regionStates.updateRegionState(rt, State.SPLITTING);
853         if (!handleRegionSplitting(
854             rt, encodedName, prettyPrintedRegionName, sn)) {
855           deleteSplittingNode(encodedName, sn);
856         }
857         break;
858       case RS_ZK_REQUEST_REGION_MERGE:
859       case RS_ZK_REGION_MERGING:
860       case RS_ZK_REGION_MERGED:
861         if (!handleRegionMerging(
862             rt, encodedName, prettyPrintedRegionName, sn)) {
863           deleteMergingNode(encodedName, sn);
864         }
865         break;
866       default:
867         throw new IllegalStateException("Received region in state:" + et + " is not valid.");
868     }
869     LOG.info("Processed region " + prettyPrintedRegionName + " in state "
870       + et + ", on " + (serverManager.isServerOnline(sn) ? "" : "dead ")
871       + "server: " + sn);
872     return true;
873   }
874 
875   /**
876    * When a region is closed, it should be removed from the regionsToReopen
877    * @param hri HRegionInfo of the region which was closed
878    */
879   public void removeClosedRegion(HRegionInfo hri) {
880     if (regionsToReopen.remove(hri.getEncodedName()) != null) {
881       LOG.debug("Removed region from reopening regions because it was closed");
882     }
883   }
884 
885   /**
886    * Handles various states an unassigned node can be in.
887    * <p>
888    * Method is called when a state change is suspected for an unassigned node.
889    * <p>
890    * This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
891    * yet).
892    * @param rt
893    * @param expectedVersion
894    */
895   void handleRegion(final RegionTransition rt, int expectedVersion) {
896     if (rt == null) {
897       LOG.warn("Unexpected NULL input for RegionTransition rt");
898       return;
899     }
900     final ServerName sn = rt.getServerName();
901     // Check if this is a special HBCK transition
902     if (sn.equals(HBCK_CODE_SERVERNAME)) {
903       handleHBCK(rt);
904       return;
905     }
906     final long createTime = rt.getCreateTime();
907     final byte[] regionName = rt.getRegionName();
908     String encodedName = HRegionInfo.encodeRegionName(regionName);
909     String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
910     // Verify this is a known server
911     if (!serverManager.isServerOnline(sn)
912       && !ignoreStatesRSOffline.contains(rt.getEventType())) {
913       LOG.warn("Attempted to handle region transition for server but " +
914         "it is not online: " + prettyPrintedRegionName + ", " + rt);
915       return;
916     }
917 
918     RegionState regionState =
919       regionStates.getRegionState(encodedName);
920     long startTime = System.currentTimeMillis();
921     if (LOG.isDebugEnabled()) {
922       boolean lateEvent = createTime < (startTime - 15000);
923       LOG.debug("Handling " + rt.getEventType() +
924         ", server=" + sn + ", region=" +
925         (prettyPrintedRegionName == null ? "null" : prettyPrintedRegionName) +
926         (lateEvent ? ", which is more than 15 seconds late" : "") +
927         ", current_state=" + regionState);
928     }
929     // We don't do anything for this event,
930     // so separate it out, no need to lock/unlock anything
931     if (rt.getEventType() == EventType.M_ZK_REGION_OFFLINE) {
932       return;
933     }
934 
935     // We need a lock on the region as we could update it
936     Lock lock = locker.acquireLock(encodedName);
937     try {
938       RegionState latestState =
939         regionStates.getRegionState(encodedName);
940       if ((regionState == null && latestState != null)
941           || (regionState != null && latestState == null)
942           || (regionState != null && latestState != null
943             && latestState.getState() != regionState.getState())) {
944         LOG.warn("Region state changed from " + regionState + " to "
945           + latestState + ", while acquiring lock");
946       }
947       long waitedTime = System.currentTimeMillis() - startTime;
948       if (waitedTime > 5000) {
949         LOG.warn("Took " + waitedTime + "ms to acquire the lock");
950       }
951       regionState = latestState;
952       switch (rt.getEventType()) {
953       case RS_ZK_REQUEST_REGION_SPLIT:
954       case RS_ZK_REGION_SPLITTING:
955       case RS_ZK_REGION_SPLIT:
956         if (!handleRegionSplitting(
957             rt, encodedName, prettyPrintedRegionName, sn)) {
958           deleteSplittingNode(encodedName, sn);
959         }
960         break;
961 
962       case RS_ZK_REQUEST_REGION_MERGE:
963       case RS_ZK_REGION_MERGING:
964       case RS_ZK_REGION_MERGED:
965         // Merged region is a new region, we can't find it in the region states now.
966         // However, the two merging regions are not new. They should be in state for merging.
967         if (!handleRegionMerging(
968             rt, encodedName, prettyPrintedRegionName, sn)) {
969           deleteMergingNode(encodedName, sn);
970         }
971         break;
972 
973       case M_ZK_REGION_CLOSING:
974         // Should see CLOSING after we have asked it to CLOSE or additional
975         // times after already being in state of CLOSING
976         if (regionState == null
977             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
978           LOG.warn("Received CLOSING for " + prettyPrintedRegionName
979             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
980             + regionStates.getRegionState(encodedName));
981           return;
982         }
983         // Transition to CLOSING (or update stamp if already CLOSING)
984         regionStates.updateRegionState(rt, State.CLOSING);
985         break;
986 
987       case RS_ZK_REGION_CLOSED:
988         // Should see CLOSED after CLOSING but possible after PENDING_CLOSE
989         if (regionState == null
990             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
991           LOG.warn("Received CLOSED for " + prettyPrintedRegionName
992             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
993             + regionStates.getRegionState(encodedName));
994           return;
995         }
996         // Handle CLOSED by assigning elsewhere or stopping if a disable
997         // If we got here all is good.  Need to update RegionState -- else
998         // what follows will fail because not in expected state.
999         new ClosedRegionHandler(server, this, regionState.getRegion()).process();
1000         updateClosedRegionHandlerTracker(regionState.getRegion());
1001         break;
1002 
1003         case RS_ZK_REGION_FAILED_OPEN:
1004           if (regionState == null
1005               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
1006             LOG.warn("Received FAILED_OPEN for " + prettyPrintedRegionName
1007               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
1008               + regionStates.getRegionState(encodedName));
1009             return;
1010           }
1011           AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
1012           if (failedOpenCount == null) {
1013             failedOpenCount = new AtomicInteger();
1014             // No need to use putIfAbsent, or extra synchronization since
1015             // this whole handleRegion block is locked on the encoded region
1016             // name, and failedOpenTracker is updated only in this block
1017             failedOpenTracker.put(encodedName, failedOpenCount);
1018           }
1019           if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
1020             regionStates.updateRegionState(rt, State.FAILED_OPEN);
1021             // remove the tracking info to save memory, also reset
1022             // the count for next open initiative
1023             failedOpenTracker.remove(encodedName);
1024           } else {
1025             // Handle this the same as if it were opened and then closed.
1026             regionState = regionStates.updateRegionState(rt, State.CLOSED);
1027             if (regionState != null) {
1028               // When there are more than one region server a new RS is selected as the
1029               // destination and the same is updated in the regionplan. (HBASE-5546)
1030               try {
1031                 getRegionPlan(regionState.getRegion(), sn, true);
1032                 new ClosedRegionHandler(server, this, regionState.getRegion()).process();
1033               } catch (HBaseIOException e) {
1034                 LOG.warn("Failed to get region plan", e);
1035               }
1036             }
1037           }
1038           break;
1039 
1040         case RS_ZK_REGION_OPENING:
1041           // Should see OPENING after we have asked it to OPEN or additional
1042           // times after already being in state of OPENING
1043           if (regionState == null
1044               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
1045             LOG.warn("Received OPENING for " + prettyPrintedRegionName
1046               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
1047               + regionStates.getRegionState(encodedName));
1048             return;
1049           }
1050           // Transition to OPENING (or update stamp if already OPENING)
1051           regionStates.updateRegionState(rt, State.OPENING);
1052           break;
1053 
1054         case RS_ZK_REGION_OPENED:
1055           // Should see OPENED after OPENING but possible after PENDING_OPEN.
1056           if (regionState == null
1057               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
1058             LOG.warn("Received OPENED for " + prettyPrintedRegionName
1059               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
1060               + regionStates.getRegionState(encodedName));
1061 
1062             if (regionState != null) {
1063               // Close it without updating the internal region states,
1064               // so as not to create double assignments in unlucky scenarios
1065               // mentioned in OpenRegionHandler#process
1066               unassign(regionState.getRegion(), null, -1, null, false, sn);
1067             }
1068             return;
1069           }
1070           // Handle OPENED by removing from transition and deleted zk node
1071           regionState = regionStates.updateRegionState(rt, State.OPEN);
1072           if (regionState != null) {
1073             failedOpenTracker.remove(encodedName); // reset the count, if any
1074             new OpenedRegionHandler(
1075               server, this, regionState.getRegion(), sn, expectedVersion).process();
1076             updateOpenedRegionHandlerTracker(regionState.getRegion());
1077           }
1078           break;
1079 
1080         default:
1081           throw new IllegalStateException("Received event is not valid.");
1082       }
1083     } finally {
1084       lock.unlock();
1085     }
1086   }
1087 
1088   //For unit tests only
1089   boolean wasClosedHandlerCalled(HRegionInfo hri) {
1090     AtomicBoolean b = closedRegionHandlerCalled.get(hri);
1091     //compareAndSet to be sure that unit tests don't see stale values. Means,
1092     //we will return true exactly once unless the handler code resets to true
1093     //this value.
1094     return b == null ? false : b.compareAndSet(true, false);
1095   }
1096 
1097   //For unit tests only
1098   boolean wasOpenedHandlerCalled(HRegionInfo hri) {
1099     AtomicBoolean b = openedRegionHandlerCalled.get(hri);
1100     //compareAndSet to be sure that unit tests don't see stale values. Means,
1101     //we will return true exactly once unless the handler code resets to true
1102     //this value.
1103     return b == null ? false : b.compareAndSet(true, false);
1104   }
1105 
1106   //For unit tests only
1107   void initializeHandlerTrackers() {
1108     closedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
1109     openedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
1110   }
1111 
1112   void updateClosedRegionHandlerTracker(HRegionInfo hri) {
1113     if (closedRegionHandlerCalled != null) { //only for unit tests this is true
1114       closedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
1115     }
1116   }
1117 
1118   void updateOpenedRegionHandlerTracker(HRegionInfo hri) {
1119     if (openedRegionHandlerCalled != null) { //only for unit tests this is true
1120       openedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
1121     }
1122   }
1123 
1124   // TODO: processFavoredNodes might throw an exception, for e.g., if the
1125   // meta could not be contacted/updated. We need to see how seriously to treat
1126   // this problem as. Should we fail the current assignment. We should be able
1127   // to recover from this problem eventually (if the meta couldn't be updated
1128   // things should work normally and eventually get fixed up).
1129   void processFavoredNodes(List<HRegionInfo> regions) throws IOException {
1130     if (!shouldAssignRegionsWithFavoredNodes) return;
1131     // The AM gets the favored nodes info for each region and updates the meta
1132     // table with that info
1133     Map<HRegionInfo, List<ServerName>> regionToFavoredNodes =
1134         new HashMap<HRegionInfo, List<ServerName>>();
1135     for (HRegionInfo region : regions) {
1136       regionToFavoredNodes.put(region,
1137           ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region));
1138     }
1139     FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(regionToFavoredNodes, catalogTracker);
1140   }
1141 
1142   /**
1143    * Handle a ZK unassigned node transition triggered by HBCK repair tool.
1144    * <p>
1145    * This is handled in a separate code path because it breaks the normal rules.
1146    * @param rt
1147    */
1148   private void handleHBCK(RegionTransition rt) {
1149     String encodedName = HRegionInfo.encodeRegionName(rt.getRegionName());
1150     LOG.info("Handling HBCK triggered transition=" + rt.getEventType() +
1151       ", server=" + rt.getServerName() + ", region=" +
1152       HRegionInfo.prettyPrint(encodedName));
1153     RegionState regionState = regionStates.getRegionTransitionState(encodedName);
1154     switch (rt.getEventType()) {
1155       case M_ZK_REGION_OFFLINE:
1156         HRegionInfo regionInfo;
1157         if (regionState != null) {
1158           regionInfo = regionState.getRegion();
1159         } else {
1160           try {
1161             byte [] name = rt.getRegionName();
1162             Pair<HRegionInfo, ServerName> p = MetaReader.getRegion(catalogTracker, name);
1163             regionInfo = p.getFirst();
1164           } catch (IOException e) {
1165             LOG.info("Exception reading hbase:meta doing HBCK repair operation", e);
1166             return;
1167           }
1168         }
1169         LOG.info("HBCK repair is triggering assignment of region=" +
1170             regionInfo.getRegionNameAsString());
1171         // trigger assign, node is already in OFFLINE so don't need to update ZK
1172         assign(regionInfo, false);
1173         break;
1174 
1175       default:
1176         LOG.warn("Received unexpected region state from HBCK: " + rt.toString());
1177         break;
1178     }
1179 
1180   }
1181 
1182   // ZooKeeper events
1183 
1184   /**
1185    * New unassigned node has been created.
1186    *
1187    * <p>This happens when an RS begins the OPENING or CLOSING of a region by
1188    * creating an unassigned node.
1189    *
1190    * <p>When this happens we must:
1191    * <ol>
1192    *   <li>Watch the node for further events</li>
1193    *   <li>Read and handle the state in the node</li>
1194    * </ol>
1195    */
1196   @Override
1197   public void nodeCreated(String path) {
1198     handleAssignmentEvent(path);
1199   }
1200 
1201   /**
1202    * Existing unassigned node has had data changed.
1203    *
1204    * <p>This happens when an RS transitions from OFFLINE to OPENING, or between
1205    * OPENING/OPENED and CLOSING/CLOSED.
1206    *
1207    * <p>When this happens we must:
1208    * <ol>
1209    *   <li>Watch the node for further events</li>
1210    *   <li>Read and handle the state in the node</li>
1211    * </ol>
1212    */
1213   @Override
1214   public void nodeDataChanged(String path) {
1215     handleAssignmentEvent(path);
1216   }
1217 
1218 
1219   // We  don't want to have two events on the same region managed simultaneously.
1220   // For this reason, we need to wait if an event on the same region is currently in progress.
1221   // So we track the region names of the events in progress, and we keep a waiting list.
1222   private final Set<String> regionsInProgress = new HashSet<String>();
1223   // In a LinkedHashMultimap, the put order is kept when we retrieve the collection back. We need
1224   //  this as we want the events to be managed in the same order as we received them.
1225   private final LinkedHashMultimap <String, RegionRunnable>
1226       zkEventWorkerWaitingList = LinkedHashMultimap.create();
1227 
1228   /**
1229    * A specific runnable that works only on a region.
1230    */
1231   private interface RegionRunnable extends Runnable{
1232     /**
1233      * @return - the name of the region it works on.
1234      */
1235     String getRegionName();
1236   }
1237 
1238   /**
1239    * Submit a task, ensuring that there is only one task at a time that working on a given region.
1240    * Order is respected.
1241    */
1242   protected void zkEventWorkersSubmit(final RegionRunnable regRunnable) {
1243 
1244     synchronized (regionsInProgress) {
1245       // If we're there is already a task with this region, we add it to the
1246       //  waiting list and return.
1247       if (regionsInProgress.contains(regRunnable.getRegionName())) {
1248         synchronized (zkEventWorkerWaitingList){
1249           zkEventWorkerWaitingList.put(regRunnable.getRegionName(), regRunnable);
1250         }
1251         return;
1252       }
1253 
1254       // No event in progress on this region => we can submit a new task immediately.
1255       regionsInProgress.add(regRunnable.getRegionName());
1256       zkEventWorkers.submit(new Runnable() {
1257         @Override
1258         public void run() {
1259           try {
1260             regRunnable.run();
1261           } finally {
1262             // now that we have finished, let's see if there is an event for the same region in the
1263             //  waiting list. If it's the case, we can now submit it to the pool.
1264             synchronized (regionsInProgress) {
1265               regionsInProgress.remove(regRunnable.getRegionName());
1266               synchronized (zkEventWorkerWaitingList) {
1267                 java.util.Set<RegionRunnable> waiting = zkEventWorkerWaitingList.get(
1268                     regRunnable.getRegionName());
1269                 if (!waiting.isEmpty()) {
1270                   // We want the first object only. The only way to get it is through an iterator.
1271                   RegionRunnable toSubmit = waiting.iterator().next();
1272                   zkEventWorkerWaitingList.remove(toSubmit.getRegionName(), toSubmit);
1273                   zkEventWorkersSubmit(toSubmit);
1274                 }
1275               }
1276             }
1277           }
1278         }
1279       });
1280     }
1281   }
1282 
1283   @Override
1284   public void nodeDeleted(final String path) {
1285     if (path.startsWith(watcher.assignmentZNode)) {
1286       final String regionName = ZKAssign.getRegionName(watcher, path);
1287       zkEventWorkersSubmit(new RegionRunnable() {
1288         @Override
1289         public String getRegionName() {
1290           return regionName;
1291         }
1292 
1293         @Override
1294         public void run() {
1295           Lock lock = locker.acquireLock(regionName);
1296           try {
1297             RegionState rs = regionStates.getRegionTransitionState(regionName);
1298             if (rs == null) {
1299               rs = regionStates.getRegionState(regionName);
1300               if (rs == null || !rs.isMergingNew()) {
1301                 // MergingNew is an offline state
1302                 return;
1303               }
1304             }
1305 
1306             HRegionInfo regionInfo = rs.getRegion();
1307             String regionNameStr = regionInfo.getRegionNameAsString();
1308             LOG.debug("Znode " + regionNameStr + " deleted, state: " + rs);
1309             boolean disabled = getZKTable().isDisablingOrDisabledTable(regionInfo.getTable());
1310             ServerName serverName = rs.getServerName();
1311             if (serverManager.isServerOnline(serverName)) {
1312               if (rs.isOnServer(serverName)
1313                   && (rs.isOpened() || rs.isSplitting())) {
1314                 regionOnline(regionInfo, serverName);
1315                 if (disabled) {
1316                   // if server is offline, no hurt to unassign again
1317                   LOG.info("Opened " + regionNameStr
1318                     + "but this table is disabled, triggering close of region");
1319                   unassign(regionInfo);
1320                 }
1321               } else if (rs.isMergingNew()) {
1322                 synchronized (regionStates) {
1323                   String p = regionInfo.getEncodedName();
1324                   PairOfSameType<HRegionInfo> regions = mergingRegions.get(p);
1325                   if (regions != null) {
1326                     onlineMergingRegion(disabled, regions.getFirst(), serverName);
1327                     onlineMergingRegion(disabled, regions.getSecond(), serverName);
1328                   }
1329                 }
1330               }
1331             }
1332           } finally {
1333             lock.unlock();
1334           }
1335         }
1336 
1337         private void onlineMergingRegion(boolean disabled,
1338             final HRegionInfo hri, final ServerName serverName) {
1339           RegionState regionState = regionStates.getRegionState(hri);
1340           if (regionState != null && regionState.isMerging()
1341               && regionState.isOnServer(serverName)) {
1342             regionOnline(regionState.getRegion(), serverName);
1343             if (disabled) {
1344               unassign(hri);
1345             }
1346           }
1347         }
1348       });
1349     }
1350   }
1351 
1352   /**
1353    * New unassigned node has been created.
1354    *
1355    * <p>This happens when an RS begins the OPENING, SPLITTING or CLOSING of a
1356    * region by creating a znode.
1357    *
1358    * <p>When this happens we must:
1359    * <ol>
1360    *   <li>Watch the node for further children changed events</li>
1361    *   <li>Watch all new children for changed events</li>
1362    * </ol>
1363    */
1364   @Override
1365   public void nodeChildrenChanged(String path) {
1366     if (path.equals(watcher.assignmentZNode)) {
1367       zkEventWorkers.submit(new Runnable() {
1368         @Override
1369         public void run() {
1370           try {
1371             // Just make sure we see the changes for the new znodes
1372             List<String> children =
1373               ZKUtil.listChildrenAndWatchForNewChildren(
1374                 watcher, watcher.assignmentZNode);
1375             if (children != null) {
1376               Stat stat = new Stat();
1377               for (String child : children) {
1378                 // if region is in transition, we already have a watch
1379                 // on it, so no need to watch it again. So, as I know for now,
1380                 // this is needed to watch splitting nodes only.
1381                 if (!regionStates.isRegionInTransition(child)) {
1382                   ZKAssign.getDataAndWatch(watcher, child, stat);
1383                 }
1384               }
1385             }
1386           } catch (KeeperException e) {
1387             server.abort("Unexpected ZK exception reading unassigned children", e);
1388           }
1389         }
1390       });
1391     }
1392   }
1393 
1394   
1395   /**
1396    * Marks the region as online.  Removes it from regions in transition and
1397    * updates the in-memory assignment information.
1398    * <p>
1399    * Used when a region has been successfully opened on a region server.
1400    * @param regionInfo
1401    * @param sn
1402    */
1403   void regionOnline(HRegionInfo regionInfo, ServerName sn) {
1404     regionOnline(regionInfo, sn, HConstants.NO_SEQNUM);
1405   }
1406 
1407   void regionOnline(HRegionInfo regionInfo, ServerName sn, long openSeqNum) {
1408     numRegionsOpened.incrementAndGet();
1409     regionStates.regionOnline(regionInfo, sn, openSeqNum);
1410 
1411     // Remove plan if one.
1412     clearRegionPlan(regionInfo);
1413     // Add the server to serversInUpdatingTimer
1414     addToServersInUpdatingTimer(sn);
1415     balancer.regionOnline(regionInfo, sn);
1416 
1417     // Tell our listeners that a region was opened
1418     sendRegionOpenedNotification(regionInfo, sn);
1419   }
1420 
1421   /**
1422    * Pass the assignment event to a worker for processing.
1423    * Each worker is a single thread executor service.  The reason
1424    * for just one thread is to make sure all events for a given
1425    * region are processed in order.
1426    *
1427    * @param path
1428    */
1429   private void handleAssignmentEvent(final String path) {
1430     if (path.startsWith(watcher.assignmentZNode)) {
1431       final String regionName = ZKAssign.getRegionName(watcher, path);
1432 
1433       zkEventWorkersSubmit(new RegionRunnable() {
1434         @Override
1435         public String getRegionName() {
1436           return regionName;
1437         }
1438 
1439         @Override
1440         public void run() {
1441           try {
1442             Stat stat = new Stat();
1443             byte [] data = ZKAssign.getDataAndWatch(watcher, path, stat);
1444             if (data == null) return;
1445 
1446             RegionTransition rt = RegionTransition.parseFrom(data);
1447             handleRegion(rt, stat.getVersion());
1448           } catch (KeeperException e) {
1449             server.abort("Unexpected ZK exception reading unassigned node data", e);
1450           } catch (DeserializationException e) {
1451             server.abort("Unexpected exception deserializing node data", e);
1452           }
1453         }
1454       });
1455     }
1456   }
1457 
1458   /**
1459    * Add the server to the set serversInUpdatingTimer, then {@link TimerUpdater}
1460    * will update timers for this server in background
1461    * @param sn
1462    */
1463   private void addToServersInUpdatingTimer(final ServerName sn) {
1464     if (tomActivated){
1465       this.serversInUpdatingTimer.add(sn);
1466     }
1467   }
1468 
1469   /**
1470    * Touch timers for all regions in transition that have the passed
1471    * <code>sn</code> in common.
1472    * Call this method whenever a server checks in.  Doing so helps the case where
1473    * a new regionserver has joined the cluster and its been given 1k regions to
1474    * open.  If this method is tickled every time the region reports in a
1475    * successful open then the 1k-th region won't be timed out just because its
1476    * sitting behind the open of 999 other regions.  This method is NOT used
1477    * as part of bulk assign -- there we have a different mechanism for extending
1478    * the regions in transition timer (we turn it off temporarily -- because
1479    * there is no regionplan involved when bulk assigning.
1480    * @param sn
1481    */
1482   private void updateTimers(final ServerName sn) {
1483     Preconditions.checkState(tomActivated);
1484     if (sn == null) return;
1485 
1486     // This loop could be expensive.
1487     // First make a copy of current regionPlan rather than hold sync while
1488     // looping because holding sync can cause deadlock.  Its ok in this loop
1489     // if the Map we're going against is a little stale
1490     List<Map.Entry<String, RegionPlan>> rps;
1491     synchronized(this.regionPlans) {
1492       rps = new ArrayList<Map.Entry<String, RegionPlan>>(regionPlans.entrySet());
1493     }
1494 
1495     for (Map.Entry<String, RegionPlan> e : rps) {
1496       if (e.getValue() != null && e.getKey() != null && sn.equals(e.getValue().getDestination())) {
1497         RegionState regionState = regionStates.getRegionTransitionState(e.getKey());
1498         if (regionState != null) {
1499           regionState.updateTimestampToNow();
1500         }
1501       }
1502     }
1503   }
1504 
1505   /**
1506    * Marks the region as offline.  Removes it from regions in transition and
1507    * removes in-memory assignment information.
1508    * <p>
1509    * Used when a region has been closed and should remain closed.
1510    * @param regionInfo
1511    */
1512   public void regionOffline(final HRegionInfo regionInfo) {
1513     regionOffline(regionInfo, null);
1514   }
1515 
1516   public void offlineDisabledRegion(HRegionInfo regionInfo) {
1517     if (useZKForAssignment) {
1518       // Disabling so should not be reassigned, just delete the CLOSED node
1519       LOG.debug("Table being disabled so deleting ZK node and removing from " +
1520         "regions in transition, skipping assignment of region " +
1521           regionInfo.getRegionNameAsString());
1522       String encodedName = regionInfo.getEncodedName();
1523       deleteNodeInStates(encodedName, "closed", null,
1524         EventType.RS_ZK_REGION_CLOSED, EventType.M_ZK_REGION_OFFLINE);
1525     }
1526     regionOffline(regionInfo);
1527   }
1528 
1529   // Assignment methods
1530 
1531   /**
1532    * Assigns the specified region.
1533    * <p>
1534    * If a RegionPlan is available with a valid destination then it will be used
1535    * to determine what server region is assigned to.  If no RegionPlan is
1536    * available, region will be assigned to a random available server.
1537    * <p>
1538    * Updates the RegionState and sends the OPEN RPC.
1539    * <p>
1540    * This will only succeed if the region is in transition and in a CLOSED or
1541    * OFFLINE state or not in transition (in-memory not zk), and of course, the
1542    * chosen server is up and running (It may have just crashed!).  If the
1543    * in-memory checks pass, the zk node is forced to OFFLINE before assigning.
1544    *
1545    * @param region server to be assigned
1546    * @param setOfflineInZK whether ZK node should be created/transitioned to an
1547    *                       OFFLINE state before assigning the region
1548    */
1549   public void assign(HRegionInfo region, boolean setOfflineInZK) {
1550     assign(region, setOfflineInZK, false);
1551   }
1552 
1553   /**
1554    * Use care with forceNewPlan. It could cause double assignment.
1555    */
1556   public void assign(HRegionInfo region,
1557       boolean setOfflineInZK, boolean forceNewPlan) {
1558     if (isDisabledorDisablingRegionInRIT(region)) {
1559       return;
1560     }
1561     if (this.serverManager.isClusterShutdown()) {
1562       LOG.info("Cluster shutdown is set; skipping assign of " +
1563         region.getRegionNameAsString());
1564       return;
1565     }
1566     String encodedName = region.getEncodedName();
1567     Lock lock = locker.acquireLock(encodedName);
1568     try {
1569       RegionState state = forceRegionStateToOffline(region, forceNewPlan);
1570       if (state != null) {
1571         if (regionStates.wasRegionOnDeadServer(encodedName)) {
1572           LOG.info("Skip assigning " + region.getRegionNameAsString()
1573             + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1574             + " is dead but not processed yet");
1575           return;
1576         }
1577         assign(state, setOfflineInZK && useZKForAssignment, forceNewPlan);
1578       }
1579     } finally {
1580       lock.unlock();
1581     }
1582   }
1583 
1584   /**
1585    * Bulk assign regions to <code>destination</code>.
1586    * @param destination
1587    * @param regions Regions to assign.
1588    * @return true if successful
1589    */
1590   boolean assign(final ServerName destination, final List<HRegionInfo> regions) {
1591     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1592     try {
1593       int regionCount = regions.size();
1594       if (regionCount == 0) {
1595         return true;
1596       }
1597       LOG.debug("Assigning " + regionCount + " region(s) to " + destination.toString());
1598       Set<String> encodedNames = new HashSet<String>(regionCount);
1599       for (HRegionInfo region : regions) {
1600         encodedNames.add(region.getEncodedName());
1601       }
1602 
1603       List<HRegionInfo> failedToOpenRegions = new ArrayList<HRegionInfo>();
1604       Map<String, Lock> locks = locker.acquireLocks(encodedNames);
1605       try {
1606         AtomicInteger counter = new AtomicInteger(0);
1607         Map<String, Integer> offlineNodesVersions = new ConcurrentHashMap<String, Integer>();
1608         OfflineCallback cb = new OfflineCallback(
1609           watcher, destination, counter, offlineNodesVersions);
1610         Map<String, RegionPlan> plans = new HashMap<String, RegionPlan>(regions.size());
1611         List<RegionState> states = new ArrayList<RegionState>(regions.size());
1612         for (HRegionInfo region : regions) {
1613           String encodedName = region.getEncodedName();
1614           if (!isDisabledorDisablingRegionInRIT(region)) {
1615             RegionState state = forceRegionStateToOffline(region, false);
1616             boolean onDeadServer = false;
1617             if (state != null) {
1618               if (regionStates.wasRegionOnDeadServer(encodedName)) {
1619                 LOG.info("Skip assigning " + region.getRegionNameAsString()
1620                   + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1621                   + " is dead but not processed yet");
1622                 onDeadServer = true;
1623               } else if (!useZKForAssignment
1624                   || asyncSetOfflineInZooKeeper(state, cb, destination)) {
1625                 RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
1626                 plans.put(encodedName, plan);
1627                 states.add(state);
1628                 continue;
1629               }
1630             }
1631             // Reassign if the region wasn't on a dead server
1632             if (!onDeadServer) {
1633               LOG.info("failed to force region state to offline or "
1634                 + "failed to set it offline in ZK, will reassign later: " + region);
1635               failedToOpenRegions.add(region); // assign individually later
1636             }
1637           }
1638           // Release the lock, this region is excluded from bulk assign because
1639           // we can't update its state, or set its znode to offline.
1640           Lock lock = locks.remove(encodedName);
1641           lock.unlock();
1642         }
1643         if (useZKForAssignment) {
1644           // Wait until all unassigned nodes have been put up and watchers set.
1645           int total = states.size();
1646           for (int oldCounter = 0; !server.isStopped();) {
1647             int count = counter.get();
1648             if (oldCounter != count) {
1649               LOG.info(destination.toString() + " unassigned znodes=" + count + " of total="
1650                   + total);
1651               oldCounter = count;
1652             }
1653             if (count >= total) break;
1654             Threads.sleep(5);
1655           }
1656         }
1657 
1658         if (server.isStopped()) {
1659           return false;
1660         }
1661 
1662         // Add region plans, so we can updateTimers when one region is opened so
1663         // that unnecessary timeout on RIT is reduced.
1664         this.addPlans(plans);
1665 
1666         List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos =
1667           new ArrayList<Triple<HRegionInfo, Integer, List<ServerName>>>(states.size());
1668         for (RegionState state: states) {
1669           HRegionInfo region = state.getRegion();
1670           String encodedRegionName = region.getEncodedName();
1671           Integer nodeVersion = offlineNodesVersions.get(encodedRegionName);
1672           if (useZKForAssignment && (nodeVersion == null || nodeVersion == -1)) {
1673             LOG.warn("failed to offline in zookeeper: " + region);
1674             failedToOpenRegions.add(region); // assign individually later
1675             Lock lock = locks.remove(encodedRegionName);
1676             lock.unlock();
1677           } else {
1678             regionStates.updateRegionState(
1679               region, State.PENDING_OPEN, destination);
1680             List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1681             if (this.shouldAssignRegionsWithFavoredNodes) {
1682               favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1683             }
1684             regionOpenInfos.add(new Triple<HRegionInfo, Integer,  List<ServerName>>(
1685               region, nodeVersion, favoredNodes));
1686           }
1687         }
1688 
1689         // Move on to open regions.
1690         try {
1691           // Send OPEN RPC. If it fails on a IOE or RemoteException,
1692           // regions will be assigned individually.
1693           long maxWaitTime = System.currentTimeMillis() +
1694             this.server.getConfiguration().
1695               getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1696           for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1697             try {
1698               List<RegionOpeningState> regionOpeningStateList = serverManager
1699                 .sendRegionOpen(destination, regionOpenInfos);
1700               if (regionOpeningStateList == null) {
1701                 // Failed getting RPC connection to this server
1702                 return false;
1703               }
1704               for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
1705                 RegionOpeningState openingState = regionOpeningStateList.get(k);
1706                 if (openingState != RegionOpeningState.OPENED) {
1707                   HRegionInfo region = regionOpenInfos.get(k).getFirst();
1708                   if (openingState == RegionOpeningState.ALREADY_OPENED) {
1709                     processAlreadyOpenedRegion(region, destination);
1710                   } else if (openingState == RegionOpeningState.FAILED_OPENING) {
1711                     // Failed opening this region, reassign it later
1712                     failedToOpenRegions.add(region);
1713                   } else {
1714                     LOG.warn("THIS SHOULD NOT HAPPEN: unknown opening state "
1715                       + openingState + " in assigning region " + region);
1716                   }
1717                 }
1718               }
1719               break;
1720             } catch (IOException e) {
1721               if (e instanceof RemoteException) {
1722                 e = ((RemoteException)e).unwrapRemoteException();
1723               }
1724               if (e instanceof RegionServerStoppedException) {
1725                 LOG.warn("The region server was shut down, ", e);
1726                 // No need to retry, the region server is a goner.
1727                 return false;
1728               } else if (e instanceof ServerNotRunningYetException) {
1729                 long now = System.currentTimeMillis();
1730                 if (now < maxWaitTime) {
1731                   LOG.debug("Server is not yet up; waiting up to " +
1732                     (maxWaitTime - now) + "ms", e);
1733                   Thread.sleep(100);
1734                   i--; // reset the try count
1735                   continue;
1736                 }
1737               } else if (e instanceof java.net.SocketTimeoutException
1738                   && this.serverManager.isServerOnline(destination)) {
1739                 // In case socket is timed out and the region server is still online,
1740                 // the openRegion RPC could have been accepted by the server and
1741                 // just the response didn't go through.  So we will retry to
1742                 // open the region on the same server.
1743                 if (LOG.isDebugEnabled()) {
1744                   LOG.debug("Bulk assigner openRegion() to " + destination
1745                     + " has timed out, but the regions might"
1746                     + " already be opened on it.", e);
1747                 }
1748                 // wait and reset the re-try count, server might be just busy.
1749                 Thread.sleep(100);
1750                 i--;
1751                 continue;
1752               }
1753               throw e;
1754             }
1755           }
1756         } catch (IOException e) {
1757           // Can be a socket timeout, EOF, NoRouteToHost, etc
1758           LOG.info("Unable to communicate with " + destination
1759             + " in order to assign regions, ", e);
1760           return false;
1761         } catch (InterruptedException e) {
1762           throw new RuntimeException(e);
1763         }
1764       } finally {
1765         for (Lock lock : locks.values()) {
1766           lock.unlock();
1767         }
1768       }
1769 
1770       if (!failedToOpenRegions.isEmpty()) {
1771         for (HRegionInfo region : failedToOpenRegions) {
1772           if (!regionStates.isRegionOnline(region)) {
1773             invokeAssign(region);
1774           }
1775         }
1776       }
1777       LOG.debug("Bulk assigning done for " + destination);
1778       return true;
1779     } finally {
1780       metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
1781     }
1782   }
1783 
1784   /**
1785    * Send CLOSE RPC if the server is online, otherwise, offline the region.
1786    *
1787    * The RPC will be sent only to the region sever found in the region state
1788    * if it is passed in, otherwise, to the src server specified. If region
1789    * state is not specified, we don't update region state at all, instead
1790    * we just send the RPC call. This is useful for some cleanup without
1791    * messing around the region states (see handleRegion, on region opened
1792    * on an unexpected server scenario, for an example)
1793    */
1794   private void unassign(final HRegionInfo region,
1795       final RegionState state, final int versionOfClosingNode,
1796       final ServerName dest, final boolean transitionInZK,
1797       final ServerName src) {
1798     ServerName server = src;
1799     if (state != null) {
1800       server = state.getServerName();
1801     }
1802     long maxWaitTime = -1;
1803     for (int i = 1; i <= this.maximumAttempts; i++) {
1804       if (this.server.isStopped() || this.server.isAborted()) {
1805         LOG.debug("Server stopped/aborted; skipping unassign of " + region);
1806         return;
1807       }
1808       // ClosedRegionhandler can remove the server from this.regions
1809       if (!serverManager.isServerOnline(server)) {
1810         LOG.debug("Offline " + region.getRegionNameAsString()
1811           + ", no need to unassign since it's on a dead server: " + server);
1812         if (transitionInZK) {
1813           // delete the node. if no node exists need not bother.
1814           deleteClosingOrClosedNode(region, server);
1815         }
1816         if (state != null) {
1817           regionOffline(region);
1818         }
1819         return;
1820       }
1821       try {
1822         // Send CLOSE RPC
1823         if (serverManager.sendRegionClose(server, region,
1824           versionOfClosingNode, dest, transitionInZK)) {
1825           LOG.debug("Sent CLOSE to " + server + " for region " +
1826             region.getRegionNameAsString());
1827           if (useZKForAssignment && !transitionInZK && state != null) {
1828             // Retry to make sure the region is
1829             // closed so as to avoid double assignment.
1830             unassign(region, state, versionOfClosingNode,
1831               dest, transitionInZK, src);
1832           }
1833           return;
1834         }
1835         // This never happens. Currently regionserver close always return true.
1836         // Todo; this can now happen (0.96) if there is an exception in a coprocessor
1837         LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
1838           region.getRegionNameAsString());
1839       } catch (Throwable t) {
1840         if (t instanceof RemoteException) {
1841           t = ((RemoteException)t).unwrapRemoteException();
1842         }
1843         boolean logRetries = true;
1844         if (t instanceof NotServingRegionException
1845             || t instanceof RegionServerStoppedException
1846             || t instanceof ServerNotRunningYetException) {
1847           LOG.debug("Offline " + region.getRegionNameAsString()
1848             + ", it's not any more on " + server, t);
1849           if (transitionInZK) {
1850             deleteClosingOrClosedNode(region, server);
1851           }
1852           if (state != null) {
1853             regionOffline(region);
1854           }
1855           return;
1856         } else if ((t instanceof FailedServerException) || (state != null &&
1857             t instanceof RegionAlreadyInTransitionException)) {
1858           long sleepTime = 0;
1859           Configuration conf = this.server.getConfiguration();
1860           if(t instanceof FailedServerException) {
1861             sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
1862                   RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
1863           } else {
1864             // RS is already processing this region, only need to update the timestamp
1865             LOG.debug("update " + state + " the timestamp.");
1866             state.updateTimestampToNow();
1867             if (maxWaitTime < 0) {
1868               maxWaitTime =
1869                   EnvironmentEdgeManager.currentTimeMillis()
1870                       + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME,
1871                         DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1872             }
1873             long now = EnvironmentEdgeManager.currentTimeMillis();
1874             if (now < maxWaitTime) {
1875               LOG.debug("Region is already in transition; "
1876                 + "waiting up to " + (maxWaitTime - now) + "ms", t);
1877               sleepTime = 100;
1878               i--; // reset the try count
1879               logRetries = false;
1880             }
1881           }
1882           try {
1883             if (sleepTime > 0) {
1884               Thread.sleep(sleepTime);
1885             }
1886           } catch (InterruptedException ie) {
1887             LOG.warn("Failed to unassign "
1888               + region.getRegionNameAsString() + " since interrupted", ie);
1889             Thread.currentThread().interrupt();
1890             if (!tomActivated && state != null) {
1891               regionStates.updateRegionState(region, State.FAILED_CLOSE);
1892             }
1893             return;
1894           }
1895         }
1896 
1897         if (logRetries) {
1898           LOG.info("Server " + server + " returned " + t + " for "
1899             + region.getRegionNameAsString() + ", try=" + i
1900             + " of " + this.maximumAttempts, t);
1901           // Presume retry or server will expire.
1902         }
1903       }
1904     }
1905     // Run out of attempts
1906     if (!tomActivated && state != null) {
1907       regionStates.updateRegionState(region, State.FAILED_CLOSE);
1908     }
1909   }
1910 
1911   /**
1912    * Set region to OFFLINE unless it is opening and forceNewPlan is false.
1913    */
1914   private RegionState forceRegionStateToOffline(
1915       final HRegionInfo region, final boolean forceNewPlan) {
1916     RegionState state = regionStates.getRegionState(region);
1917     if (state == null) {
1918       LOG.warn("Assigning a region not in region states: " + region);
1919       state = regionStates.createRegionState(region);
1920     }
1921 
1922     ServerName sn = state.getServerName();
1923     if (forceNewPlan && LOG.isDebugEnabled()) {
1924       LOG.debug("Force region state offline " + state);
1925     }
1926 
1927     switch (state.getState()) {
1928     case OPEN:
1929     case OPENING:
1930     case PENDING_OPEN:
1931     case CLOSING:
1932     case PENDING_CLOSE:
1933       if (!forceNewPlan) {
1934         LOG.debug("Skip assigning " +
1935           region + ", it is already " + state);
1936         return null;
1937       }
1938     case FAILED_CLOSE:
1939     case FAILED_OPEN:
1940       unassign(region, state, -1, null, false, null);
1941       state = regionStates.getRegionState(region);
1942       if (state.isFailedClose()) {
1943         // If we can't close the region, we can't re-assign
1944         // it so as to avoid possible double assignment/data loss.
1945         LOG.info("Skip assigning " +
1946           region + ", we couldn't close it: " + state);
1947         return null;
1948       }
1949     case OFFLINE:
1950       // This region could have been open on this server
1951       // for a while. If the server is dead and not processed
1952       // yet, we can move on only if the meta shows the
1953       // region is not on this server actually, or on a server
1954       // not dead, or dead and processed already.
1955       // In case not using ZK, we don't need this check because
1956       // we have the latest info in memory, and the caller
1957       // will do another round checking any way.
1958       if (useZKForAssignment
1959           && regionStates.isServerDeadAndNotProcessed(sn)
1960           && wasRegionOnDeadServerByMeta(region, sn)) {
1961         LOG.info("Skip assigning " + region.getRegionNameAsString()
1962           + ", it is on a dead but not processed yet server: " + sn);
1963         return null;
1964       }
1965     case CLOSED:
1966       break;
1967     default:
1968       LOG.error("Trying to assign region " + region
1969         + ", which is " + state);
1970       return null;
1971     }
1972     return state;
1973   }
1974 
1975   private boolean wasRegionOnDeadServerByMeta(
1976       final HRegionInfo region, final ServerName sn) {
1977     try {
1978       if (region.isMetaRegion()) {
1979         ServerName server = catalogTracker.getMetaLocation();
1980         return regionStates.isServerDeadAndNotProcessed(server);
1981       }
1982       while (!server.isStopped()) {
1983         try {
1984           catalogTracker.waitForMeta();
1985           Result r = MetaReader.getRegionResult(catalogTracker, region.getRegionName());
1986           if (r == null || r.isEmpty()) return false;
1987           ServerName server = HRegionInfo.getServerName(r);
1988           return regionStates.isServerDeadAndNotProcessed(server);
1989         } catch (IOException ioe) {
1990           LOG.info("Received exception accessing hbase:meta during force assign "
1991             + region.getRegionNameAsString() + ", retrying", ioe);
1992         }
1993       }
1994     } catch (InterruptedException e) {
1995       Thread.currentThread().interrupt();
1996       LOG.info("Interrupted accessing hbase:meta", e);
1997     }
1998     // Call is interrupted or server is stopped.
1999     return regionStates.isServerDeadAndNotProcessed(sn);
2000   }
2001 
2002   /**
2003    * Caller must hold lock on the passed <code>state</code> object.
2004    * @param state
2005    * @param setOfflineInZK
2006    * @param forceNewPlan
2007    */
2008   private void assign(RegionState state,
2009       final boolean setOfflineInZK, final boolean forceNewPlan) {
2010     long startTime = EnvironmentEdgeManager.currentTimeMillis();
2011     try {
2012       Configuration conf = server.getConfiguration();
2013       RegionState currentState = state;
2014       int versionOfOfflineNode = -1;
2015       RegionPlan plan = null;
2016       long maxWaitTime = -1;
2017       HRegionInfo region = state.getRegion();
2018       RegionOpeningState regionOpenState;
2019       Throwable previousException = null;
2020       for (int i = 1; i <= maximumAttempts; i++) {
2021         if (server.isStopped() || server.isAborted()) {
2022           LOG.info("Skip assigning " + region.getRegionNameAsString()
2023             + ", the server is stopped/aborted");
2024           return;
2025         }
2026         if (plan == null) { // Get a server for the region at first
2027           try {
2028             plan = getRegionPlan(region, forceNewPlan);
2029           } catch (HBaseIOException e) {
2030             LOG.warn("Failed to get region plan", e);
2031           }
2032         }
2033         if (plan == null) {
2034           LOG.warn("Unable to determine a plan to assign " + region);
2035           if (tomActivated){
2036             this.timeoutMonitor.setAllRegionServersOffline(true);
2037           } else {
2038             if (region.isMetaRegion()) {
2039               try {
2040                 Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
2041                 if (i == maximumAttempts) i = 1;
2042                 continue;
2043               } catch (InterruptedException e) {
2044                 LOG.error("Got exception while waiting for hbase:meta assignment");
2045                 Thread.currentThread().interrupt();
2046               }
2047             }
2048             regionStates.updateRegionState(region, State.FAILED_OPEN);
2049           }
2050           return;
2051         }
2052         if (setOfflineInZK && versionOfOfflineNode == -1) {
2053           // get the version of the znode after setting it to OFFLINE.
2054           // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
2055           versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination());
2056           if (versionOfOfflineNode != -1) {
2057             if (isDisabledorDisablingRegionInRIT(region)) {
2058               return;
2059             }
2060             // In case of assignment from EnableTableHandler table state is ENABLING. Any how
2061             // EnableTableHandler will set ENABLED after assigning all the table regions. If we
2062             // try to set to ENABLED directly then client API may think table is enabled.
2063             // When we have a case such as all the regions are added directly into hbase:meta and we call
2064             // assignRegion then we need to make the table ENABLED. Hence in such case the table
2065             // will not be in ENABLING or ENABLED state.
2066             TableName tableName = region.getTable();
2067             if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) {
2068               LOG.debug("Setting table " + tableName + " to ENABLED state.");
2069               setEnabledTable(tableName);
2070             }
2071           }
2072         }
2073         if (setOfflineInZK && versionOfOfflineNode == -1) {
2074           LOG.info("Unable to set offline in ZooKeeper to assign " + region);
2075           // Setting offline in ZK must have been failed due to ZK racing or some
2076           // exception which may make the server to abort. If it is ZK racing,
2077           // we should retry since we already reset the region state,
2078           // existing (re)assignment will fail anyway.
2079           if (!server.isAborted()) {
2080             continue;
2081           }
2082         }
2083         LOG.info("Assigning " + region.getRegionNameAsString() +
2084             " to " + plan.getDestination().toString());
2085         // Transition RegionState to PENDING_OPEN
2086         currentState = regionStates.updateRegionState(region,
2087           State.PENDING_OPEN, plan.getDestination());
2088 
2089         boolean needNewPlan;
2090         final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
2091             " to " + plan.getDestination();
2092         try {
2093           List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
2094           if (this.shouldAssignRegionsWithFavoredNodes) {
2095             favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
2096           }
2097           regionOpenState = serverManager.sendRegionOpen(
2098               plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
2099 
2100           if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
2101             // Failed opening this region, looping again on a new server.
2102             needNewPlan = true;
2103             LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
2104                 " trying to assign elsewhere instead; " +
2105                 "try=" + i + " of " + this.maximumAttempts);
2106           } else {
2107             // we're done
2108             if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
2109               processAlreadyOpenedRegion(region, plan.getDestination());
2110             }
2111             return;
2112           }
2113 
2114         } catch (Throwable t) {
2115           if (t instanceof RemoteException) {
2116             t = ((RemoteException) t).unwrapRemoteException();
2117           }
2118           previousException = t;
2119 
2120           // Should we wait a little before retrying? If the server is starting it's yes.
2121           // If the region is already in transition, it's yes as well: we want to be sure that
2122           //  the region will get opened but we don't want a double assignment.
2123           boolean hold = (t instanceof RegionAlreadyInTransitionException ||
2124               t instanceof ServerNotRunningYetException);
2125 
2126           // In case socket is timed out and the region server is still online,
2127           // the openRegion RPC could have been accepted by the server and
2128           // just the response didn't go through.  So we will retry to
2129           // open the region on the same server to avoid possible
2130           // double assignment.
2131           boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
2132               && this.serverManager.isServerOnline(plan.getDestination()));
2133 
2134 
2135           if (hold) {
2136             LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
2137               "try=" + i + " of " + this.maximumAttempts, t);
2138 
2139             if (maxWaitTime < 0) {
2140               if (t instanceof RegionAlreadyInTransitionException) {
2141                 maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
2142                   + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
2143                     DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
2144               } else {
2145                 maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
2146                   + this.server.getConfiguration().getLong(
2147                     "hbase.regionserver.rpc.startup.waittime", 60000);
2148               }
2149             }
2150             try {
2151               needNewPlan = false;
2152               long now = EnvironmentEdgeManager.currentTimeMillis();
2153               if (now < maxWaitTime) {
2154                 LOG.debug("Server is not yet up or region is already in transition; "
2155                   + "waiting up to " + (maxWaitTime - now) + "ms", t);
2156                 Thread.sleep(100);
2157                 i--; // reset the try count
2158               } else if (!(t instanceof RegionAlreadyInTransitionException)) {
2159                 LOG.debug("Server is not up for a while; try a new one", t);
2160                 needNewPlan = true;
2161               }
2162             } catch (InterruptedException ie) {
2163               LOG.warn("Failed to assign "
2164                   + region.getRegionNameAsString() + " since interrupted", ie);
2165               Thread.currentThread().interrupt();
2166               if (!tomActivated) {
2167                 regionStates.updateRegionState(region, State.FAILED_OPEN);
2168               }
2169               return;
2170             }
2171           } else if (retry) {
2172             needNewPlan = false;
2173             i--; // we want to retry as many times as needed as long as the RS is not dead.
2174             LOG.warn(assignMsg + ", trying to assign to the same region server due ", t);
2175           } else {
2176             needNewPlan = true;
2177             LOG.warn(assignMsg + ", trying to assign elsewhere instead;" +
2178                 " try=" + i + " of " + this.maximumAttempts, t);
2179           }
2180         }
2181 
2182         if (i == this.maximumAttempts) {
2183           // Don't reset the region state or get a new plan any more.
2184           // This is the last try.
2185           continue;
2186         }
2187 
2188         // If region opened on destination of present plan, reassigning to new
2189         // RS may cause double assignments. In case of RegionAlreadyInTransitionException
2190         // reassigning to same RS.
2191         if (needNewPlan) {
2192           // Force a new plan and reassign. Will return null if no servers.
2193           // The new plan could be the same as the existing plan since we don't
2194           // exclude the server of the original plan, which should not be
2195           // excluded since it could be the only server up now.
2196           RegionPlan newPlan = null;
2197           try {
2198             newPlan = getRegionPlan(region, true);
2199           } catch (HBaseIOException e) {
2200             LOG.warn("Failed to get region plan", e);
2201           }
2202           if (newPlan == null) {
2203             if (tomActivated) {
2204               this.timeoutMonitor.setAllRegionServersOffline(true);
2205             } else {
2206               regionStates.updateRegionState(region, State.FAILED_OPEN);
2207             }
2208             LOG.warn("Unable to find a viable location to assign region " +
2209                 region.getRegionNameAsString());
2210             return;
2211           }
2212 
2213           if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
2214             // Clean out plan we failed execute and one that doesn't look like it'll
2215             // succeed anyways; we need a new plan!
2216             // Transition back to OFFLINE
2217             currentState = regionStates.updateRegionState(region, State.OFFLINE);
2218             versionOfOfflineNode = -1;
2219             plan = newPlan;
2220           } else if(plan.getDestination().equals(newPlan.getDestination()) &&
2221               previousException instanceof FailedServerException) {
2222             try {
2223               LOG.info("Trying to re-assign " + region.getRegionNameAsString() +
2224                 " to the same failed server.");
2225               Thread.sleep(1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY,
2226                 RpcClient.FAILED_SERVER_EXPIRY_DEFAULT));
2227             } catch (InterruptedException ie) {
2228               LOG.warn("Failed to assign "
2229                   + region.getRegionNameAsString() + " since interrupted", ie);
2230               Thread.currentThread().interrupt();
2231               if (!tomActivated) {
2232                 regionStates.updateRegionState(region, State.FAILED_OPEN);
2233               }
2234               return;
2235             }
2236           }
2237         }
2238       }
2239       // Run out of attempts
2240       if (!tomActivated) {
2241         regionStates.updateRegionState(region, State.FAILED_OPEN);
2242       }
2243     } finally {
2244       metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
2245     }
2246   }
2247 
2248   private void processAlreadyOpenedRegion(HRegionInfo region, ServerName sn) {
2249     // Remove region from in-memory transition and unassigned node from ZK
2250     // While trying to enable the table the regions of the table were
2251     // already enabled.
2252     LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2253       + " to " + sn);
2254     String encodedName = region.getEncodedName();
2255     deleteNodeInStates(encodedName, "offline", sn, EventType.M_ZK_REGION_OFFLINE);
2256     regionStates.regionOnline(region, sn);
2257   }
2258 
2259   private boolean isDisabledorDisablingRegionInRIT(final HRegionInfo region) {
2260     TableName tableName = region.getTable();
2261     boolean disabled = this.zkTable.isDisabledTable(tableName);
2262     if (disabled || this.zkTable.isDisablingTable(tableName)) {
2263       LOG.info("Table " + tableName + (disabled ? " disabled;" : " disabling;") +
2264         " skipping assign of " + region.getRegionNameAsString());
2265       offlineDisabledRegion(region);
2266       return true;
2267     }
2268     return false;
2269   }
2270 
2271   /**
2272    * Set region as OFFLINED up in zookeeper
2273    *
2274    * @param state
2275    * @return the version of the offline node if setting of the OFFLINE node was
2276    *         successful, -1 otherwise.
2277    */
2278   private int setOfflineInZooKeeper(final RegionState state, final ServerName destination) {
2279     if (!state.isClosed() && !state.isOffline()) {
2280       String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
2281       this.server.abort(msg, new IllegalStateException(msg));
2282       return -1;
2283     }
2284     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
2285     int versionOfOfflineNode;
2286     try {
2287       // get the version after setting the znode to OFFLINE
2288       versionOfOfflineNode = ZKAssign.createOrForceNodeOffline(watcher,
2289         state.getRegion(), destination);
2290       if (versionOfOfflineNode == -1) {
2291         LOG.warn("Attempted to create/force node into OFFLINE state before "
2292             + "completing assignment but failed to do so for " + state);
2293         return -1;
2294       }
2295     } catch (KeeperException e) {
2296       server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
2297       return -1;
2298     }
2299     return versionOfOfflineNode;
2300   }
2301 
2302   /**
2303    * @param region the region to assign
2304    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2305    * if no servers to assign, it returns null).
2306    */
2307   private RegionPlan getRegionPlan(final HRegionInfo region,
2308       final boolean forceNewPlan)  throws HBaseIOException {
2309     return getRegionPlan(region, null, forceNewPlan);
2310   }
2311 
2312   /**
2313    * @param region the region to assign
2314    * @param serverToExclude Server to exclude (we know its bad). Pass null if
2315    * all servers are thought to be assignable.
2316    * @param forceNewPlan If true, then if an existing plan exists, a new plan
2317    * will be generated.
2318    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2319    * if no servers to assign, it returns null).
2320    */
2321   private RegionPlan getRegionPlan(final HRegionInfo region,
2322       final ServerName serverToExclude, final boolean forceNewPlan) throws HBaseIOException {
2323     // Pickup existing plan or make a new one
2324     final String encodedName = region.getEncodedName();
2325     final List<ServerName> destServers =
2326       serverManager.createDestinationServersList(serverToExclude);
2327 
2328     if (destServers.isEmpty()){
2329       LOG.warn("Can't move " + encodedName +
2330         ", there is no destination server available.");
2331       return null;
2332     }
2333 
2334     RegionPlan randomPlan = null;
2335     boolean newPlan = false;
2336     RegionPlan existingPlan;
2337 
2338     synchronized (this.regionPlans) {
2339       existingPlan = this.regionPlans.get(encodedName);
2340 
2341       if (existingPlan != null && existingPlan.getDestination() != null) {
2342         LOG.debug("Found an existing plan for " + region.getRegionNameAsString()
2343           + " destination server is " + existingPlan.getDestination() +
2344             " accepted as a dest server = " + destServers.contains(existingPlan.getDestination()));
2345       }
2346 
2347       if (forceNewPlan
2348           || existingPlan == null
2349           || existingPlan.getDestination() == null
2350           || !destServers.contains(existingPlan.getDestination())) {
2351         newPlan = true;
2352         randomPlan = new RegionPlan(region, null,
2353             balancer.randomAssignment(region, destServers));
2354         if (!region.isMetaTable() && shouldAssignRegionsWithFavoredNodes) {
2355           List<HRegionInfo> regions = new ArrayList<HRegionInfo>(1);
2356           regions.add(region);
2357           try {
2358             processFavoredNodes(regions);
2359           } catch (IOException ie) {
2360             LOG.warn("Ignoring exception in processFavoredNodes " + ie);
2361           }
2362         }
2363         this.regionPlans.put(encodedName, randomPlan);
2364       }
2365     }
2366 
2367     if (newPlan) {
2368       if (randomPlan.getDestination() == null) {
2369         LOG.warn("Can't find a destination for " + encodedName);
2370         return null;
2371       }
2372       LOG.debug("No previous transition plan found (or ignoring " +
2373         "an existing plan) for " + region.getRegionNameAsString() +
2374         "; generated random plan=" + randomPlan + "; " +
2375         serverManager.countOfRegionServers() +
2376                " (online=" + serverManager.getOnlineServers().size() +
2377                ", available=" + destServers.size() + ") available servers" +
2378                ", forceNewPlan=" + forceNewPlan);
2379         return randomPlan;
2380       }
2381     LOG.debug("Using pre-existing plan for " +
2382       region.getRegionNameAsString() + "; plan=" + existingPlan);
2383     return existingPlan;
2384   }
2385 
2386   /**
2387    * Unassigns the specified region.
2388    * <p>
2389    * Updates the RegionState and sends the CLOSE RPC unless region is being
2390    * split by regionserver; then the unassign fails (silently) because we
2391    * presume the region being unassigned no longer exists (its been split out
2392    * of existence). TODO: What to do if split fails and is rolled back and
2393    * parent is revivified?
2394    * <p>
2395    * If a RegionPlan is already set, it will remain.
2396    *
2397    * @param region server to be unassigned
2398    */
2399   public void unassign(HRegionInfo region) {
2400     unassign(region, false);
2401   }
2402 
2403 
2404   /**
2405    * Unassigns the specified region.
2406    * <p>
2407    * Updates the RegionState and sends the CLOSE RPC unless region is being
2408    * split by regionserver; then the unassign fails (silently) because we
2409    * presume the region being unassigned no longer exists (its been split out
2410    * of existence). TODO: What to do if split fails and is rolled back and
2411    * parent is revivified?
2412    * <p>
2413    * If a RegionPlan is already set, it will remain.
2414    *
2415    * @param region server to be unassigned
2416    * @param force if region should be closed even if already closing
2417    */
2418   public void unassign(HRegionInfo region, boolean force, ServerName dest) {
2419     // TODO: Method needs refactoring.  Ugly buried returns throughout.  Beware!
2420     LOG.debug("Starting unassign of " + region.getRegionNameAsString()
2421       + " (offlining), current state: " + regionStates.getRegionState(region));
2422 
2423     String encodedName = region.getEncodedName();
2424     // Grab the state of this region and synchronize on it
2425     int versionOfClosingNode = -1;
2426     // We need a lock here as we're going to do a put later and we don't want multiple states
2427     //  creation
2428     ReentrantLock lock = locker.acquireLock(encodedName);
2429     RegionState state = regionStates.getRegionTransitionState(encodedName);
2430     boolean reassign = true;
2431     try {
2432       if (state == null) {
2433         // Region is not in transition.
2434         // We can unassign it only if it's not SPLIT/MERGED.
2435         state = regionStates.getRegionState(encodedName);
2436         if (state != null && state.isUnassignable()) {
2437           LOG.info("Attempting to unassign " + state + ", ignored");
2438           // Offline region will be reassigned below
2439           return;
2440         }
2441         // Create the znode in CLOSING state
2442         try {
2443           if (state == null || state.getServerName() == null) {
2444             // We don't know where the region is, offline it.
2445             // No need to send CLOSE RPC
2446             LOG.warn("Attempting to unassign a region not in RegionStates"
2447               + region.getRegionNameAsString() + ", offlined");
2448             regionOffline(region);
2449             return;
2450           }
2451           if (useZKForAssignment) {
2452             versionOfClosingNode = ZKAssign.createNodeClosing(
2453               watcher, region, state.getServerName());
2454             if (versionOfClosingNode == -1) {
2455               LOG.info("Attempting to unassign " +
2456                 region.getRegionNameAsString() + " but ZK closing node "
2457                 + "can't be created.");
2458               reassign = false; // not unassigned at all
2459               return;
2460             }
2461           }
2462         } catch (KeeperException e) {
2463           if (e instanceof NodeExistsException) {
2464             // Handle race between master initiated close and regionserver
2465             // orchestrated splitting. See if existing node is in a
2466             // SPLITTING or SPLIT state.  If so, the regionserver started
2467             // an op on node before we could get our CLOSING in.  Deal.
2468             NodeExistsException nee = (NodeExistsException)e;
2469             String path = nee.getPath();
2470             try {
2471               if (isSplitOrSplittingOrMergedOrMerging(path)) {
2472                 LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " +
2473                   "skipping unassign because region no longer exists -- its split or merge");
2474                 reassign = false; // no need to reassign for split/merged region
2475                 return;
2476               }
2477             } catch (KeeperException.NoNodeException ke) {
2478               LOG.warn("Failed getData on SPLITTING/SPLIT at " + path +
2479                 "; presuming split and that the region to unassign, " +
2480                 encodedName + ", no longer exists -- confirm", ke);
2481               return;
2482             } catch (KeeperException ke) {
2483               LOG.error("Unexpected zk state", ke);
2484             } catch (DeserializationException de) {
2485               LOG.error("Failed parse", de);
2486             }
2487           }
2488           // If we get here, don't understand whats going on -- abort.
2489           server.abort("Unexpected ZK exception creating node CLOSING", e);
2490           reassign = false; // heading out already
2491           return;
2492         }
2493         state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2494       } else if (state.isFailedOpen()) {
2495         // The region is not open yet
2496         regionOffline(region);
2497         return;
2498       } else if (force && state.isPendingCloseOrClosing()) {
2499         LOG.debug("Attempting to unassign " + region.getRegionNameAsString() +
2500           " which is already " + state.getState()  +
2501           " but forcing to send a CLOSE RPC again ");
2502         if (state.isFailedClose()) {
2503           state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2504         }
2505         state.updateTimestampToNow();
2506       } else {
2507         LOG.debug("Attempting to unassign " +
2508           region.getRegionNameAsString() + " but it is " +
2509           "already in transition (" + state.getState() + ", force=" + force + ")");
2510         return;
2511       }
2512 
2513       unassign(region, state, versionOfClosingNode, dest, useZKForAssignment, null);
2514     } finally {
2515       lock.unlock();
2516 
2517       // Region is expected to be reassigned afterwards
2518       if (reassign && regionStates.isRegionOffline(region)) {
2519         assign(region, true);
2520       }
2521     }
2522   }
2523 
2524   public void unassign(HRegionInfo region, boolean force){
2525      unassign(region, force, null);
2526   }
2527 
2528   /**
2529    * @param region regioninfo of znode to be deleted.
2530    */
2531   public void deleteClosingOrClosedNode(HRegionInfo region, ServerName sn) {
2532     String encodedName = region.getEncodedName();
2533     deleteNodeInStates(encodedName, "closing", sn, EventType.M_ZK_REGION_CLOSING,
2534       EventType.RS_ZK_REGION_CLOSED);
2535   }
2536 
2537   /**
2538    * @param path
2539    * @return True if znode is in SPLIT or SPLITTING or MERGED or MERGING state.
2540    * @throws KeeperException Can happen if the znode went away in meantime.
2541    * @throws DeserializationException
2542    */
2543   private boolean isSplitOrSplittingOrMergedOrMerging(final String path)
2544       throws KeeperException, DeserializationException {
2545     boolean result = false;
2546     // This may fail if the SPLIT or SPLITTING or MERGED or MERGING znode gets
2547     // cleaned up before we can get data from it.
2548     byte [] data = ZKAssign.getData(watcher, path);
2549     if (data == null) {
2550       LOG.info("Node " + path + " is gone");
2551       return false;
2552     }
2553     RegionTransition rt = RegionTransition.parseFrom(data);
2554     switch (rt.getEventType()) {
2555     case RS_ZK_REQUEST_REGION_SPLIT:
2556     case RS_ZK_REGION_SPLIT:
2557     case RS_ZK_REGION_SPLITTING:
2558     case RS_ZK_REQUEST_REGION_MERGE:
2559     case RS_ZK_REGION_MERGED:
2560     case RS_ZK_REGION_MERGING:
2561       result = true;
2562       break;
2563     default:
2564       LOG.info("Node " + path + " is in " + rt.getEventType());
2565       break;
2566     }
2567     return result;
2568   }
2569 
2570   /**
2571    * Used by unit tests. Return the number of regions opened so far in the life
2572    * of the master. Increases by one every time the master opens a region
2573    * @return the counter value of the number of regions opened so far
2574    */
2575   public int getNumRegionsOpened() {
2576     return numRegionsOpened.get();
2577   }
2578 
2579   /**
2580    * Waits until the specified region has completed assignment.
2581    * <p>
2582    * If the region is already assigned, returns immediately.  Otherwise, method
2583    * blocks until the region is assigned.
2584    * @param regionInfo region to wait on assignment for
2585    * @throws InterruptedException
2586    */
2587   public boolean waitForAssignment(HRegionInfo regionInfo)
2588       throws InterruptedException {
2589     while (!regionStates.isRegionOnline(regionInfo)) {
2590       if (regionStates.isRegionInState(regionInfo, State.FAILED_OPEN)
2591           || this.server.isStopped()) {
2592         return false;
2593       }
2594 
2595       // We should receive a notification, but it's
2596       //  better to have a timeout to recheck the condition here:
2597       //  it lowers the impact of a race condition if any
2598       regionStates.waitForUpdate(100);
2599     }
2600     return true;
2601   }
2602 
2603   /**
2604    * Assigns the hbase:meta region.
2605    * <p>
2606    * Assumes that hbase:meta is currently closed and is not being actively served by
2607    * any RegionServer.
2608    * <p>
2609    * Forcibly unsets the current meta region location in ZooKeeper and assigns
2610    * hbase:meta to a random RegionServer.
2611    * @throws KeeperException
2612    */
2613   public void assignMeta() throws KeeperException {
2614     MetaRegionTracker.deleteMetaLocation(this.watcher);
2615     assign(HRegionInfo.FIRST_META_REGIONINFO, true);
2616   }
2617 
2618   /**
2619    * Assigns specified regions retaining assignments, if any.
2620    * <p>
2621    * This is a synchronous call and will return once every region has been
2622    * assigned.  If anything fails, an exception is thrown
2623    * @throws InterruptedException
2624    * @throws IOException
2625    */
2626   public void assign(Map<HRegionInfo, ServerName> regions)
2627         throws IOException, InterruptedException {
2628     if (regions == null || regions.isEmpty()) {
2629       return;
2630     }
2631     List<ServerName> servers = serverManager.createDestinationServersList();
2632     if (servers == null || servers.isEmpty()) {
2633       throw new IOException("Found no destination server to assign region(s)");
2634     }
2635 
2636     // Reuse existing assignment info
2637     Map<ServerName, List<HRegionInfo>> bulkPlan =
2638       balancer.retainAssignment(regions, servers);
2639 
2640     assign(regions.size(), servers.size(),
2641       "retainAssignment=true", bulkPlan);
2642   }
2643 
2644   /**
2645    * Assigns specified regions round robin, if any.
2646    * <p>
2647    * This is a synchronous call and will return once every region has been
2648    * assigned.  If anything fails, an exception is thrown
2649    * @throws InterruptedException
2650    * @throws IOException
2651    */
2652   public void assign(List<HRegionInfo> regions)
2653         throws IOException, InterruptedException {
2654     if (regions == null || regions.isEmpty()) {
2655       return;
2656     }
2657 
2658     List<ServerName> servers = serverManager.createDestinationServersList();
2659     if (servers == null || servers.isEmpty()) {
2660       throw new IOException("Found no destination server to assign region(s)");
2661     }
2662 
2663     // Generate a round-robin bulk assignment plan
2664     Map<ServerName, List<HRegionInfo>> bulkPlan
2665       = balancer.roundRobinAssignment(regions, servers);
2666     processFavoredNodes(regions);
2667 
2668     assign(regions.size(), servers.size(),
2669       "round-robin=true", bulkPlan);
2670   }
2671 
2672   private void assign(int regions, int totalServers,
2673       String message, Map<ServerName, List<HRegionInfo>> bulkPlan)
2674           throws InterruptedException, IOException {
2675 
2676     int servers = bulkPlan.size();
2677     if (servers == 1 || (regions < bulkAssignThresholdRegions
2678         && servers < bulkAssignThresholdServers)) {
2679 
2680       // Not use bulk assignment.  This could be more efficient in small
2681       // cluster, especially mini cluster for testing, so that tests won't time out
2682       if (LOG.isTraceEnabled()) {
2683         LOG.trace("Not using bulk assignment since we are assigning only " + regions +
2684           " region(s) to " + servers + " server(s)");
2685       }
2686       for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) {
2687         if (!assign(plan.getKey(), plan.getValue())) {
2688           for (HRegionInfo region: plan.getValue()) {
2689             if (!regionStates.isRegionOnline(region)) {
2690               invokeAssign(region);
2691             }
2692           }
2693         }
2694       }
2695     } else {
2696       LOG.info("Bulk assigning " + regions + " region(s) across "
2697         + totalServers + " server(s), " + message);
2698 
2699       // Use fixed count thread pool assigning.
2700       BulkAssigner ba = new GeneralBulkAssigner(
2701         this.server, bulkPlan, this, bulkAssignWaitTillAllAssigned);
2702       ba.bulkAssign();
2703       LOG.info("Bulk assigning done");
2704     }
2705   }
2706 
2707   /**
2708    * Assigns all user regions, if any exist.  Used during cluster startup.
2709    * <p>
2710    * This is a synchronous call and will return once every region has been
2711    * assigned.  If anything fails, an exception is thrown and the cluster
2712    * should be shutdown.
2713    * @throws InterruptedException
2714    * @throws IOException
2715    * @throws KeeperException
2716    */
2717   private void assignAllUserRegions(Set<TableName> disabledOrDisablingOrEnabling)
2718       throws IOException, InterruptedException, KeeperException {
2719     // Skip assignment for regions of tables in DISABLING state because during clean cluster startup
2720     // no RS is alive and regions map also doesn't have any information about the regions.
2721     // See HBASE-6281.
2722     // Scan hbase:meta for all user regions, skipping any disabled tables
2723     Map<HRegionInfo, ServerName> allRegions;
2724     SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment =
2725        new SnapshotOfRegionAssignmentFromMeta(catalogTracker, disabledOrDisablingOrEnabling, true);
2726     snapshotOfRegionAssignment.initialize();
2727     allRegions = snapshotOfRegionAssignment.getRegionToRegionServerMap();
2728     if (allRegions == null || allRegions.isEmpty()) {
2729       return;
2730     }
2731 
2732     // Determine what type of assignment to do on startup
2733     boolean retainAssignment = server.getConfiguration().
2734       getBoolean("hbase.master.startup.retainassign", true);
2735 
2736     if (retainAssignment) {
2737       assign(allRegions);
2738     } else {
2739       List<HRegionInfo> regions = new ArrayList<HRegionInfo>(allRegions.keySet());
2740       assign(regions);
2741     }
2742 
2743     for (HRegionInfo hri : allRegions.keySet()) {
2744       TableName tableName = hri.getTable();
2745       if (!zkTable.isEnabledTable(tableName)) {
2746         setEnabledTable(tableName);
2747       }
2748     }
2749   }
2750 
2751   /**
2752    * Wait until no regions in transition.
2753    * @param timeout How long to wait.
2754    * @return True if nothing in regions in transition.
2755    * @throws InterruptedException
2756    */
2757   boolean waitUntilNoRegionsInTransition(final long timeout)
2758       throws InterruptedException {
2759     // Blocks until there are no regions in transition. It is possible that
2760     // there
2761     // are regions in transition immediately after this returns but guarantees
2762     // that if it returns without an exception that there was a period of time
2763     // with no regions in transition from the point-of-view of the in-memory
2764     // state of the Master.
2765     final long endTime = System.currentTimeMillis() + timeout;
2766 
2767     while (!this.server.isStopped() && regionStates.isRegionsInTransition()
2768         && endTime > System.currentTimeMillis()) {
2769       regionStates.waitForUpdate(100);
2770     }
2771 
2772     return !regionStates.isRegionsInTransition();
2773   }
2774 
2775   /**
2776    * Rebuild the list of user regions and assignment information.
2777    * <p>
2778    * Returns a map of servers that are not found to be online and the regions
2779    * they were hosting.
2780    * @return map of servers not online to their assigned regions, as stored
2781    *         in META
2782    * @throws IOException
2783    */
2784   Map<ServerName, List<HRegionInfo>> rebuildUserRegions() throws IOException, KeeperException {
2785     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2786     Set<TableName> disabledOrEnablingTables = ZKTable.getDisabledTables(watcher);
2787     disabledOrEnablingTables.addAll(enablingTables);
2788     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisablingTables(watcher);
2789     disabledOrDisablingOrEnabling.addAll(disabledOrEnablingTables);
2790 
2791     // Region assignment from META
2792     List<Result> results = MetaReader.fullScan(this.catalogTracker);
2793     // Get any new but slow to checkin region server that joined the cluster
2794     Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
2795     // Map of offline servers and their regions to be returned
2796     Map<ServerName, List<HRegionInfo>> offlineServers =
2797       new TreeMap<ServerName, List<HRegionInfo>>();
2798     // Iterate regions in META
2799     for (Result result : results) {
2800       HRegionInfo regionInfo = HRegionInfo.getHRegionInfo(result);
2801       if (regionInfo == null) continue;
2802       State state = RegionStateStore.getRegionState(result);
2803       ServerName regionLocation = RegionStateStore.getRegionServer(result);
2804       regionStates.createRegionState(regionInfo, state, regionLocation);
2805       if (!regionStates.isRegionInState(regionInfo, State.OPEN)) {
2806         // Region is not open (either offline or in transition), skip
2807         continue;
2808       }
2809       TableName tableName = regionInfo.getTable();
2810       if (!onlineServers.contains(regionLocation)) {
2811         // Region is located on a server that isn't online
2812         List<HRegionInfo> offlineRegions = offlineServers.get(regionLocation);
2813         if (offlineRegions == null) {
2814           offlineRegions = new ArrayList<HRegionInfo>(1);
2815           offlineServers.put(regionLocation, offlineRegions);
2816         }
2817         if (useZKForAssignment) {
2818           regionStates.regionOffline(regionInfo);
2819         }
2820         offlineRegions.add(regionInfo);
2821       } else if (!disabledOrEnablingTables.contains(tableName)) {
2822         // Region is being served and on an active server
2823         // add only if region not in disabled or enabling table
2824 
2825         regionStates.updateRegionState(regionInfo, State.OPEN, regionLocation);
2826         regionStates.regionOnline(regionInfo, regionLocation);
2827         balancer.regionOnline(regionInfo, regionLocation);
2828       } else if (useZKForAssignment) {
2829         regionStates.regionOffline(regionInfo);
2830       }
2831       // need to enable the table if not disabled or disabling or enabling
2832       // this will be used in rolling restarts
2833       if (!disabledOrDisablingOrEnabling.contains(tableName)
2834           && !getZKTable().isEnabledTable(tableName)) {
2835         setEnabledTable(tableName);
2836       }
2837 
2838     }
2839     return offlineServers;
2840   }
2841 
2842   /**
2843    * Recover the tables that were not fully moved to DISABLED state. These
2844    * tables are in DISABLING state when the master restarted/switched.
2845    *
2846    * @throws KeeperException
2847    * @throws TableNotFoundException
2848    * @throws IOException
2849    */
2850   private void recoverTableInDisablingState()
2851       throws KeeperException, TableNotFoundException, IOException {
2852     Set<TableName> disablingTables = ZKTable.getDisablingTables(watcher);
2853     if (disablingTables.size() != 0) {
2854       for (TableName tableName : disablingTables) {
2855         // Recover by calling DisableTableHandler
2856         LOG.info("The table " + tableName
2857             + " is in DISABLING state.  Hence recovering by moving the table"
2858             + " to DISABLED state.");
2859         new DisableTableHandler(this.server, tableName, catalogTracker,
2860             this, tableLockManager, true).prepare().process();
2861       }
2862     }
2863   }
2864 
2865   /**
2866    * Recover the tables that are not fully moved to ENABLED state. These tables
2867    * are in ENABLING state when the master restarted/switched
2868    *
2869    * @throws KeeperException
2870    * @throws org.apache.hadoop.hbase.TableNotFoundException
2871    * @throws IOException
2872    */
2873   private void recoverTableInEnablingState()
2874       throws KeeperException, TableNotFoundException, IOException {
2875     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2876     if (enablingTables.size() != 0) {
2877       for (TableName tableName : enablingTables) {
2878         // Recover by calling EnableTableHandler
2879         LOG.info("The table " + tableName
2880             + " is in ENABLING state.  Hence recovering by moving the table"
2881             + " to ENABLED state.");
2882         // enableTable in sync way during master startup,
2883         // no need to invoke coprocessor
2884         EnableTableHandler eth = new EnableTableHandler(this.server, tableName,
2885           catalogTracker, this, tableLockManager, true);
2886         try {
2887           eth.prepare();
2888         } catch (TableNotFoundException e) {
2889           LOG.warn("Table " + tableName + " not found in hbase:meta to recover.");
2890           continue;
2891         }
2892         eth.process();
2893       }
2894     }
2895   }
2896 
2897   /**
2898    * Processes list of dead servers from result of hbase:meta scan and regions in RIT
2899    * <p>
2900    * This is used for failover to recover the lost regions that belonged to
2901    * RegionServers which failed while there was no active master or regions
2902    * that were in RIT.
2903    * <p>
2904    *
2905    *
2906    * @param deadServers
2907    *          The list of dead servers which failed while there was no active
2908    *          master. Can be null.
2909    * @throws IOException
2910    * @throws KeeperException
2911    */
2912   private void processDeadServersAndRecoverLostRegions(
2913       Map<ServerName, List<HRegionInfo>> deadServers)
2914           throws IOException, KeeperException {
2915     if (deadServers != null) {
2916       for (Map.Entry<ServerName, List<HRegionInfo>> server: deadServers.entrySet()) {
2917         ServerName serverName = server.getKey();
2918         // We need to keep such info even if the server is known dead
2919         regionStates.setLastRegionServerOfRegions(serverName, server.getValue());
2920         if (!serverManager.isServerDead(serverName)) {
2921           serverManager.expireServer(serverName); // Let SSH do region re-assign
2922         }
2923       }
2924     }
2925 
2926     List<String> nodes = useZKForAssignment ?
2927       ZKUtil.listChildrenAndWatchForNewChildren(watcher, watcher.assignmentZNode)
2928       : ZKUtil.listChildrenNoWatch(watcher, watcher.assignmentZNode);
2929     if (nodes != null && !nodes.isEmpty()) {
2930       for (String encodedRegionName : nodes) {
2931         processRegionInTransition(encodedRegionName, null);
2932       }
2933     } else if (!useZKForAssignment) {
2934       // We need to send RPC call again for PENDING_OPEN/PENDING_CLOSE regions
2935       // in case the RPC call is not sent out yet before the master was shut down
2936       // since we update the state before we send the RPC call. We can't update
2937       // the state after the RPC call. Otherwise, we don't know what's happened
2938       // to the region if the master dies right after the RPC call is out.
2939       Map<String, RegionState> rits = regionStates.getRegionsInTransition();
2940       for (RegionState regionState: rits.values()) {
2941         if (!serverManager.isServerOnline(regionState.getServerName())) {
2942           continue; // SSH will handle it
2943         }
2944         State state = regionState.getState();
2945         LOG.info("Processing " + regionState);
2946         switch (state) {
2947         case CLOSED:
2948           invokeAssign(regionState.getRegion());
2949           break;
2950         case PENDING_OPEN:
2951           retrySendRegionOpen(regionState);
2952           break;
2953         case PENDING_CLOSE:
2954           retrySendRegionClose(regionState);
2955           break;
2956         default:
2957           // No process for other states
2958         }
2959       }
2960     }
2961   }
2962 
2963   /**
2964    * At master failover, for pending_open region, make sure
2965    * sendRegionOpen RPC call is sent to the target regionserver
2966    */
2967   private void retrySendRegionOpen(final RegionState regionState) {
2968     this.executorService.submit(
2969       new EventHandler(server, EventType.M_MASTER_RECOVERY) {
2970         @Override
2971         public void process() throws IOException {
2972           HRegionInfo hri = regionState.getRegion();
2973           ServerName serverName = regionState.getServerName();
2974           ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
2975           try {
2976             while (serverManager.isServerOnline(serverName)
2977                 && !server.isStopped() && !server.isAborted()) {
2978               try {
2979                 List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
2980                 if (shouldAssignRegionsWithFavoredNodes) {
2981                   favoredNodes = ((FavoredNodeLoadBalancer)balancer).getFavoredNodes(hri);
2982                 }
2983                 RegionOpeningState regionOpenState = serverManager.sendRegionOpen(
2984                   serverName, hri, -1, favoredNodes);
2985 
2986                 if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
2987                   // Failed opening this region, this means the target server didn't get
2988                   // the original region open RPC, so re-assign it with a new plan
2989                   LOG.debug("Got failed_opening in retry sendRegionOpen for "
2990                     + regionState + ", re-assign it");
2991                   invokeAssign(hri, true);
2992                 }
2993                 return; // Done.
2994               } catch (Throwable t) {
2995                 if (t instanceof RemoteException) {
2996                   t = ((RemoteException) t).unwrapRemoteException();
2997                 }
2998                 // In case SocketTimeoutException/FailedServerException, we will retry
2999                 if (t instanceof java.net.SocketTimeoutException
3000                     || t instanceof FailedServerException) {
3001                   Threads.sleep(100);
3002                   continue;
3003                 }
3004                 // For other exceptions, re-assign it
3005                 LOG.debug("Got exception in retry sendRegionOpen for "
3006                   + regionState + ", re-assign it", t);
3007                 invokeAssign(hri);
3008                 return; // Done.
3009               }
3010             }
3011           } finally {
3012             lock.unlock();
3013           }
3014         }
3015       });
3016   }
3017 
3018   /**
3019    * At master failover, for pending_close region, make sure
3020    * sendRegionClose RPC call is sent to the target regionserver
3021    */
3022   private void retrySendRegionClose(final RegionState regionState) {
3023     this.executorService.submit(
3024       new EventHandler(server, EventType.M_MASTER_RECOVERY) {
3025         @Override
3026         public void process() throws IOException {
3027           HRegionInfo hri = regionState.getRegion();
3028           ServerName serverName = regionState.getServerName();
3029           ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
3030           try {
3031             while (serverManager.isServerOnline(serverName)
3032                 && !server.isStopped() && !server.isAborted()) {
3033               try {
3034                 if (!serverManager.sendRegionClose(serverName, hri, -1, null, false)) {
3035                   // This means the region is still on the target server
3036                   LOG.debug("Got false in retry sendRegionClose for "
3037                     + regionState + ", re-close it");
3038                   invokeUnAssign(hri);
3039                 }
3040                 return; // Done.
3041               } catch (Throwable t) {
3042                 if (t instanceof RemoteException) {
3043                   t = ((RemoteException) t).unwrapRemoteException();
3044                 }
3045                 // In case SocketTimeoutException/FailedServerException, we will retry
3046                 if (t instanceof java.net.SocketTimeoutException
3047                     || t instanceof FailedServerException) {
3048                   Threads.sleep(100);
3049                   continue;
3050                 }
3051                 if (!(t instanceof NotServingRegionException
3052                     || t instanceof RegionAlreadyInTransitionException)) {
3053                   // NotServingRegionException/RegionAlreadyInTransitionException
3054                   // means the target server got the original region close request.
3055                   // For other exceptions, re-close it
3056                   LOG.debug("Got exception in retry sendRegionClose for "
3057                     + regionState + ", re-close it", t);
3058                   invokeUnAssign(hri);
3059                 }
3060                 return; // Done.
3061               }
3062             }
3063           } finally {
3064             lock.unlock();
3065           }
3066         }
3067       });
3068   }
3069 
3070   /**
3071    * Set Regions in transitions metrics.
3072    * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
3073    * This iterator is not fail fast, which may lead to stale read; but that's better than
3074    * creating a copy of the map for metrics computation, as this method will be invoked
3075    * on a frequent interval.
3076    */
3077   public void updateRegionsInTransitionMetrics() {
3078     long currentTime = System.currentTimeMillis();
3079     int totalRITs = 0;
3080     int totalRITsOverThreshold = 0;
3081     long oldestRITTime = 0;
3082     int ritThreshold = this.server.getConfiguration().
3083       getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
3084     for (RegionState state: regionStates.getRegionsInTransition().values()) {
3085       totalRITs++;
3086       long ritTime = currentTime - state.getStamp();
3087       if (ritTime > ritThreshold) { // more than the threshold
3088         totalRITsOverThreshold++;
3089       }
3090       if (oldestRITTime < ritTime) {
3091         oldestRITTime = ritTime;
3092       }
3093     }
3094     if (this.metricsAssignmentManager != null) {
3095       this.metricsAssignmentManager.updateRITOldestAge(oldestRITTime);
3096       this.metricsAssignmentManager.updateRITCount(totalRITs);
3097       this.metricsAssignmentManager.updateRITCountOverThreshold(totalRITsOverThreshold);
3098     }
3099   }
3100 
3101   /**
3102    * @param region Region whose plan we are to clear.
3103    */
3104   void clearRegionPlan(final HRegionInfo region) {
3105     synchronized (this.regionPlans) {
3106       this.regionPlans.remove(region.getEncodedName());
3107     }
3108   }
3109 
3110   /**
3111    * Wait on region to clear regions-in-transition.
3112    * @param hri Region to wait on.
3113    * @throws IOException
3114    */
3115   public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
3116       throws IOException, InterruptedException {
3117     waitOnRegionToClearRegionsInTransition(hri, -1L);
3118   }
3119 
3120   /**
3121    * Wait on region to clear regions-in-transition or time out
3122    * @param hri
3123    * @param timeOut Milliseconds to wait for current region to be out of transition state.
3124    * @return True when a region clears regions-in-transition before timeout otherwise false
3125    * @throws InterruptedException
3126    */
3127   public boolean waitOnRegionToClearRegionsInTransition(final HRegionInfo hri, long timeOut)
3128       throws InterruptedException {
3129     if (!regionStates.isRegionInTransition(hri)) return true;
3130     long end = (timeOut <= 0) ? Long.MAX_VALUE : EnvironmentEdgeManager.currentTimeMillis()
3131         + timeOut;
3132     // There is already a timeout monitor on regions in transition so I
3133     // should not have to have one here too?
3134     LOG.info("Waiting for " + hri.getEncodedName() +
3135         " to leave regions-in-transition, timeOut=" + timeOut + " ms.");
3136     while (!this.server.isStopped() && regionStates.isRegionInTransition(hri)) {
3137       regionStates.waitForUpdate(100);
3138       if (EnvironmentEdgeManager.currentTimeMillis() > end) {
3139         LOG.info("Timed out on waiting for " + hri.getEncodedName() + " to be assigned.");
3140         return false;
3141       }
3142     }
3143     if (this.server.isStopped()) {
3144       LOG.info("Giving up wait on regions in transition because stoppable.isStopped is set");
3145       return false;
3146     }
3147     return true;
3148   }
3149 
3150   /**
3151    * Update timers for all regions in transition going against the server in the
3152    * serversInUpdatingTimer.
3153    */
3154   public class TimerUpdater extends Chore {
3155 
3156     public TimerUpdater(final int period, final Stoppable stopper) {
3157       super("AssignmentTimerUpdater", period, stopper);
3158     }
3159 
3160     @Override
3161     protected void chore() {
3162       Preconditions.checkState(tomActivated);
3163       ServerName serverToUpdateTimer = null;
3164       while (!serversInUpdatingTimer.isEmpty() && !stopper.isStopped()) {
3165         if (serverToUpdateTimer == null) {
3166           serverToUpdateTimer = serversInUpdatingTimer.first();
3167         } else {
3168           serverToUpdateTimer = serversInUpdatingTimer
3169               .higher(serverToUpdateTimer);
3170         }
3171         if (serverToUpdateTimer == null) {
3172           break;
3173         }
3174         updateTimers(serverToUpdateTimer);
3175         serversInUpdatingTimer.remove(serverToUpdateTimer);
3176       }
3177     }
3178   }
3179 
3180   /**
3181    * Monitor to check for time outs on region transition operations
3182    */
3183   public class TimeoutMonitor extends Chore {
3184     private boolean allRegionServersOffline = false;
3185     private ServerManager serverManager;
3186     private final int timeout;
3187 
3188     /**
3189      * Creates a periodic monitor to check for time outs on region transition
3190      * operations.  This will deal with retries if for some reason something
3191      * doesn't happen within the specified timeout.
3192      * @param period
3193    * @param stopper When {@link Stoppable#isStopped()} is true, this thread will
3194    * cleanup and exit cleanly.
3195      * @param timeout
3196      */
3197     public TimeoutMonitor(final int period, final Stoppable stopper,
3198         ServerManager serverManager,
3199         final int timeout) {
3200       super("AssignmentTimeoutMonitor", period, stopper);
3201       this.timeout = timeout;
3202       this.serverManager = serverManager;
3203     }
3204 
3205     private synchronized void setAllRegionServersOffline(
3206       boolean allRegionServersOffline) {
3207       this.allRegionServersOffline = allRegionServersOffline;
3208     }
3209 
3210     @Override
3211     protected void chore() {
3212       Preconditions.checkState(tomActivated);
3213       boolean noRSAvailable = this.serverManager.createDestinationServersList().isEmpty();
3214 
3215       // Iterate all regions in transition checking for time outs
3216       long now = System.currentTimeMillis();
3217       // no lock concurrent access ok: we will be working on a copy, and it's java-valid to do
3218       //  a copy while another thread is adding/removing items
3219       for (String regionName : regionStates.getRegionsInTransition().keySet()) {
3220         RegionState regionState = regionStates.getRegionTransitionState(regionName);
3221         if (regionState == null) continue;
3222 
3223         if (regionState.getStamp() + timeout <= now) {
3224           // decide on action upon timeout
3225           actOnTimeOut(regionState);
3226         } else if (this.allRegionServersOffline && !noRSAvailable) {
3227           RegionPlan existingPlan = regionPlans.get(regionName);
3228           if (existingPlan == null
3229               || !this.serverManager.isServerOnline(existingPlan
3230                   .getDestination())) {
3231             // if some RSs just came back online, we can start the assignment
3232             // right away
3233             actOnTimeOut(regionState);
3234           }
3235         }
3236       }
3237       setAllRegionServersOffline(noRSAvailable);
3238     }
3239 
3240     private void actOnTimeOut(RegionState regionState) {
3241       HRegionInfo regionInfo = regionState.getRegion();
3242       LOG.info("Regions in transition timed out:  " + regionState);
3243       // Expired! Do a retry.
3244       switch (regionState.getState()) {
3245       case CLOSED:
3246         LOG.info("Region " + regionInfo.getEncodedName()
3247             + " has been CLOSED for too long, waiting on queued "
3248             + "ClosedRegionHandler to run or server shutdown");
3249         // Update our timestamp.
3250         regionState.updateTimestampToNow();
3251         break;
3252       case OFFLINE:
3253         LOG.info("Region has been OFFLINE for too long, " + "reassigning "
3254             + regionInfo.getRegionNameAsString() + " to a random server");
3255         invokeAssign(regionInfo);
3256         break;
3257       case PENDING_OPEN:
3258         LOG.info("Region has been PENDING_OPEN for too "
3259             + "long, reassigning region=" + regionInfo.getRegionNameAsString());
3260         invokeAssign(regionInfo);
3261         break;
3262       case OPENING:
3263         processOpeningState(regionInfo);
3264         break;
3265       case OPEN:
3266         LOG.error("Region has been OPEN for too long, " +
3267             "we don't know where region was opened so can't do anything");
3268         regionState.updateTimestampToNow();
3269         break;
3270 
3271       case PENDING_CLOSE:
3272         LOG.info("Region has been PENDING_CLOSE for too "
3273             + "long, running forced unassign again on region="
3274             + regionInfo.getRegionNameAsString());
3275         invokeUnassign(regionInfo);
3276         break;
3277       case CLOSING:
3278         LOG.info("Region has been CLOSING for too " +
3279           "long, this should eventually complete or the server will " +
3280           "expire, send RPC again");
3281         invokeUnassign(regionInfo);
3282         break;
3283 
3284       case SPLIT:
3285       case SPLITTING:
3286       case FAILED_OPEN:
3287       case FAILED_CLOSE:
3288       case MERGING:
3289         break;
3290 
3291       default:
3292         throw new IllegalStateException("Received event is not valid.");
3293       }
3294     }
3295   }
3296 
3297   private void processOpeningState(HRegionInfo regionInfo) {
3298     LOG.info("Region has been OPENING for too long, reassigning region="
3299         + regionInfo.getRegionNameAsString());
3300     // Should have a ZK node in OPENING state
3301     try {
3302       String node = ZKAssign.getNodeName(watcher, regionInfo.getEncodedName());
3303       Stat stat = new Stat();
3304       byte [] data = ZKAssign.getDataNoWatch(watcher, node, stat);
3305       if (data == null) {
3306         LOG.warn("Data is null, node " + node + " no longer exists");
3307         return;
3308       }
3309       RegionTransition rt = RegionTransition.parseFrom(data);
3310       EventType et = rt.getEventType();
3311       if (et == EventType.RS_ZK_REGION_OPENED) {
3312         LOG.debug("Region has transitioned to OPENED, allowing "
3313             + "watched event handlers to process");
3314         return;
3315       } else if (et != EventType.RS_ZK_REGION_OPENING && et != EventType.RS_ZK_REGION_FAILED_OPEN ) {
3316         LOG.warn("While timing out a region, found ZK node in unexpected state: " + et);
3317         return;
3318       }
3319       invokeAssign(regionInfo);
3320     } catch (KeeperException ke) {
3321       LOG.error("Unexpected ZK exception timing out CLOSING region", ke);
3322     } catch (DeserializationException e) {
3323       LOG.error("Unexpected exception parsing CLOSING region", e);
3324     }
3325   }
3326 
3327   void invokeAssign(HRegionInfo regionInfo) {
3328     invokeAssign(regionInfo, true);
3329   }
3330 
3331   void invokeAssign(HRegionInfo regionInfo, boolean newPlan) {
3332     threadPoolExecutorService.submit(new AssignCallable(this, regionInfo, newPlan));
3333   }
3334 
3335   void invokeUnAssign(HRegionInfo regionInfo) {
3336     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3337   }
3338 
3339   private void invokeUnassign(HRegionInfo regionInfo) {
3340     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3341   }
3342 
3343   public boolean isCarryingMeta(ServerName serverName) {
3344     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
3345   }
3346 
3347   /**
3348    * Check if the shutdown server carries the specific region.
3349    * We have a bunch of places that store region location
3350    * Those values aren't consistent. There is a delay of notification.
3351    * The location from zookeeper unassigned node has the most recent data;
3352    * but the node could be deleted after the region is opened by AM.
3353    * The AM's info could be old when OpenedRegionHandler
3354    * processing hasn't finished yet when server shutdown occurs.
3355    * @return whether the serverName currently hosts the region
3356    */
3357   private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
3358     RegionTransition rt = null;
3359     try {
3360       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
3361       // This call can legitimately come by null
3362       rt = data == null? null: RegionTransition.parseFrom(data);
3363     } catch (KeeperException e) {
3364       server.abort("Exception reading unassigned node for region=" + hri.getEncodedName(), e);
3365     } catch (DeserializationException e) {
3366       server.abort("Exception parsing unassigned node for region=" + hri.getEncodedName(), e);
3367     }
3368 
3369     ServerName addressFromZK = rt != null? rt.getServerName():  null;
3370     if (addressFromZK != null) {
3371       // if we get something from ZK, we will use the data
3372       boolean matchZK = addressFromZK.equals(serverName);
3373       LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK +
3374         " current=" + serverName + ", matches=" + matchZK);
3375       return matchZK;
3376     }
3377 
3378     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
3379     boolean matchAM = (addressFromAM != null &&
3380       addressFromAM.equals(serverName));
3381     LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
3382       " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
3383       " server being checked: " + serverName);
3384 
3385     return matchAM;
3386   }
3387 
3388   /**
3389    * Process shutdown server removing any assignments.
3390    * @param sn Server that went down.
3391    * @return list of regions in transition on this server
3392    */
3393   public List<HRegionInfo> processServerShutdown(final ServerName sn) {
3394     // Clean out any existing assignment plans for this server
3395     synchronized (this.regionPlans) {
3396       for (Iterator <Map.Entry<String, RegionPlan>> i =
3397           this.regionPlans.entrySet().iterator(); i.hasNext();) {
3398         Map.Entry<String, RegionPlan> e = i.next();
3399         ServerName otherSn = e.getValue().getDestination();
3400         // The name will be null if the region is planned for a random assign.
3401         if (otherSn != null && otherSn.equals(sn)) {
3402           // Use iterator's remove else we'll get CME
3403           i.remove();
3404         }
3405       }
3406     }
3407     List<HRegionInfo> regions = regionStates.serverOffline(watcher, sn);
3408     for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) {
3409       HRegionInfo hri = it.next();
3410       String encodedName = hri.getEncodedName();
3411 
3412       // We need a lock on the region as we could update it
3413       Lock lock = locker.acquireLock(encodedName);
3414       try {
3415         RegionState regionState =
3416           regionStates.getRegionTransitionState(encodedName);
3417         if (regionState == null
3418             || (regionState.getServerName() != null && !regionState.isOnServer(sn))
3419             || !(regionState.isFailedClose() || regionState.isOffline()
3420               || regionState.isPendingOpenOrOpening())) {
3421           LOG.info("Skip " + regionState + " since it is not opening/failed_close"
3422             + " on the dead server any more: " + sn);
3423           it.remove();
3424         } else {
3425           try {
3426             // Delete the ZNode if exists
3427             ZKAssign.deleteNodeFailSilent(watcher, hri);
3428           } catch (KeeperException ke) {
3429             server.abort("Unexpected ZK exception deleting node " + hri, ke);
3430           }
3431           if (zkTable.isDisablingOrDisabledTable(hri.getTable())) {
3432             regionStates.regionOffline(hri);
3433             it.remove();
3434             continue;
3435           }
3436           // Mark the region offline and assign it again by SSH
3437           regionStates.updateRegionState(hri, State.OFFLINE);
3438         }
3439       } finally {
3440         lock.unlock();
3441       }
3442     }
3443     return regions;
3444   }
3445 
3446   /**
3447    * @param plan Plan to execute.
3448    */
3449   public void balance(final RegionPlan plan) {
3450     HRegionInfo hri = plan.getRegionInfo();
3451     TableName tableName = hri.getTable();
3452     if (zkTable.isDisablingOrDisabledTable(tableName)) {
3453       LOG.info("Ignored moving region of disabling/disabled table "
3454         + tableName);
3455       return;
3456     }
3457 
3458     // Move the region only if it's assigned
3459     String encodedName = hri.getEncodedName();
3460     ReentrantLock lock = locker.acquireLock(encodedName);
3461     try {
3462       if (!regionStates.isRegionOnline(hri)) {
3463         RegionState state = regionStates.getRegionState(encodedName);
3464         LOG.info("Ignored moving region not assigned: " + hri + ", "
3465           + (state == null ? "not in region states" : state));
3466         return;
3467       }
3468       synchronized (this.regionPlans) {
3469         this.regionPlans.put(plan.getRegionName(), plan);
3470       }
3471       unassign(hri, false, plan.getDestination());
3472     } finally {
3473       lock.unlock();
3474     }
3475   }
3476 
3477   public void stop() {
3478     shutdown(); // Stop executor service, etc
3479     if (tomActivated){
3480       this.timeoutMonitor.interrupt();
3481       this.timerUpdater.interrupt();
3482     }
3483   }
3484 
3485   /**
3486    * Shutdown the threadpool executor service
3487    */
3488   public void shutdown() {
3489     // It's an immediate shutdown, so we're clearing the remaining tasks.
3490     synchronized (zkEventWorkerWaitingList){
3491       zkEventWorkerWaitingList.clear();
3492     }
3493     threadPoolExecutorService.shutdownNow();
3494     zkEventWorkers.shutdownNow();
3495     regionStateStore.stop();
3496   }
3497 
3498   protected void setEnabledTable(TableName tableName) {
3499     try {
3500       this.zkTable.setEnabledTable(tableName);
3501     } catch (KeeperException e) {
3502       // here we can abort as it is the start up flow
3503       String errorMsg = "Unable to ensure that the table " + tableName
3504           + " will be" + " enabled because of a ZooKeeper issue";
3505       LOG.error(errorMsg);
3506       this.server.abort(errorMsg, e);
3507     }
3508   }
3509 
3510   /**
3511    * Set region as OFFLINED up in zookeeper asynchronously.
3512    * @param state
3513    * @return True if we succeeded, false otherwise (State was incorrect or failed
3514    * updating zk).
3515    */
3516   private boolean asyncSetOfflineInZooKeeper(final RegionState state,
3517       final AsyncCallback.StringCallback cb, final ServerName destination) {
3518     if (!state.isClosed() && !state.isOffline()) {
3519       this.server.abort("Unexpected state trying to OFFLINE; " + state,
3520         new IllegalStateException());
3521       return false;
3522     }
3523     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
3524     try {
3525       ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(),
3526         destination, cb, state);
3527     } catch (KeeperException e) {
3528       if (e instanceof NodeExistsException) {
3529         LOG.warn("Node for " + state.getRegion() + " already exists");
3530       } else {
3531         server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
3532       }
3533       return false;
3534     }
3535     return true;
3536   }
3537 
3538   private boolean deleteNodeInStates(String encodedName,
3539       String desc, ServerName sn, EventType... types) {
3540     try {
3541       for (EventType et: types) {
3542         if (ZKAssign.deleteNode(watcher, encodedName, et, sn)) {
3543           return true;
3544         }
3545       }
3546       LOG.info("Failed to delete the " + desc + " node for "
3547         + encodedName + ". The node type may not match");
3548     } catch (NoNodeException e) {
3549       if (LOG.isDebugEnabled()) {
3550         LOG.debug("The " + desc + " node for " + encodedName + " already deleted");
3551       }
3552     } catch (KeeperException ke) {
3553       server.abort("Unexpected ZK exception deleting " + desc
3554         + " node for the region " + encodedName, ke);
3555     }
3556     return false;
3557   }
3558 
3559   private void deleteMergingNode(String encodedName, ServerName sn) {
3560     deleteNodeInStates(encodedName, "merging", sn, EventType.RS_ZK_REGION_MERGING,
3561       EventType.RS_ZK_REQUEST_REGION_MERGE, EventType.RS_ZK_REGION_MERGED);
3562   }
3563 
3564   private void deleteSplittingNode(String encodedName, ServerName sn) {
3565     deleteNodeInStates(encodedName, "splitting", sn, EventType.RS_ZK_REGION_SPLITTING,
3566       EventType.RS_ZK_REQUEST_REGION_SPLIT, EventType.RS_ZK_REGION_SPLIT);
3567   }
3568 
3569   private void onRegionFailedOpen(
3570       final HRegionInfo hri, final ServerName sn) {
3571     String encodedName = hri.getEncodedName();
3572     AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
3573     if (failedOpenCount == null) {
3574       failedOpenCount = new AtomicInteger();
3575       // No need to use putIfAbsent, or extra synchronization since
3576       // this whole handleRegion block is locked on the encoded region
3577       // name, and failedOpenTracker is updated only in this block
3578       failedOpenTracker.put(encodedName, failedOpenCount);
3579     }
3580     if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
3581       regionStates.updateRegionState(hri, State.FAILED_OPEN);
3582       // remove the tracking info to save memory, also reset
3583       // the count for next open initiative
3584       failedOpenTracker.remove(encodedName);
3585     } else {
3586       // Handle this the same as if it were opened and then closed.
3587       RegionState regionState = regionStates.updateRegionState(hri, State.CLOSED);
3588       if (regionState != null) {
3589         // When there are more than one region server a new RS is selected as the
3590         // destination and the same is updated in the region plan. (HBASE-5546)
3591         Set<TableName> disablingOrDisabled = null;
3592         try {
3593           disablingOrDisabled = ZKTable.getDisablingTables(watcher);
3594           disablingOrDisabled.addAll(ZKTable.getDisabledTables(watcher));
3595         } catch (KeeperException e) {
3596           server.abort("Cannot retrieve info about disabling or disabled tables ", e);
3597         }
3598         if (disablingOrDisabled.contains(hri.getTable())) {
3599           offlineDisabledRegion(hri);
3600           return;
3601         }
3602         // ZK Node is in CLOSED state, assign it.
3603          regionStates.updateRegionState(hri, RegionState.State.CLOSED);
3604         // This below has to do w/ online enable/disable of a table
3605         removeClosedRegion(hri);
3606         try {
3607           getRegionPlan(hri, sn, true);
3608         } catch (HBaseIOException e) {
3609           LOG.warn("Failed to get region plan", e);
3610         }
3611         invokeAssign(hri, false);
3612       }
3613     }
3614   }
3615 
3616   private void onRegionOpen(
3617       final HRegionInfo hri, final ServerName sn, long openSeqNum) {
3618     regionOnline(hri, sn, openSeqNum);
3619     if (useZKForAssignment) {
3620       try {
3621         // Delete the ZNode if exists
3622         ZKAssign.deleteNodeFailSilent(watcher, hri);
3623       } catch (KeeperException ke) {
3624         server.abort("Unexpected ZK exception deleting node " + hri, ke);
3625       }
3626     }
3627 
3628     // reset the count, if any
3629     failedOpenTracker.remove(hri.getEncodedName());
3630     if (isTableDisabledOrDisabling(hri.getTable())) {
3631       invokeUnAssign(hri);
3632     }
3633   }
3634 
3635   private void onRegionClosed(final HRegionInfo hri) {
3636     if (isTableDisabledOrDisabling(hri.getTable())) {
3637       offlineDisabledRegion(hri);
3638       return;
3639     }
3640     regionStates.updateRegionState(hri, RegionState.State.CLOSED);
3641     // This below has to do w/ online enable/disable of a table
3642     removeClosedRegion(hri);
3643     invokeAssign(hri, false);
3644   }
3645 
3646   private String onRegionSplit(ServerName sn, TransitionCode code,
3647       HRegionInfo p, HRegionInfo a, HRegionInfo b) {
3648     RegionState rs_p = regionStates.getRegionState(p);
3649     RegionState rs_a = regionStates.getRegionState(a);
3650     RegionState rs_b = regionStates.getRegionState(b);
3651     if (!(rs_p.isOpenOrSplittingOnServer(sn)
3652         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
3653         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
3654       return "Not in state good for split";
3655     }
3656 
3657     regionStates.updateRegionState(a, State.SPLITTING_NEW, sn);
3658     regionStates.updateRegionState(b, State.SPLITTING_NEW, sn);
3659     regionStates.updateRegionState(p, State.SPLITTING);
3660 
3661     if (code == TransitionCode.SPLIT) {
3662       if (TEST_SKIP_SPLIT_HANDLING) {
3663         return "Skipping split message, TEST_SKIP_SPLIT_HANDLING is set";
3664       }
3665       regionOffline(p, State.SPLIT);
3666       regionOnline(a, sn, 1);
3667       regionOnline(b, sn, 1);
3668 
3669       // User could disable the table before master knows the new region.
3670       if (isTableDisabledOrDisabling(p.getTable())) {
3671         invokeUnAssign(a);
3672         invokeUnAssign(b);
3673       }
3674     } else if (code == TransitionCode.SPLIT_PONR) {
3675       try {
3676         regionStateStore.splitRegion(p, a, b, sn);
3677       } catch (IOException ioe) {
3678         LOG.info("Failed to record split region " + p.getShortNameToLog());
3679         return "Failed to record the splitting in meta";
3680       }
3681     } else if (code == TransitionCode.SPLIT_REVERTED) {
3682       regionOnline(p, sn);
3683       regionOffline(a);
3684       regionOffline(b);
3685 
3686       if (isTableDisabledOrDisabling(p.getTable())) {
3687         invokeUnAssign(p);
3688       }
3689     }
3690     return null;
3691   }
3692 
3693   private boolean isTableDisabledOrDisabling(TableName t) {
3694     Set<TableName> disablingOrDisabled = null;
3695     try {
3696       disablingOrDisabled = ZKTable.getDisablingTables(watcher);
3697       disablingOrDisabled.addAll(ZKTable.getDisabledTables(watcher));
3698     } catch (KeeperException e) {
3699       server.abort("Cannot retrieve info about disabling or disabled tables ", e);
3700     }
3701     return disablingOrDisabled.contains(t) ? true : false;
3702   }
3703 
3704   private String onRegionMerge(ServerName sn, TransitionCode code,
3705       HRegionInfo p, HRegionInfo a, HRegionInfo b) {
3706     RegionState rs_p = regionStates.getRegionState(p);
3707     RegionState rs_a = regionStates.getRegionState(a);
3708     RegionState rs_b = regionStates.getRegionState(b);
3709     if (!(rs_a.isOpenOrMergingOnServer(sn) && rs_b.isOpenOrMergingOnServer(sn)
3710         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
3711       return "Not in state good for merge";
3712     }
3713 
3714     regionStates.updateRegionState(a, State.MERGING);
3715     regionStates.updateRegionState(b, State.MERGING);
3716     regionStates.updateRegionState(p, State.MERGING_NEW, sn);
3717 
3718     String encodedName = p.getEncodedName();
3719     if (code == TransitionCode.READY_TO_MERGE) {
3720       mergingRegions.put(encodedName,
3721         new PairOfSameType<HRegionInfo>(a, b));
3722     } else if (code == TransitionCode.MERGED) {
3723       mergingRegions.remove(encodedName);
3724       regionOffline(a, State.MERGED);
3725       regionOffline(b, State.MERGED);
3726       regionOnline(p, sn, 1);
3727 
3728       // User could disable the table before master knows the new region.
3729       if (isTableDisabledOrDisabling(p.getTable())) {
3730         invokeUnAssign(p);
3731       }
3732     } else if (code == TransitionCode.MERGE_PONR) {
3733       try {
3734         regionStateStore.mergeRegions(p, a, b, sn);
3735       } catch (IOException ioe) {
3736         LOG.info("Failed to record merged region " + p.getShortNameToLog());
3737         return "Failed to record the merging in meta";
3738       }
3739     } else {
3740       mergingRegions.remove(encodedName);
3741       regionOnline(a, sn);
3742       regionOnline(b, sn);
3743       regionOffline(p);
3744 
3745       if (isTableDisabledOrDisabling(p.getTable())) {
3746         invokeUnAssign(a);
3747         invokeUnAssign(b);
3748       }
3749     }
3750     return null;
3751   }
3752 
3753   /**
3754    * A helper to handle region merging transition event.
3755    * It transitions merging regions to MERGING state.
3756    */
3757   private boolean handleRegionMerging(final RegionTransition rt, final String encodedName,
3758       final String prettyPrintedRegionName, final ServerName sn) {
3759     if (!serverManager.isServerOnline(sn)) {
3760       LOG.warn("Dropped merging! ServerName=" + sn + " unknown.");
3761       return false;
3762     }
3763     byte [] payloadOfMerging = rt.getPayload();
3764     List<HRegionInfo> mergingRegions;
3765     try {
3766       mergingRegions = HRegionInfo.parseDelimitedFrom(
3767         payloadOfMerging, 0, payloadOfMerging.length);
3768     } catch (IOException e) {
3769       LOG.error("Dropped merging! Failed reading "  + rt.getEventType()
3770         + " payload for " + prettyPrintedRegionName);
3771       return false;
3772     }
3773     assert mergingRegions.size() == 3;
3774     HRegionInfo p = mergingRegions.get(0);
3775     HRegionInfo hri_a = mergingRegions.get(1);
3776     HRegionInfo hri_b = mergingRegions.get(2);
3777 
3778     RegionState rs_p = regionStates.getRegionState(p);
3779     RegionState rs_a = regionStates.getRegionState(hri_a);
3780     RegionState rs_b = regionStates.getRegionState(hri_b);
3781 
3782     if (!((rs_a == null || rs_a.isOpenOrMergingOnServer(sn))
3783         && (rs_b == null || rs_b.isOpenOrMergingOnServer(sn))
3784         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
3785       LOG.warn("Dropped merging! Not in state good for MERGING; rs_p="
3786         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3787       return false;
3788     }
3789 
3790     EventType et = rt.getEventType();
3791     if (et == EventType.RS_ZK_REQUEST_REGION_MERGE) {
3792       try {
3793         if (RegionMergeTransaction.transitionMergingNode(watcher, p,
3794             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_MERGE,
3795             EventType.RS_ZK_REGION_MERGING) == -1) {
3796           byte[] data = ZKAssign.getData(watcher, encodedName);
3797           EventType currentType = null;
3798           if (data != null) {
3799             RegionTransition newRt = RegionTransition.parseFrom(data);
3800             currentType = newRt.getEventType();
3801           }
3802           if (currentType == null || (currentType != EventType.RS_ZK_REGION_MERGED
3803               && currentType != EventType.RS_ZK_REGION_MERGING)) {
3804             LOG.warn("Failed to transition pending_merge node "
3805               + encodedName + " to merging, it's now " + currentType);
3806             return false;
3807           }
3808         }
3809       } catch (Exception e) {
3810         LOG.warn("Failed to transition pending_merge node "
3811           + encodedName + " to merging", e);
3812         return false;
3813       }
3814     }
3815 
3816     synchronized (regionStates) {
3817       regionStates.updateRegionState(hri_a, State.MERGING);
3818       regionStates.updateRegionState(hri_b, State.MERGING);
3819       regionStates.updateRegionState(p, State.MERGING_NEW, sn);
3820 
3821       if (et != EventType.RS_ZK_REGION_MERGED) {
3822         this.mergingRegions.put(encodedName,
3823           new PairOfSameType<HRegionInfo>(hri_a, hri_b));
3824       } else {
3825         this.mergingRegions.remove(encodedName);
3826         regionOffline(hri_a, State.MERGED);
3827         regionOffline(hri_b, State.MERGED);
3828         regionOnline(p, sn);
3829       }
3830     }
3831 
3832     if (et == EventType.RS_ZK_REGION_MERGED) {
3833       LOG.debug("Handling MERGED event for " + encodedName + "; deleting node");
3834       // Remove region from ZK
3835       try {
3836         boolean successful = false;
3837         while (!successful) {
3838           // It's possible that the RS tickles in between the reading of the
3839           // znode and the deleting, so it's safe to retry.
3840           successful = ZKAssign.deleteNode(watcher, encodedName,
3841             EventType.RS_ZK_REGION_MERGED, sn);
3842         }
3843       } catch (KeeperException e) {
3844         if (e instanceof NoNodeException) {
3845           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3846           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3847         } else {
3848           server.abort("Error deleting MERGED node " + encodedName, e);
3849         }
3850       }
3851       LOG.info("Handled MERGED event; merged=" + p.getRegionNameAsString()
3852         + ", region_a=" + hri_a.getRegionNameAsString() + ", region_b="
3853         + hri_b.getRegionNameAsString() + ", on " + sn);
3854 
3855       // User could disable the table before master knows the new region.
3856       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3857         unassign(p);
3858       }
3859     }
3860     return true;
3861   }
3862 
3863   /**
3864    * A helper to handle region splitting transition event.
3865    */
3866   private boolean handleRegionSplitting(final RegionTransition rt, final String encodedName,
3867       final String prettyPrintedRegionName, final ServerName sn) {
3868     if (!serverManager.isServerOnline(sn)) {
3869       LOG.warn("Dropped splitting! ServerName=" + sn + " unknown.");
3870       return false;
3871     }
3872     byte [] payloadOfSplitting = rt.getPayload();
3873     List<HRegionInfo> splittingRegions;
3874     try {
3875       splittingRegions = HRegionInfo.parseDelimitedFrom(
3876         payloadOfSplitting, 0, payloadOfSplitting.length);
3877     } catch (IOException e) {
3878       LOG.error("Dropped splitting! Failed reading " + rt.getEventType()
3879         + " payload for " + prettyPrintedRegionName);
3880       return false;
3881     }
3882     assert splittingRegions.size() == 2;
3883     HRegionInfo hri_a = splittingRegions.get(0);
3884     HRegionInfo hri_b = splittingRegions.get(1);
3885 
3886     RegionState rs_p = regionStates.getRegionState(encodedName);
3887     RegionState rs_a = regionStates.getRegionState(hri_a);
3888     RegionState rs_b = regionStates.getRegionState(hri_b);
3889 
3890     if (!((rs_p == null || rs_p.isOpenOrSplittingOnServer(sn))
3891         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
3892         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
3893       LOG.warn("Dropped splitting! Not in state good for SPLITTING; rs_p="
3894         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3895       return false;
3896     }
3897 
3898     if (rs_p == null) {
3899       // Splitting region should be online
3900       rs_p = regionStates.updateRegionState(rt, State.OPEN);
3901       if (rs_p == null) {
3902         LOG.warn("Received splitting for region " + prettyPrintedRegionName
3903           + " from server " + sn + " but it doesn't exist anymore,"
3904           + " probably already processed its split");
3905         return false;
3906       }
3907       regionStates.regionOnline(rs_p.getRegion(), sn);
3908     }
3909 
3910     HRegionInfo p = rs_p.getRegion();
3911     EventType et = rt.getEventType();
3912     if (et == EventType.RS_ZK_REQUEST_REGION_SPLIT) {
3913       try {
3914         if (SplitTransaction.transitionSplittingNode(watcher, p,
3915             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_SPLIT,
3916             EventType.RS_ZK_REGION_SPLITTING) == -1) {
3917           byte[] data = ZKAssign.getData(watcher, encodedName);
3918           EventType currentType = null;
3919           if (data != null) {
3920             RegionTransition newRt = RegionTransition.parseFrom(data);
3921             currentType = newRt.getEventType();
3922           }
3923           if (currentType == null || (currentType != EventType.RS_ZK_REGION_SPLIT
3924               && currentType != EventType.RS_ZK_REGION_SPLITTING)) {
3925             LOG.warn("Failed to transition pending_split node "
3926               + encodedName + " to splitting, it's now " + currentType);
3927             return false;
3928           }
3929         }
3930       } catch (Exception e) {
3931         LOG.warn("Failed to transition pending_split node "
3932           + encodedName + " to splitting", e);
3933         return false;
3934       }
3935     }
3936 
3937     synchronized (regionStates) {
3938       regionStates.updateRegionState(hri_a, State.SPLITTING_NEW, sn);
3939       regionStates.updateRegionState(hri_b, State.SPLITTING_NEW, sn);
3940       regionStates.updateRegionState(rt, State.SPLITTING);
3941 
3942       // The below is for testing ONLY!  We can't do fault injection easily, so
3943       // resort to this kinda uglyness -- St.Ack 02/25/2011.
3944       if (TEST_SKIP_SPLIT_HANDLING) {
3945         LOG.warn("Skipping split message, TEST_SKIP_SPLIT_HANDLING is set");
3946         return true; // return true so that the splitting node stays
3947       }
3948 
3949       if (et == EventType.RS_ZK_REGION_SPLIT) {
3950         regionOffline(p, State.SPLIT);
3951         regionOnline(hri_a, sn);
3952         regionOnline(hri_b, sn);
3953       }
3954     }
3955 
3956     if (et == EventType.RS_ZK_REGION_SPLIT) {
3957       LOG.debug("Handling SPLIT event for " + encodedName + "; deleting node");
3958       // Remove region from ZK
3959       try {
3960         boolean successful = false;
3961         while (!successful) {
3962           // It's possible that the RS tickles in between the reading of the
3963           // znode and the deleting, so it's safe to retry.
3964           successful = ZKAssign.deleteNode(watcher, encodedName,
3965             EventType.RS_ZK_REGION_SPLIT, sn);
3966         }
3967       } catch (KeeperException e) {
3968         if (e instanceof NoNodeException) {
3969           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3970           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3971         } else {
3972           server.abort("Error deleting SPLIT node " + encodedName, e);
3973         }
3974       }
3975       LOG.info("Handled SPLIT event; parent=" + p.getRegionNameAsString()
3976         + ", daughter a=" + hri_a.getRegionNameAsString() + ", daughter b="
3977         + hri_b.getRegionNameAsString() + ", on " + sn);
3978 
3979       // User could disable the table before master knows the new region.
3980       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3981         unassign(hri_a);
3982         unassign(hri_b);
3983       }
3984     }
3985     return true;
3986   }
3987 
3988   /**
3989    * A region is offline.  The new state should be the specified one,
3990    * if not null.  If the specified state is null, the new state is Offline.
3991    * The specified state can be Split/Merged/Offline/null only.
3992    */
3993   private void regionOffline(final HRegionInfo regionInfo, final State state) {
3994     regionStates.regionOffline(regionInfo, state);
3995     removeClosedRegion(regionInfo);
3996     // remove the region plan as well just in case.
3997     clearRegionPlan(regionInfo);
3998     balancer.regionOffline(regionInfo);
3999 
4000     // Tell our listeners that a region was closed
4001     sendRegionClosedNotification(regionInfo);
4002   }
4003 
4004   private void sendRegionOpenedNotification(final HRegionInfo regionInfo,
4005       final ServerName serverName) {
4006     if (!this.listeners.isEmpty()) {
4007       for (AssignmentListener listener : this.listeners) {
4008         listener.regionOpened(regionInfo, serverName);
4009       }
4010     }
4011   }
4012 
4013   private void sendRegionClosedNotification(final HRegionInfo regionInfo) {
4014     if (!this.listeners.isEmpty()) {
4015       for (AssignmentListener listener : this.listeners) {
4016         listener.regionClosed(regionInfo);
4017       }
4018     }
4019   }
4020 
4021   /**
4022    * Try to update some region states. If the state machine prevents
4023    * such update, an error message is returned to explain the reason.
4024    *
4025    * It's expected that in each transition there should have just one
4026    * region for opening/closing, 3 regions for splitting/merging.
4027    * These regions should be on the server that requested the change.
4028    *
4029    * Region state machine. Only these transitions
4030    * are expected to be triggered by a region server.
4031    *
4032    * On the state transition:
4033    *  (1) Open/Close should be initiated by master
4034    *      (a) Master sets the region to pending_open/pending_close
4035    *        in memory and hbase:meta after sending the request
4036    *        to the region server
4037    *      (b) Region server reports back to the master
4038    *        after open/close is done (either success/failure)
4039    *      (c) If region server has problem to report the status
4040    *        to master, it must be because the master is down or some
4041    *        temporary network issue. Otherwise, the region server should
4042    *        abort since it must be a bug. If the master is not accessible,
4043    *        the region server should keep trying until the server is
4044    *        stopped or till the status is reported to the (new) master
4045    *      (d) If region server dies in the middle of opening/closing
4046    *        a region, SSH picks it up and finishes it
4047    *      (e) If master dies in the middle, the new master recovers
4048    *        the state during initialization from hbase:meta. Region server
4049    *        can report any transition that has not been reported to
4050    *        the previous active master yet
4051    *  (2) Split/merge is initiated by region servers
4052    *      (a) To split a region, a region server sends a request
4053    *        to master to try to set a region to splitting, together with
4054    *        two daughters (to be created) to splitting new. If approved
4055    *        by the master, the splitting can then move ahead
4056    *      (b) To merge two regions, a region server sends a request to
4057    *        master to try to set the new merged region (to be created) to
4058    *        merging_new, together with two regions (to be merged) to merging.
4059    *        If it is ok with the master, the merge can then move ahead
4060    *      (c) Once the splitting/merging is done, the region server
4061    *        reports the status back to the master either success/failure.
4062    *      (d) Other scenarios should be handled similarly as for
4063    *        region open/close
4064    */
4065   protected String onRegionTransition(final ServerName serverName,
4066       final RegionServerStatusProtos.RegionTransition transition) {
4067     TransitionCode code = transition.getTransitionCode();
4068     HRegionInfo hri = HRegionInfo.convert(transition.getRegionInfo(0));
4069     RegionState current = regionStates.getRegionState(hri);
4070     if (LOG.isDebugEnabled()) {
4071       LOG.debug("Got transition " + code + " for "
4072         + (current != null ? current.toString() : hri.getShortNameToLog())
4073         + " from " + serverName);
4074     }
4075     String errorMsg = null;
4076     switch (code) {
4077     case OPENED:
4078       if (current != null && current.isOpened() && current.isOnServer(serverName)) {
4079         LOG.info("Region " + hri.getShortNameToLog() + " is already " + current.getState() + " on "
4080             + serverName);
4081         break;
4082       }
4083     case FAILED_OPEN:
4084       if (current == null
4085           || !current.isPendingOpenOrOpeningOnServer(serverName)) {
4086         errorMsg = hri.getShortNameToLog()
4087           + " is not pending open on " + serverName;
4088       } else if (code == TransitionCode.FAILED_OPEN) {
4089         onRegionFailedOpen(hri, serverName);
4090       } else {
4091         long openSeqNum = HConstants.NO_SEQNUM;
4092         if (transition.hasOpenSeqNum()) {
4093           openSeqNum = transition.getOpenSeqNum();
4094         }
4095         if (openSeqNum < 0) {
4096           errorMsg = "Newly opened region has invalid open seq num " + openSeqNum;
4097         } else {
4098           onRegionOpen(hri, serverName, openSeqNum);
4099         }
4100       }
4101       break;
4102 
4103     case CLOSED:
4104       if (current == null
4105           || !current.isPendingCloseOrClosingOnServer(serverName)) {
4106         errorMsg = hri.getShortNameToLog()
4107           + " is not pending close on " + serverName;
4108       } else {
4109         onRegionClosed(hri);
4110       }
4111       break;
4112 
4113     case READY_TO_SPLIT:
4114     case SPLIT_PONR:
4115     case SPLIT:
4116     case SPLIT_REVERTED:
4117       errorMsg = onRegionSplit(serverName, code, hri,
4118         HRegionInfo.convert(transition.getRegionInfo(1)),
4119         HRegionInfo.convert(transition.getRegionInfo(2)));
4120       break;
4121 
4122     case READY_TO_MERGE:
4123     case MERGE_PONR:
4124     case MERGED:
4125     case MERGE_REVERTED:
4126       errorMsg = onRegionMerge(serverName, code, hri,
4127         HRegionInfo.convert(transition.getRegionInfo(1)),
4128         HRegionInfo.convert(transition.getRegionInfo(2)));
4129       break;
4130 
4131     default:
4132       errorMsg = "Unexpected transition code " + code;
4133     }
4134     if (errorMsg != null) {
4135       LOG.error("Failed to transtion region from " + current + " to "
4136         + code + " by " + serverName + ": " + errorMsg);
4137     }
4138     return errorMsg;
4139   }
4140 
4141   /**
4142    * @return Instance of load balancer
4143    */
4144   public LoadBalancer getBalancer() {
4145     return this.balancer;
4146   }
4147 }