View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.NavigableMap;
31  import java.util.Set;
32  import java.util.TreeMap;
33  import java.util.concurrent.ConcurrentHashMap;
34  import java.util.concurrent.ConcurrentSkipListSet;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.TimeUnit;
37  import java.util.concurrent.atomic.AtomicBoolean;
38  import java.util.concurrent.atomic.AtomicInteger;
39  import java.util.concurrent.locks.Lock;
40  import java.util.concurrent.locks.ReentrantLock;
41  
42  import org.apache.commons.logging.Log;
43  import org.apache.commons.logging.LogFactory;
44  import org.apache.hadoop.classification.InterfaceAudience;
45  import org.apache.hadoop.conf.Configuration;
46  import org.apache.hadoop.hbase.Chore;
47  import org.apache.hadoop.hbase.HBaseIOException;
48  import org.apache.hadoop.hbase.HConstants;
49  import org.apache.hadoop.hbase.HRegionInfo;
50  import org.apache.hadoop.hbase.NotServingRegionException;
51  import org.apache.hadoop.hbase.RegionTransition;
52  import org.apache.hadoop.hbase.Server;
53  import org.apache.hadoop.hbase.ServerName;
54  import org.apache.hadoop.hbase.Stoppable;
55  import org.apache.hadoop.hbase.TableName;
56  import org.apache.hadoop.hbase.TableNotFoundException;
57  import org.apache.hadoop.hbase.catalog.CatalogTracker;
58  import org.apache.hadoop.hbase.catalog.MetaReader;
59  import org.apache.hadoop.hbase.client.Result;
60  import org.apache.hadoop.hbase.exceptions.DeserializationException;
61  import org.apache.hadoop.hbase.executor.EventHandler;
62  import org.apache.hadoop.hbase.executor.EventType;
63  import org.apache.hadoop.hbase.executor.ExecutorService;
64  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
65  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
66  import org.apache.hadoop.hbase.master.RegionState.State;
67  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
68  import org.apache.hadoop.hbase.master.balancer.FavoredNodeLoadBalancer;
69  import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
70  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
71  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
72  import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
73  import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
74  import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
75  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
76  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
77  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
78  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
79  import org.apache.hadoop.hbase.util.KeyLocker;
80  import org.apache.hadoop.hbase.util.Pair;
81  import org.apache.hadoop.hbase.util.PairOfSameType;
82  import org.apache.hadoop.hbase.util.Threads;
83  import org.apache.hadoop.hbase.util.Triple;
84  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
85  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
86  import org.apache.hadoop.hbase.zookeeper.ZKTable;
87  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
88  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
89  import org.apache.hadoop.ipc.RemoteException;
90  import org.apache.zookeeper.AsyncCallback;
91  import org.apache.zookeeper.KeeperException;
92  import org.apache.zookeeper.KeeperException.NoNodeException;
93  import org.apache.zookeeper.KeeperException.NodeExistsException;
94  import org.apache.zookeeper.data.Stat;
95  
96  import com.google.common.base.Preconditions;
97  import com.google.common.collect.LinkedHashMultimap;
98  
99  /**
100  * Manages and performs region assignment.
101  * <p>
102  * Monitors ZooKeeper for events related to regions in transition.
103  * <p>
104  * Handles existing regions in transition during master failover.
105  */
106 @InterfaceAudience.Private
107 public class AssignmentManager extends ZooKeeperListener {
108   private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
109 
110   public static final ServerName HBCK_CODE_SERVERNAME = ServerName.valueOf(HConstants.HBCK_CODE_NAME,
111       -1, -1L);
112 
113   public static final String ASSIGNMENT_TIMEOUT = "hbase.master.assignment.timeoutmonitor.timeout";
114   public static final int DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT = 600000;
115   public static final String ASSIGNMENT_TIMEOUT_MANAGEMENT = "hbase.assignment.timeout.management";
116   public static final boolean DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT = false;
117 
118   public static final String ALREADY_IN_TRANSITION_WAITTIME
119     = "hbase.assignment.already.intransition.waittime";
120   public static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
121 
122   protected final Server server;
123 
124   private ServerManager serverManager;
125 
126   private boolean shouldAssignRegionsWithFavoredNodes;
127 
128   private CatalogTracker catalogTracker;
129 
130   protected final TimeoutMonitor timeoutMonitor;
131 
132   private final TimerUpdater timerUpdater;
133 
134   private LoadBalancer balancer;
135 
136   private final MetricsAssignmentManager metricsAssignmentManager;
137 
138   private final TableLockManager tableLockManager;
139 
140   private AtomicInteger numRegionsOpened = new AtomicInteger(0);
141 
142   final private KeyLocker<String> locker = new KeyLocker<String>();
143 
144   /**
145    * Map of regions to reopen after the schema of a table is changed. Key -
146    * encoded region name, value - HRegionInfo
147    */
148   private final Map <String, HRegionInfo> regionsToReopen;
149 
150   /*
151    * Maximum times we recurse an assignment/unassignment.
152    * See below in {@link #assign()} and {@link #unassign()}.
153    */
154   private final int maximumAttempts;
155 
156   /**
157    * Map of two merging regions from the region to be created.
158    */
159   private final Map<String, PairOfSameType<HRegionInfo>> mergingRegions
160     = new HashMap<String, PairOfSameType<HRegionInfo>>();
161 
162   /**
163    * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
164    * failure due to lack of availability of region plan
165    */
166   private final long sleepTimeBeforeRetryingMetaAssignment;
167 
168   /** Plans for region movement. Key is the encoded version of a region name*/
169   // TODO: When do plans get cleaned out?  Ever? In server open and in server
170   // shutdown processing -- St.Ack
171   // All access to this Map must be synchronized.
172   final NavigableMap<String, RegionPlan> regionPlans =
173     new TreeMap<String, RegionPlan>();
174 
175   private final ZKTable zkTable;
176 
177   /**
178    * Contains the server which need to update timer, these servers will be
179    * handled by {@link TimerUpdater}
180    */
181   private final ConcurrentSkipListSet<ServerName> serversInUpdatingTimer;
182 
183   private final ExecutorService executorService;
184 
185   // For unit tests, keep track of calls to ClosedRegionHandler
186   private Map<HRegionInfo, AtomicBoolean> closedRegionHandlerCalled = null;
187 
188   // For unit tests, keep track of calls to OpenedRegionHandler
189   private Map<HRegionInfo, AtomicBoolean> openedRegionHandlerCalled = null;
190 
191   //Thread pool executor service for timeout monitor
192   private java.util.concurrent.ExecutorService threadPoolExecutorService;
193 
194   // A bunch of ZK events workers. Each is a single thread executor service
195   private final java.util.concurrent.ExecutorService zkEventWorkers;
196 
197   private List<EventType> ignoreStatesRSOffline = Arrays.asList(
198       EventType.RS_ZK_REGION_FAILED_OPEN, EventType.RS_ZK_REGION_CLOSED);
199 
200   private final RegionStates regionStates;
201 
202   // The threshold to use bulk assigning. Using bulk assignment
203   // only if assigning at least this many regions to at least this
204   // many servers. If assigning fewer regions to fewer servers,
205   // bulk assigning may be not as efficient.
206   private final int bulkAssignThresholdRegions;
207   private final int bulkAssignThresholdServers;
208 
209   // Should bulk assignment wait till all regions are assigned,
210   // or it is timed out?  This is useful to measure bulk assignment
211   // performance, but not needed in most use cases.
212   private final boolean bulkAssignWaitTillAllAssigned;
213 
214   /**
215    * Indicator that AssignmentManager has recovered the region states so
216    * that ServerShutdownHandler can be fully enabled and re-assign regions
217    * of dead servers. So that when re-assignment happens, AssignmentManager
218    * has proper region states.
219    *
220    * Protected to ease testing.
221    */
222   protected final AtomicBoolean failoverCleanupDone = new AtomicBoolean(false);
223 
224   /** Is the TimeOutManagement activated **/
225   private final boolean tomActivated;
226 
227   /**
228    * A map to track the count a region fails to open in a row.
229    * So that we don't try to open a region forever if the failure is
230    * unrecoverable.  We don't put this information in region states
231    * because we don't expect this to happen frequently; we don't
232    * want to copy this information over during each state transition either.
233    */
234   private final ConcurrentHashMap<String, AtomicInteger>
235     failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
236 
237   /**
238    * For testing only!  Set to true to skip handling of split.
239    */
240   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="MS_SHOULD_BE_FINAL")
241   public static boolean TEST_SKIP_SPLIT_HANDLING = false;
242 
243   /**
244    * Constructs a new assignment manager.
245    *
246    * @param server
247    * @param serverManager
248    * @param catalogTracker
249    * @param service
250    * @throws KeeperException
251    * @throws IOException
252    */
253   public AssignmentManager(Server server, ServerManager serverManager,
254       CatalogTracker catalogTracker, final LoadBalancer balancer,
255       final ExecutorService service, MetricsMaster metricsMaster,
256       final TableLockManager tableLockManager) throws KeeperException, IOException {
257     super(server.getZooKeeper());
258     this.server = server;
259     this.serverManager = serverManager;
260     this.catalogTracker = catalogTracker;
261     this.executorService = service;
262     this.regionsToReopen = Collections.synchronizedMap
263                            (new HashMap<String, HRegionInfo> ());
264     Configuration conf = server.getConfiguration();
265     // Only read favored nodes if using the favored nodes load balancer.
266     this.shouldAssignRegionsWithFavoredNodes = conf.getClass(
267            HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class).equals(
268            FavoredNodeLoadBalancer.class);
269     this.tomActivated = conf.getBoolean(
270       ASSIGNMENT_TIMEOUT_MANAGEMENT, DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT);
271     if (tomActivated){
272       this.serversInUpdatingTimer =  new ConcurrentSkipListSet<ServerName>();
273       this.timeoutMonitor = new TimeoutMonitor(
274         conf.getInt("hbase.master.assignment.timeoutmonitor.period", 30000),
275         server, serverManager,
276         conf.getInt(ASSIGNMENT_TIMEOUT, DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT));
277       this.timerUpdater = new TimerUpdater(conf.getInt(
278         "hbase.master.assignment.timerupdater.period", 10000), server);
279       Threads.setDaemonThreadRunning(timerUpdater.getThread(),
280         server.getServerName() + ".timerUpdater");
281     } else {
282       this.serversInUpdatingTimer =  null;
283       this.timeoutMonitor = null;
284       this.timerUpdater = null;
285     }
286     this.zkTable = new ZKTable(this.watcher);
287     // This is the max attempts, not retries, so it should be at least 1.
288     this.maximumAttempts = Math.max(1,
289       this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10));
290     this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong(
291         "hbase.meta.assignment.retry.sleeptime", 1000l);
292     this.balancer = balancer;
293     int maxThreads = conf.getInt("hbase.assignment.threads.max", 30);
294     this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool(
295       maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM."));
296     this.regionStates = new RegionStates(server, serverManager);
297 
298     this.bulkAssignWaitTillAllAssigned =
299       conf.getBoolean("hbase.bulk.assignment.waittillallassigned", false);
300     this.bulkAssignThresholdRegions = conf.getInt("hbase.bulk.assignment.threshold.regions", 7);
301     this.bulkAssignThresholdServers = conf.getInt("hbase.bulk.assignment.threshold.servers", 3);
302 
303     int workers = conf.getInt("hbase.assignment.zkevent.workers", 20);
304     ThreadFactory threadFactory = Threads.newDaemonThreadFactory("AM.ZK.Worker");
305     zkEventWorkers = Threads.getBoundedCachedThreadPool(workers, 60L,
306             TimeUnit.SECONDS, threadFactory);
307     this.tableLockManager = tableLockManager;
308 
309     this.metricsAssignmentManager = new MetricsAssignmentManager();
310   }
311 
312   void startTimeOutMonitor() {
313     if (tomActivated) {
314       Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), server.getServerName()
315           + ".timeoutMonitor");
316     }
317   }
318 
319   /**
320    * @return Instance of ZKTable.
321    */
322   public ZKTable getZKTable() {
323     // These are 'expensive' to make involving trip to zk ensemble so allow
324     // sharing.
325     return this.zkTable;
326   }
327 
328   /**
329    * This SHOULD not be public. It is public now
330    * because of some unit tests.
331    *
332    * TODO: make it package private and keep RegionStates in the master package
333    */
334   public RegionStates getRegionStates() {
335     return regionStates;
336   }
337 
338   public RegionPlan getRegionReopenPlan(HRegionInfo hri) {
339     return new RegionPlan(hri, null, regionStates.getRegionServerOfRegion(hri));
340   }
341 
342   /**
343    * Add a regionPlan for the specified region.
344    * @param encodedName
345    * @param plan
346    */
347   public void addPlan(String encodedName, RegionPlan plan) {
348     synchronized (regionPlans) {
349       regionPlans.put(encodedName, plan);
350     }
351   }
352 
353   /**
354    * Add a map of region plans.
355    */
356   public void addPlans(Map<String, RegionPlan> plans) {
357     synchronized (regionPlans) {
358       regionPlans.putAll(plans);
359     }
360   }
361 
362   /**
363    * Set the list of regions that will be reopened
364    * because of an update in table schema
365    *
366    * @param regions
367    *          list of regions that should be tracked for reopen
368    */
369   public void setRegionsToReopen(List <HRegionInfo> regions) {
370     for(HRegionInfo hri : regions) {
371       regionsToReopen.put(hri.getEncodedName(), hri);
372     }
373   }
374 
375   /**
376    * Used by the client to identify if all regions have the schema updates
377    *
378    * @param tableName
379    * @return Pair indicating the status of the alter command
380    * @throws IOException
381    */
382   public Pair<Integer, Integer> getReopenStatus(TableName tableName)
383       throws IOException {
384     List <HRegionInfo> hris =
385       MetaReader.getTableRegions(this.server.getCatalogTracker(), tableName, true);
386     Integer pending = 0;
387     for (HRegionInfo hri : hris) {
388       String name = hri.getEncodedName();
389       // no lock concurrent access ok: sequential consistency respected.
390       if (regionsToReopen.containsKey(name)
391           || regionStates.isRegionInTransition(name)) {
392         pending++;
393       }
394     }
395     return new Pair<Integer, Integer>(pending, hris.size());
396   }
397 
398   /**
399    * Used by ServerShutdownHandler to make sure AssignmentManager has completed
400    * the failover cleanup before re-assigning regions of dead servers. So that
401    * when re-assignment happens, AssignmentManager has proper region states.
402    */
403   public boolean isFailoverCleanupDone() {
404     return failoverCleanupDone.get();
405   }
406 
407   /**
408    * To avoid racing with AM, external entities may need to lock a region,
409    * for example, when SSH checks what regions to skip re-assigning.
410    */
411   public Lock acquireRegionLock(final String encodedName) {
412     return locker.acquireLock(encodedName);
413   }
414 
415   /**
416    * Now, failover cleanup is completed. Notify server manager to
417    * process queued up dead servers processing, if any.
418    */
419   void failoverCleanupDone() {
420     failoverCleanupDone.set(true);
421     serverManager.processQueuedDeadServers();
422   }
423 
424   /**
425    * Called on startup.
426    * Figures whether a fresh cluster start of we are joining extant running cluster.
427    * @throws IOException
428    * @throws KeeperException
429    * @throws InterruptedException
430    */
431   void joinCluster() throws IOException,
432       KeeperException, InterruptedException {
433     // Concurrency note: In the below the accesses on regionsInTransition are
434     // outside of a synchronization block where usually all accesses to RIT are
435     // synchronized.  The presumption is that in this case it is safe since this
436     // method is being played by a single thread on startup.
437 
438     // TODO: Regions that have a null location and are not in regionsInTransitions
439     // need to be handled.
440 
441     // Scan hbase:meta to build list of existing regions, servers, and assignment
442     // Returns servers who have not checked in (assumed dead) and their regions
443     Map<ServerName, List<HRegionInfo>> deadServers = rebuildUserRegions();
444 
445     // This method will assign all user regions if a clean server startup or
446     // it will reconstruct master state and cleanup any leftovers from
447     // previous master process.
448     processDeadServersAndRegionsInTransition(deadServers);
449 
450     recoverTableInDisablingState();
451     recoverTableInEnablingState();
452   }
453 
454   /**
455    * Process all regions that are in transition in zookeeper and also
456    * processes the list of dead servers by scanning the META.
457    * Used by master joining an cluster.  If we figure this is a clean cluster
458    * startup, will assign all user regions.
459    * @param deadServers
460    *          Map of dead servers and their regions. Can be null.
461    * @throws KeeperException
462    * @throws IOException
463    * @throws InterruptedException
464    */
465   void processDeadServersAndRegionsInTransition(
466       final Map<ServerName, List<HRegionInfo>> deadServers)
467           throws KeeperException, IOException, InterruptedException {
468     List<String> nodes = ZKUtil.listChildrenNoWatch(watcher,
469       watcher.assignmentZNode);
470 
471     if (nodes == null) {
472       String errorMessage = "Failed to get the children from ZK";
473       server.abort(errorMessage, new IOException(errorMessage));
474       return;
475     }
476 
477     boolean failover = (!serverManager.getDeadServers().isEmpty() || !serverManager
478         .getRequeuedDeadServers().isEmpty());
479 
480     if (!failover) {
481       // If any one region except meta is assigned, it's a failover.
482       Map<HRegionInfo, ServerName> regions = regionStates.getRegionAssignments();
483       for (HRegionInfo hri: regions.keySet()) {
484         if (!hri.isMetaTable()) {
485           LOG.debug("Found " + hri + " out on cluster");
486           failover = true;
487           break;
488         }
489       }
490       if (!failover) {
491         // If any one region except meta is in transition, it's a failover.
492         for (String encodedName: nodes) {
493           RegionState state = regionStates.getRegionState(encodedName);
494           if (state != null && !state.getRegion().isMetaRegion()) {
495             LOG.debug("Found " + state.getRegion().getRegionNameAsString() + " in RITs");
496             failover = true;
497             break;
498           }
499         }
500       }
501     }
502 
503     // If we found user regions out on cluster, its a failover.
504     if (failover) {
505       LOG.info("Found regions out on cluster or in RIT; presuming failover");
506       // Process list of dead servers and regions in RIT.
507       // See HBASE-4580 for more information.
508       processDeadServersAndRecoverLostRegions(deadServers);
509     } else {
510       // Fresh cluster startup.
511       LOG.info("Clean cluster startup. Assigning userregions");
512       assignAllUserRegions();
513     }
514   }
515 
516   /**
517    * If region is up in zk in transition, then do fixup and block and wait until
518    * the region is assigned and out of transition.  Used on startup for
519    * catalog regions.
520    * @param hri Region to look for.
521    * @return True if we processed a region in transition else false if region
522    * was not up in zk in transition.
523    * @throws InterruptedException
524    * @throws KeeperException
525    * @throws IOException
526    */
527   boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
528       throws InterruptedException, KeeperException, IOException {
529     String encodedRegionName = hri.getEncodedName();
530     if (!processRegionInTransition(encodedRegionName, hri)) {
531       return false; // The region is not in transition
532     }
533     LOG.debug("Waiting on " + HRegionInfo.prettyPrint(encodedRegionName));
534     while (!this.server.isStopped() &&
535         this.regionStates.isRegionInTransition(encodedRegionName)) {
536       RegionState state = this.regionStates.getRegionTransitionState(encodedRegionName);
537       if (state == null || !serverManager.isServerOnline(state.getServerName())) {
538         // The region is not in transition, or not in transition on an online
539         // server. Doesn't help to block here any more. Caller need to
540         // verify the region is actually assigned.
541         break;
542       }
543       this.regionStates.waitForUpdate(100);
544     }
545     return true;
546   }
547 
548   /**
549    * Process failover of new master for region <code>encodedRegionName</code>
550    * up in zookeeper.
551    * @param encodedRegionName Region to process failover for.
552    * @param regionInfo If null we'll go get it from meta table.
553    * @return True if we processed <code>regionInfo</code> as a RIT.
554    * @throws KeeperException
555    * @throws IOException
556    */
557   boolean processRegionInTransition(final String encodedRegionName,
558       final HRegionInfo regionInfo) throws KeeperException, IOException {
559     // We need a lock here to ensure that we will not put the same region twice
560     // It has no reason to be a lock shared with the other operations.
561     // We can do the lock on the region only, instead of a global lock: what we want to ensure
562     // is that we don't have two threads working on the same region.
563     Lock lock = locker.acquireLock(encodedRegionName);
564     try {
565       Stat stat = new Stat();
566       byte [] data = ZKAssign.getDataAndWatch(watcher, encodedRegionName, stat);
567       if (data == null) return false;
568       RegionTransition rt;
569       try {
570         rt = RegionTransition.parseFrom(data);
571       } catch (DeserializationException e) {
572         LOG.warn("Failed parse znode data", e);
573         return false;
574       }
575       HRegionInfo hri = regionInfo;
576       if (hri == null) {
577         // The region info is not passed in. We will try to find the region
578         // from region states map/meta based on the encoded region name. But we
579         // may not be able to find it. This is valid for online merge that
580         // the region may have not been created if the merge is not completed.
581         // Therefore, it is not in meta at master recovery time.
582         hri = regionStates.getRegionInfo(rt.getRegionName());
583         EventType et = rt.getEventType();
584         if (hri == null && et != EventType.RS_ZK_REGION_MERGING
585             && et != EventType.RS_ZK_REQUEST_REGION_MERGE) {
586           LOG.warn("Couldn't find the region in recovering " + rt);
587           return false;
588         }
589       }
590       return processRegionsInTransition(
591         rt, hri, stat.getVersion());
592     } finally {
593       lock.unlock();
594     }
595   }
596 
597   /**
598    * This call is invoked only (1) master assign meta;
599    * (2) during failover mode startup, zk assignment node processing.
600    * The locker is set in the caller. It returns true if the region
601    * is in transition for sure, false otherwise.
602    *
603    * It should be private but it is used by some test too.
604    */
605   boolean processRegionsInTransition(
606       final RegionTransition rt, final HRegionInfo regionInfo,
607       final int expectedVersion) throws KeeperException {
608     EventType et = rt.getEventType();
609     // Get ServerName.  Could not be null.
610     final ServerName sn = rt.getServerName();
611     final byte[] regionName = rt.getRegionName();
612     final String encodedName = HRegionInfo.encodeRegionName(regionName);
613     final String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
614     LOG.info("Processing " + prettyPrintedRegionName + " in state: " + et);
615 
616     if (regionStates.isRegionInTransition(encodedName)) {
617       LOG.info("Processed region " + prettyPrintedRegionName + " in state: "
618         + et + ", does nothing since the region is already in transition "
619         + regionStates.getRegionTransitionState(encodedName));
620       // Just return
621       return true;
622     }
623     if (!serverManager.isServerOnline(sn)) {
624       // It was transitioning on a dead server, so it's closed now.
625       // Force to OFFLINE and put it in transition, but not assign it
626       // since log splitting for the dead server is not done yet.
627       LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
628         " was on deadserver; forcing offline");
629       if (regionStates.isRegionOnline(regionInfo)) {
630         // Meta could still show the region is assigned to the previous
631         // server. If that server is online, when we reload the meta, the
632         // region is put back to online, we need to offline it.
633         regionStates.regionOffline(regionInfo);
634       }
635       // Put it back in transition so that SSH can re-assign it
636       regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
637 
638       if (regionInfo.isMetaRegion()) {
639         // If it's meta region, reset the meta location.
640         // So that master knows the right meta region server.
641         MetaRegionTracker.setMetaLocation(watcher, sn);
642       } else {
643         // No matter the previous server is online or offline,
644         // we need to reset the last region server of the region.
645         regionStates.setLastRegionServerOfRegion(sn, encodedName);
646         // Make sure we know the server is dead.
647         if (!serverManager.isServerDead(sn)) {
648           serverManager.expireServer(sn);
649         }
650       }
651       return false;
652     }
653     switch (et) {
654       case M_ZK_REGION_CLOSING:
655         // Insert into RIT & resend the query to the region server: may be the previous master
656         // died before sending the query the first time.
657         final RegionState rsClosing = regionStates.updateRegionState(rt, State.CLOSING);
658         this.executorService.submit(
659           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
660             @Override
661             public void process() throws IOException {
662               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
663               try {
664                 unassign(regionInfo, rsClosing, expectedVersion, null, true, null);
665                 if (regionStates.isRegionOffline(regionInfo)) {
666                   assign(regionInfo, true);
667                 }
668               } finally {
669                 lock.unlock();
670               }
671             }
672           });
673         break;
674 
675       case RS_ZK_REGION_CLOSED:
676       case RS_ZK_REGION_FAILED_OPEN:
677         // Region is closed, insert into RIT and handle it
678         regionStates.updateRegionState(regionInfo, State.CLOSED, sn);
679         invokeAssign(regionInfo);
680         break;
681 
682       case M_ZK_REGION_OFFLINE:
683         // Insert in RIT and resend to the regionserver
684         regionStates.updateRegionState(rt, State.PENDING_OPEN);
685         final RegionState rsOffline = regionStates.getRegionState(regionInfo);
686         this.executorService.submit(
687           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
688             @Override
689             public void process() throws IOException {
690               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
691               try {
692                 RegionPlan plan = new RegionPlan(regionInfo, null, sn);
693                 addPlan(encodedName, plan);
694                 assign(rsOffline, false, false);
695               } finally {
696                 lock.unlock();
697               }
698             }
699           });
700         break;
701 
702       case RS_ZK_REGION_OPENING:
703         regionStates.updateRegionState(rt, State.OPENING);
704         break;
705 
706       case RS_ZK_REGION_OPENED:
707         // Region is opened, insert into RIT and handle it
708         // This could be done asynchronously, we would need then to acquire the lock in the
709         //  handler.
710         regionStates.updateRegionState(rt, State.OPEN);
711         new OpenedRegionHandler(server, this, regionInfo, sn, expectedVersion).process();
712         break;
713       case RS_ZK_REQUEST_REGION_SPLIT:
714       case RS_ZK_REGION_SPLITTING:
715       case RS_ZK_REGION_SPLIT:
716         // Splitting region should be online. We could have skipped it during
717         // user region rebuilding since we may consider the split is completed.
718         // Put it in SPLITTING state to avoid complications.
719         regionStates.regionOnline(regionInfo, sn);
720         regionStates.updateRegionState(rt, State.SPLITTING);
721         if (!handleRegionSplitting(
722             rt, encodedName, prettyPrintedRegionName, sn)) {
723           deleteSplittingNode(encodedName, sn);
724         }
725         break;
726       case RS_ZK_REQUEST_REGION_MERGE:
727       case RS_ZK_REGION_MERGING:
728       case RS_ZK_REGION_MERGED:
729         if (!handleRegionMerging(
730             rt, encodedName, prettyPrintedRegionName, sn)) {
731           deleteMergingNode(encodedName, sn);
732         }
733         break;
734       default:
735         throw new IllegalStateException("Received region in state:" + et + " is not valid.");
736     }
737     LOG.info("Processed region " + prettyPrintedRegionName + " in state "
738       + et + ", on " + (serverManager.isServerOnline(sn) ? "" : "dead ")
739       + "server: " + sn);
740     return true;
741   }
742 
743   /**
744    * When a region is closed, it should be removed from the regionsToReopen
745    * @param hri HRegionInfo of the region which was closed
746    */
747   public void removeClosedRegion(HRegionInfo hri) {
748     if (regionsToReopen.remove(hri.getEncodedName()) != null) {
749       LOG.debug("Removed region from reopening regions because it was closed");
750     }
751   }
752 
753   /**
754    * Handles various states an unassigned node can be in.
755    * <p>
756    * Method is called when a state change is suspected for an unassigned node.
757    * <p>
758    * This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
759    * yet).
760    * @param rt
761    * @param expectedVersion
762    */
763   void handleRegion(final RegionTransition rt, int expectedVersion) {
764     if (rt == null) {
765       LOG.warn("Unexpected NULL input for RegionTransition rt");
766       return;
767     }
768     final ServerName sn = rt.getServerName();
769     // Check if this is a special HBCK transition
770     if (sn.equals(HBCK_CODE_SERVERNAME)) {
771       handleHBCK(rt);
772       return;
773     }
774     final long createTime = rt.getCreateTime();
775     final byte[] regionName = rt.getRegionName();
776     String encodedName = HRegionInfo.encodeRegionName(regionName);
777     String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
778     // Verify this is a known server
779     if (!serverManager.isServerOnline(sn)
780       && !ignoreStatesRSOffline.contains(rt.getEventType())) {
781       LOG.warn("Attempted to handle region transition for server but " +
782         "it is not online: " + prettyPrintedRegionName + ", " + rt);
783       return;
784     }
785 
786     RegionState regionState =
787       regionStates.getRegionState(encodedName);
788     long startTime = System.currentTimeMillis();
789     if (LOG.isDebugEnabled()) {
790       boolean lateEvent = createTime < (startTime - 15000);
791       LOG.debug("Handling " + rt.getEventType() +
792         ", server=" + sn + ", region=" +
793         (prettyPrintedRegionName == null ? "null" : prettyPrintedRegionName) +
794         (lateEvent ? ", which is more than 15 seconds late" : "") +
795         ", current_state=" + regionState);
796     }
797     // We don't do anything for this event,
798     // so separate it out, no need to lock/unlock anything
799     if (rt.getEventType() == EventType.M_ZK_REGION_OFFLINE) {
800       return;
801     }
802 
803     // We need a lock on the region as we could update it
804     Lock lock = locker.acquireLock(encodedName);
805     try {
806       RegionState latestState =
807         regionStates.getRegionState(encodedName);
808       if ((regionState == null && latestState != null)
809           || (regionState != null && latestState == null)
810           || (regionState != null && latestState != null
811             && latestState.getState() != regionState.getState())) {
812         LOG.warn("Region state changed from " + regionState + " to "
813           + latestState + ", while acquiring lock");
814       }
815       long waitedTime = System.currentTimeMillis() - startTime;
816       if (waitedTime > 5000) {
817         LOG.warn("Took " + waitedTime + "ms to acquire the lock");
818       }
819       regionState = latestState;
820       switch (rt.getEventType()) {
821       case RS_ZK_REQUEST_REGION_SPLIT:
822       case RS_ZK_REGION_SPLITTING:
823       case RS_ZK_REGION_SPLIT:
824         if (!handleRegionSplitting(
825             rt, encodedName, prettyPrintedRegionName, sn)) {
826           deleteSplittingNode(encodedName, sn);
827         }
828         break;
829 
830       case RS_ZK_REQUEST_REGION_MERGE:
831       case RS_ZK_REGION_MERGING:
832       case RS_ZK_REGION_MERGED:
833         // Merged region is a new region, we can't find it in the region states now.
834         // However, the two merging regions are not new. They should be in state for merging.
835         if (!handleRegionMerging(
836             rt, encodedName, prettyPrintedRegionName, sn)) {
837           deleteMergingNode(encodedName, sn);
838         }
839         break;
840 
841       case M_ZK_REGION_CLOSING:
842         // Should see CLOSING after we have asked it to CLOSE or additional
843         // times after already being in state of CLOSING
844         if (regionState == null
845             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
846           LOG.warn("Received CLOSING for " + prettyPrintedRegionName
847             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
848             + regionStates.getRegionState(encodedName));
849           return;
850         }
851         // Transition to CLOSING (or update stamp if already CLOSING)
852         regionStates.updateRegionState(rt, State.CLOSING);
853         break;
854 
855       case RS_ZK_REGION_CLOSED:
856         // Should see CLOSED after CLOSING but possible after PENDING_CLOSE
857         if (regionState == null
858             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
859           LOG.warn("Received CLOSED for " + prettyPrintedRegionName
860             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
861             + regionStates.getRegionState(encodedName));
862           return;
863         }
864         // Handle CLOSED by assigning elsewhere or stopping if a disable
865         // If we got here all is good.  Need to update RegionState -- else
866         // what follows will fail because not in expected state.
867         new ClosedRegionHandler(server, this, regionState.getRegion()).process();
868         updateClosedRegionHandlerTracker(regionState.getRegion());
869         break;
870 
871         case RS_ZK_REGION_FAILED_OPEN:
872           if (regionState == null
873               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
874             LOG.warn("Received FAILED_OPEN for " + prettyPrintedRegionName
875               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
876               + regionStates.getRegionState(encodedName));
877             return;
878           }
879           AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
880           if (failedOpenCount == null) {
881             failedOpenCount = new AtomicInteger();
882             // No need to use putIfAbsent, or extra synchronization since
883             // this whole handleRegion block is locked on the encoded region
884             // name, and failedOpenTracker is updated only in this block
885             failedOpenTracker.put(encodedName, failedOpenCount);
886           }
887           if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
888             regionStates.updateRegionState(rt, State.FAILED_OPEN);
889             // remove the tracking info to save memory, also reset
890             // the count for next open initiative
891             failedOpenTracker.remove(encodedName);
892           } else {
893             // Handle this the same as if it were opened and then closed.
894             regionState = regionStates.updateRegionState(rt, State.CLOSED);
895             if (regionState != null) {
896               // When there are more than one region server a new RS is selected as the
897               // destination and the same is updated in the regionplan. (HBASE-5546)
898               try {
899                 getRegionPlan(regionState.getRegion(), sn, true);
900                 new ClosedRegionHandler(server, this, regionState.getRegion()).process();
901               } catch (HBaseIOException e) {
902                 LOG.warn("Failed to get region plan", e);
903               }
904             }
905           }
906           break;
907 
908         case RS_ZK_REGION_OPENING:
909           // Should see OPENING after we have asked it to OPEN or additional
910           // times after already being in state of OPENING
911           if (regionState == null
912               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
913             LOG.warn("Received OPENING for " + prettyPrintedRegionName
914               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
915               + regionStates.getRegionState(encodedName));
916             return;
917           }
918           // Transition to OPENING (or update stamp if already OPENING)
919           regionStates.updateRegionState(rt, State.OPENING);
920           break;
921 
922         case RS_ZK_REGION_OPENED:
923           // Should see OPENED after OPENING but possible after PENDING_OPEN.
924           if (regionState == null
925               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
926             LOG.warn("Received OPENED for " + prettyPrintedRegionName
927               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
928               + regionStates.getRegionState(encodedName));
929 
930             if (regionState != null) {
931               // Close it without updating the internal region states,
932               // so as not to create double assignments in unlucky scenarios
933               // mentioned in OpenRegionHandler#process
934               unassign(regionState.getRegion(), null, -1, null, false, sn);
935             }
936             return;
937           }
938           // Handle OPENED by removing from transition and deleted zk node
939           regionState = regionStates.updateRegionState(rt, State.OPEN);
940           if (regionState != null) {
941             failedOpenTracker.remove(encodedName); // reset the count, if any
942             new OpenedRegionHandler(
943               server, this, regionState.getRegion(), sn, expectedVersion).process();
944             updateOpenedRegionHandlerTracker(regionState.getRegion());
945           }
946           break;
947 
948         default:
949           throw new IllegalStateException("Received event is not valid.");
950       }
951     } finally {
952       lock.unlock();
953     }
954   }
955 
956   //For unit tests only
957   boolean wasClosedHandlerCalled(HRegionInfo hri) {
958     AtomicBoolean b = closedRegionHandlerCalled.get(hri);
959     //compareAndSet to be sure that unit tests don't see stale values. Means,
960     //we will return true exactly once unless the handler code resets to true
961     //this value.
962     return b == null ? false : b.compareAndSet(true, false);
963   }
964 
965   //For unit tests only
966   boolean wasOpenedHandlerCalled(HRegionInfo hri) {
967     AtomicBoolean b = openedRegionHandlerCalled.get(hri);
968     //compareAndSet to be sure that unit tests don't see stale values. Means,
969     //we will return true exactly once unless the handler code resets to true
970     //this value.
971     return b == null ? false : b.compareAndSet(true, false);
972   }
973 
974   //For unit tests only
975   void initializeHandlerTrackers() {
976     closedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
977     openedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
978   }
979 
980   void updateClosedRegionHandlerTracker(HRegionInfo hri) {
981     if (closedRegionHandlerCalled != null) { //only for unit tests this is true
982       closedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
983     }
984   }
985 
986   void updateOpenedRegionHandlerTracker(HRegionInfo hri) {
987     if (openedRegionHandlerCalled != null) { //only for unit tests this is true
988       openedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
989     }
990   }
991 
992   // TODO: processFavoredNodes might throw an exception, for e.g., if the
993   // meta could not be contacted/updated. We need to see how seriously to treat
994   // this problem as. Should we fail the current assignment. We should be able
995   // to recover from this problem eventually (if the meta couldn't be updated
996   // things should work normally and eventually get fixed up).
997   void processFavoredNodes(List<HRegionInfo> regions) throws IOException {
998     if (!shouldAssignRegionsWithFavoredNodes) return;
999     // The AM gets the favored nodes info for each region and updates the meta
1000     // table with that info
1001     Map<HRegionInfo, List<ServerName>> regionToFavoredNodes =
1002         new HashMap<HRegionInfo, List<ServerName>>();
1003     for (HRegionInfo region : regions) {
1004       regionToFavoredNodes.put(region,
1005           ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region));
1006     }
1007     FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(regionToFavoredNodes, catalogTracker);
1008   }
1009 
1010   /**
1011    * Handle a ZK unassigned node transition triggered by HBCK repair tool.
1012    * <p>
1013    * This is handled in a separate code path because it breaks the normal rules.
1014    * @param rt
1015    */
1016   private void handleHBCK(RegionTransition rt) {
1017     String encodedName = HRegionInfo.encodeRegionName(rt.getRegionName());
1018     LOG.info("Handling HBCK triggered transition=" + rt.getEventType() +
1019       ", server=" + rt.getServerName() + ", region=" +
1020       HRegionInfo.prettyPrint(encodedName));
1021     RegionState regionState = regionStates.getRegionTransitionState(encodedName);
1022     switch (rt.getEventType()) {
1023       case M_ZK_REGION_OFFLINE:
1024         HRegionInfo regionInfo;
1025         if (regionState != null) {
1026           regionInfo = regionState.getRegion();
1027         } else {
1028           try {
1029             byte [] name = rt.getRegionName();
1030             Pair<HRegionInfo, ServerName> p = MetaReader.getRegion(catalogTracker, name);
1031             regionInfo = p.getFirst();
1032           } catch (IOException e) {
1033             LOG.info("Exception reading hbase:meta doing HBCK repair operation", e);
1034             return;
1035           }
1036         }
1037         LOG.info("HBCK repair is triggering assignment of region=" +
1038             regionInfo.getRegionNameAsString());
1039         // trigger assign, node is already in OFFLINE so don't need to update ZK
1040         assign(regionInfo, false);
1041         break;
1042 
1043       default:
1044         LOG.warn("Received unexpected region state from HBCK: " + rt.toString());
1045         break;
1046     }
1047 
1048   }
1049 
1050   // ZooKeeper events
1051 
1052   /**
1053    * New unassigned node has been created.
1054    *
1055    * <p>This happens when an RS begins the OPENING or CLOSING of a region by
1056    * creating an unassigned node.
1057    *
1058    * <p>When this happens we must:
1059    * <ol>
1060    *   <li>Watch the node for further events</li>
1061    *   <li>Read and handle the state in the node</li>
1062    * </ol>
1063    */
1064   @Override
1065   public void nodeCreated(String path) {
1066     handleAssignmentEvent(path);
1067   }
1068 
1069   /**
1070    * Existing unassigned node has had data changed.
1071    *
1072    * <p>This happens when an RS transitions from OFFLINE to OPENING, or between
1073    * OPENING/OPENED and CLOSING/CLOSED.
1074    *
1075    * <p>When this happens we must:
1076    * <ol>
1077    *   <li>Watch the node for further events</li>
1078    *   <li>Read and handle the state in the node</li>
1079    * </ol>
1080    */
1081   @Override
1082   public void nodeDataChanged(String path) {
1083     handleAssignmentEvent(path);
1084   }
1085 
1086 
1087   // We  don't want to have two events on the same region managed simultaneously.
1088   // For this reason, we need to wait if an event on the same region is currently in progress.
1089   // So we track the region names of the events in progress, and we keep a waiting list.
1090   private final Set<String> regionsInProgress = new HashSet<String>();
1091   // In a LinkedHashMultimap, the put order is kept when we retrieve the collection back. We need
1092   //  this as we want the events to be managed in the same order as we received them.
1093   private final LinkedHashMultimap <String, RegionRunnable>
1094       zkEventWorkerWaitingList = LinkedHashMultimap.create();
1095 
1096   /**
1097    * A specific runnable that works only on a region.
1098    */
1099   private interface RegionRunnable extends Runnable{
1100     /**
1101      * @return - the name of the region it works on.
1102      */
1103     String getRegionName();
1104   }
1105 
1106   /**
1107    * Submit a task, ensuring that there is only one task at a time that working on a given region.
1108    * Order is respected.
1109    */
1110   protected void zkEventWorkersSubmit(final RegionRunnable regRunnable) {
1111 
1112     synchronized (regionsInProgress) {
1113       // If we're there is already a task with this region, we add it to the
1114       //  waiting list and return.
1115       if (regionsInProgress.contains(regRunnable.getRegionName())) {
1116         synchronized (zkEventWorkerWaitingList){
1117           zkEventWorkerWaitingList.put(regRunnable.getRegionName(), regRunnable);
1118         }
1119         return;
1120       }
1121 
1122       // No event in progress on this region => we can submit a new task immediately.
1123       regionsInProgress.add(regRunnable.getRegionName());
1124       zkEventWorkers.submit(new Runnable() {
1125         @Override
1126         public void run() {
1127           try {
1128             regRunnable.run();
1129           } finally {
1130             // now that we have finished, let's see if there is an event for the same region in the
1131             //  waiting list. If it's the case, we can now submit it to the pool.
1132             synchronized (regionsInProgress) {
1133               regionsInProgress.remove(regRunnable.getRegionName());
1134               synchronized (zkEventWorkerWaitingList) {
1135                 java.util.Set<RegionRunnable> waiting = zkEventWorkerWaitingList.get(
1136                     regRunnable.getRegionName());
1137                 if (!waiting.isEmpty()) {
1138                   // We want the first object only. The only way to get it is through an iterator.
1139                   RegionRunnable toSubmit = waiting.iterator().next();
1140                   zkEventWorkerWaitingList.remove(toSubmit.getRegionName(), toSubmit);
1141                   zkEventWorkersSubmit(toSubmit);
1142                 }
1143               }
1144             }
1145           }
1146         }
1147       });
1148     }
1149   }
1150 
1151   @Override
1152   public void nodeDeleted(final String path) {
1153     if (path.startsWith(watcher.assignmentZNode)) {
1154       final String regionName = ZKAssign.getRegionName(watcher, path);
1155       zkEventWorkersSubmit(new RegionRunnable() {
1156         @Override
1157         public String getRegionName() {
1158           return regionName;
1159         }
1160 
1161         @Override
1162         public void run() {
1163           Lock lock = locker.acquireLock(regionName);
1164           try {
1165             RegionState rs = regionStates.getRegionTransitionState(regionName);
1166             if (rs == null) {
1167               rs = regionStates.getRegionState(regionName);
1168               if (rs == null || !rs.isMergingNew()) {
1169                 // MergingNew is an offline state
1170                 return;
1171               }
1172             }
1173 
1174             HRegionInfo regionInfo = rs.getRegion();
1175             String regionNameStr = regionInfo.getRegionNameAsString();
1176             LOG.debug("Znode " + regionNameStr + " deleted, state: " + rs);
1177             boolean disabled = getZKTable().isDisablingOrDisabledTable(regionInfo.getTable());
1178             ServerName serverName = rs.getServerName();
1179             if (serverManager.isServerOnline(serverName)) {
1180               if (rs.isOnServer(serverName)
1181                   && (rs.isOpened() || rs.isSplitting())) {
1182                 regionOnline(regionInfo, serverName);
1183                 if (disabled) {
1184                   // if server is offline, no hurt to unassign again
1185                   LOG.info("Opened " + regionNameStr
1186                     + "but this table is disabled, triggering close of region");
1187                   unassign(regionInfo);
1188                 }
1189               } else if (rs.isMergingNew()) {
1190                 synchronized (regionStates) {
1191                   String p = regionInfo.getEncodedName();
1192                   PairOfSameType<HRegionInfo> regions = mergingRegions.get(p);
1193                   if (regions != null) {
1194                     onlineMergingRegion(disabled, regions.getFirst(), serverName);
1195                     onlineMergingRegion(disabled, regions.getSecond(), serverName);
1196                   }
1197                 }
1198               }
1199             }
1200           } finally {
1201             lock.unlock();
1202           }
1203         }
1204 
1205         private void onlineMergingRegion(boolean disabled,
1206             final HRegionInfo hri, final ServerName serverName) {
1207           RegionState regionState = regionStates.getRegionState(hri);
1208           if (regionState != null && regionState.isMerging()
1209               && regionState.isOnServer(serverName)) {
1210             regionOnline(regionState.getRegion(), serverName);
1211             if (disabled) {
1212               unassign(hri);
1213             }
1214           }
1215         }
1216       });
1217     }
1218   }
1219 
1220   /**
1221    * New unassigned node has been created.
1222    *
1223    * <p>This happens when an RS begins the OPENING, SPLITTING or CLOSING of a
1224    * region by creating a znode.
1225    *
1226    * <p>When this happens we must:
1227    * <ol>
1228    *   <li>Watch the node for further children changed events</li>
1229    *   <li>Watch all new children for changed events</li>
1230    * </ol>
1231    */
1232   @Override
1233   public void nodeChildrenChanged(String path) {
1234     if (path.equals(watcher.assignmentZNode)) {
1235       zkEventWorkers.submit(new Runnable() {
1236         @Override
1237         public void run() {
1238           try {
1239             // Just make sure we see the changes for the new znodes
1240             List<String> children =
1241               ZKUtil.listChildrenAndWatchForNewChildren(
1242                 watcher, watcher.assignmentZNode);
1243             if (children != null) {
1244               Stat stat = new Stat();
1245               for (String child : children) {
1246                 // if region is in transition, we already have a watch
1247                 // on it, so no need to watch it again. So, as I know for now,
1248                 // this is needed to watch splitting nodes only.
1249                 if (!regionStates.isRegionInTransition(child)) {
1250                   ZKAssign.getDataAndWatch(watcher, child, stat);
1251                 }
1252               }
1253             }
1254           } catch (KeeperException e) {
1255             server.abort("Unexpected ZK exception reading unassigned children", e);
1256           }
1257         }
1258       });
1259     }
1260   }
1261 
1262   /**
1263    * Marks the region as online.  Removes it from regions in transition and
1264    * updates the in-memory assignment information.
1265    * <p>
1266    * Used when a region has been successfully opened on a region server.
1267    * @param regionInfo
1268    * @param sn
1269    */
1270   void regionOnline(HRegionInfo regionInfo, ServerName sn) {
1271     numRegionsOpened.incrementAndGet();
1272     regionStates.regionOnline(regionInfo, sn);
1273 
1274     // Remove plan if one.
1275     clearRegionPlan(regionInfo);
1276     // Add the server to serversInUpdatingTimer
1277     addToServersInUpdatingTimer(sn);
1278   }
1279 
1280   /**
1281    * Pass the assignment event to a worker for processing.
1282    * Each worker is a single thread executor service.  The reason
1283    * for just one thread is to make sure all events for a given
1284    * region are processed in order.
1285    *
1286    * @param path
1287    */
1288   private void handleAssignmentEvent(final String path) {
1289     if (path.startsWith(watcher.assignmentZNode)) {
1290       final String regionName = ZKAssign.getRegionName(watcher, path);
1291 
1292       zkEventWorkersSubmit(new RegionRunnable() {
1293         @Override
1294         public String getRegionName() {
1295           return regionName;
1296         }
1297 
1298         @Override
1299         public void run() {
1300           try {
1301             Stat stat = new Stat();
1302             byte [] data = ZKAssign.getDataAndWatch(watcher, path, stat);
1303             if (data == null) return;
1304 
1305             RegionTransition rt = RegionTransition.parseFrom(data);
1306             handleRegion(rt, stat.getVersion());
1307           } catch (KeeperException e) {
1308             server.abort("Unexpected ZK exception reading unassigned node data", e);
1309           } catch (DeserializationException e) {
1310             server.abort("Unexpected exception deserializing node data", e);
1311           }
1312         }
1313       });
1314     }
1315   }
1316 
1317   /**
1318    * Add the server to the set serversInUpdatingTimer, then {@link TimerUpdater}
1319    * will update timers for this server in background
1320    * @param sn
1321    */
1322   private void addToServersInUpdatingTimer(final ServerName sn) {
1323     if (tomActivated){
1324       this.serversInUpdatingTimer.add(sn);
1325     }
1326   }
1327 
1328   /**
1329    * Touch timers for all regions in transition that have the passed
1330    * <code>sn</code> in common.
1331    * Call this method whenever a server checks in.  Doing so helps the case where
1332    * a new regionserver has joined the cluster and its been given 1k regions to
1333    * open.  If this method is tickled every time the region reports in a
1334    * successful open then the 1k-th region won't be timed out just because its
1335    * sitting behind the open of 999 other regions.  This method is NOT used
1336    * as part of bulk assign -- there we have a different mechanism for extending
1337    * the regions in transition timer (we turn it off temporarily -- because
1338    * there is no regionplan involved when bulk assigning.
1339    * @param sn
1340    */
1341   private void updateTimers(final ServerName sn) {
1342     Preconditions.checkState(tomActivated);
1343     if (sn == null) return;
1344 
1345     // This loop could be expensive.
1346     // First make a copy of current regionPlan rather than hold sync while
1347     // looping because holding sync can cause deadlock.  Its ok in this loop
1348     // if the Map we're going against is a little stale
1349     List<Map.Entry<String, RegionPlan>> rps;
1350     synchronized(this.regionPlans) {
1351       rps = new ArrayList<Map.Entry<String, RegionPlan>>(regionPlans.entrySet());
1352     }
1353 
1354     for (Map.Entry<String, RegionPlan> e : rps) {
1355       if (e.getValue() != null && e.getKey() != null && sn.equals(e.getValue().getDestination())) {
1356         RegionState regionState = regionStates.getRegionTransitionState(e.getKey());
1357         if (regionState != null) {
1358           regionState.updateTimestampToNow();
1359         }
1360       }
1361     }
1362   }
1363 
1364   /**
1365    * Marks the region as offline.  Removes it from regions in transition and
1366    * removes in-memory assignment information.
1367    * <p>
1368    * Used when a region has been closed and should remain closed.
1369    * @param regionInfo
1370    */
1371   public void regionOffline(final HRegionInfo regionInfo) {
1372     regionOffline(regionInfo, null);
1373   }
1374 
1375   public void offlineDisabledRegion(HRegionInfo regionInfo) {
1376     // Disabling so should not be reassigned, just delete the CLOSED node
1377     LOG.debug("Table being disabled so deleting ZK node and removing from " +
1378       "regions in transition, skipping assignment of region " +
1379         regionInfo.getRegionNameAsString());
1380     String encodedName = regionInfo.getEncodedName();
1381     deleteNodeInStates(encodedName, "closed", null,
1382       EventType.RS_ZK_REGION_CLOSED, EventType.M_ZK_REGION_OFFLINE);
1383     regionOffline(regionInfo);
1384   }
1385 
1386   // Assignment methods
1387 
1388   /**
1389    * Assigns the specified region.
1390    * <p>
1391    * If a RegionPlan is available with a valid destination then it will be used
1392    * to determine what server region is assigned to.  If no RegionPlan is
1393    * available, region will be assigned to a random available server.
1394    * <p>
1395    * Updates the RegionState and sends the OPEN RPC.
1396    * <p>
1397    * This will only succeed if the region is in transition and in a CLOSED or
1398    * OFFLINE state or not in transition (in-memory not zk), and of course, the
1399    * chosen server is up and running (It may have just crashed!).  If the
1400    * in-memory checks pass, the zk node is forced to OFFLINE before assigning.
1401    *
1402    * @param region server to be assigned
1403    * @param setOfflineInZK whether ZK node should be created/transitioned to an
1404    *                       OFFLINE state before assigning the region
1405    */
1406   public void assign(HRegionInfo region, boolean setOfflineInZK) {
1407     assign(region, setOfflineInZK, false);
1408   }
1409 
1410   /**
1411    * Use care with forceNewPlan. It could cause double assignment.
1412    */
1413   public void assign(HRegionInfo region,
1414       boolean setOfflineInZK, boolean forceNewPlan) {
1415     if (isDisabledorDisablingRegionInRIT(region)) {
1416       return;
1417     }
1418     if (this.serverManager.isClusterShutdown()) {
1419       LOG.info("Cluster shutdown is set; skipping assign of " +
1420         region.getRegionNameAsString());
1421       return;
1422     }
1423     String encodedName = region.getEncodedName();
1424     Lock lock = locker.acquireLock(encodedName);
1425     try {
1426       RegionState state = forceRegionStateToOffline(region, forceNewPlan);
1427       if (state != null) {
1428         if (regionStates.wasRegionOnDeadServer(encodedName)) {
1429           LOG.info("Skip assigning " + region.getRegionNameAsString()
1430             + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1431             + " is dead but not processed yet");
1432           return;
1433         }
1434         assign(state, setOfflineInZK, forceNewPlan);
1435       }
1436     } finally {
1437       lock.unlock();
1438     }
1439   }
1440 
1441   /**
1442    * Bulk assign regions to <code>destination</code>.
1443    * @param destination
1444    * @param regions Regions to assign.
1445    * @return true if successful
1446    */
1447   boolean assign(final ServerName destination, final List<HRegionInfo> regions) {
1448     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1449     try {
1450       int regionCount = regions.size();
1451       if (regionCount == 0) {
1452         return true;
1453       }
1454       LOG.debug("Assigning " + regionCount + " region(s) to " + destination.toString());
1455       Set<String> encodedNames = new HashSet<String>(regionCount);
1456       for (HRegionInfo region : regions) {
1457         encodedNames.add(region.getEncodedName());
1458       }
1459 
1460       List<HRegionInfo> failedToOpenRegions = new ArrayList<HRegionInfo>();
1461       Map<String, Lock> locks = locker.acquireLocks(encodedNames);
1462       try {
1463         AtomicInteger counter = new AtomicInteger(0);
1464         Map<String, Integer> offlineNodesVersions = new ConcurrentHashMap<String, Integer>();
1465         OfflineCallback cb = new OfflineCallback(
1466           watcher, destination, counter, offlineNodesVersions);
1467         Map<String, RegionPlan> plans = new HashMap<String, RegionPlan>(regions.size());
1468         List<RegionState> states = new ArrayList<RegionState>(regions.size());
1469         for (HRegionInfo region : regions) {
1470           String encodedName = region.getEncodedName();
1471           if (!isDisabledorDisablingRegionInRIT(region)) {
1472             RegionState state = forceRegionStateToOffline(region, false);
1473             boolean onDeadServer = false;
1474             if (state != null) {
1475               if (regionStates.wasRegionOnDeadServer(encodedName)) {
1476                 LOG.info("Skip assigning " + region.getRegionNameAsString()
1477                   + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1478                   + " is dead but not processed yet");
1479                 onDeadServer = true;
1480               } else if (asyncSetOfflineInZooKeeper(state, cb, destination)) {
1481                 RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
1482                 plans.put(encodedName, plan);
1483                 states.add(state);
1484                 continue;
1485               }
1486             }
1487             // Reassign if the region wasn't on a dead server
1488             if (!onDeadServer) {
1489               LOG.info("failed to force region state to offline or "
1490                 + "failed to set it offline in ZK, will reassign later: " + region);
1491               failedToOpenRegions.add(region); // assign individually later
1492             }
1493           }
1494           // Release the lock, this region is excluded from bulk assign because
1495           // we can't update its state, or set its znode to offline.
1496           Lock lock = locks.remove(encodedName);
1497           lock.unlock();
1498         }
1499 
1500         // Wait until all unassigned nodes have been put up and watchers set.
1501         int total = states.size();
1502         for (int oldCounter = 0; !server.isStopped();) {
1503           int count = counter.get();
1504           if (oldCounter != count) {
1505             LOG.info(destination.toString() + " unassigned znodes=" + count +
1506               " of total=" + total);
1507             oldCounter = count;
1508           }
1509           if (count >= total) break;
1510           Threads.sleep(5);
1511         }
1512 
1513         if (server.isStopped()) {
1514           return false;
1515         }
1516 
1517         // Add region plans, so we can updateTimers when one region is opened so
1518         // that unnecessary timeout on RIT is reduced.
1519         this.addPlans(plans);
1520 
1521         List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos =
1522           new ArrayList<Triple<HRegionInfo, Integer, List<ServerName>>>(states.size());
1523         for (RegionState state: states) {
1524           HRegionInfo region = state.getRegion();
1525           String encodedRegionName = region.getEncodedName();
1526           Integer nodeVersion = offlineNodesVersions.get(encodedRegionName);
1527           if (nodeVersion == null || nodeVersion == -1) {
1528             LOG.warn("failed to offline in zookeeper: " + region);
1529             failedToOpenRegions.add(region); // assign individually later
1530             Lock lock = locks.remove(encodedRegionName);
1531             lock.unlock();
1532           } else {
1533             regionStates.updateRegionState(
1534               region, State.PENDING_OPEN, destination);
1535             List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1536             if (this.shouldAssignRegionsWithFavoredNodes) {
1537               favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1538             }
1539             regionOpenInfos.add(new Triple<HRegionInfo, Integer,  List<ServerName>>(
1540               region, nodeVersion, favoredNodes));
1541           }
1542         }
1543 
1544         // Move on to open regions.
1545         try {
1546           // Send OPEN RPC. If it fails on a IOE or RemoteException,
1547           // regions will be assigned individually.
1548           long maxWaitTime = System.currentTimeMillis() +
1549             this.server.getConfiguration().
1550               getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1551           for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1552             try {
1553               List<RegionOpeningState> regionOpeningStateList = serverManager
1554                 .sendRegionOpen(destination, regionOpenInfos);
1555               if (regionOpeningStateList == null) {
1556                 // Failed getting RPC connection to this server
1557                 return false;
1558               }
1559               for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
1560                 RegionOpeningState openingState = regionOpeningStateList.get(k);
1561                 if (openingState != RegionOpeningState.OPENED) {
1562                   HRegionInfo region = regionOpenInfos.get(k).getFirst();
1563                   if (openingState == RegionOpeningState.ALREADY_OPENED) {
1564                     processAlreadyOpenedRegion(region, destination);
1565                   } else if (openingState == RegionOpeningState.FAILED_OPENING) {
1566                     // Failed opening this region, reassign it later
1567                     failedToOpenRegions.add(region);
1568                   } else {
1569                     LOG.warn("THIS SHOULD NOT HAPPEN: unknown opening state "
1570                       + openingState + " in assigning region " + region);
1571                   }
1572                 }
1573               }
1574               break;
1575             } catch (IOException e) {
1576               if (e instanceof RemoteException) {
1577                 e = ((RemoteException)e).unwrapRemoteException();
1578               }
1579               if (e instanceof RegionServerStoppedException) {
1580                 LOG.warn("The region server was shut down, ", e);
1581                 // No need to retry, the region server is a goner.
1582                 return false;
1583               } else if (e instanceof ServerNotRunningYetException) {
1584                 long now = System.currentTimeMillis();
1585                 if (now < maxWaitTime) {
1586                   LOG.debug("Server is not yet up; waiting up to " +
1587                     (maxWaitTime - now) + "ms", e);
1588                   Thread.sleep(100);
1589                   i--; // reset the try count
1590                   continue;
1591                 }
1592               } else if (e instanceof java.net.SocketTimeoutException
1593                   && this.serverManager.isServerOnline(destination)) {
1594                 // In case socket is timed out and the region server is still online,
1595                 // the openRegion RPC could have been accepted by the server and
1596                 // just the response didn't go through.  So we will retry to
1597                 // open the region on the same server.
1598                 if (LOG.isDebugEnabled()) {
1599                   LOG.debug("Bulk assigner openRegion() to " + destination
1600                     + " has timed out, but the regions might"
1601                     + " already be opened on it.", e);
1602                 }
1603                 continue;
1604               }
1605               throw e;
1606             }
1607           }
1608         } catch (IOException e) {
1609           // Can be a socket timeout, EOF, NoRouteToHost, etc
1610           LOG.info("Unable to communicate with " + destination
1611             + " in order to assign regions, ", e);
1612           return false;
1613         } catch (InterruptedException e) {
1614           throw new RuntimeException(e);
1615         }
1616       } finally {
1617         for (Lock lock : locks.values()) {
1618           lock.unlock();
1619         }
1620       }
1621 
1622       if (!failedToOpenRegions.isEmpty()) {
1623         for (HRegionInfo region : failedToOpenRegions) {
1624           if (!regionStates.isRegionOnline(region)) {
1625             invokeAssign(region);
1626           }
1627         }
1628       }
1629       LOG.debug("Bulk assigning done for " + destination);
1630       return true;
1631     } finally {
1632       metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
1633     }
1634   }
1635 
1636   /**
1637    * Send CLOSE RPC if the server is online, otherwise, offline the region.
1638    *
1639    * The RPC will be sent only to the region sever found in the region state
1640    * if it is passed in, otherwise, to the src server specified. If region
1641    * state is not specified, we don't update region state at all, instead
1642    * we just send the RPC call. This is useful for some cleanup without
1643    * messing around the region states (see handleRegion, on region opened
1644    * on an unexpected server scenario, for an example)
1645    */
1646   private void unassign(final HRegionInfo region,
1647       final RegionState state, final int versionOfClosingNode,
1648       final ServerName dest, final boolean transitionInZK,
1649       final ServerName src) {
1650     ServerName server = src;
1651     if (state != null) {
1652       server = state.getServerName();
1653     }
1654     long maxWaitTime = -1;
1655     for (int i = 1; i <= this.maximumAttempts; i++) {
1656       if (this.server.isStopped() || this.server.isAborted()) {
1657         LOG.debug("Server stopped/aborted; skipping unassign of " + region);
1658         return;
1659       }
1660       // ClosedRegionhandler can remove the server from this.regions
1661       if (!serverManager.isServerOnline(server)) {
1662         LOG.debug("Offline " + region.getRegionNameAsString()
1663           + ", no need to unassign since it's on a dead server: " + server);
1664         if (transitionInZK) {
1665           // delete the node. if no node exists need not bother.
1666           deleteClosingOrClosedNode(region, server);
1667         }
1668         if (state != null) {
1669           regionOffline(region);
1670         }
1671         return;
1672       }
1673       try {
1674         // Send CLOSE RPC
1675         if (serverManager.sendRegionClose(server, region,
1676           versionOfClosingNode, dest, transitionInZK)) {
1677           LOG.debug("Sent CLOSE to " + server + " for region " +
1678             region.getRegionNameAsString());
1679           if (!transitionInZK && state != null) {
1680             // Retry to make sure the region is
1681             // closed so as to avoid double assignment.
1682             unassign(region, state, versionOfClosingNode,
1683               dest, transitionInZK,src);
1684           }
1685           return;
1686         }
1687         // This never happens. Currently regionserver close always return true.
1688         // Todo; this can now happen (0.96) if there is an exception in a coprocessor
1689         LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
1690           region.getRegionNameAsString());
1691       } catch (Throwable t) {
1692         if (t instanceof RemoteException) {
1693           t = ((RemoteException)t).unwrapRemoteException();
1694         }
1695         if (t instanceof NotServingRegionException
1696             || t instanceof RegionServerStoppedException
1697             || t instanceof ServerNotRunningYetException
1698             || t instanceof FailedServerException) {
1699           LOG.debug("Offline " + region.getRegionNameAsString()
1700             + ", it's not any more on " + server, t);
1701           if (transitionInZK) {
1702             deleteClosingOrClosedNode(region, server);
1703           }
1704           if (state != null) {
1705             regionOffline(region);
1706           }
1707           return;
1708         } else if (state != null
1709             && t instanceof RegionAlreadyInTransitionException) {
1710           // RS is already processing this region, only need to update the timestamp
1711           LOG.debug("update " + state + " the timestamp.");
1712           state.updateTimestampToNow();
1713           if (maxWaitTime < 0) {
1714             maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
1715               + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
1716                 DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1717           }
1718           try {
1719             long now = EnvironmentEdgeManager.currentTimeMillis();
1720             if (now < maxWaitTime) {
1721               LOG.debug("Region is already in transition; "
1722                 + "waiting up to " + (maxWaitTime - now) + "ms", t);
1723               Thread.sleep(100);
1724               i--; // reset the try count
1725             }
1726           } catch (InterruptedException ie) {
1727             LOG.warn("Failed to unassign "
1728               + region.getRegionNameAsString() + " since interrupted", ie);
1729             Thread.currentThread().interrupt();
1730             if (!tomActivated) {
1731               regionStates.updateRegionState(region, State.FAILED_CLOSE);
1732             }
1733             return;
1734           }
1735         } else {
1736           LOG.info("Server " + server + " returned " + t + " for "
1737             + region.getRegionNameAsString() + ", try=" + i
1738             + " of " + this.maximumAttempts, t);
1739           // Presume retry or server will expire.
1740         }
1741       }
1742     }
1743     // Run out of attempts
1744     if (!tomActivated && state != null) {
1745       regionStates.updateRegionState(region, State.FAILED_CLOSE);
1746     }
1747   }
1748 
1749   /**
1750    * Set region to OFFLINE unless it is opening and forceNewPlan is false.
1751    */
1752   private RegionState forceRegionStateToOffline(
1753       final HRegionInfo region, final boolean forceNewPlan) {
1754     RegionState state = regionStates.getRegionState(region);
1755     if (state == null) {
1756       LOG.warn("Assigning a region not in region states: " + region);
1757       state = regionStates.createRegionState(region);
1758     }
1759 
1760     ServerName sn = state.getServerName();
1761     if (forceNewPlan && LOG.isDebugEnabled()) {
1762       LOG.debug("Force region state offline " + state);
1763     }
1764 
1765     switch (state.getState()) {
1766     case OPEN:
1767     case OPENING:
1768     case PENDING_OPEN:
1769     case CLOSING:
1770     case PENDING_CLOSE:
1771       if (!forceNewPlan) {
1772         LOG.debug("Skip assigning " +
1773           region + ", it is already " + state);
1774         return null;
1775       }
1776     case FAILED_CLOSE:
1777     case FAILED_OPEN:
1778       unassign(region, state, -1, null, false, null);
1779       state = regionStates.getRegionState(region);
1780       if (state.isFailedClose()) {
1781         // If we can't close the region, we can't re-assign
1782         // it so as to avoid possible double assignment/data loss.
1783         LOG.info("Skip assigning " +
1784           region + ", we couldn't close it: " + state);
1785         return null;
1786       }
1787     case OFFLINE:
1788       // This region could have been open on this server
1789       // for a while. If the server is dead and not processed
1790       // yet, we can move on only if the meta shows the
1791       // region is not on this server actually, or on a server
1792       // not dead, or dead and processed already.
1793       if (regionStates.isServerDeadAndNotProcessed(sn)
1794           && wasRegionOnDeadServerByMeta(region, sn)) {
1795         LOG.info("Skip assigning " + region.getRegionNameAsString()
1796           + ", it is on a dead but not processed yet server");
1797         return null;
1798       }
1799     case CLOSED:
1800       break;
1801     default:
1802       LOG.error("Trying to assign region " + region
1803         + ", which is " + state);
1804       return null;
1805     }
1806     return state;
1807   }
1808 
1809   private boolean wasRegionOnDeadServerByMeta(
1810       final HRegionInfo region, final ServerName sn) {
1811     try {
1812       if (region.isMetaRegion()) {
1813         ServerName server = catalogTracker.getMetaLocation();
1814         return regionStates.isServerDeadAndNotProcessed(server);
1815       }
1816       while (!server.isStopped()) {
1817         try {
1818           catalogTracker.waitForMeta();
1819           Pair<HRegionInfo, ServerName> r =
1820             MetaReader.getRegion(catalogTracker, region.getRegionName());
1821           ServerName server = r == null ? null : r.getSecond();
1822           return regionStates.isServerDeadAndNotProcessed(server);
1823         } catch (IOException ioe) {
1824           LOG.info("Received exception accessing hbase:meta during force assign "
1825             + region.getRegionNameAsString() + ", retrying", ioe);
1826         }
1827       }
1828     } catch (InterruptedException e) {
1829       Thread.currentThread().interrupt();
1830       LOG.info("Interrupted accessing hbase:meta", e);
1831     }
1832     // Call is interrupted or server is stopped.
1833     return regionStates.isServerDeadAndNotProcessed(sn);
1834   }
1835 
1836   /**
1837    * Caller must hold lock on the passed <code>state</code> object.
1838    * @param state
1839    * @param setOfflineInZK
1840    * @param forceNewPlan
1841    */
1842   private void assign(RegionState state,
1843       final boolean setOfflineInZK, final boolean forceNewPlan) {
1844     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1845     try {
1846       RegionState currentState = state;
1847       int versionOfOfflineNode = -1;
1848       RegionPlan plan = null;
1849       long maxWaitTime = -1;
1850       HRegionInfo region = state.getRegion();
1851       RegionOpeningState regionOpenState;
1852       for (int i = 1; i <= maximumAttempts; i++) {
1853         if (server.isStopped() || server.isAborted()) {
1854           LOG.info("Skip assigning " + region.getRegionNameAsString()
1855             + ", the server is stopped/aborted");
1856           return;
1857         }
1858         if (plan == null) { // Get a server for the region at first
1859           try {
1860             plan = getRegionPlan(region, forceNewPlan);
1861           } catch (HBaseIOException e) {
1862             LOG.warn("Failed to get region plan", e);
1863           }
1864         }
1865         if (plan == null) {
1866           LOG.warn("Unable to determine a plan to assign " + region);
1867           if (tomActivated){
1868             this.timeoutMonitor.setAllRegionServersOffline(true);
1869           } else {
1870             if (region.isMetaRegion()) {
1871               try {
1872                 Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
1873                 if (i == maximumAttempts) i = 1;
1874                 continue;
1875               } catch (InterruptedException e) {
1876                 LOG.error("Got exception while waiting for hbase:meta assignment");
1877                 Thread.currentThread().interrupt();
1878               }
1879             }
1880             regionStates.updateRegionState(region, State.FAILED_OPEN);
1881           }
1882           return;
1883         }
1884         if (setOfflineInZK && versionOfOfflineNode == -1) {
1885           // get the version of the znode after setting it to OFFLINE.
1886           // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
1887           versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination());
1888           if (versionOfOfflineNode != -1) {
1889             if (isDisabledorDisablingRegionInRIT(region)) {
1890               return;
1891             }
1892             // In case of assignment from EnableTableHandler table state is ENABLING. Any how
1893             // EnableTableHandler will set ENABLED after assigning all the table regions. If we
1894             // try to set to ENABLED directly then client API may think table is enabled.
1895             // When we have a case such as all the regions are added directly into hbase:meta and we call
1896             // assignRegion then we need to make the table ENABLED. Hence in such case the table
1897             // will not be in ENABLING or ENABLED state.
1898             TableName tableName = region.getTable();
1899             if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) {
1900               LOG.debug("Setting table " + tableName + " to ENABLED state.");
1901               setEnabledTable(tableName);
1902             }
1903           }
1904         }
1905         if (setOfflineInZK && versionOfOfflineNode == -1) {
1906           LOG.info("Unable to set offline in ZooKeeper to assign " + region);
1907           // Setting offline in ZK must have been failed due to ZK racing or some
1908           // exception which may make the server to abort. If it is ZK racing,
1909           // we should retry since we already reset the region state,
1910           // existing (re)assignment will fail anyway.
1911           if (!server.isAborted()) {
1912             continue;
1913           }
1914         }
1915         LOG.info("Assigning " + region.getRegionNameAsString() +
1916             " to " + plan.getDestination().toString());
1917         // Transition RegionState to PENDING_OPEN
1918         currentState = regionStates.updateRegionState(region,
1919           State.PENDING_OPEN, plan.getDestination());
1920 
1921         boolean needNewPlan;
1922         final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
1923             " to " + plan.getDestination();
1924         try {
1925           List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1926           if (this.shouldAssignRegionsWithFavoredNodes) {
1927             favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1928           }
1929           regionOpenState = serverManager.sendRegionOpen(
1930               plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
1931 
1932           if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
1933             // Failed opening this region, looping again on a new server.
1934             needNewPlan = true;
1935             LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
1936                 " trying to assign elsewhere instead; " +
1937                 "try=" + i + " of " + this.maximumAttempts);
1938           } else {
1939             // we're done
1940             if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
1941               processAlreadyOpenedRegion(region, plan.getDestination());
1942             }
1943             return;
1944           }
1945 
1946         } catch (Throwable t) {
1947           if (t instanceof RemoteException) {
1948             t = ((RemoteException) t).unwrapRemoteException();
1949           }
1950 
1951           // Should we wait a little before retrying? If the server is starting it's yes.
1952           // If the region is already in transition, it's yes as well: we want to be sure that
1953           //  the region will get opened but we don't want a double assignment.
1954           boolean hold = (t instanceof RegionAlreadyInTransitionException ||
1955               t instanceof ServerNotRunningYetException);
1956 
1957           // In case socket is timed out and the region server is still online,
1958           // the openRegion RPC could have been accepted by the server and
1959           // just the response didn't go through.  So we will retry to
1960           // open the region on the same server to avoid possible
1961           // double assignment.
1962           boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
1963               && this.serverManager.isServerOnline(plan.getDestination()));
1964 
1965 
1966           if (hold) {
1967             LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
1968               "try=" + i + " of " + this.maximumAttempts, t);
1969 
1970             if (maxWaitTime < 0) {
1971               if (t instanceof RegionAlreadyInTransitionException) {
1972                 maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
1973                   + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
1974                     DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1975               } else {
1976                 maxWaitTime = this.server.getConfiguration().
1977                   getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1978               }
1979             }
1980             try {
1981               needNewPlan = false;
1982               long now = EnvironmentEdgeManager.currentTimeMillis();
1983               if (now < maxWaitTime) {
1984                 LOG.debug("Server is not yet up or region is already in transition; "
1985                   + "waiting up to " + (maxWaitTime - now) + "ms", t);
1986                 Thread.sleep(100);
1987                 i--; // reset the try count
1988               } else if (!(t instanceof RegionAlreadyInTransitionException)) {
1989                 LOG.debug("Server is not up for a while; try a new one", t);
1990                 needNewPlan = true;
1991               }
1992             } catch (InterruptedException ie) {
1993               LOG.warn("Failed to assign "
1994                   + region.getRegionNameAsString() + " since interrupted", ie);
1995               Thread.currentThread().interrupt();
1996               if (!tomActivated) {
1997                 regionStates.updateRegionState(region, State.FAILED_OPEN);
1998               }
1999               return;
2000             }
2001           } else if (retry) {
2002             needNewPlan = false;
2003             LOG.warn(assignMsg + ", trying to assign to the same region server " +
2004                 "try=" + i + " of " + this.maximumAttempts, t);
2005           } else {
2006             needNewPlan = true;
2007             LOG.warn(assignMsg + ", trying to assign elsewhere instead;" +
2008                 " try=" + i + " of " + this.maximumAttempts, t);
2009           }
2010         }
2011 
2012         if (i == this.maximumAttempts) {
2013           // Don't reset the region state or get a new plan any more.
2014           // This is the last try.
2015           continue;
2016         }
2017 
2018         // If region opened on destination of present plan, reassigning to new
2019         // RS may cause double assignments. In case of RegionAlreadyInTransitionException
2020         // reassigning to same RS.
2021         if (needNewPlan) {
2022           // Force a new plan and reassign. Will return null if no servers.
2023           // The new plan could be the same as the existing plan since we don't
2024           // exclude the server of the original plan, which should not be
2025           // excluded since it could be the only server up now.
2026           RegionPlan newPlan = null;
2027           try {
2028             newPlan = getRegionPlan(region, true);
2029           } catch (HBaseIOException e) {
2030             LOG.warn("Failed to get region plan", e);
2031           }
2032           if (newPlan == null) {
2033             if (tomActivated) {
2034               this.timeoutMonitor.setAllRegionServersOffline(true);
2035             } else {
2036               regionStates.updateRegionState(region, State.FAILED_OPEN);
2037             }
2038             LOG.warn("Unable to find a viable location to assign region " +
2039                 region.getRegionNameAsString());
2040             return;
2041           }
2042 
2043           if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
2044             // Clean out plan we failed execute and one that doesn't look like it'll
2045             // succeed anyways; we need a new plan!
2046             // Transition back to OFFLINE
2047             currentState = regionStates.updateRegionState(region, State.OFFLINE);
2048             versionOfOfflineNode = -1;
2049             plan = newPlan;
2050           }
2051         }
2052       }
2053       // Run out of attempts
2054       if (!tomActivated) {
2055         regionStates.updateRegionState(region, State.FAILED_OPEN);
2056       }
2057     } finally {
2058       metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
2059     }
2060   }
2061 
2062   private void processAlreadyOpenedRegion(HRegionInfo region, ServerName sn) {
2063     // Remove region from in-memory transition and unassigned node from ZK
2064     // While trying to enable the table the regions of the table were
2065     // already enabled.
2066     LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2067       + " to " + sn);
2068     String encodedName = region.getEncodedName();
2069     deleteNodeInStates(encodedName, "offline", sn, EventType.M_ZK_REGION_OFFLINE);
2070     regionStates.regionOnline(region, sn);
2071   }
2072 
2073   private boolean isDisabledorDisablingRegionInRIT(final HRegionInfo region) {
2074     TableName tableName = region.getTable();
2075     boolean disabled = this.zkTable.isDisabledTable(tableName);
2076     if (disabled || this.zkTable.isDisablingTable(tableName)) {
2077       LOG.info("Table " + tableName + (disabled ? " disabled;" : " disabling;") +
2078         " skipping assign of " + region.getRegionNameAsString());
2079       offlineDisabledRegion(region);
2080       return true;
2081     }
2082     return false;
2083   }
2084 
2085   /**
2086    * Set region as OFFLINED up in zookeeper
2087    *
2088    * @param state
2089    * @return the version of the offline node if setting of the OFFLINE node was
2090    *         successful, -1 otherwise.
2091    */
2092   private int setOfflineInZooKeeper(final RegionState state, final ServerName destination) {
2093     if (!state.isClosed() && !state.isOffline()) {
2094       String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
2095       this.server.abort(msg, new IllegalStateException(msg));
2096       return -1;
2097     }
2098     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
2099     int versionOfOfflineNode;
2100     try {
2101       // get the version after setting the znode to OFFLINE
2102       versionOfOfflineNode = ZKAssign.createOrForceNodeOffline(watcher,
2103         state.getRegion(), destination);
2104       if (versionOfOfflineNode == -1) {
2105         LOG.warn("Attempted to create/force node into OFFLINE state before "
2106             + "completing assignment but failed to do so for " + state);
2107         return -1;
2108       }
2109     } catch (KeeperException e) {
2110       server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
2111       return -1;
2112     }
2113     return versionOfOfflineNode;
2114   }
2115 
2116   /**
2117    * @param region the region to assign
2118    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2119    * if no servers to assign, it returns null).
2120    */
2121   private RegionPlan getRegionPlan(final HRegionInfo region,
2122       final boolean forceNewPlan)  throws HBaseIOException {
2123     return getRegionPlan(region, null, forceNewPlan);
2124   }
2125 
2126   /**
2127    * @param region the region to assign
2128    * @param serverToExclude Server to exclude (we know its bad). Pass null if
2129    * all servers are thought to be assignable.
2130    * @param forceNewPlan If true, then if an existing plan exists, a new plan
2131    * will be generated.
2132    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2133    * if no servers to assign, it returns null).
2134    */
2135   private RegionPlan getRegionPlan(final HRegionInfo region,
2136       final ServerName serverToExclude, final boolean forceNewPlan) throws HBaseIOException {
2137     // Pickup existing plan or make a new one
2138     final String encodedName = region.getEncodedName();
2139     final List<ServerName> destServers =
2140       serverManager.createDestinationServersList(serverToExclude);
2141 
2142     if (destServers.isEmpty()){
2143       LOG.warn("Can't move " + encodedName +
2144         ", there is no destination server available.");
2145       return null;
2146     }
2147 
2148     RegionPlan randomPlan = null;
2149     boolean newPlan = false;
2150     RegionPlan existingPlan;
2151 
2152     synchronized (this.regionPlans) {
2153       existingPlan = this.regionPlans.get(encodedName);
2154 
2155       if (existingPlan != null && existingPlan.getDestination() != null) {
2156         LOG.debug("Found an existing plan for " + region.getRegionNameAsString()
2157           + " destination server is " + existingPlan.getDestination() +
2158             " accepted as a dest server = " + destServers.contains(existingPlan.getDestination()));
2159       }
2160 
2161       if (forceNewPlan
2162           || existingPlan == null
2163           || existingPlan.getDestination() == null
2164           || !destServers.contains(existingPlan.getDestination())) {
2165         newPlan = true;
2166         randomPlan = new RegionPlan(region, null,
2167             balancer.randomAssignment(region, destServers));
2168         if (!region.isMetaTable() && shouldAssignRegionsWithFavoredNodes) {
2169           List<HRegionInfo> regions = new ArrayList<HRegionInfo>(1);
2170           regions.add(region);
2171           try {
2172             processFavoredNodes(regions);
2173           } catch (IOException ie) {
2174             LOG.warn("Ignoring exception in processFavoredNodes " + ie);
2175           }
2176         }
2177         this.regionPlans.put(encodedName, randomPlan);
2178       }
2179     }
2180 
2181     if (newPlan) {
2182       if (randomPlan.getDestination() == null) {
2183         LOG.warn("Can't find a destination for " + encodedName);
2184         return null;
2185       }
2186       LOG.debug("No previous transition plan found (or ignoring " +
2187         "an existing plan) for " + region.getRegionNameAsString() +
2188         "; generated random plan=" + randomPlan + "; " +
2189         serverManager.countOfRegionServers() +
2190                " (online=" + serverManager.getOnlineServers().size() +
2191                ", available=" + destServers.size() + ") available servers" +
2192                ", forceNewPlan=" + forceNewPlan);
2193         return randomPlan;
2194       }
2195     LOG.debug("Using pre-existing plan for " +
2196       region.getRegionNameAsString() + "; plan=" + existingPlan);
2197     return existingPlan;
2198   }
2199 
2200   /**
2201    * Unassigns the specified region.
2202    * <p>
2203    * Updates the RegionState and sends the CLOSE RPC unless region is being
2204    * split by regionserver; then the unassign fails (silently) because we
2205    * presume the region being unassigned no longer exists (its been split out
2206    * of existence). TODO: What to do if split fails and is rolled back and
2207    * parent is revivified?
2208    * <p>
2209    * If a RegionPlan is already set, it will remain.
2210    *
2211    * @param region server to be unassigned
2212    */
2213   public void unassign(HRegionInfo region) {
2214     unassign(region, false);
2215   }
2216 
2217 
2218   /**
2219    * Unassigns the specified region.
2220    * <p>
2221    * Updates the RegionState and sends the CLOSE RPC unless region is being
2222    * split by regionserver; then the unassign fails (silently) because we
2223    * presume the region being unassigned no longer exists (its been split out
2224    * of existence). TODO: What to do if split fails and is rolled back and
2225    * parent is revivified?
2226    * <p>
2227    * If a RegionPlan is already set, it will remain.
2228    *
2229    * @param region server to be unassigned
2230    * @param force if region should be closed even if already closing
2231    */
2232   public void unassign(HRegionInfo region, boolean force, ServerName dest) {
2233     // TODO: Method needs refactoring.  Ugly buried returns throughout.  Beware!
2234     LOG.debug("Starting unassign of " + region.getRegionNameAsString()
2235       + " (offlining), current state: " + regionStates.getRegionState(region));
2236 
2237     String encodedName = region.getEncodedName();
2238     // Grab the state of this region and synchronize on it
2239     int versionOfClosingNode = -1;
2240     // We need a lock here as we're going to do a put later and we don't want multiple states
2241     //  creation
2242     ReentrantLock lock = locker.acquireLock(encodedName);
2243     RegionState state = regionStates.getRegionTransitionState(encodedName);
2244     boolean reassign = true;
2245     try {
2246       if (state == null) {
2247         // Region is not in transition.
2248         // We can unassign it only if it's not SPLIT/MERGED.
2249         state = regionStates.getRegionState(encodedName);
2250         if (state != null && state.isUnassignable()) {
2251           LOG.info("Attempting to unassign " + state + ", ignored");
2252           // Offline region will be reassigned below
2253           return;
2254         }
2255         // Create the znode in CLOSING state
2256         try {
2257           if (state == null || state.getServerName() == null) {
2258             // We don't know where the region is, offline it.
2259             // No need to send CLOSE RPC
2260             LOG.warn("Attempting to unassign a region not in RegionStates"
2261               + region.getRegionNameAsString() + ", offlined");
2262             regionOffline(region);
2263             return;
2264           }
2265           versionOfClosingNode = ZKAssign.createNodeClosing(
2266             watcher, region, state.getServerName());
2267           if (versionOfClosingNode == -1) {
2268             LOG.info("Attempting to unassign " +
2269               region.getRegionNameAsString() + " but ZK closing node "
2270               + "can't be created.");
2271             reassign = false; // not unassigned at all
2272             return;
2273           }
2274         } catch (KeeperException e) {
2275           if (e instanceof NodeExistsException) {
2276             // Handle race between master initiated close and regionserver
2277             // orchestrated splitting. See if existing node is in a
2278             // SPLITTING or SPLIT state.  If so, the regionserver started
2279             // an op on node before we could get our CLOSING in.  Deal.
2280             NodeExistsException nee = (NodeExistsException)e;
2281             String path = nee.getPath();
2282             try {
2283               if (isSplitOrSplittingOrMergedOrMerging(path)) {
2284                 LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " +
2285                   "skipping unassign because region no longer exists -- its split or merge");
2286                 reassign = false; // no need to reassign for split/merged region
2287                 return;
2288               }
2289             } catch (KeeperException.NoNodeException ke) {
2290               LOG.warn("Failed getData on SPLITTING/SPLIT at " + path +
2291                 "; presuming split and that the region to unassign, " +
2292                 encodedName + ", no longer exists -- confirm", ke);
2293               return;
2294             } catch (KeeperException ke) {
2295               LOG.error("Unexpected zk state", ke);
2296             } catch (DeserializationException de) {
2297               LOG.error("Failed parse", de);
2298             }
2299           }
2300           // If we get here, don't understand whats going on -- abort.
2301           server.abort("Unexpected ZK exception creating node CLOSING", e);
2302           reassign = false; // heading out already
2303           return;
2304         }
2305         state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2306       } else if (state.isFailedOpen()) {
2307         // The region is not open yet
2308         regionOffline(region);
2309         return;
2310       } else if (force && state.isPendingCloseOrClosing()) {
2311         LOG.debug("Attempting to unassign " + region.getRegionNameAsString() +
2312           " which is already " + state.getState()  +
2313           " but forcing to send a CLOSE RPC again ");
2314         if (state.isFailedClose()) {
2315           state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2316         }
2317         state.updateTimestampToNow();
2318       } else {
2319         LOG.debug("Attempting to unassign " +
2320           region.getRegionNameAsString() + " but it is " +
2321           "already in transition (" + state.getState() + ", force=" + force + ")");
2322         return;
2323       }
2324 
2325       unassign(region, state, versionOfClosingNode, dest, true, null);
2326     } finally {
2327       lock.unlock();
2328 
2329       // Region is expected to be reassigned afterwards
2330       if (reassign && regionStates.isRegionOffline(region)) {
2331         assign(region, true);
2332       }
2333     }
2334   }
2335 
2336   public void unassign(HRegionInfo region, boolean force){
2337      unassign(region, force, null);
2338   }
2339 
2340   /**
2341    * @param region regioninfo of znode to be deleted.
2342    */
2343   public void deleteClosingOrClosedNode(HRegionInfo region, ServerName sn) {
2344     String encodedName = region.getEncodedName();
2345     deleteNodeInStates(encodedName, "closing", sn, EventType.M_ZK_REGION_CLOSING,
2346       EventType.RS_ZK_REGION_CLOSED);
2347   }
2348 
2349   /**
2350    * @param path
2351    * @return True if znode is in SPLIT or SPLITTING or MERGED or MERGING state.
2352    * @throws KeeperException Can happen if the znode went away in meantime.
2353    * @throws DeserializationException
2354    */
2355   private boolean isSplitOrSplittingOrMergedOrMerging(final String path)
2356       throws KeeperException, DeserializationException {
2357     boolean result = false;
2358     // This may fail if the SPLIT or SPLITTING or MERGED or MERGING znode gets
2359     // cleaned up before we can get data from it.
2360     byte [] data = ZKAssign.getData(watcher, path);
2361     if (data == null) {
2362       LOG.info("Node " + path + " is gone");
2363       return false;
2364     }
2365     RegionTransition rt = RegionTransition.parseFrom(data);
2366     switch (rt.getEventType()) {
2367     case RS_ZK_REQUEST_REGION_SPLIT:
2368     case RS_ZK_REGION_SPLIT:
2369     case RS_ZK_REGION_SPLITTING:
2370     case RS_ZK_REQUEST_REGION_MERGE:
2371     case RS_ZK_REGION_MERGED:
2372     case RS_ZK_REGION_MERGING:
2373       result = true;
2374       break;
2375     default:
2376       LOG.info("Node " + path + " is in " + rt.getEventType());
2377       break;
2378     }
2379     return result;
2380   }
2381 
2382   /**
2383    * Used by unit tests. Return the number of regions opened so far in the life
2384    * of the master. Increases by one every time the master opens a region
2385    * @return the counter value of the number of regions opened so far
2386    */
2387   public int getNumRegionsOpened() {
2388     return numRegionsOpened.get();
2389   }
2390 
2391   /**
2392    * Waits until the specified region has completed assignment.
2393    * <p>
2394    * If the region is already assigned, returns immediately.  Otherwise, method
2395    * blocks until the region is assigned.
2396    * @param regionInfo region to wait on assignment for
2397    * @throws InterruptedException
2398    */
2399   public boolean waitForAssignment(HRegionInfo regionInfo)
2400       throws InterruptedException {
2401     while (!regionStates.isRegionOnline(regionInfo)) {
2402       if (regionStates.isRegionInState(regionInfo, State.FAILED_OPEN)
2403           || this.server.isStopped()) {
2404         return false;
2405       }
2406 
2407       // We should receive a notification, but it's
2408       //  better to have a timeout to recheck the condition here:
2409       //  it lowers the impact of a race condition if any
2410       regionStates.waitForUpdate(100);
2411     }
2412     return true;
2413   }
2414 
2415   /**
2416    * Assigns the hbase:meta region.
2417    * <p>
2418    * Assumes that hbase:meta is currently closed and is not being actively served by
2419    * any RegionServer.
2420    * <p>
2421    * Forcibly unsets the current meta region location in ZooKeeper and assigns
2422    * hbase:meta to a random RegionServer.
2423    * @throws KeeperException
2424    */
2425   public void assignMeta() throws KeeperException {
2426     MetaRegionTracker.deleteMetaLocation(this.watcher);
2427     assign(HRegionInfo.FIRST_META_REGIONINFO, true);
2428   }
2429 
2430   /**
2431    * Assigns specified regions retaining assignments, if any.
2432    * <p>
2433    * This is a synchronous call and will return once every region has been
2434    * assigned.  If anything fails, an exception is thrown
2435    * @throws InterruptedException
2436    * @throws IOException
2437    */
2438   public void assign(Map<HRegionInfo, ServerName> regions)
2439         throws IOException, InterruptedException {
2440     if (regions == null || regions.isEmpty()) {
2441       return;
2442     }
2443     List<ServerName> servers = serverManager.createDestinationServersList();
2444     if (servers == null || servers.isEmpty()) {
2445       throw new IOException("Found no destination server to assign region(s)");
2446     }
2447 
2448     // Reuse existing assignment info
2449     Map<ServerName, List<HRegionInfo>> bulkPlan =
2450       balancer.retainAssignment(regions, servers);
2451 
2452     assign(regions.size(), servers.size(),
2453       "retainAssignment=true", bulkPlan);
2454   }
2455 
2456   /**
2457    * Assigns specified regions round robin, if any.
2458    * <p>
2459    * This is a synchronous call and will return once every region has been
2460    * assigned.  If anything fails, an exception is thrown
2461    * @throws InterruptedException
2462    * @throws IOException
2463    */
2464   public void assign(List<HRegionInfo> regions)
2465         throws IOException, InterruptedException {
2466     if (regions == null || regions.isEmpty()) {
2467       return;
2468     }
2469 
2470     List<ServerName> servers = serverManager.createDestinationServersList();
2471     if (servers == null || servers.isEmpty()) {
2472       throw new IOException("Found no destination server to assign region(s)");
2473     }
2474 
2475     // Generate a round-robin bulk assignment plan
2476     Map<ServerName, List<HRegionInfo>> bulkPlan
2477       = balancer.roundRobinAssignment(regions, servers);
2478     processFavoredNodes(regions);
2479 
2480     assign(regions.size(), servers.size(),
2481       "round-robin=true", bulkPlan);
2482   }
2483 
2484   private void assign(int regions, int totalServers,
2485       String message, Map<ServerName, List<HRegionInfo>> bulkPlan)
2486           throws InterruptedException, IOException {
2487 
2488     int servers = bulkPlan.size();
2489     if (servers == 1 || (regions < bulkAssignThresholdRegions
2490         && servers < bulkAssignThresholdServers)) {
2491 
2492       // Not use bulk assignment.  This could be more efficient in small
2493       // cluster, especially mini cluster for testing, so that tests won't time out
2494       if (LOG.isTraceEnabled()) {
2495         LOG.trace("Not using bulk assignment since we are assigning only " + regions +
2496           " region(s) to " + servers + " server(s)");
2497       }
2498       for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) {
2499         if (!assign(plan.getKey(), plan.getValue())) {
2500           for (HRegionInfo region: plan.getValue()) {
2501             if (!regionStates.isRegionOnline(region)) {
2502               invokeAssign(region);
2503             }
2504           }
2505         }
2506       }
2507     } else {
2508       LOG.info("Bulk assigning " + regions + " region(s) across "
2509         + totalServers + " server(s), " + message);
2510 
2511       // Use fixed count thread pool assigning.
2512       BulkAssigner ba = new GeneralBulkAssigner(
2513         this.server, bulkPlan, this, bulkAssignWaitTillAllAssigned);
2514       ba.bulkAssign();
2515       LOG.info("Bulk assigning done");
2516     }
2517   }
2518 
2519   /**
2520    * Assigns all user regions, if any exist.  Used during cluster startup.
2521    * <p>
2522    * This is a synchronous call and will return once every region has been
2523    * assigned.  If anything fails, an exception is thrown and the cluster
2524    * should be shutdown.
2525    * @throws InterruptedException
2526    * @throws IOException
2527    * @throws KeeperException
2528    */
2529   private void assignAllUserRegions()
2530       throws IOException, InterruptedException, KeeperException {
2531     // Cleanup any existing ZK nodes and start watching
2532     ZKAssign.deleteAllNodes(watcher);
2533     ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
2534       this.watcher.assignmentZNode);
2535     failoverCleanupDone();
2536 
2537     // Skip assignment for regions of tables in DISABLING state because during clean cluster startup
2538     // no RS is alive and regions map also doesn't have any information about the regions.
2539     // See HBASE-6281.
2540     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisabledOrDisablingTables(watcher);
2541     disabledOrDisablingOrEnabling.addAll(ZKTable.getEnablingTables(watcher));
2542     // Scan hbase:meta for all user regions, skipping any disabled tables
2543     Map<HRegionInfo, ServerName> allRegions;
2544     SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment =
2545        new SnapshotOfRegionAssignmentFromMeta(catalogTracker, disabledOrDisablingOrEnabling, true);
2546     snapshotOfRegionAssignment.initialize();
2547     allRegions = snapshotOfRegionAssignment.getRegionToRegionServerMap();
2548     if (allRegions == null || allRegions.isEmpty()) return;
2549 
2550     // Determine what type of assignment to do on startup
2551     boolean retainAssignment = server.getConfiguration().
2552       getBoolean("hbase.master.startup.retainassign", true);
2553 
2554     if (retainAssignment) {
2555       assign(allRegions);
2556     } else {
2557       List<HRegionInfo> regions = new ArrayList<HRegionInfo>(allRegions.keySet());
2558       assign(regions);
2559     }
2560 
2561     for (HRegionInfo hri : allRegions.keySet()) {
2562       TableName tableName = hri.getTable();
2563       if (!zkTable.isEnabledTable(tableName)) {
2564         setEnabledTable(tableName);
2565       }
2566     }
2567   }
2568 
2569   /**
2570    * Wait until no regions in transition.
2571    * @param timeout How long to wait.
2572    * @return True if nothing in regions in transition.
2573    * @throws InterruptedException
2574    */
2575   boolean waitUntilNoRegionsInTransition(final long timeout)
2576       throws InterruptedException {
2577     // Blocks until there are no regions in transition. It is possible that
2578     // there
2579     // are regions in transition immediately after this returns but guarantees
2580     // that if it returns without an exception that there was a period of time
2581     // with no regions in transition from the point-of-view of the in-memory
2582     // state of the Master.
2583     final long endTime = System.currentTimeMillis() + timeout;
2584 
2585     while (!this.server.isStopped() && regionStates.isRegionsInTransition()
2586         && endTime > System.currentTimeMillis()) {
2587       regionStates.waitForUpdate(100);
2588     }
2589 
2590     return !regionStates.isRegionsInTransition();
2591   }
2592 
2593   /**
2594    * Rebuild the list of user regions and assignment information.
2595    * <p>
2596    * Returns a map of servers that are not found to be online and the regions
2597    * they were hosting.
2598    * @return map of servers not online to their assigned regions, as stored
2599    *         in META
2600    * @throws IOException
2601    */
2602   Map<ServerName, List<HRegionInfo>> rebuildUserRegions() throws IOException, KeeperException {
2603     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2604     Set<TableName> disabledOrEnablingTables = ZKTable.getDisabledTables(watcher);
2605     disabledOrEnablingTables.addAll(enablingTables);
2606     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisablingTables(watcher);
2607     disabledOrDisablingOrEnabling.addAll(disabledOrEnablingTables);
2608 
2609     // Region assignment from META
2610     List<Result> results = MetaReader.fullScan(this.catalogTracker);
2611     // Get any new but slow to checkin region server that joined the cluster
2612     Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
2613     // Map of offline servers and their regions to be returned
2614     Map<ServerName, List<HRegionInfo>> offlineServers =
2615       new TreeMap<ServerName, List<HRegionInfo>>();
2616     // Iterate regions in META
2617     for (Result result : results) {
2618       Pair<HRegionInfo, ServerName> region = HRegionInfo.getHRegionInfoAndServerName(result);
2619       if (region == null) continue;
2620       HRegionInfo regionInfo = region.getFirst();
2621       ServerName regionLocation = region.getSecond();
2622       if (regionInfo == null) continue;
2623       regionStates.createRegionState(regionInfo);
2624       if (regionStates.isRegionInState(regionInfo, State.SPLIT)) {
2625         // Split is considered to be completed. If the split znode still
2626         // exists, the region will be put back to SPLITTING state later
2627         LOG.debug("Region " + regionInfo.getRegionNameAsString()
2628            + " split is completed. Hence need not add to regions list");
2629         continue;
2630       }
2631       TableName tableName = regionInfo.getTable();
2632       if (regionLocation == null) {
2633         // regionLocation could be null if createTable didn't finish properly.
2634         // When createTable is in progress, HMaster restarts.
2635         // Some regions have been added to hbase:meta, but have not been assigned.
2636         // When this happens, the region's table must be in ENABLING state.
2637         // It can't be in ENABLED state as that is set when all regions are
2638         // assigned.
2639         // It can't be in DISABLING state, because DISABLING state transitions
2640         // from ENABLED state when application calls disableTable.
2641         // It can't be in DISABLED state, because DISABLED states transitions
2642         // from DISABLING state.
2643         if (!enablingTables.contains(tableName)) {
2644           LOG.warn("Region " + regionInfo.getEncodedName() +
2645             " has null regionLocation." + " But its table " + tableName +
2646             " isn't in ENABLING state.");
2647         }
2648       } else if (!onlineServers.contains(regionLocation)) {
2649         // Region is located on a server that isn't online
2650         List<HRegionInfo> offlineRegions = offlineServers.get(regionLocation);
2651         if (offlineRegions == null) {
2652           offlineRegions = new ArrayList<HRegionInfo>(1);
2653           offlineServers.put(regionLocation, offlineRegions);
2654         }
2655         offlineRegions.add(regionInfo);
2656         // need to enable the table if not disabled or disabling or enabling
2657         // this will be used in rolling restarts
2658         if (!disabledOrDisablingOrEnabling.contains(tableName)
2659             && !getZKTable().isEnabledTable(tableName)) {
2660           setEnabledTable(tableName);
2661         }
2662       } else {
2663         // Region is being served and on an active server
2664         // add only if region not in disabled or enabling table
2665         if (!disabledOrEnablingTables.contains(tableName)) {
2666           regionStates.updateRegionState(regionInfo, State.OPEN, regionLocation);
2667           regionStates.regionOnline(regionInfo, regionLocation);
2668         }
2669         // need to enable the table if not disabled or disabling or enabling
2670         // this will be used in rolling restarts
2671         if (!disabledOrDisablingOrEnabling.contains(tableName)
2672             && !getZKTable().isEnabledTable(tableName)) {
2673           setEnabledTable(tableName);
2674         }
2675       }
2676     }
2677     return offlineServers;
2678   }
2679 
2680   /**
2681    * Recover the tables that were not fully moved to DISABLED state. These
2682    * tables are in DISABLING state when the master restarted/switched.
2683    *
2684    * @throws KeeperException
2685    * @throws TableNotFoundException
2686    * @throws IOException
2687    */
2688   private void recoverTableInDisablingState()
2689       throws KeeperException, TableNotFoundException, IOException {
2690     Set<TableName> disablingTables = ZKTable.getDisablingTables(watcher);
2691     if (disablingTables.size() != 0) {
2692       for (TableName tableName : disablingTables) {
2693         // Recover by calling DisableTableHandler
2694         LOG.info("The table " + tableName
2695             + " is in DISABLING state.  Hence recovering by moving the table"
2696             + " to DISABLED state.");
2697         new DisableTableHandler(this.server, tableName, catalogTracker,
2698             this, tableLockManager, true).prepare().process();
2699       }
2700     }
2701   }
2702 
2703   /**
2704    * Recover the tables that are not fully moved to ENABLED state. These tables
2705    * are in ENABLING state when the master restarted/switched
2706    *
2707    * @throws KeeperException
2708    * @throws org.apache.hadoop.hbase.TableNotFoundException
2709    * @throws IOException
2710    */
2711   private void recoverTableInEnablingState()
2712       throws KeeperException, TableNotFoundException, IOException {
2713     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2714     if (enablingTables.size() != 0) {
2715       for (TableName tableName : enablingTables) {
2716         // Recover by calling EnableTableHandler
2717         LOG.info("The table " + tableName
2718             + " is in ENABLING state.  Hence recovering by moving the table"
2719             + " to ENABLED state.");
2720         // enableTable in sync way during master startup,
2721         // no need to invoke coprocessor
2722         EnableTableHandler eth = new EnableTableHandler(this.server, tableName,
2723           catalogTracker, this, tableLockManager, true);
2724         try {
2725           eth.prepare();
2726         } catch (TableNotFoundException e) {
2727           LOG.warn("Table " + tableName + " not found in hbase:meta to recover.");
2728           continue;
2729         }
2730         eth.process();
2731       }
2732     }
2733   }
2734 
2735   /**
2736    * Processes list of dead servers from result of hbase:meta scan and regions in RIT
2737    * <p>
2738    * This is used for failover to recover the lost regions that belonged to
2739    * RegionServers which failed while there was no active master or regions
2740    * that were in RIT.
2741    * <p>
2742    *
2743    *
2744    * @param deadServers
2745    *          The list of dead servers which failed while there was no active
2746    *          master. Can be null.
2747    * @throws IOException
2748    * @throws KeeperException
2749    */
2750   private void processDeadServersAndRecoverLostRegions(
2751       Map<ServerName, List<HRegionInfo>> deadServers)
2752           throws IOException, KeeperException {
2753     if (deadServers != null) {
2754       for (Map.Entry<ServerName, List<HRegionInfo>> server: deadServers.entrySet()) {
2755         ServerName serverName = server.getKey();
2756         // We need to keep such info even if the server is known dead
2757         regionStates.setLastRegionServerOfRegions(serverName, server.getValue());
2758         if (!serverManager.isServerDead(serverName)) {
2759           serverManager.expireServer(serverName); // Let SSH do region re-assign
2760         }
2761       }
2762     }
2763     List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(
2764       this.watcher, this.watcher.assignmentZNode);
2765     if (!nodes.isEmpty()) {
2766       for (String encodedRegionName : nodes) {
2767         processRegionInTransition(encodedRegionName, null);
2768       }
2769     }
2770 
2771     // Now we can safely claim failover cleanup completed and enable
2772     // ServerShutdownHandler for further processing. The nodes (below)
2773     // in transition, if any, are for regions not related to those
2774     // dead servers at all, and can be done in parallel to SSH.
2775     failoverCleanupDone();
2776   }
2777 
2778   /**
2779    * Set Regions in transitions metrics.
2780    * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
2781    * This iterator is not fail fast, which may lead to stale read; but that's better than
2782    * creating a copy of the map for metrics computation, as this method will be invoked
2783    * on a frequent interval.
2784    */
2785   public void updateRegionsInTransitionMetrics() {
2786     long currentTime = System.currentTimeMillis();
2787     int totalRITs = 0;
2788     int totalRITsOverThreshold = 0;
2789     long oldestRITTime = 0;
2790     int ritThreshold = this.server.getConfiguration().
2791       getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
2792     for (RegionState state: regionStates.getRegionsInTransition().values()) {
2793       totalRITs++;
2794       long ritTime = currentTime - state.getStamp();
2795       if (ritTime > ritThreshold) { // more than the threshold
2796         totalRITsOverThreshold++;
2797       }
2798       if (oldestRITTime < ritTime) {
2799         oldestRITTime = ritTime;
2800       }
2801     }
2802     if (this.metricsAssignmentManager != null) {
2803       this.metricsAssignmentManager.updateRITOldestAge(oldestRITTime);
2804       this.metricsAssignmentManager.updateRITCount(totalRITs);
2805       this.metricsAssignmentManager.updateRITCountOverThreshold(totalRITsOverThreshold);
2806     }
2807   }
2808 
2809   /**
2810    * @param region Region whose plan we are to clear.
2811    */
2812   void clearRegionPlan(final HRegionInfo region) {
2813     synchronized (this.regionPlans) {
2814       this.regionPlans.remove(region.getEncodedName());
2815     }
2816   }
2817 
2818   /**
2819    * Wait on region to clear regions-in-transition.
2820    * @param hri Region to wait on.
2821    * @throws IOException
2822    */
2823   public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
2824       throws IOException, InterruptedException {
2825     waitOnRegionToClearRegionsInTransition(hri, -1L);
2826   }
2827 
2828   /**
2829    * Wait on region to clear regions-in-transition or time out
2830    * @param hri
2831    * @param timeOut Milliseconds to wait for current region to be out of transition state.
2832    * @return True when a region clears regions-in-transition before timeout otherwise false
2833    * @throws InterruptedException
2834    */
2835   public boolean waitOnRegionToClearRegionsInTransition(final HRegionInfo hri, long timeOut)
2836       throws InterruptedException {
2837     if (!regionStates.isRegionInTransition(hri)) return true;
2838     long end = (timeOut <= 0) ? Long.MAX_VALUE : EnvironmentEdgeManager.currentTimeMillis()
2839         + timeOut;
2840     // There is already a timeout monitor on regions in transition so I
2841     // should not have to have one here too?
2842     LOG.info("Waiting for " + hri.getEncodedName() +
2843         " to leave regions-in-transition, timeOut=" + timeOut + " ms.");
2844     while (!this.server.isStopped() && regionStates.isRegionInTransition(hri)) {
2845       regionStates.waitForUpdate(100);
2846       if (EnvironmentEdgeManager.currentTimeMillis() > end) {
2847         LOG.info("Timed out on waiting for " + hri.getEncodedName() + " to be assigned.");
2848         return false;
2849       }
2850     }
2851     if (this.server.isStopped()) {
2852       LOG.info("Giving up wait on regions in transition because stoppable.isStopped is set");
2853       return false;
2854     }
2855     return true;
2856   }
2857 
2858   /**
2859    * Update timers for all regions in transition going against the server in the
2860    * serversInUpdatingTimer.
2861    */
2862   public class TimerUpdater extends Chore {
2863 
2864     public TimerUpdater(final int period, final Stoppable stopper) {
2865       super("AssignmentTimerUpdater", period, stopper);
2866     }
2867 
2868     @Override
2869     protected void chore() {
2870       Preconditions.checkState(tomActivated);
2871       ServerName serverToUpdateTimer = null;
2872       while (!serversInUpdatingTimer.isEmpty() && !stopper.isStopped()) {
2873         if (serverToUpdateTimer == null) {
2874           serverToUpdateTimer = serversInUpdatingTimer.first();
2875         } else {
2876           serverToUpdateTimer = serversInUpdatingTimer
2877               .higher(serverToUpdateTimer);
2878         }
2879         if (serverToUpdateTimer == null) {
2880           break;
2881         }
2882         updateTimers(serverToUpdateTimer);
2883         serversInUpdatingTimer.remove(serverToUpdateTimer);
2884       }
2885     }
2886   }
2887 
2888   /**
2889    * Monitor to check for time outs on region transition operations
2890    */
2891   public class TimeoutMonitor extends Chore {
2892     private boolean allRegionServersOffline = false;
2893     private ServerManager serverManager;
2894     private final int timeout;
2895 
2896     /**
2897      * Creates a periodic monitor to check for time outs on region transition
2898      * operations.  This will deal with retries if for some reason something
2899      * doesn't happen within the specified timeout.
2900      * @param period
2901    * @param stopper When {@link Stoppable#isStopped()} is true, this thread will
2902    * cleanup and exit cleanly.
2903      * @param timeout
2904      */
2905     public TimeoutMonitor(final int period, final Stoppable stopper,
2906         ServerManager serverManager,
2907         final int timeout) {
2908       super("AssignmentTimeoutMonitor", period, stopper);
2909       this.timeout = timeout;
2910       this.serverManager = serverManager;
2911     }
2912 
2913     private synchronized void setAllRegionServersOffline(
2914       boolean allRegionServersOffline) {
2915       this.allRegionServersOffline = allRegionServersOffline;
2916     }
2917 
2918     @Override
2919     protected void chore() {
2920       Preconditions.checkState(tomActivated);
2921       boolean noRSAvailable = this.serverManager.createDestinationServersList().isEmpty();
2922 
2923       // Iterate all regions in transition checking for time outs
2924       long now = System.currentTimeMillis();
2925       // no lock concurrent access ok: we will be working on a copy, and it's java-valid to do
2926       //  a copy while another thread is adding/removing items
2927       for (String regionName : regionStates.getRegionsInTransition().keySet()) {
2928         RegionState regionState = regionStates.getRegionTransitionState(regionName);
2929         if (regionState == null) continue;
2930 
2931         if (regionState.getStamp() + timeout <= now) {
2932           // decide on action upon timeout
2933           actOnTimeOut(regionState);
2934         } else if (this.allRegionServersOffline && !noRSAvailable) {
2935           RegionPlan existingPlan = regionPlans.get(regionName);
2936           if (existingPlan == null
2937               || !this.serverManager.isServerOnline(existingPlan
2938                   .getDestination())) {
2939             // if some RSs just came back online, we can start the assignment
2940             // right away
2941             actOnTimeOut(regionState);
2942           }
2943         }
2944       }
2945       setAllRegionServersOffline(noRSAvailable);
2946     }
2947 
2948     private void actOnTimeOut(RegionState regionState) {
2949       HRegionInfo regionInfo = regionState.getRegion();
2950       LOG.info("Regions in transition timed out:  " + regionState);
2951       // Expired! Do a retry.
2952       switch (regionState.getState()) {
2953       case CLOSED:
2954         LOG.info("Region " + regionInfo.getEncodedName()
2955             + " has been CLOSED for too long, waiting on queued "
2956             + "ClosedRegionHandler to run or server shutdown");
2957         // Update our timestamp.
2958         regionState.updateTimestampToNow();
2959         break;
2960       case OFFLINE:
2961         LOG.info("Region has been OFFLINE for too long, " + "reassigning "
2962             + regionInfo.getRegionNameAsString() + " to a random server");
2963         invokeAssign(regionInfo);
2964         break;
2965       case PENDING_OPEN:
2966         LOG.info("Region has been PENDING_OPEN for too "
2967             + "long, reassigning region=" + regionInfo.getRegionNameAsString());
2968         invokeAssign(regionInfo);
2969         break;
2970       case OPENING:
2971         processOpeningState(regionInfo);
2972         break;
2973       case OPEN:
2974         LOG.error("Region has been OPEN for too long, " +
2975             "we don't know where region was opened so can't do anything");
2976         regionState.updateTimestampToNow();
2977         break;
2978 
2979       case PENDING_CLOSE:
2980         LOG.info("Region has been PENDING_CLOSE for too "
2981             + "long, running forced unassign again on region="
2982             + regionInfo.getRegionNameAsString());
2983         invokeUnassign(regionInfo);
2984         break;
2985       case CLOSING:
2986         LOG.info("Region has been CLOSING for too " +
2987           "long, this should eventually complete or the server will " +
2988           "expire, send RPC again");
2989         invokeUnassign(regionInfo);
2990         break;
2991 
2992       case SPLIT:
2993       case SPLITTING:
2994       case FAILED_OPEN:
2995       case FAILED_CLOSE:
2996       case MERGING:
2997         break;
2998 
2999       default:
3000         throw new IllegalStateException("Received event is not valid.");
3001       }
3002     }
3003   }
3004 
3005   private void processOpeningState(HRegionInfo regionInfo) {
3006     LOG.info("Region has been OPENING for too long, reassigning region="
3007         + regionInfo.getRegionNameAsString());
3008     // Should have a ZK node in OPENING state
3009     try {
3010       String node = ZKAssign.getNodeName(watcher, regionInfo.getEncodedName());
3011       Stat stat = new Stat();
3012       byte [] data = ZKAssign.getDataNoWatch(watcher, node, stat);
3013       if (data == null) {
3014         LOG.warn("Data is null, node " + node + " no longer exists");
3015         return;
3016       }
3017       RegionTransition rt = RegionTransition.parseFrom(data);
3018       EventType et = rt.getEventType();
3019       if (et == EventType.RS_ZK_REGION_OPENED) {
3020         LOG.debug("Region has transitioned to OPENED, allowing "
3021             + "watched event handlers to process");
3022         return;
3023       } else if (et != EventType.RS_ZK_REGION_OPENING && et != EventType.RS_ZK_REGION_FAILED_OPEN ) {
3024         LOG.warn("While timing out a region, found ZK node in unexpected state: " + et);
3025         return;
3026       }
3027       invokeAssign(regionInfo);
3028     } catch (KeeperException ke) {
3029       LOG.error("Unexpected ZK exception timing out CLOSING region", ke);
3030     } catch (DeserializationException e) {
3031       LOG.error("Unexpected exception parsing CLOSING region", e);
3032     }
3033   }
3034 
3035   void invokeAssign(HRegionInfo regionInfo) {
3036     threadPoolExecutorService.submit(new AssignCallable(this, regionInfo));
3037   }
3038 
3039   private void invokeUnassign(HRegionInfo regionInfo) {
3040     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3041   }
3042 
3043   public boolean isCarryingMeta(ServerName serverName) {
3044     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
3045   }
3046 
3047   /**
3048    * Check if the shutdown server carries the specific region.
3049    * We have a bunch of places that store region location
3050    * Those values aren't consistent. There is a delay of notification.
3051    * The location from zookeeper unassigned node has the most recent data;
3052    * but the node could be deleted after the region is opened by AM.
3053    * The AM's info could be old when OpenedRegionHandler
3054    * processing hasn't finished yet when server shutdown occurs.
3055    * @return whether the serverName currently hosts the region
3056    */
3057   private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
3058     RegionTransition rt = null;
3059     try {
3060       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
3061       // This call can legitimately come by null
3062       rt = data == null? null: RegionTransition.parseFrom(data);
3063     } catch (KeeperException e) {
3064       server.abort("Exception reading unassigned node for region=" + hri.getEncodedName(), e);
3065     } catch (DeserializationException e) {
3066       server.abort("Exception parsing unassigned node for region=" + hri.getEncodedName(), e);
3067     }
3068 
3069     ServerName addressFromZK = rt != null? rt.getServerName():  null;
3070     if (addressFromZK != null) {
3071       // if we get something from ZK, we will use the data
3072       boolean matchZK = addressFromZK.equals(serverName);
3073       LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK +
3074         " current=" + serverName + ", matches=" + matchZK);
3075       return matchZK;
3076     }
3077 
3078     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
3079     boolean matchAM = (addressFromAM != null &&
3080       addressFromAM.equals(serverName));
3081     LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
3082       " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
3083       " server being checked: " + serverName);
3084 
3085     return matchAM;
3086   }
3087 
3088   /**
3089    * Process shutdown server removing any assignments.
3090    * @param sn Server that went down.
3091    * @return list of regions in transition on this server
3092    */
3093   public List<HRegionInfo> processServerShutdown(final ServerName sn) {
3094     // Clean out any existing assignment plans for this server
3095     synchronized (this.regionPlans) {
3096       for (Iterator <Map.Entry<String, RegionPlan>> i =
3097           this.regionPlans.entrySet().iterator(); i.hasNext();) {
3098         Map.Entry<String, RegionPlan> e = i.next();
3099         ServerName otherSn = e.getValue().getDestination();
3100         // The name will be null if the region is planned for a random assign.
3101         if (otherSn != null && otherSn.equals(sn)) {
3102           // Use iterator's remove else we'll get CME
3103           i.remove();
3104         }
3105       }
3106     }
3107     List<HRegionInfo> regions = regionStates.serverOffline(watcher, sn);
3108     for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) {
3109       HRegionInfo hri = it.next();
3110       String encodedName = hri.getEncodedName();
3111 
3112       // We need a lock on the region as we could update it
3113       Lock lock = locker.acquireLock(encodedName);
3114       try {
3115         RegionState regionState =
3116           regionStates.getRegionTransitionState(encodedName);
3117         if (regionState == null
3118             || (regionState.getServerName() != null && !regionState.isOnServer(sn))
3119             || !(regionState.isFailedClose() || regionState.isOffline()
3120               || regionState.isPendingOpenOrOpening())) {
3121           LOG.info("Skip " + regionState + " since it is not opening/failed_close"
3122             + " on the dead server any more: " + sn);
3123           it.remove();
3124         } else {
3125           try {
3126             // Delete the ZNode if exists
3127             ZKAssign.deleteNodeFailSilent(watcher, hri);
3128           } catch (KeeperException ke) {
3129             server.abort("Unexpected ZK exception deleting node " + hri, ke);
3130           }
3131           if (zkTable.isDisablingOrDisabledTable(hri.getTable())) {
3132             regionStates.regionOffline(hri);
3133             it.remove();
3134             continue;
3135           }
3136           // Mark the region offline and assign it again by SSH
3137           regionStates.updateRegionState(hri, State.OFFLINE);
3138         }
3139       } finally {
3140         lock.unlock();
3141       }
3142     }
3143     return regions;
3144   }
3145 
3146   /**
3147    * @param plan Plan to execute.
3148    */
3149   public void balance(final RegionPlan plan) {
3150     HRegionInfo hri = plan.getRegionInfo();
3151     TableName tableName = hri.getTable();
3152     if (zkTable.isDisablingOrDisabledTable(tableName)) {
3153       LOG.info("Ignored moving region of disabling/disabled table "
3154         + tableName);
3155       return;
3156     }
3157 
3158     // Move the region only if it's assigned
3159     String encodedName = hri.getEncodedName();
3160     ReentrantLock lock = locker.acquireLock(encodedName);
3161     try {
3162       if (!regionStates.isRegionOnline(hri)) {
3163         RegionState state = regionStates.getRegionState(encodedName);
3164         LOG.info("Ignored moving region not assigned: " + hri + ", "
3165           + (state == null ? "not in region states" : state));
3166         return;
3167       }
3168       synchronized (this.regionPlans) {
3169         this.regionPlans.put(plan.getRegionName(), plan);
3170       }
3171       unassign(hri, false, plan.getDestination());
3172     } finally {
3173       lock.unlock();
3174     }
3175   }
3176 
3177   public void stop() {
3178     shutdown(); // Stop executor service, etc
3179     if (tomActivated){
3180       this.timeoutMonitor.interrupt();
3181       this.timerUpdater.interrupt();
3182     }
3183   }
3184 
3185   /**
3186    * Shutdown the threadpool executor service
3187    */
3188   public void shutdown() {
3189     // It's an immediate shutdown, so we're clearing the remaining tasks.
3190     synchronized (zkEventWorkerWaitingList){
3191       zkEventWorkerWaitingList.clear();
3192     }
3193     threadPoolExecutorService.shutdownNow();
3194     zkEventWorkers.shutdownNow();
3195   }
3196 
3197   protected void setEnabledTable(TableName tableName) {
3198     try {
3199       this.zkTable.setEnabledTable(tableName);
3200     } catch (KeeperException e) {
3201       // here we can abort as it is the start up flow
3202       String errorMsg = "Unable to ensure that the table " + tableName
3203           + " will be" + " enabled because of a ZooKeeper issue";
3204       LOG.error(errorMsg);
3205       this.server.abort(errorMsg, e);
3206     }
3207   }
3208 
3209   /**
3210    * Set region as OFFLINED up in zookeeper asynchronously.
3211    * @param state
3212    * @return True if we succeeded, false otherwise (State was incorrect or failed
3213    * updating zk).
3214    */
3215   private boolean asyncSetOfflineInZooKeeper(final RegionState state,
3216       final AsyncCallback.StringCallback cb, final ServerName destination) {
3217     if (!state.isClosed() && !state.isOffline()) {
3218       this.server.abort("Unexpected state trying to OFFLINE; " + state,
3219         new IllegalStateException());
3220       return false;
3221     }
3222     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
3223     try {
3224       ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(),
3225         destination, cb, state);
3226     } catch (KeeperException e) {
3227       if (e instanceof NodeExistsException) {
3228         LOG.warn("Node for " + state.getRegion() + " already exists");
3229       } else {
3230         server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
3231       }
3232       return false;
3233     }
3234     return true;
3235   }
3236 
3237   private boolean deleteNodeInStates(String encodedName,
3238       String desc, ServerName sn, EventType... types) {
3239     try {
3240       for (EventType et: types) {
3241         if (ZKAssign.deleteNode(watcher, encodedName, et, sn)) {
3242           return true;
3243         }
3244       }
3245       LOG.info("Failed to delete the " + desc + " node for "
3246         + encodedName + ". The node type may not match");
3247     } catch (NoNodeException e) {
3248       if (LOG.isDebugEnabled()) {
3249         LOG.debug("The " + desc + " node for " + encodedName + " already deleted");
3250       }
3251     } catch (KeeperException ke) {
3252       server.abort("Unexpected ZK exception deleting " + desc
3253         + " node for the region " + encodedName, ke);
3254     }
3255     return false;
3256   }
3257 
3258   private void deleteMergingNode(String encodedName, ServerName sn) {
3259     deleteNodeInStates(encodedName, "merging", sn, EventType.RS_ZK_REGION_MERGING,
3260       EventType.RS_ZK_REQUEST_REGION_MERGE, EventType.RS_ZK_REGION_MERGED);
3261   }
3262 
3263   private void deleteSplittingNode(String encodedName, ServerName sn) {
3264     deleteNodeInStates(encodedName, "splitting", sn, EventType.RS_ZK_REGION_SPLITTING,
3265       EventType.RS_ZK_REQUEST_REGION_SPLIT, EventType.RS_ZK_REGION_SPLIT);
3266   }
3267 
3268   /**
3269    * A helper to handle region merging transition event.
3270    * It transitions merging regions to MERGING state.
3271    */
3272   private boolean handleRegionMerging(final RegionTransition rt, final String encodedName,
3273       final String prettyPrintedRegionName, final ServerName sn) {
3274     if (!serverManager.isServerOnline(sn)) {
3275       LOG.warn("Dropped merging! ServerName=" + sn + " unknown.");
3276       return false;
3277     }
3278     byte [] payloadOfMerging = rt.getPayload();
3279     List<HRegionInfo> mergingRegions;
3280     try {
3281       mergingRegions = HRegionInfo.parseDelimitedFrom(
3282         payloadOfMerging, 0, payloadOfMerging.length);
3283     } catch (IOException e) {
3284       LOG.error("Dropped merging! Failed reading "  + rt.getEventType()
3285         + " payload for " + prettyPrintedRegionName);
3286       return false;
3287     }
3288     assert mergingRegions.size() == 3;
3289     HRegionInfo p = mergingRegions.get(0);
3290     HRegionInfo hri_a = mergingRegions.get(1);
3291     HRegionInfo hri_b = mergingRegions.get(2);
3292 
3293     RegionState rs_p = regionStates.getRegionState(p);
3294     RegionState rs_a = regionStates.getRegionState(hri_a);
3295     RegionState rs_b = regionStates.getRegionState(hri_b);
3296 
3297     if (!((rs_a == null || rs_a.isOpenOrMergingOnServer(sn))
3298         && (rs_b == null || rs_b.isOpenOrMergingOnServer(sn))
3299         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
3300       LOG.warn("Dropped merging! Not in state good for MERGING; rs_p="
3301         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3302       return false;
3303     }
3304 
3305     EventType et = rt.getEventType();
3306     if (et == EventType.RS_ZK_REQUEST_REGION_MERGE) {
3307       try {
3308         if (RegionMergeTransaction.transitionMergingNode(watcher, p,
3309             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_MERGE,
3310             EventType.RS_ZK_REGION_MERGING) == -1) {
3311           byte[] data = ZKAssign.getData(watcher, encodedName);
3312           EventType currentType = null;
3313           if (data != null) {
3314             RegionTransition newRt = RegionTransition.parseFrom(data);
3315             currentType = newRt.getEventType();
3316           }
3317           if (currentType == null || (currentType != EventType.RS_ZK_REGION_MERGED
3318               && currentType != EventType.RS_ZK_REGION_MERGING)) {
3319             LOG.warn("Failed to transition pending_merge node "
3320               + encodedName + " to merging, it's now " + currentType);
3321             return false;
3322           }
3323         }
3324       } catch (Exception e) {
3325         LOG.warn("Failed to transition pending_merge node "
3326           + encodedName + " to merging", e);
3327         return false;
3328       }
3329     }
3330 
3331     synchronized (regionStates) {
3332       regionStates.updateRegionState(hri_a, State.MERGING);
3333       regionStates.updateRegionState(hri_b, State.MERGING);
3334       regionStates.updateRegionState(p, State.MERGING_NEW, sn);
3335 
3336       if (et != EventType.RS_ZK_REGION_MERGED) {
3337         regionStates.regionOffline(p, State.MERGING_NEW);
3338         this.mergingRegions.put(encodedName,
3339           new PairOfSameType<HRegionInfo>(hri_a, hri_b));
3340       } else {
3341         this.mergingRegions.remove(encodedName);
3342         regionOffline(hri_a, State.MERGED);
3343         regionOffline(hri_b, State.MERGED);
3344         regionOnline(p, sn);
3345       }
3346     }
3347 
3348     if (et == EventType.RS_ZK_REGION_MERGED) {
3349       LOG.debug("Handling MERGED event for " + encodedName + "; deleting node");
3350       // Remove region from ZK
3351       try {
3352         boolean successful = false;
3353         while (!successful) {
3354           // It's possible that the RS tickles in between the reading of the
3355           // znode and the deleting, so it's safe to retry.
3356           successful = ZKAssign.deleteNode(watcher, encodedName,
3357             EventType.RS_ZK_REGION_MERGED, sn);
3358         }
3359       } catch (KeeperException e) {
3360         if (e instanceof NoNodeException) {
3361           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3362           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3363         } else {
3364           server.abort("Error deleting MERGED node " + encodedName, e);
3365         }
3366       }
3367       LOG.info("Handled MERGED event; merged=" + p.getRegionNameAsString()
3368         + ", region_a=" + hri_a.getRegionNameAsString() + ", region_b="
3369         + hri_b.getRegionNameAsString() + ", on " + sn);
3370 
3371       // User could disable the table before master knows the new region.
3372       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3373         unassign(p);
3374       }
3375     }
3376     return true;
3377   }
3378 
3379   /**
3380    * A helper to handle region splitting transition event.
3381    */
3382   private boolean handleRegionSplitting(final RegionTransition rt, final String encodedName,
3383       final String prettyPrintedRegionName, final ServerName sn) {
3384     if (!serverManager.isServerOnline(sn)) {
3385       LOG.warn("Dropped splitting! ServerName=" + sn + " unknown.");
3386       return false;
3387     }
3388     byte [] payloadOfSplitting = rt.getPayload();
3389     List<HRegionInfo> splittingRegions;
3390     try {
3391       splittingRegions = HRegionInfo.parseDelimitedFrom(
3392         payloadOfSplitting, 0, payloadOfSplitting.length);
3393     } catch (IOException e) {
3394       LOG.error("Dropped splitting! Failed reading " + rt.getEventType()
3395         + " payload for " + prettyPrintedRegionName);
3396       return false;
3397     }
3398     assert splittingRegions.size() == 2;
3399     HRegionInfo hri_a = splittingRegions.get(0);
3400     HRegionInfo hri_b = splittingRegions.get(1);
3401 
3402     RegionState rs_p = regionStates.getRegionState(encodedName);
3403     RegionState rs_a = regionStates.getRegionState(hri_a);
3404     RegionState rs_b = regionStates.getRegionState(hri_b);
3405 
3406     if (!((rs_p == null || rs_p.isOpenOrSplittingOnServer(sn))
3407         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
3408         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
3409       LOG.warn("Dropped splitting! Not in state good for SPLITTING; rs_p="
3410         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3411       return false;
3412     }
3413 
3414     if (rs_p == null) {
3415       // Splitting region should be online
3416       rs_p = regionStates.updateRegionState(rt, State.OPEN);
3417       if (rs_p == null) {
3418         LOG.warn("Received splitting for region " + prettyPrintedRegionName
3419           + " from server " + sn + " but it doesn't exist anymore,"
3420           + " probably already processed its split");
3421         return false;
3422       }
3423       regionStates.regionOnline(rs_p.getRegion(), sn);
3424     }
3425 
3426     HRegionInfo p = rs_p.getRegion();
3427     EventType et = rt.getEventType();
3428     if (et == EventType.RS_ZK_REQUEST_REGION_SPLIT) {
3429       try {
3430         if (SplitTransaction.transitionSplittingNode(watcher, p,
3431             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_SPLIT,
3432             EventType.RS_ZK_REGION_SPLITTING) == -1) {
3433           byte[] data = ZKAssign.getData(watcher, encodedName);
3434           EventType currentType = null;
3435           if (data != null) {
3436             RegionTransition newRt = RegionTransition.parseFrom(data);
3437             currentType = newRt.getEventType();
3438           }
3439           if (currentType == null || (currentType != EventType.RS_ZK_REGION_SPLIT
3440               && currentType != EventType.RS_ZK_REGION_SPLITTING)) {
3441             LOG.warn("Failed to transition pending_split node "
3442               + encodedName + " to splitting, it's now " + currentType);
3443             return false;
3444           }
3445         }
3446       } catch (Exception e) {
3447         LOG.warn("Failed to transition pending_split node "
3448           + encodedName + " to splitting", e);
3449         return false;
3450       }
3451     }
3452 
3453     synchronized (regionStates) {
3454       regionStates.updateRegionState(hri_a, State.SPLITTING_NEW, sn);
3455       regionStates.updateRegionState(hri_b, State.SPLITTING_NEW, sn);
3456       regionStates.regionOffline(hri_a, State.SPLITTING_NEW);
3457       regionStates.regionOffline(hri_b, State.SPLITTING_NEW);
3458       regionStates.updateRegionState(rt, State.SPLITTING);
3459 
3460       // The below is for testing ONLY!  We can't do fault injection easily, so
3461       // resort to this kinda uglyness -- St.Ack 02/25/2011.
3462       if (TEST_SKIP_SPLIT_HANDLING) {
3463         LOG.warn("Skipping split message, TEST_SKIP_SPLIT_HANDLING is set");
3464         return true; // return true so that the splitting node stays
3465       }
3466 
3467       if (et == EventType.RS_ZK_REGION_SPLIT) {
3468         regionOffline(p, State.SPLIT);
3469         regionOnline(hri_a, sn);
3470         regionOnline(hri_b, sn);
3471       }
3472     }
3473 
3474     if (et == EventType.RS_ZK_REGION_SPLIT) {
3475       LOG.debug("Handling SPLIT event for " + encodedName + "; deleting node");
3476       // Remove region from ZK
3477       try {
3478         boolean successful = false;
3479         while (!successful) {
3480           // It's possible that the RS tickles in between the reading of the
3481           // znode and the deleting, so it's safe to retry.
3482           successful = ZKAssign.deleteNode(watcher, encodedName,
3483             EventType.RS_ZK_REGION_SPLIT, sn);
3484         }
3485       } catch (KeeperException e) {
3486         if (e instanceof NoNodeException) {
3487           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3488           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3489         } else {
3490           server.abort("Error deleting SPLIT node " + encodedName, e);
3491         }
3492       }
3493       LOG.info("Handled SPLIT event; parent=" + p.getRegionNameAsString()
3494         + ", daughter a=" + hri_a.getRegionNameAsString() + ", daughter b="
3495         + hri_b.getRegionNameAsString() + ", on " + sn);
3496 
3497       // User could disable the table before master knows the new region.
3498       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3499         unassign(hri_a);
3500         unassign(hri_b);
3501       }
3502     }
3503     return true;
3504   }
3505 
3506   /**
3507    * A region is offline.  The new state should be the specified one,
3508    * if not null.  If the specified state is null, the new state is Offline.
3509    * The specified state can be Split/Merged/Offline/null only.
3510    */
3511   private void regionOffline(final HRegionInfo regionInfo, final State state) {
3512     regionStates.regionOffline(regionInfo, state);
3513     removeClosedRegion(regionInfo);
3514     // remove the region plan as well just in case.
3515     clearRegionPlan(regionInfo);
3516   }
3517 
3518   /**
3519    * @return Instance of load balancer
3520    */
3521   public LoadBalancer getBalancer() {
3522     return this.balancer;
3523   }
3524 }