View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.NavigableMap;
31  import java.util.Set;
32  import java.util.TreeMap;
33  import java.util.concurrent.ConcurrentHashMap;
34  import java.util.concurrent.ConcurrentSkipListSet;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.TimeUnit;
37  import java.util.concurrent.atomic.AtomicBoolean;
38  import java.util.concurrent.atomic.AtomicInteger;
39  import java.util.concurrent.locks.Lock;
40  import java.util.concurrent.locks.ReentrantLock;
41  
42  import org.apache.commons.logging.Log;
43  import org.apache.commons.logging.LogFactory;
44  import org.apache.hadoop.classification.InterfaceAudience;
45  import org.apache.hadoop.conf.Configuration;
46  import org.apache.hadoop.hbase.Chore;
47  import org.apache.hadoop.hbase.HBaseIOException;
48  import org.apache.hadoop.hbase.HConstants;
49  import org.apache.hadoop.hbase.HRegionInfo;
50  import org.apache.hadoop.hbase.NotServingRegionException;
51  import org.apache.hadoop.hbase.RegionTransition;
52  import org.apache.hadoop.hbase.Server;
53  import org.apache.hadoop.hbase.ServerName;
54  import org.apache.hadoop.hbase.Stoppable;
55  import org.apache.hadoop.hbase.TableName;
56  import org.apache.hadoop.hbase.TableNotFoundException;
57  import org.apache.hadoop.hbase.catalog.CatalogTracker;
58  import org.apache.hadoop.hbase.catalog.MetaReader;
59  import org.apache.hadoop.hbase.client.Result;
60  import org.apache.hadoop.hbase.exceptions.DeserializationException;
61  import org.apache.hadoop.hbase.executor.EventHandler;
62  import org.apache.hadoop.hbase.executor.EventType;
63  import org.apache.hadoop.hbase.executor.ExecutorService;
64  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
65  import org.apache.hadoop.hbase.master.RegionState.State;
66  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
67  import org.apache.hadoop.hbase.master.balancer.FavoredNodeLoadBalancer;
68  import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
69  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
70  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
71  import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
72  import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
73  import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
74  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
75  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
76  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
77  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
78  import org.apache.hadoop.hbase.util.KeyLocker;
79  import org.apache.hadoop.hbase.util.Pair;
80  import org.apache.hadoop.hbase.util.PairOfSameType;
81  import org.apache.hadoop.hbase.util.Threads;
82  import org.apache.hadoop.hbase.util.Triple;
83  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
84  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
85  import org.apache.hadoop.hbase.zookeeper.ZKTable;
86  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
87  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
88  import org.apache.hadoop.ipc.RemoteException;
89  import org.apache.zookeeper.AsyncCallback;
90  import org.apache.zookeeper.KeeperException;
91  import org.apache.zookeeper.KeeperException.NoNodeException;
92  import org.apache.zookeeper.KeeperException.NodeExistsException;
93  import org.apache.zookeeper.data.Stat;
94  
95  import com.google.common.base.Preconditions;
96  import com.google.common.collect.LinkedHashMultimap;
97  
98  /**
99   * Manages and performs region assignment.
100  * <p>
101  * Monitors ZooKeeper for events related to regions in transition.
102  * <p>
103  * Handles existing regions in transition during master failover.
104  */
105 @InterfaceAudience.Private
106 public class AssignmentManager extends ZooKeeperListener {
107   private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
108 
109   public static final ServerName HBCK_CODE_SERVERNAME = ServerName.valueOf(HConstants.HBCK_CODE_NAME,
110       -1, -1L);
111 
112   public static final String ASSIGNMENT_TIMEOUT = "hbase.master.assignment.timeoutmonitor.timeout";
113   public static final int DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT = 600000;
114   public static final String ASSIGNMENT_TIMEOUT_MANAGEMENT = "hbase.assignment.timeout.management";
115   public static final boolean DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT = false;
116 
117   public static final String ALREADY_IN_TRANSITION_WAITTIME
118     = "hbase.assignment.already.intransition.waittime";
119   public static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
120 
121   protected final Server server;
122 
123   private ServerManager serverManager;
124 
125   private boolean shouldAssignRegionsWithFavoredNodes;
126 
127   private CatalogTracker catalogTracker;
128 
129   protected final TimeoutMonitor timeoutMonitor;
130 
131   private final TimerUpdater timerUpdater;
132 
133   private LoadBalancer balancer;
134 
135   private final MetricsAssignmentManager metricsAssignmentManager;
136 
137   private final TableLockManager tableLockManager;
138 
139   private AtomicInteger numRegionsOpened = new AtomicInteger(0);
140 
141   final private KeyLocker<String> locker = new KeyLocker<String>();
142 
143   /**
144    * Map of regions to reopen after the schema of a table is changed. Key -
145    * encoded region name, value - HRegionInfo
146    */
147   private final Map <String, HRegionInfo> regionsToReopen;
148 
149   /*
150    * Maximum times we recurse an assignment/unassignment.
151    * See below in {@link #assign()} and {@link #unassign()}.
152    */
153   private final int maximumAttempts;
154 
155   /**
156    * Map of two merging regions from the region to be created.
157    */
158   private final Map<String, PairOfSameType<HRegionInfo>> mergingRegions
159     = new HashMap<String, PairOfSameType<HRegionInfo>>();
160 
161   /**
162    * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
163    * failure due to lack of availability of region plan
164    */
165   private final long sleepTimeBeforeRetryingMetaAssignment;
166 
167   /** Plans for region movement. Key is the encoded version of a region name*/
168   // TODO: When do plans get cleaned out?  Ever? In server open and in server
169   // shutdown processing -- St.Ack
170   // All access to this Map must be synchronized.
171   final NavigableMap<String, RegionPlan> regionPlans =
172     new TreeMap<String, RegionPlan>();
173 
174   private final ZKTable zkTable;
175 
176   /**
177    * Contains the server which need to update timer, these servers will be
178    * handled by {@link TimerUpdater}
179    */
180   private final ConcurrentSkipListSet<ServerName> serversInUpdatingTimer;
181 
182   private final ExecutorService executorService;
183 
184   // For unit tests, keep track of calls to ClosedRegionHandler
185   private Map<HRegionInfo, AtomicBoolean> closedRegionHandlerCalled = null;
186 
187   // For unit tests, keep track of calls to OpenedRegionHandler
188   private Map<HRegionInfo, AtomicBoolean> openedRegionHandlerCalled = null;
189 
190   //Thread pool executor service for timeout monitor
191   private java.util.concurrent.ExecutorService threadPoolExecutorService;
192 
193   // A bunch of ZK events workers. Each is a single thread executor service
194   private final java.util.concurrent.ExecutorService zkEventWorkers;
195 
196   private List<EventType> ignoreStatesRSOffline = Arrays.asList(
197       EventType.RS_ZK_REGION_FAILED_OPEN, EventType.RS_ZK_REGION_CLOSED);
198 
199   private final RegionStates regionStates;
200 
201   // The threshold to use bulk assigning. Using bulk assignment
202   // only if assigning at least this many regions to at least this
203   // many servers. If assigning fewer regions to fewer servers,
204   // bulk assigning may be not as efficient.
205   private final int bulkAssignThresholdRegions;
206   private final int bulkAssignThresholdServers;
207 
208   // Should bulk assignment wait till all regions are assigned,
209   // or it is timed out?  This is useful to measure bulk assignment
210   // performance, but not needed in most use cases.
211   private final boolean bulkAssignWaitTillAllAssigned;
212 
213   /**
214    * Indicator that AssignmentManager has recovered the region states so
215    * that ServerShutdownHandler can be fully enabled and re-assign regions
216    * of dead servers. So that when re-assignment happens, AssignmentManager
217    * has proper region states.
218    *
219    * Protected to ease testing.
220    */
221   protected final AtomicBoolean failoverCleanupDone = new AtomicBoolean(false);
222 
223   /** Is the TimeOutManagement activated **/
224   private final boolean tomActivated;
225 
226   /**
227    * A map to track the count a region fails to open in a row.
228    * So that we don't try to open a region forever if the failure is
229    * unrecoverable.  We don't put this information in region states
230    * because we don't expect this to happen frequently; we don't
231    * want to copy this information over during each state transition either.
232    */
233   private final ConcurrentHashMap<String, AtomicInteger>
234     failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
235 
236   /**
237    * For testing only!  Set to true to skip handling of split.
238    */
239   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="MS_SHOULD_BE_FINAL")
240   public static boolean TEST_SKIP_SPLIT_HANDLING = false;
241 
242   /**
243    * Constructs a new assignment manager.
244    *
245    * @param server
246    * @param serverManager
247    * @param catalogTracker
248    * @param service
249    * @throws KeeperException
250    * @throws IOException
251    */
252   public AssignmentManager(Server server, ServerManager serverManager,
253       CatalogTracker catalogTracker, final LoadBalancer balancer,
254       final ExecutorService service, MetricsMaster metricsMaster,
255       final TableLockManager tableLockManager) throws KeeperException, IOException {
256     super(server.getZooKeeper());
257     this.server = server;
258     this.serverManager = serverManager;
259     this.catalogTracker = catalogTracker;
260     this.executorService = service;
261     this.regionsToReopen = Collections.synchronizedMap
262                            (new HashMap<String, HRegionInfo> ());
263     Configuration conf = server.getConfiguration();
264     // Only read favored nodes if using the favored nodes load balancer.
265     this.shouldAssignRegionsWithFavoredNodes = conf.getClass(
266            HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class).equals(
267            FavoredNodeLoadBalancer.class);
268     this.tomActivated = conf.getBoolean(
269       ASSIGNMENT_TIMEOUT_MANAGEMENT, DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT);
270     if (tomActivated){
271       this.serversInUpdatingTimer =  new ConcurrentSkipListSet<ServerName>();
272       this.timeoutMonitor = new TimeoutMonitor(
273         conf.getInt("hbase.master.assignment.timeoutmonitor.period", 30000),
274         server, serverManager,
275         conf.getInt(ASSIGNMENT_TIMEOUT, DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT));
276       this.timerUpdater = new TimerUpdater(conf.getInt(
277         "hbase.master.assignment.timerupdater.period", 10000), server);
278       Threads.setDaemonThreadRunning(timerUpdater.getThread(),
279         server.getServerName() + ".timerUpdater");
280     } else {
281       this.serversInUpdatingTimer =  null;
282       this.timeoutMonitor = null;
283       this.timerUpdater = null;
284     }
285     this.zkTable = new ZKTable(this.watcher);
286     // This is the max attempts, not retries, so it should be at least 1.
287     this.maximumAttempts = Math.max(1,
288       this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10));
289     this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong(
290         "hbase.meta.assignment.retry.sleeptime", 1000l);
291     this.balancer = balancer;
292     int maxThreads = conf.getInt("hbase.assignment.threads.max", 30);
293     this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool(
294       maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM."));
295     this.regionStates = new RegionStates(server, serverManager);
296 
297     this.bulkAssignWaitTillAllAssigned =
298       conf.getBoolean("hbase.bulk.assignment.waittillallassigned", false);
299     this.bulkAssignThresholdRegions = conf.getInt("hbase.bulk.assignment.threshold.regions", 7);
300     this.bulkAssignThresholdServers = conf.getInt("hbase.bulk.assignment.threshold.servers", 3);
301 
302     int workers = conf.getInt("hbase.assignment.zkevent.workers", 20);
303     ThreadFactory threadFactory = Threads.newDaemonThreadFactory("AM.ZK.Worker");
304     zkEventWorkers = Threads.getBoundedCachedThreadPool(workers, 60L,
305             TimeUnit.SECONDS, threadFactory);
306     this.tableLockManager = tableLockManager;
307 
308     this.metricsAssignmentManager = new MetricsAssignmentManager();
309   }
310 
311   void startTimeOutMonitor() {
312     if (tomActivated) {
313       Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), server.getServerName()
314           + ".timeoutMonitor");
315     }
316   }
317 
318   /**
319    * @return Instance of ZKTable.
320    */
321   public ZKTable getZKTable() {
322     // These are 'expensive' to make involving trip to zk ensemble so allow
323     // sharing.
324     return this.zkTable;
325   }
326 
327   /**
328    * This SHOULD not be public. It is public now
329    * because of some unit tests.
330    *
331    * TODO: make it package private and keep RegionStates in the master package
332    */
333   public RegionStates getRegionStates() {
334     return regionStates;
335   }
336 
337   public RegionPlan getRegionReopenPlan(HRegionInfo hri) {
338     return new RegionPlan(hri, null, regionStates.getRegionServerOfRegion(hri));
339   }
340 
341   /**
342    * Add a regionPlan for the specified region.
343    * @param encodedName
344    * @param plan
345    */
346   public void addPlan(String encodedName, RegionPlan plan) {
347     synchronized (regionPlans) {
348       regionPlans.put(encodedName, plan);
349     }
350   }
351 
352   /**
353    * Add a map of region plans.
354    */
355   public void addPlans(Map<String, RegionPlan> plans) {
356     synchronized (regionPlans) {
357       regionPlans.putAll(plans);
358     }
359   }
360 
361   /**
362    * Set the list of regions that will be reopened
363    * because of an update in table schema
364    *
365    * @param regions
366    *          list of regions that should be tracked for reopen
367    */
368   public void setRegionsToReopen(List <HRegionInfo> regions) {
369     for(HRegionInfo hri : regions) {
370       regionsToReopen.put(hri.getEncodedName(), hri);
371     }
372   }
373 
374   /**
375    * Used by the client to identify if all regions have the schema updates
376    *
377    * @param tableName
378    * @return Pair indicating the status of the alter command
379    * @throws IOException
380    */
381   public Pair<Integer, Integer> getReopenStatus(TableName tableName)
382       throws IOException {
383     List <HRegionInfo> hris =
384       MetaReader.getTableRegions(this.server.getCatalogTracker(), tableName, true);
385     Integer pending = 0;
386     for (HRegionInfo hri : hris) {
387       String name = hri.getEncodedName();
388       // no lock concurrent access ok: sequential consistency respected.
389       if (regionsToReopen.containsKey(name)
390           || regionStates.isRegionInTransition(name)) {
391         pending++;
392       }
393     }
394     return new Pair<Integer, Integer>(pending, hris.size());
395   }
396 
397   /**
398    * Used by ServerShutdownHandler to make sure AssignmentManager has completed
399    * the failover cleanup before re-assigning regions of dead servers. So that
400    * when re-assignment happens, AssignmentManager has proper region states.
401    */
402   public boolean isFailoverCleanupDone() {
403     return failoverCleanupDone.get();
404   }
405 
406   /**
407    * To avoid racing with AM, external entities may need to lock a region,
408    * for example, when SSH checks what regions to skip re-assigning.
409    */
410   public Lock acquireRegionLock(final String encodedName) {
411     return locker.acquireLock(encodedName);
412   }
413 
414   /**
415    * Now, failover cleanup is completed. Notify server manager to
416    * process queued up dead servers processing, if any.
417    */
418   void failoverCleanupDone() {
419     failoverCleanupDone.set(true);
420     serverManager.processQueuedDeadServers();
421   }
422 
423   /**
424    * Called on startup.
425    * Figures whether a fresh cluster start of we are joining extant running cluster.
426    * @throws IOException
427    * @throws KeeperException
428    * @throws InterruptedException
429    */
430   void joinCluster() throws IOException,
431       KeeperException, InterruptedException {
432     // Concurrency note: In the below the accesses on regionsInTransition are
433     // outside of a synchronization block where usually all accesses to RIT are
434     // synchronized.  The presumption is that in this case it is safe since this
435     // method is being played by a single thread on startup.
436 
437     // TODO: Regions that have a null location and are not in regionsInTransitions
438     // need to be handled.
439 
440     // Scan hbase:meta to build list of existing regions, servers, and assignment
441     // Returns servers who have not checked in (assumed dead) and their regions
442     Map<ServerName, List<HRegionInfo>> deadServers = rebuildUserRegions();
443 
444     // This method will assign all user regions if a clean server startup or
445     // it will reconstruct master state and cleanup any leftovers from
446     // previous master process.
447     processDeadServersAndRegionsInTransition(deadServers);
448 
449     recoverTableInDisablingState();
450     recoverTableInEnablingState();
451   }
452 
453   /**
454    * Process all regions that are in transition in zookeeper and also
455    * processes the list of dead servers by scanning the META.
456    * Used by master joining an cluster.  If we figure this is a clean cluster
457    * startup, will assign all user regions.
458    * @param deadServers
459    *          Map of dead servers and their regions. Can be null.
460    * @throws KeeperException
461    * @throws IOException
462    * @throws InterruptedException
463    */
464   void processDeadServersAndRegionsInTransition(
465       final Map<ServerName, List<HRegionInfo>> deadServers)
466           throws KeeperException, IOException, InterruptedException {
467     List<String> nodes = ZKUtil.listChildrenNoWatch(watcher,
468       watcher.assignmentZNode);
469 
470     if (nodes == null) {
471       String errorMessage = "Failed to get the children from ZK";
472       server.abort(errorMessage, new IOException(errorMessage));
473       return;
474     }
475 
476     boolean failover = (!serverManager.getDeadServers().isEmpty() || !serverManager
477         .getRequeuedDeadServers().isEmpty());
478 
479     if (!failover) {
480       // If any one region except meta is assigned, it's a failover.
481       Map<HRegionInfo, ServerName> regions = regionStates.getRegionAssignments();
482       for (HRegionInfo hri: regions.keySet()) {
483         if (!hri.isMetaTable()) {
484           LOG.debug("Found " + hri + " out on cluster");
485           failover = true;
486           break;
487         }
488       }
489       if (!failover) {
490         // If any one region except meta is in transition, it's a failover.
491         for (String encodedName: nodes) {
492           RegionState state = regionStates.getRegionState(encodedName);
493           if (state != null && !state.getRegion().isMetaRegion()) {
494             LOG.debug("Found " + state.getRegion().getRegionNameAsString() + " in RITs");
495             failover = true;
496             break;
497           }
498         }
499       }
500     }
501 
502     // If we found user regions out on cluster, its a failover.
503     if (failover) {
504       LOG.info("Found regions out on cluster or in RIT; presuming failover");
505       // Process list of dead servers and regions in RIT.
506       // See HBASE-4580 for more information.
507       processDeadServersAndRecoverLostRegions(deadServers);
508     } else {
509       // Fresh cluster startup.
510       LOG.info("Clean cluster startup. Assigning userregions");
511       assignAllUserRegions();
512     }
513   }
514 
515   /**
516    * If region is up in zk in transition, then do fixup and block and wait until
517    * the region is assigned and out of transition.  Used on startup for
518    * catalog regions.
519    * @param hri Region to look for.
520    * @return True if we processed a region in transition else false if region
521    * was not up in zk in transition.
522    * @throws InterruptedException
523    * @throws KeeperException
524    * @throws IOException
525    */
526   boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
527       throws InterruptedException, KeeperException, IOException {
528     String encodedRegionName = hri.getEncodedName();
529     if (!processRegionInTransition(encodedRegionName, hri)) {
530       return false; // The region is not in transition
531     }
532     LOG.debug("Waiting on " + HRegionInfo.prettyPrint(encodedRegionName));
533     while (!this.server.isStopped() &&
534         this.regionStates.isRegionInTransition(encodedRegionName)) {
535       RegionState state = this.regionStates.getRegionTransitionState(encodedRegionName);
536       if (state == null || !serverManager.isServerOnline(state.getServerName())) {
537         // The region is not in transition, or not in transition on an online
538         // server. Doesn't help to block here any more. Caller need to
539         // verify the region is actually assigned.
540         break;
541       }
542       this.regionStates.waitForUpdate(100);
543     }
544     return true;
545   }
546 
547   /**
548    * Process failover of new master for region <code>encodedRegionName</code>
549    * up in zookeeper.
550    * @param encodedRegionName Region to process failover for.
551    * @param regionInfo If null we'll go get it from meta table.
552    * @return True if we processed <code>regionInfo</code> as a RIT.
553    * @throws KeeperException
554    * @throws IOException
555    */
556   boolean processRegionInTransition(final String encodedRegionName,
557       final HRegionInfo regionInfo) throws KeeperException, IOException {
558     // We need a lock here to ensure that we will not put the same region twice
559     // It has no reason to be a lock shared with the other operations.
560     // We can do the lock on the region only, instead of a global lock: what we want to ensure
561     // is that we don't have two threads working on the same region.
562     Lock lock = locker.acquireLock(encodedRegionName);
563     try {
564       Stat stat = new Stat();
565       byte [] data = ZKAssign.getDataAndWatch(watcher, encodedRegionName, stat);
566       if (data == null) return false;
567       RegionTransition rt;
568       try {
569         rt = RegionTransition.parseFrom(data);
570       } catch (DeserializationException e) {
571         LOG.warn("Failed parse znode data", e);
572         return false;
573       }
574       HRegionInfo hri = regionInfo;
575       if (hri == null) {
576         // The region info is not passed in. We will try to find the region
577         // from region states map/meta based on the encoded region name. But we
578         // may not be able to find it. This is valid for online merge that
579         // the region may have not been created if the merge is not completed.
580         // Therefore, it is not in meta at master recovery time.
581         hri = regionStates.getRegionInfo(rt.getRegionName());
582         EventType et = rt.getEventType();
583         if (hri == null && et != EventType.RS_ZK_REGION_MERGING
584             && et != EventType.RS_ZK_REQUEST_REGION_MERGE) {
585           LOG.warn("Couldn't find the region in recovering " + rt);
586           return false;
587         }
588       }
589       return processRegionsInTransition(
590         rt, hri, stat.getVersion());
591     } finally {
592       lock.unlock();
593     }
594   }
595 
596   /**
597    * This call is invoked only (1) master assign meta;
598    * (2) during failover mode startup, zk assignment node processing.
599    * The locker is set in the caller. It returns true if the region
600    * is in transition for sure, false otherwise.
601    *
602    * It should be private but it is used by some test too.
603    */
604   boolean processRegionsInTransition(
605       final RegionTransition rt, final HRegionInfo regionInfo,
606       final int expectedVersion) throws KeeperException {
607     EventType et = rt.getEventType();
608     // Get ServerName.  Could not be null.
609     final ServerName sn = rt.getServerName();
610     final byte[] regionName = rt.getRegionName();
611     final String encodedName = HRegionInfo.encodeRegionName(regionName);
612     final String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
613     LOG.info("Processing " + prettyPrintedRegionName + " in state: " + et);
614 
615     if (regionStates.isRegionInTransition(encodedName)) {
616       LOG.info("Processed region " + prettyPrintedRegionName + " in state: "
617         + et + ", does nothing since the region is already in transition "
618         + regionStates.getRegionTransitionState(encodedName));
619       // Just return
620       return true;
621     }
622     if (!serverManager.isServerOnline(sn)) {
623       // It was transitioning on a dead server, so it's closed now.
624       // Force to OFFLINE and put it in transition, but not assign it
625       // since log splitting for the dead server is not done yet.
626       LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
627         " was on deadserver; forcing offline");
628       if (regionStates.isRegionOnline(regionInfo)) {
629         // Meta could still show the region is assigned to the previous
630         // server. If that server is online, when we reload the meta, the
631         // region is put back to online, we need to offline it.
632         regionStates.regionOffline(regionInfo);
633       }
634       // Put it back in transition so that SSH can re-assign it
635       regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
636 
637       if (regionInfo.isMetaRegion()) {
638         // If it's meta region, reset the meta location.
639         // So that master knows the right meta region server.
640         MetaRegionTracker.setMetaLocation(watcher, sn);
641       } else {
642         // No matter the previous server is online or offline,
643         // we need to reset the last region server of the region.
644         regionStates.setLastRegionServerOfRegion(sn, encodedName);
645         // Make sure we know the server is dead.
646         if (!serverManager.isServerDead(sn)) {
647           serverManager.expireServer(sn);
648         }
649       }
650       return false;
651     }
652     switch (et) {
653       case M_ZK_REGION_CLOSING:
654         // Insert into RIT & resend the query to the region server: may be the previous master
655         // died before sending the query the first time.
656         final RegionState rsClosing = regionStates.updateRegionState(rt, State.CLOSING);
657         this.executorService.submit(
658           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
659             @Override
660             public void process() throws IOException {
661               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
662               try {
663                 unassign(regionInfo, rsClosing, expectedVersion, null, true, null);
664                 if (regionStates.isRegionOffline(regionInfo)) {
665                   assign(regionInfo, true);
666                 }
667               } finally {
668                 lock.unlock();
669               }
670             }
671           });
672         break;
673 
674       case RS_ZK_REGION_CLOSED:
675       case RS_ZK_REGION_FAILED_OPEN:
676         // Region is closed, insert into RIT and handle it
677         regionStates.updateRegionState(regionInfo, State.CLOSED, sn);
678         invokeAssign(regionInfo);
679         break;
680 
681       case M_ZK_REGION_OFFLINE:
682         // Insert in RIT and resend to the regionserver
683         regionStates.updateRegionState(rt, State.PENDING_OPEN);
684         final RegionState rsOffline = regionStates.getRegionState(regionInfo);
685         this.executorService.submit(
686           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
687             @Override
688             public void process() throws IOException {
689               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
690               try {
691                 RegionPlan plan = new RegionPlan(regionInfo, null, sn);
692                 addPlan(encodedName, plan);
693                 assign(rsOffline, false, false);
694               } finally {
695                 lock.unlock();
696               }
697             }
698           });
699         break;
700 
701       case RS_ZK_REGION_OPENING:
702         regionStates.updateRegionState(rt, State.OPENING);
703         break;
704 
705       case RS_ZK_REGION_OPENED:
706         // Region is opened, insert into RIT and handle it
707         // This could be done asynchronously, we would need then to acquire the lock in the
708         //  handler.
709         regionStates.updateRegionState(rt, State.OPEN);
710         new OpenedRegionHandler(server, this, regionInfo, sn, expectedVersion).process();
711         break;
712       case RS_ZK_REQUEST_REGION_SPLIT:
713       case RS_ZK_REGION_SPLITTING:
714       case RS_ZK_REGION_SPLIT:
715         // Splitting region should be online. We could have skipped it during
716         // user region rebuilding since we may consider the split is completed.
717         // Put it in SPLITTING state to avoid complications.
718         regionStates.regionOnline(regionInfo, sn);
719         regionStates.updateRegionState(rt, State.SPLITTING);
720         if (!handleRegionSplitting(
721             rt, encodedName, prettyPrintedRegionName, sn)) {
722           deleteSplittingNode(encodedName, sn);
723         }
724         break;
725       case RS_ZK_REQUEST_REGION_MERGE:
726       case RS_ZK_REGION_MERGING:
727       case RS_ZK_REGION_MERGED:
728         if (!handleRegionMerging(
729             rt, encodedName, prettyPrintedRegionName, sn)) {
730           deleteMergingNode(encodedName, sn);
731         }
732         break;
733       default:
734         throw new IllegalStateException("Received region in state:" + et + " is not valid.");
735     }
736     LOG.info("Processed region " + prettyPrintedRegionName + " in state "
737       + et + ", on " + (serverManager.isServerOnline(sn) ? "" : "dead ")
738       + "server: " + sn);
739     return true;
740   }
741 
742   /**
743    * When a region is closed, it should be removed from the regionsToReopen
744    * @param hri HRegionInfo of the region which was closed
745    */
746   public void removeClosedRegion(HRegionInfo hri) {
747     if (regionsToReopen.remove(hri.getEncodedName()) != null) {
748       LOG.debug("Removed region from reopening regions because it was closed");
749     }
750   }
751 
752   /**
753    * Handles various states an unassigned node can be in.
754    * <p>
755    * Method is called when a state change is suspected for an unassigned node.
756    * <p>
757    * This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
758    * yet).
759    * @param rt
760    * @param expectedVersion
761    */
762   void handleRegion(final RegionTransition rt, int expectedVersion) {
763     if (rt == null) {
764       LOG.warn("Unexpected NULL input for RegionTransition rt");
765       return;
766     }
767     final ServerName sn = rt.getServerName();
768     // Check if this is a special HBCK transition
769     if (sn.equals(HBCK_CODE_SERVERNAME)) {
770       handleHBCK(rt);
771       return;
772     }
773     final long createTime = rt.getCreateTime();
774     final byte[] regionName = rt.getRegionName();
775     String encodedName = HRegionInfo.encodeRegionName(regionName);
776     String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
777     // Verify this is a known server
778     if (!serverManager.isServerOnline(sn)
779       && !ignoreStatesRSOffline.contains(rt.getEventType())) {
780       LOG.warn("Attempted to handle region transition for server but " +
781         "it is not online: " + prettyPrintedRegionName + ", " + rt);
782       return;
783     }
784 
785     RegionState regionState =
786       regionStates.getRegionState(encodedName);
787     long startTime = System.currentTimeMillis();
788     if (LOG.isDebugEnabled()) {
789       boolean lateEvent = createTime < (startTime - 15000);
790       LOG.debug("Handling " + rt.getEventType() +
791         ", server=" + sn + ", region=" +
792         (prettyPrintedRegionName == null ? "null" : prettyPrintedRegionName) +
793         (lateEvent ? ", which is more than 15 seconds late" : "") +
794         ", current_state=" + regionState);
795     }
796     // We don't do anything for this event,
797     // so separate it out, no need to lock/unlock anything
798     if (rt.getEventType() == EventType.M_ZK_REGION_OFFLINE) {
799       return;
800     }
801 
802     // We need a lock on the region as we could update it
803     Lock lock = locker.acquireLock(encodedName);
804     try {
805       RegionState latestState =
806         regionStates.getRegionState(encodedName);
807       if ((regionState == null && latestState != null)
808           || (regionState != null && latestState == null)
809           || (regionState != null && latestState != null
810             && latestState.getState() != regionState.getState())) {
811         LOG.warn("Region state changed from " + regionState + " to "
812           + latestState + ", while acquiring lock");
813       }
814       long waitedTime = System.currentTimeMillis() - startTime;
815       if (waitedTime > 5000) {
816         LOG.warn("Took " + waitedTime + "ms to acquire the lock");
817       }
818       regionState = latestState;
819       switch (rt.getEventType()) {
820       case RS_ZK_REQUEST_REGION_SPLIT:
821       case RS_ZK_REGION_SPLITTING:
822       case RS_ZK_REGION_SPLIT:
823         if (!handleRegionSplitting(
824             rt, encodedName, prettyPrintedRegionName, sn)) {
825           deleteSplittingNode(encodedName, sn);
826         }
827         break;
828 
829       case RS_ZK_REQUEST_REGION_MERGE:
830       case RS_ZK_REGION_MERGING:
831       case RS_ZK_REGION_MERGED:
832         // Merged region is a new region, we can't find it in the region states now.
833         // However, the two merging regions are not new. They should be in state for merging.
834         if (!handleRegionMerging(
835             rt, encodedName, prettyPrintedRegionName, sn)) {
836           deleteMergingNode(encodedName, sn);
837         }
838         break;
839 
840       case M_ZK_REGION_CLOSING:
841         // Should see CLOSING after we have asked it to CLOSE or additional
842         // times after already being in state of CLOSING
843         if (regionState == null
844             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
845           LOG.warn("Received CLOSING for " + prettyPrintedRegionName
846             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
847             + regionStates.getRegionState(encodedName));
848           return;
849         }
850         // Transition to CLOSING (or update stamp if already CLOSING)
851         regionStates.updateRegionState(rt, State.CLOSING);
852         break;
853 
854       case RS_ZK_REGION_CLOSED:
855         // Should see CLOSED after CLOSING but possible after PENDING_CLOSE
856         if (regionState == null
857             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
858           LOG.warn("Received CLOSED for " + prettyPrintedRegionName
859             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
860             + regionStates.getRegionState(encodedName));
861           return;
862         }
863         // Handle CLOSED by assigning elsewhere or stopping if a disable
864         // If we got here all is good.  Need to update RegionState -- else
865         // what follows will fail because not in expected state.
866         new ClosedRegionHandler(server, this, regionState.getRegion()).process();
867         updateClosedRegionHandlerTracker(regionState.getRegion());
868         break;
869 
870         case RS_ZK_REGION_FAILED_OPEN:
871           if (regionState == null
872               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
873             LOG.warn("Received FAILED_OPEN for " + prettyPrintedRegionName
874               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
875               + regionStates.getRegionState(encodedName));
876             return;
877           }
878           AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
879           if (failedOpenCount == null) {
880             failedOpenCount = new AtomicInteger();
881             // No need to use putIfAbsent, or extra synchronization since
882             // this whole handleRegion block is locked on the encoded region
883             // name, and failedOpenTracker is updated only in this block
884             failedOpenTracker.put(encodedName, failedOpenCount);
885           }
886           if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
887             regionStates.updateRegionState(rt, State.FAILED_OPEN);
888             // remove the tracking info to save memory, also reset
889             // the count for next open initiative
890             failedOpenTracker.remove(encodedName);
891           } else {
892             // Handle this the same as if it were opened and then closed.
893             regionState = regionStates.updateRegionState(rt, State.CLOSED);
894             if (regionState != null) {
895               // When there are more than one region server a new RS is selected as the
896               // destination and the same is updated in the regionplan. (HBASE-5546)
897               try {
898                 getRegionPlan(regionState.getRegion(), sn, true);
899                 new ClosedRegionHandler(server, this, regionState.getRegion()).process();
900               } catch (HBaseIOException e) {
901                 LOG.warn("Failed to get region plan", e);
902               }
903             }
904           }
905           break;
906 
907         case RS_ZK_REGION_OPENING:
908           // Should see OPENING after we have asked it to OPEN or additional
909           // times after already being in state of OPENING
910           if (regionState == null
911               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
912             LOG.warn("Received OPENING for " + prettyPrintedRegionName
913               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
914               + regionStates.getRegionState(encodedName));
915             return;
916           }
917           // Transition to OPENING (or update stamp if already OPENING)
918           regionStates.updateRegionState(rt, State.OPENING);
919           break;
920 
921         case RS_ZK_REGION_OPENED:
922           // Should see OPENED after OPENING but possible after PENDING_OPEN.
923           if (regionState == null
924               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
925             LOG.warn("Received OPENED for " + prettyPrintedRegionName
926               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
927               + regionStates.getRegionState(encodedName));
928 
929             if (regionState != null) {
930               // Close it without updating the internal region states,
931               // so as not to create double assignments in unlucky scenarios
932               // mentioned in OpenRegionHandler#process
933               unassign(regionState.getRegion(), null, -1, null, false, sn);
934             }
935             return;
936           }
937           // Handle OPENED by removing from transition and deleted zk node
938           regionState = regionStates.updateRegionState(rt, State.OPEN);
939           if (regionState != null) {
940             failedOpenTracker.remove(encodedName); // reset the count, if any
941             new OpenedRegionHandler(
942               server, this, regionState.getRegion(), sn, expectedVersion).process();
943             updateOpenedRegionHandlerTracker(regionState.getRegion());
944           }
945           break;
946 
947         default:
948           throw new IllegalStateException("Received event is not valid.");
949       }
950     } finally {
951       lock.unlock();
952     }
953   }
954 
955   //For unit tests only
956   boolean wasClosedHandlerCalled(HRegionInfo hri) {
957     AtomicBoolean b = closedRegionHandlerCalled.get(hri);
958     //compareAndSet to be sure that unit tests don't see stale values. Means,
959     //we will return true exactly once unless the handler code resets to true
960     //this value.
961     return b == null ? false : b.compareAndSet(true, false);
962   }
963 
964   //For unit tests only
965   boolean wasOpenedHandlerCalled(HRegionInfo hri) {
966     AtomicBoolean b = openedRegionHandlerCalled.get(hri);
967     //compareAndSet to be sure that unit tests don't see stale values. Means,
968     //we will return true exactly once unless the handler code resets to true
969     //this value.
970     return b == null ? false : b.compareAndSet(true, false);
971   }
972 
973   //For unit tests only
974   void initializeHandlerTrackers() {
975     closedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
976     openedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
977   }
978 
979   void updateClosedRegionHandlerTracker(HRegionInfo hri) {
980     if (closedRegionHandlerCalled != null) { //only for unit tests this is true
981       closedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
982     }
983   }
984 
985   void updateOpenedRegionHandlerTracker(HRegionInfo hri) {
986     if (openedRegionHandlerCalled != null) { //only for unit tests this is true
987       openedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
988     }
989   }
990 
991   // TODO: processFavoredNodes might throw an exception, for e.g., if the
992   // meta could not be contacted/updated. We need to see how seriously to treat
993   // this problem as. Should we fail the current assignment. We should be able
994   // to recover from this problem eventually (if the meta couldn't be updated
995   // things should work normally and eventually get fixed up).
996   void processFavoredNodes(List<HRegionInfo> regions) throws IOException {
997     if (!shouldAssignRegionsWithFavoredNodes) return;
998     // The AM gets the favored nodes info for each region and updates the meta
999     // table with that info
1000     Map<HRegionInfo, List<ServerName>> regionToFavoredNodes =
1001         new HashMap<HRegionInfo, List<ServerName>>();
1002     for (HRegionInfo region : regions) {
1003       regionToFavoredNodes.put(region,
1004           ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region));
1005     }
1006     FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(regionToFavoredNodes, catalogTracker);
1007   }
1008 
1009   /**
1010    * Handle a ZK unassigned node transition triggered by HBCK repair tool.
1011    * <p>
1012    * This is handled in a separate code path because it breaks the normal rules.
1013    * @param rt
1014    */
1015   private void handleHBCK(RegionTransition rt) {
1016     String encodedName = HRegionInfo.encodeRegionName(rt.getRegionName());
1017     LOG.info("Handling HBCK triggered transition=" + rt.getEventType() +
1018       ", server=" + rt.getServerName() + ", region=" +
1019       HRegionInfo.prettyPrint(encodedName));
1020     RegionState regionState = regionStates.getRegionTransitionState(encodedName);
1021     switch (rt.getEventType()) {
1022       case M_ZK_REGION_OFFLINE:
1023         HRegionInfo regionInfo;
1024         if (regionState != null) {
1025           regionInfo = regionState.getRegion();
1026         } else {
1027           try {
1028             byte [] name = rt.getRegionName();
1029             Pair<HRegionInfo, ServerName> p = MetaReader.getRegion(catalogTracker, name);
1030             regionInfo = p.getFirst();
1031           } catch (IOException e) {
1032             LOG.info("Exception reading hbase:meta doing HBCK repair operation", e);
1033             return;
1034           }
1035         }
1036         LOG.info("HBCK repair is triggering assignment of region=" +
1037             regionInfo.getRegionNameAsString());
1038         // trigger assign, node is already in OFFLINE so don't need to update ZK
1039         assign(regionInfo, false);
1040         break;
1041 
1042       default:
1043         LOG.warn("Received unexpected region state from HBCK: " + rt.toString());
1044         break;
1045     }
1046 
1047   }
1048 
1049   // ZooKeeper events
1050 
1051   /**
1052    * New unassigned node has been created.
1053    *
1054    * <p>This happens when an RS begins the OPENING or CLOSING of a region by
1055    * creating an unassigned node.
1056    *
1057    * <p>When this happens we must:
1058    * <ol>
1059    *   <li>Watch the node for further events</li>
1060    *   <li>Read and handle the state in the node</li>
1061    * </ol>
1062    */
1063   @Override
1064   public void nodeCreated(String path) {
1065     handleAssignmentEvent(path);
1066   }
1067 
1068   /**
1069    * Existing unassigned node has had data changed.
1070    *
1071    * <p>This happens when an RS transitions from OFFLINE to OPENING, or between
1072    * OPENING/OPENED and CLOSING/CLOSED.
1073    *
1074    * <p>When this happens we must:
1075    * <ol>
1076    *   <li>Watch the node for further events</li>
1077    *   <li>Read and handle the state in the node</li>
1078    * </ol>
1079    */
1080   @Override
1081   public void nodeDataChanged(String path) {
1082     handleAssignmentEvent(path);
1083   }
1084 
1085 
1086   // We  don't want to have two events on the same region managed simultaneously.
1087   // For this reason, we need to wait if an event on the same region is currently in progress.
1088   // So we track the region names of the events in progress, and we keep a waiting list.
1089   private final Set<String> regionsInProgress = new HashSet<String>();
1090   // In a LinkedHashMultimap, the put order is kept when we retrieve the collection back. We need
1091   //  this as we want the events to be managed in the same order as we received them.
1092   private final LinkedHashMultimap <String, RegionRunnable>
1093       zkEventWorkerWaitingList = LinkedHashMultimap.create();
1094 
1095   /**
1096    * A specific runnable that works only on a region.
1097    */
1098   private interface RegionRunnable extends Runnable{
1099     /**
1100      * @return - the name of the region it works on.
1101      */
1102     String getRegionName();
1103   }
1104 
1105   /**
1106    * Submit a task, ensuring that there is only one task at a time that working on a given region.
1107    * Order is respected.
1108    */
1109   protected void zkEventWorkersSubmit(final RegionRunnable regRunnable) {
1110 
1111     synchronized (regionsInProgress) {
1112       // If we're there is already a task with this region, we add it to the
1113       //  waiting list and return.
1114       if (regionsInProgress.contains(regRunnable.getRegionName())) {
1115         synchronized (zkEventWorkerWaitingList){
1116           zkEventWorkerWaitingList.put(regRunnable.getRegionName(), regRunnable);
1117         }
1118         return;
1119       }
1120 
1121       // No event in progress on this region => we can submit a new task immediately.
1122       regionsInProgress.add(regRunnable.getRegionName());
1123       zkEventWorkers.submit(new Runnable() {
1124         @Override
1125         public void run() {
1126           try {
1127             regRunnable.run();
1128           } finally {
1129             // now that we have finished, let's see if there is an event for the same region in the
1130             //  waiting list. If it's the case, we can now submit it to the pool.
1131             synchronized (regionsInProgress) {
1132               regionsInProgress.remove(regRunnable.getRegionName());
1133               synchronized (zkEventWorkerWaitingList) {
1134                 java.util.Set<RegionRunnable> waiting = zkEventWorkerWaitingList.get(
1135                     regRunnable.getRegionName());
1136                 if (!waiting.isEmpty()) {
1137                   // We want the first object only. The only way to get it is through an iterator.
1138                   RegionRunnable toSubmit = waiting.iterator().next();
1139                   zkEventWorkerWaitingList.remove(toSubmit.getRegionName(), toSubmit);
1140                   zkEventWorkersSubmit(toSubmit);
1141                 }
1142               }
1143             }
1144           }
1145         }
1146       });
1147     }
1148   }
1149 
1150   @Override
1151   public void nodeDeleted(final String path) {
1152     if (path.startsWith(watcher.assignmentZNode)) {
1153       final String regionName = ZKAssign.getRegionName(watcher, path);
1154       zkEventWorkersSubmit(new RegionRunnable() {
1155         @Override
1156         public String getRegionName() {
1157           return regionName;
1158         }
1159 
1160         @Override
1161         public void run() {
1162           Lock lock = locker.acquireLock(regionName);
1163           try {
1164             RegionState rs = regionStates.getRegionTransitionState(regionName);
1165             if (rs == null) {
1166               rs = regionStates.getRegionState(regionName);
1167               if (rs == null || !rs.isMergingNew()) {
1168                 // MergingNew is an offline state
1169                 return;
1170               }
1171             }
1172 
1173             HRegionInfo regionInfo = rs.getRegion();
1174             String regionNameStr = regionInfo.getRegionNameAsString();
1175             LOG.debug("Znode " + regionNameStr + " deleted, state: " + rs);
1176             boolean disabled = getZKTable().isDisablingOrDisabledTable(regionInfo.getTable());
1177             ServerName serverName = rs.getServerName();
1178             if (serverManager.isServerOnline(serverName)) {
1179               if (rs.isOnServer(serverName)
1180                   && (rs.isOpened() || rs.isSplitting())) {
1181                 regionOnline(regionInfo, serverName);
1182                 if (disabled) {
1183                   // if server is offline, no hurt to unassign again
1184                   LOG.info("Opened " + regionNameStr
1185                     + "but this table is disabled, triggering close of region");
1186                   unassign(regionInfo);
1187                 }
1188               } else if (rs.isMergingNew()) {
1189                 synchronized (regionStates) {
1190                   String p = regionInfo.getEncodedName();
1191                   PairOfSameType<HRegionInfo> regions = mergingRegions.get(p);
1192                   if (regions != null) {
1193                     onlineMergingRegion(disabled, regions.getFirst(), serverName);
1194                     onlineMergingRegion(disabled, regions.getSecond(), serverName);
1195                   }
1196                 }
1197               }
1198             }
1199           } finally {
1200             lock.unlock();
1201           }
1202         }
1203 
1204         private void onlineMergingRegion(boolean disabled,
1205             final HRegionInfo hri, final ServerName serverName) {
1206           RegionState regionState = regionStates.getRegionState(hri);
1207           if (regionState != null && regionState.isMerging()
1208               && regionState.isOnServer(serverName)) {
1209             regionOnline(regionState.getRegion(), serverName);
1210             if (disabled) {
1211               unassign(hri);
1212             }
1213           }
1214         }
1215       });
1216     }
1217   }
1218 
1219   /**
1220    * New unassigned node has been created.
1221    *
1222    * <p>This happens when an RS begins the OPENING, SPLITTING or CLOSING of a
1223    * region by creating a znode.
1224    *
1225    * <p>When this happens we must:
1226    * <ol>
1227    *   <li>Watch the node for further children changed events</li>
1228    *   <li>Watch all new children for changed events</li>
1229    * </ol>
1230    */
1231   @Override
1232   public void nodeChildrenChanged(String path) {
1233     if (path.equals(watcher.assignmentZNode)) {
1234       zkEventWorkers.submit(new Runnable() {
1235         @Override
1236         public void run() {
1237           try {
1238             // Just make sure we see the changes for the new znodes
1239             List<String> children =
1240               ZKUtil.listChildrenAndWatchForNewChildren(
1241                 watcher, watcher.assignmentZNode);
1242             if (children != null) {
1243               Stat stat = new Stat();
1244               for (String child : children) {
1245                 // if region is in transition, we already have a watch
1246                 // on it, so no need to watch it again. So, as I know for now,
1247                 // this is needed to watch splitting nodes only.
1248                 if (!regionStates.isRegionInTransition(child)) {
1249                   ZKAssign.getDataAndWatch(watcher, child, stat);
1250                 }
1251               }
1252             }
1253           } catch (KeeperException e) {
1254             server.abort("Unexpected ZK exception reading unassigned children", e);
1255           }
1256         }
1257       });
1258     }
1259   }
1260 
1261   /**
1262    * Marks the region as online.  Removes it from regions in transition and
1263    * updates the in-memory assignment information.
1264    * <p>
1265    * Used when a region has been successfully opened on a region server.
1266    * @param regionInfo
1267    * @param sn
1268    */
1269   void regionOnline(HRegionInfo regionInfo, ServerName sn) {
1270     numRegionsOpened.incrementAndGet();
1271     regionStates.regionOnline(regionInfo, sn);
1272 
1273     // Remove plan if one.
1274     clearRegionPlan(regionInfo);
1275     // Add the server to serversInUpdatingTimer
1276     addToServersInUpdatingTimer(sn);
1277   }
1278 
1279   /**
1280    * Pass the assignment event to a worker for processing.
1281    * Each worker is a single thread executor service.  The reason
1282    * for just one thread is to make sure all events for a given
1283    * region are processed in order.
1284    *
1285    * @param path
1286    */
1287   private void handleAssignmentEvent(final String path) {
1288     if (path.startsWith(watcher.assignmentZNode)) {
1289       final String regionName = ZKAssign.getRegionName(watcher, path);
1290 
1291       zkEventWorkersSubmit(new RegionRunnable() {
1292         @Override
1293         public String getRegionName() {
1294           return regionName;
1295         }
1296 
1297         @Override
1298         public void run() {
1299           try {
1300             Stat stat = new Stat();
1301             byte [] data = ZKAssign.getDataAndWatch(watcher, path, stat);
1302             if (data == null) return;
1303 
1304             RegionTransition rt = RegionTransition.parseFrom(data);
1305             handleRegion(rt, stat.getVersion());
1306           } catch (KeeperException e) {
1307             server.abort("Unexpected ZK exception reading unassigned node data", e);
1308           } catch (DeserializationException e) {
1309             server.abort("Unexpected exception deserializing node data", e);
1310           }
1311         }
1312       });
1313     }
1314   }
1315 
1316   /**
1317    * Add the server to the set serversInUpdatingTimer, then {@link TimerUpdater}
1318    * will update timers for this server in background
1319    * @param sn
1320    */
1321   private void addToServersInUpdatingTimer(final ServerName sn) {
1322     if (tomActivated){
1323       this.serversInUpdatingTimer.add(sn);
1324     }
1325   }
1326 
1327   /**
1328    * Touch timers for all regions in transition that have the passed
1329    * <code>sn</code> in common.
1330    * Call this method whenever a server checks in.  Doing so helps the case where
1331    * a new regionserver has joined the cluster and its been given 1k regions to
1332    * open.  If this method is tickled every time the region reports in a
1333    * successful open then the 1k-th region won't be timed out just because its
1334    * sitting behind the open of 999 other regions.  This method is NOT used
1335    * as part of bulk assign -- there we have a different mechanism for extending
1336    * the regions in transition timer (we turn it off temporarily -- because
1337    * there is no regionplan involved when bulk assigning.
1338    * @param sn
1339    */
1340   private void updateTimers(final ServerName sn) {
1341     Preconditions.checkState(tomActivated);
1342     if (sn == null) return;
1343 
1344     // This loop could be expensive.
1345     // First make a copy of current regionPlan rather than hold sync while
1346     // looping because holding sync can cause deadlock.  Its ok in this loop
1347     // if the Map we're going against is a little stale
1348     List<Map.Entry<String, RegionPlan>> rps;
1349     synchronized(this.regionPlans) {
1350       rps = new ArrayList<Map.Entry<String, RegionPlan>>(regionPlans.entrySet());
1351     }
1352 
1353     for (Map.Entry<String, RegionPlan> e : rps) {
1354       if (e.getValue() != null && e.getKey() != null && sn.equals(e.getValue().getDestination())) {
1355         RegionState regionState = regionStates.getRegionTransitionState(e.getKey());
1356         if (regionState != null) {
1357           regionState.updateTimestampToNow();
1358         }
1359       }
1360     }
1361   }
1362 
1363   /**
1364    * Marks the region as offline.  Removes it from regions in transition and
1365    * removes in-memory assignment information.
1366    * <p>
1367    * Used when a region has been closed and should remain closed.
1368    * @param regionInfo
1369    */
1370   public void regionOffline(final HRegionInfo regionInfo) {
1371     regionOffline(regionInfo, null);
1372   }
1373 
1374   public void offlineDisabledRegion(HRegionInfo regionInfo) {
1375     // Disabling so should not be reassigned, just delete the CLOSED node
1376     LOG.debug("Table being disabled so deleting ZK node and removing from " +
1377       "regions in transition, skipping assignment of region " +
1378         regionInfo.getRegionNameAsString());
1379     String encodedName = regionInfo.getEncodedName();
1380     deleteNodeInStates(encodedName, "closed", null,
1381       EventType.RS_ZK_REGION_CLOSED, EventType.M_ZK_REGION_OFFLINE);
1382     regionOffline(regionInfo);
1383   }
1384 
1385   // Assignment methods
1386 
1387   /**
1388    * Assigns the specified region.
1389    * <p>
1390    * If a RegionPlan is available with a valid destination then it will be used
1391    * to determine what server region is assigned to.  If no RegionPlan is
1392    * available, region will be assigned to a random available server.
1393    * <p>
1394    * Updates the RegionState and sends the OPEN RPC.
1395    * <p>
1396    * This will only succeed if the region is in transition and in a CLOSED or
1397    * OFFLINE state or not in transition (in-memory not zk), and of course, the
1398    * chosen server is up and running (It may have just crashed!).  If the
1399    * in-memory checks pass, the zk node is forced to OFFLINE before assigning.
1400    *
1401    * @param region server to be assigned
1402    * @param setOfflineInZK whether ZK node should be created/transitioned to an
1403    *                       OFFLINE state before assigning the region
1404    */
1405   public void assign(HRegionInfo region, boolean setOfflineInZK) {
1406     assign(region, setOfflineInZK, false);
1407   }
1408 
1409   /**
1410    * Use care with forceNewPlan. It could cause double assignment.
1411    */
1412   public void assign(HRegionInfo region,
1413       boolean setOfflineInZK, boolean forceNewPlan) {
1414     if (isDisabledorDisablingRegionInRIT(region)) {
1415       return;
1416     }
1417     if (this.serverManager.isClusterShutdown()) {
1418       LOG.info("Cluster shutdown is set; skipping assign of " +
1419         region.getRegionNameAsString());
1420       return;
1421     }
1422     String encodedName = region.getEncodedName();
1423     Lock lock = locker.acquireLock(encodedName);
1424     try {
1425       RegionState state = forceRegionStateToOffline(region, forceNewPlan);
1426       if (state != null) {
1427         if (regionStates.wasRegionOnDeadServer(encodedName)) {
1428           LOG.info("Skip assigning " + region.getRegionNameAsString()
1429             + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1430             + " is dead but not processed yet");
1431           return;
1432         }
1433         assign(state, setOfflineInZK, forceNewPlan);
1434       }
1435     } finally {
1436       lock.unlock();
1437     }
1438   }
1439 
1440   /**
1441    * Bulk assign regions to <code>destination</code>.
1442    * @param destination
1443    * @param regions Regions to assign.
1444    * @return true if successful
1445    */
1446   boolean assign(final ServerName destination, final List<HRegionInfo> regions) {
1447     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1448     try {
1449       int regionCount = regions.size();
1450       if (regionCount == 0) {
1451         return true;
1452       }
1453       LOG.debug("Assigning " + regionCount + " region(s) to " + destination.toString());
1454       Set<String> encodedNames = new HashSet<String>(regionCount);
1455       for (HRegionInfo region : regions) {
1456         encodedNames.add(region.getEncodedName());
1457       }
1458 
1459       List<HRegionInfo> failedToOpenRegions = new ArrayList<HRegionInfo>();
1460       Map<String, Lock> locks = locker.acquireLocks(encodedNames);
1461       try {
1462         AtomicInteger counter = new AtomicInteger(0);
1463         Map<String, Integer> offlineNodesVersions = new ConcurrentHashMap<String, Integer>();
1464         OfflineCallback cb = new OfflineCallback(
1465           watcher, destination, counter, offlineNodesVersions);
1466         Map<String, RegionPlan> plans = new HashMap<String, RegionPlan>(regions.size());
1467         List<RegionState> states = new ArrayList<RegionState>(regions.size());
1468         for (HRegionInfo region : regions) {
1469           String encodedName = region.getEncodedName();
1470           if (!isDisabledorDisablingRegionInRIT(region)) {
1471             RegionState state = forceRegionStateToOffline(region, false);
1472             boolean onDeadServer = false;
1473             if (state != null) {
1474               if (regionStates.wasRegionOnDeadServer(encodedName)) {
1475                 LOG.info("Skip assigning " + region.getRegionNameAsString()
1476                   + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1477                   + " is dead but not processed yet");
1478                 onDeadServer = true;
1479               } else if (asyncSetOfflineInZooKeeper(state, cb, destination)) {
1480                 RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
1481                 plans.put(encodedName, plan);
1482                 states.add(state);
1483                 continue;
1484               }
1485             }
1486             // Reassign if the region wasn't on a dead server
1487             if (!onDeadServer) {
1488               LOG.info("failed to force region state to offline or "
1489                 + "failed to set it offline in ZK, will reassign later: " + region);
1490               failedToOpenRegions.add(region); // assign individually later
1491             }
1492           }
1493           // Release the lock, this region is excluded from bulk assign because
1494           // we can't update its state, or set its znode to offline.
1495           Lock lock = locks.remove(encodedName);
1496           lock.unlock();
1497         }
1498 
1499         // Wait until all unassigned nodes have been put up and watchers set.
1500         int total = states.size();
1501         for (int oldCounter = 0; !server.isStopped();) {
1502           int count = counter.get();
1503           if (oldCounter != count) {
1504             LOG.info(destination.toString() + " unassigned znodes=" + count +
1505               " of total=" + total);
1506             oldCounter = count;
1507           }
1508           if (count >= total) break;
1509           Threads.sleep(5);
1510         }
1511 
1512         if (server.isStopped()) {
1513           return false;
1514         }
1515 
1516         // Add region plans, so we can updateTimers when one region is opened so
1517         // that unnecessary timeout on RIT is reduced.
1518         this.addPlans(plans);
1519 
1520         List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos =
1521           new ArrayList<Triple<HRegionInfo, Integer, List<ServerName>>>(states.size());
1522         for (RegionState state: states) {
1523           HRegionInfo region = state.getRegion();
1524           String encodedRegionName = region.getEncodedName();
1525           Integer nodeVersion = offlineNodesVersions.get(encodedRegionName);
1526           if (nodeVersion == null || nodeVersion == -1) {
1527             LOG.warn("failed to offline in zookeeper: " + region);
1528             failedToOpenRegions.add(region); // assign individually later
1529             Lock lock = locks.remove(encodedRegionName);
1530             lock.unlock();
1531           } else {
1532             regionStates.updateRegionState(
1533               region, State.PENDING_OPEN, destination);
1534             List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1535             if (this.shouldAssignRegionsWithFavoredNodes) {
1536               favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1537             }
1538             regionOpenInfos.add(new Triple<HRegionInfo, Integer,  List<ServerName>>(
1539               region, nodeVersion, favoredNodes));
1540           }
1541         }
1542 
1543         // Move on to open regions.
1544         try {
1545           // Send OPEN RPC. If it fails on a IOE or RemoteException,
1546           // regions will be assigned individually.
1547           long maxWaitTime = System.currentTimeMillis() +
1548             this.server.getConfiguration().
1549               getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1550           for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1551             try {
1552               List<RegionOpeningState> regionOpeningStateList = serverManager
1553                 .sendRegionOpen(destination, regionOpenInfos);
1554               if (regionOpeningStateList == null) {
1555                 // Failed getting RPC connection to this server
1556                 return false;
1557               }
1558               for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
1559                 RegionOpeningState openingState = regionOpeningStateList.get(k);
1560                 if (openingState != RegionOpeningState.OPENED) {
1561                   HRegionInfo region = regionOpenInfos.get(k).getFirst();
1562                   if (openingState == RegionOpeningState.ALREADY_OPENED) {
1563                     processAlreadyOpenedRegion(region, destination);
1564                   } else if (openingState == RegionOpeningState.FAILED_OPENING) {
1565                     // Failed opening this region, reassign it later
1566                     failedToOpenRegions.add(region);
1567                   } else {
1568                     LOG.warn("THIS SHOULD NOT HAPPEN: unknown opening state "
1569                       + openingState + " in assigning region " + region);
1570                   }
1571                 }
1572               }
1573               break;
1574             } catch (IOException e) {
1575               if (e instanceof RemoteException) {
1576                 e = ((RemoteException)e).unwrapRemoteException();
1577               }
1578               if (e instanceof RegionServerStoppedException) {
1579                 LOG.warn("The region server was shut down, ", e);
1580                 // No need to retry, the region server is a goner.
1581                 return false;
1582               } else if (e instanceof ServerNotRunningYetException) {
1583                 long now = System.currentTimeMillis();
1584                 if (now < maxWaitTime) {
1585                   LOG.debug("Server is not yet up; waiting up to " +
1586                     (maxWaitTime - now) + "ms", e);
1587                   Thread.sleep(100);
1588                   i--; // reset the try count
1589                   continue;
1590                 }
1591               } else if (e instanceof java.net.SocketTimeoutException
1592                   && this.serverManager.isServerOnline(destination)) {
1593                 // In case socket is timed out and the region server is still online,
1594                 // the openRegion RPC could have been accepted by the server and
1595                 // just the response didn't go through.  So we will retry to
1596                 // open the region on the same server.
1597                 if (LOG.isDebugEnabled()) {
1598                   LOG.debug("Bulk assigner openRegion() to " + destination
1599                     + " has timed out, but the regions might"
1600                     + " already be opened on it.", e);
1601                 }
1602                 continue;
1603               }
1604               throw e;
1605             }
1606           }
1607         } catch (IOException e) {
1608           // Can be a socket timeout, EOF, NoRouteToHost, etc
1609           LOG.info("Unable to communicate with " + destination
1610             + " in order to assign regions, ", e);
1611           return false;
1612         } catch (InterruptedException e) {
1613           throw new RuntimeException(e);
1614         }
1615       } finally {
1616         for (Lock lock : locks.values()) {
1617           lock.unlock();
1618         }
1619       }
1620 
1621       if (!failedToOpenRegions.isEmpty()) {
1622         for (HRegionInfo region : failedToOpenRegions) {
1623           if (!regionStates.isRegionOnline(region)) {
1624             invokeAssign(region);
1625           }
1626         }
1627       }
1628       LOG.debug("Bulk assigning done for " + destination);
1629       return true;
1630     } finally {
1631       metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
1632     }
1633   }
1634 
1635   /**
1636    * Send CLOSE RPC if the server is online, otherwise, offline the region.
1637    *
1638    * The RPC will be sent only to the region sever found in the region state
1639    * if it is passed in, otherwise, to the src server specified. If region
1640    * state is not specified, we don't update region state at all, instead
1641    * we just send the RPC call. This is useful for some cleanup without
1642    * messing around the region states (see handleRegion, on region opened
1643    * on an unexpected server scenario, for an example)
1644    */
1645   private void unassign(final HRegionInfo region,
1646       final RegionState state, final int versionOfClosingNode,
1647       final ServerName dest, final boolean transitionInZK,
1648       final ServerName src) {
1649     ServerName server = src;
1650     if (state != null) {
1651       server = state.getServerName();
1652     }
1653     long maxWaitTime = -1;
1654     for (int i = 1; i <= this.maximumAttempts; i++) {
1655       if (this.server.isStopped() || this.server.isAborted()) {
1656         LOG.debug("Server stopped/aborted; skipping unassign of " + region);
1657         return;
1658       }
1659       // ClosedRegionhandler can remove the server from this.regions
1660       if (!serverManager.isServerOnline(server)) {
1661         LOG.debug("Offline " + region.getRegionNameAsString()
1662           + ", no need to unassign since it's on a dead server: " + server);
1663         if (transitionInZK) {
1664           // delete the node. if no node exists need not bother.
1665           deleteClosingOrClosedNode(region, server);
1666         }
1667         if (state != null) {
1668           regionOffline(region);
1669         }
1670         return;
1671       }
1672       try {
1673         // Send CLOSE RPC
1674         if (serverManager.sendRegionClose(server, region,
1675           versionOfClosingNode, dest, transitionInZK)) {
1676           LOG.debug("Sent CLOSE to " + server + " for region " +
1677             region.getRegionNameAsString());
1678           if (!transitionInZK && state != null) {
1679             // Retry to make sure the region is
1680             // closed so as to avoid double assignment.
1681             unassign(region, state, versionOfClosingNode,
1682               dest, transitionInZK,src);
1683           }
1684           return;
1685         }
1686         // This never happens. Currently regionserver close always return true.
1687         // Todo; this can now happen (0.96) if there is an exception in a coprocessor
1688         LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
1689           region.getRegionNameAsString());
1690       } catch (Throwable t) {
1691         if (t instanceof RemoteException) {
1692           t = ((RemoteException)t).unwrapRemoteException();
1693         }
1694         if (t instanceof NotServingRegionException
1695             || t instanceof RegionServerStoppedException) {
1696           LOG.debug("Offline " + region.getRegionNameAsString()
1697             + ", it's not any more on " + server, t);
1698           if (transitionInZK) {
1699             deleteClosingOrClosedNode(region, server);
1700           }
1701           if (state != null) {
1702             regionOffline(region);
1703           }
1704           return;
1705         } else if (state != null
1706             && t instanceof RegionAlreadyInTransitionException) {
1707           // RS is already processing this region, only need to update the timestamp
1708           LOG.debug("update " + state + " the timestamp.");
1709           state.updateTimestampToNow();
1710           if (maxWaitTime < 0) {
1711             maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
1712               + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
1713                 DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1714           }
1715           try {
1716             long now = EnvironmentEdgeManager.currentTimeMillis();
1717             if (now < maxWaitTime) {
1718               LOG.debug("Region is already in transition; "
1719                 + "waiting up to " + (maxWaitTime - now) + "ms", t);
1720               Thread.sleep(100);
1721               i--; // reset the try count
1722             }
1723           } catch (InterruptedException ie) {
1724             LOG.warn("Failed to unassign "
1725               + region.getRegionNameAsString() + " since interrupted", ie);
1726             Thread.currentThread().interrupt();
1727             if (!tomActivated) {
1728               regionStates.updateRegionState(region, State.FAILED_CLOSE);
1729             }
1730             return;
1731           }
1732         } else {
1733           LOG.info("Server " + server + " returned " + t + " for "
1734             + region.getRegionNameAsString() + ", try=" + i
1735             + " of " + this.maximumAttempts, t);
1736           // Presume retry or server will expire.
1737         }
1738       }
1739     }
1740     // Run out of attempts
1741     if (!tomActivated && state != null) {
1742       regionStates.updateRegionState(region, State.FAILED_CLOSE);
1743     }
1744   }
1745 
1746   /**
1747    * Set region to OFFLINE unless it is opening and forceNewPlan is false.
1748    */
1749   private RegionState forceRegionStateToOffline(
1750       final HRegionInfo region, final boolean forceNewPlan) {
1751     RegionState state = regionStates.getRegionState(region);
1752     if (state == null) {
1753       LOG.warn("Assigning a region not in region states: " + region);
1754       state = regionStates.createRegionState(region);
1755     }
1756 
1757     ServerName sn = state.getServerName();
1758     if (forceNewPlan && LOG.isDebugEnabled()) {
1759       LOG.debug("Force region state offline " + state);
1760     }
1761 
1762     switch (state.getState()) {
1763     case OPEN:
1764     case OPENING:
1765     case PENDING_OPEN:
1766     case CLOSING:
1767     case PENDING_CLOSE:
1768       if (!forceNewPlan) {
1769         LOG.debug("Skip assigning " +
1770           region + ", it is already " + state);
1771         return null;
1772       }
1773     case FAILED_CLOSE:
1774     case FAILED_OPEN:
1775       unassign(region, state, -1, null, false, null);
1776       state = regionStates.getRegionState(region);
1777       if (state.isFailedClose()) {
1778         // If we can't close the region, we can't re-assign
1779         // it so as to avoid possible double assignment/data loss.
1780         LOG.info("Skip assigning " +
1781           region + ", we couldn't close it: " + state);
1782         return null;
1783       }
1784     case OFFLINE:
1785       // This region could have been open on this server
1786       // for a while. If the server is dead and not processed
1787       // yet, we can move on only if the meta shows the
1788       // region is not on this server actually, or on a server
1789       // not dead, or dead and processed already.
1790       if (regionStates.isServerDeadAndNotProcessed(sn)
1791           && wasRegionOnDeadServerByMeta(region, sn)) {
1792         LOG.info("Skip assigning " + region.getRegionNameAsString()
1793           + ", it is on a dead but not processed yet server");
1794         return null;
1795       }
1796     case CLOSED:
1797       break;
1798     default:
1799       LOG.error("Trying to assign region " + region
1800         + ", which is " + state);
1801       return null;
1802     }
1803     return state;
1804   }
1805 
1806   private boolean wasRegionOnDeadServerByMeta(
1807       final HRegionInfo region, final ServerName sn) {
1808     try {
1809       if (region.isMetaRegion()) {
1810         ServerName server = catalogTracker.getMetaLocation();
1811         return regionStates.isServerDeadAndNotProcessed(server);
1812       }
1813       while (!server.isStopped()) {
1814         try {
1815           catalogTracker.waitForMeta();
1816           Pair<HRegionInfo, ServerName> r =
1817             MetaReader.getRegion(catalogTracker, region.getRegionName());
1818           ServerName server = r == null ? null : r.getSecond();
1819           return regionStates.isServerDeadAndNotProcessed(server);
1820         } catch (IOException ioe) {
1821           LOG.info("Received exception accessing hbase:meta during force assign "
1822             + region.getRegionNameAsString() + ", retrying", ioe);
1823         }
1824       }
1825     } catch (InterruptedException e) {
1826       Thread.currentThread().interrupt();
1827       LOG.info("Interrupted accessing hbase:meta", e);
1828     }
1829     // Call is interrupted or server is stopped.
1830     return regionStates.isServerDeadAndNotProcessed(sn);
1831   }
1832 
1833   /**
1834    * Caller must hold lock on the passed <code>state</code> object.
1835    * @param state
1836    * @param setOfflineInZK
1837    * @param forceNewPlan
1838    */
1839   private void assign(RegionState state,
1840       final boolean setOfflineInZK, final boolean forceNewPlan) {
1841     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1842     try {
1843       RegionState currentState = state;
1844       int versionOfOfflineNode = -1;
1845       RegionPlan plan = null;
1846       long maxWaitTime = -1;
1847       HRegionInfo region = state.getRegion();
1848       RegionOpeningState regionOpenState;
1849       for (int i = 1; i <= maximumAttempts; i++) {
1850         if (server.isStopped() || server.isAborted()) {
1851           LOG.info("Skip assigning " + region.getRegionNameAsString()
1852             + ", the server is stopped/aborted");
1853           return;
1854         }
1855         if (plan == null) { // Get a server for the region at first
1856           try {
1857             plan = getRegionPlan(region, forceNewPlan);
1858           } catch (HBaseIOException e) {
1859             LOG.warn("Failed to get region plan", e);
1860           }
1861         }
1862         if (plan == null) {
1863           LOG.warn("Unable to determine a plan to assign " + region);
1864           if (tomActivated){
1865             this.timeoutMonitor.setAllRegionServersOffline(true);
1866           } else {
1867             if (region.isMetaRegion()) {
1868               try {
1869                 Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
1870                 if (i == maximumAttempts) i = 1;
1871                 continue;
1872               } catch (InterruptedException e) {
1873                 LOG.error("Got exception while waiting for hbase:meta assignment");
1874                 Thread.currentThread().interrupt();
1875               }
1876             }
1877             regionStates.updateRegionState(region, State.FAILED_OPEN);
1878           }
1879           return;
1880         }
1881         if (setOfflineInZK && versionOfOfflineNode == -1) {
1882           // get the version of the znode after setting it to OFFLINE.
1883           // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
1884           versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination());
1885           if (versionOfOfflineNode != -1) {
1886             if (isDisabledorDisablingRegionInRIT(region)) {
1887               return;
1888             }
1889             // In case of assignment from EnableTableHandler table state is ENABLING. Any how
1890             // EnableTableHandler will set ENABLED after assigning all the table regions. If we
1891             // try to set to ENABLED directly then client API may think table is enabled.
1892             // When we have a case such as all the regions are added directly into hbase:meta and we call
1893             // assignRegion then we need to make the table ENABLED. Hence in such case the table
1894             // will not be in ENABLING or ENABLED state.
1895             TableName tableName = region.getTable();
1896             if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) {
1897               LOG.debug("Setting table " + tableName + " to ENABLED state.");
1898               setEnabledTable(tableName);
1899             }
1900           }
1901         }
1902         if (setOfflineInZK && versionOfOfflineNode == -1) {
1903           LOG.info("Unable to set offline in ZooKeeper to assign " + region);
1904           // Setting offline in ZK must have been failed due to ZK racing or some
1905           // exception which may make the server to abort. If it is ZK racing,
1906           // we should retry since we already reset the region state,
1907           // existing (re)assignment will fail anyway.
1908           if (!server.isAborted()) {
1909             continue;
1910           }
1911         }
1912         LOG.info("Assigning " + region.getRegionNameAsString() +
1913             " to " + plan.getDestination().toString());
1914         // Transition RegionState to PENDING_OPEN
1915         currentState = regionStates.updateRegionState(region,
1916           State.PENDING_OPEN, plan.getDestination());
1917 
1918         boolean needNewPlan;
1919         final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
1920             " to " + plan.getDestination();
1921         try {
1922           List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1923           if (this.shouldAssignRegionsWithFavoredNodes) {
1924             favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1925           }
1926           regionOpenState = serverManager.sendRegionOpen(
1927               plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
1928 
1929           if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
1930             // Failed opening this region, looping again on a new server.
1931             needNewPlan = true;
1932             LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
1933                 " trying to assign elsewhere instead; " +
1934                 "try=" + i + " of " + this.maximumAttempts);
1935           } else {
1936             // we're done
1937             if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
1938               processAlreadyOpenedRegion(region, plan.getDestination());
1939             }
1940             return;
1941           }
1942 
1943         } catch (Throwable t) {
1944           if (t instanceof RemoteException) {
1945             t = ((RemoteException) t).unwrapRemoteException();
1946           }
1947 
1948           // Should we wait a little before retrying? If the server is starting it's yes.
1949           // If the region is already in transition, it's yes as well: we want to be sure that
1950           //  the region will get opened but we don't want a double assignment.
1951           boolean hold = (t instanceof RegionAlreadyInTransitionException ||
1952               t instanceof ServerNotRunningYetException);
1953 
1954           // In case socket is timed out and the region server is still online,
1955           // the openRegion RPC could have been accepted by the server and
1956           // just the response didn't go through.  So we will retry to
1957           // open the region on the same server to avoid possible
1958           // double assignment.
1959           boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
1960               && this.serverManager.isServerOnline(plan.getDestination()));
1961 
1962 
1963           if (hold) {
1964             LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
1965               "try=" + i + " of " + this.maximumAttempts, t);
1966 
1967             if (maxWaitTime < 0) {
1968               if (t instanceof RegionAlreadyInTransitionException) {
1969                 maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
1970                   + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
1971                     DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1972               } else {
1973                 maxWaitTime = this.server.getConfiguration().
1974                   getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1975               }
1976             }
1977             try {
1978               needNewPlan = false;
1979               long now = EnvironmentEdgeManager.currentTimeMillis();
1980               if (now < maxWaitTime) {
1981                 LOG.debug("Server is not yet up or region is already in transition; "
1982                   + "waiting up to " + (maxWaitTime - now) + "ms", t);
1983                 Thread.sleep(100);
1984                 i--; // reset the try count
1985               } else if (!(t instanceof RegionAlreadyInTransitionException)) {
1986                 LOG.debug("Server is not up for a while; try a new one", t);
1987                 needNewPlan = true;
1988               }
1989             } catch (InterruptedException ie) {
1990               LOG.warn("Failed to assign "
1991                   + region.getRegionNameAsString() + " since interrupted", ie);
1992               Thread.currentThread().interrupt();
1993               if (!tomActivated) {
1994                 regionStates.updateRegionState(region, State.FAILED_OPEN);
1995               }
1996               return;
1997             }
1998           } else if (retry) {
1999             needNewPlan = false;
2000             LOG.warn(assignMsg + ", trying to assign to the same region server " +
2001                 "try=" + i + " of " + this.maximumAttempts, t);
2002           } else {
2003             needNewPlan = true;
2004             LOG.warn(assignMsg + ", trying to assign elsewhere instead;" +
2005                 " try=" + i + " of " + this.maximumAttempts, t);
2006           }
2007         }
2008 
2009         if (i == this.maximumAttempts) {
2010           // Don't reset the region state or get a new plan any more.
2011           // This is the last try.
2012           continue;
2013         }
2014 
2015         // If region opened on destination of present plan, reassigning to new
2016         // RS may cause double assignments. In case of RegionAlreadyInTransitionException
2017         // reassigning to same RS.
2018         if (needNewPlan) {
2019           // Force a new plan and reassign. Will return null if no servers.
2020           // The new plan could be the same as the existing plan since we don't
2021           // exclude the server of the original plan, which should not be
2022           // excluded since it could be the only server up now.
2023           RegionPlan newPlan = null;
2024           try {
2025             newPlan = getRegionPlan(region, true);
2026           } catch (HBaseIOException e) {
2027             LOG.warn("Failed to get region plan", e);
2028           }
2029           if (newPlan == null) {
2030             if (tomActivated) {
2031               this.timeoutMonitor.setAllRegionServersOffline(true);
2032             } else {
2033               regionStates.updateRegionState(region, State.FAILED_OPEN);
2034             }
2035             LOG.warn("Unable to find a viable location to assign region " +
2036                 region.getRegionNameAsString());
2037             return;
2038           }
2039 
2040           if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
2041             // Clean out plan we failed execute and one that doesn't look like it'll
2042             // succeed anyways; we need a new plan!
2043             // Transition back to OFFLINE
2044             currentState = regionStates.updateRegionState(region, State.OFFLINE);
2045             versionOfOfflineNode = -1;
2046             plan = newPlan;
2047           }
2048         }
2049       }
2050       // Run out of attempts
2051       if (!tomActivated) {
2052         regionStates.updateRegionState(region, State.FAILED_OPEN);
2053       }
2054     } finally {
2055       metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
2056     }
2057   }
2058 
2059   private void processAlreadyOpenedRegion(HRegionInfo region, ServerName sn) {
2060     // Remove region from in-memory transition and unassigned node from ZK
2061     // While trying to enable the table the regions of the table were
2062     // already enabled.
2063     LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2064       + " to " + sn);
2065     String encodedName = region.getEncodedName();
2066     deleteNodeInStates(encodedName, "offline", sn, EventType.M_ZK_REGION_OFFLINE);
2067     regionStates.regionOnline(region, sn);
2068   }
2069 
2070   private boolean isDisabledorDisablingRegionInRIT(final HRegionInfo region) {
2071     TableName tableName = region.getTable();
2072     boolean disabled = this.zkTable.isDisabledTable(tableName);
2073     if (disabled || this.zkTable.isDisablingTable(tableName)) {
2074       LOG.info("Table " + tableName + (disabled ? " disabled;" : " disabling;") +
2075         " skipping assign of " + region.getRegionNameAsString());
2076       offlineDisabledRegion(region);
2077       return true;
2078     }
2079     return false;
2080   }
2081 
2082   /**
2083    * Set region as OFFLINED up in zookeeper
2084    *
2085    * @param state
2086    * @return the version of the offline node if setting of the OFFLINE node was
2087    *         successful, -1 otherwise.
2088    */
2089   private int setOfflineInZooKeeper(final RegionState state, final ServerName destination) {
2090     if (!state.isClosed() && !state.isOffline()) {
2091       String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
2092       this.server.abort(msg, new IllegalStateException(msg));
2093       return -1;
2094     }
2095     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
2096     int versionOfOfflineNode;
2097     try {
2098       // get the version after setting the znode to OFFLINE
2099       versionOfOfflineNode = ZKAssign.createOrForceNodeOffline(watcher,
2100         state.getRegion(), destination);
2101       if (versionOfOfflineNode == -1) {
2102         LOG.warn("Attempted to create/force node into OFFLINE state before "
2103             + "completing assignment but failed to do so for " + state);
2104         return -1;
2105       }
2106     } catch (KeeperException e) {
2107       server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
2108       return -1;
2109     }
2110     return versionOfOfflineNode;
2111   }
2112 
2113   /**
2114    * @param region the region to assign
2115    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2116    * if no servers to assign, it returns null).
2117    */
2118   private RegionPlan getRegionPlan(final HRegionInfo region,
2119       final boolean forceNewPlan)  throws HBaseIOException  {
2120     return getRegionPlan(region, null, forceNewPlan);
2121   }
2122 
2123   /**
2124    * @param region the region to assign
2125    * @param serverToExclude Server to exclude (we know its bad). Pass null if
2126    * all servers are thought to be assignable.
2127    * @param forceNewPlan If true, then if an existing plan exists, a new plan
2128    * will be generated.
2129    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2130    * if no servers to assign, it returns null).
2131    */
2132   private RegionPlan getRegionPlan(final HRegionInfo region,
2133       final ServerName serverToExclude, final boolean forceNewPlan) throws HBaseIOException {
2134     // Pickup existing plan or make a new one
2135     final String encodedName = region.getEncodedName();
2136     final List<ServerName> destServers =
2137       serverManager.createDestinationServersList(serverToExclude);
2138 
2139     if (destServers.isEmpty()){
2140       LOG.warn("Can't move " + encodedName +
2141         ", there is no destination server available.");
2142       return null;
2143     }
2144 
2145     RegionPlan randomPlan = null;
2146     boolean newPlan = false;
2147     RegionPlan existingPlan;
2148 
2149     synchronized (this.regionPlans) {
2150       existingPlan = this.regionPlans.get(encodedName);
2151 
2152       if (existingPlan != null && existingPlan.getDestination() != null) {
2153         LOG.debug("Found an existing plan for " + region.getRegionNameAsString()
2154           + " destination server is " + existingPlan.getDestination() +
2155             " accepted as a dest server = " + destServers.contains(existingPlan.getDestination()));
2156       }
2157 
2158       if (forceNewPlan
2159           || existingPlan == null
2160           || existingPlan.getDestination() == null
2161           || !destServers.contains(existingPlan.getDestination())) {
2162         newPlan = true;
2163         randomPlan = new RegionPlan(region, null,
2164             balancer.randomAssignment(region, destServers));
2165         if (!region.isMetaTable() && shouldAssignRegionsWithFavoredNodes) {
2166           List<HRegionInfo> regions = new ArrayList<HRegionInfo>(1);
2167           regions.add(region);
2168           try {
2169             processFavoredNodes(regions);
2170           } catch (IOException ie) {
2171             LOG.warn("Ignoring exception in processFavoredNodes " + ie);
2172           }
2173         }
2174         this.regionPlans.put(encodedName, randomPlan);
2175       }
2176     }
2177 
2178     if (newPlan) {
2179       if (randomPlan.getDestination() == null) {
2180         LOG.warn("Can't find a destination for " + encodedName);
2181         return null;
2182       }
2183       LOG.debug("No previous transition plan found (or ignoring " +
2184         "an existing plan) for " + region.getRegionNameAsString() +
2185         "; generated random plan=" + randomPlan + "; " +
2186         serverManager.countOfRegionServers() +
2187                " (online=" + serverManager.getOnlineServers().size() +
2188                ", available=" + destServers.size() + ") available servers" +
2189                ", forceNewPlan=" + forceNewPlan);
2190         return randomPlan;
2191       }
2192     LOG.debug("Using pre-existing plan for " +
2193       region.getRegionNameAsString() + "; plan=" + existingPlan);
2194     return existingPlan;
2195   }
2196 
2197   /**
2198    * Unassigns the specified region.
2199    * <p>
2200    * Updates the RegionState and sends the CLOSE RPC unless region is being
2201    * split by regionserver; then the unassign fails (silently) because we
2202    * presume the region being unassigned no longer exists (its been split out
2203    * of existence). TODO: What to do if split fails and is rolled back and
2204    * parent is revivified?
2205    * <p>
2206    * If a RegionPlan is already set, it will remain.
2207    *
2208    * @param region server to be unassigned
2209    */
2210   public void unassign(HRegionInfo region) {
2211     unassign(region, false);
2212   }
2213 
2214 
2215   /**
2216    * Unassigns the specified region.
2217    * <p>
2218    * Updates the RegionState and sends the CLOSE RPC unless region is being
2219    * split by regionserver; then the unassign fails (silently) because we
2220    * presume the region being unassigned no longer exists (its been split out
2221    * of existence). TODO: What to do if split fails and is rolled back and
2222    * parent is revivified?
2223    * <p>
2224    * If a RegionPlan is already set, it will remain.
2225    *
2226    * @param region server to be unassigned
2227    * @param force if region should be closed even if already closing
2228    */
2229   public void unassign(HRegionInfo region, boolean force, ServerName dest) {
2230     // TODO: Method needs refactoring.  Ugly buried returns throughout.  Beware!
2231     LOG.debug("Starting unassign of " + region.getRegionNameAsString()
2232       + " (offlining), current state: " + regionStates.getRegionState(region));
2233 
2234     String encodedName = region.getEncodedName();
2235     // Grab the state of this region and synchronize on it
2236     int versionOfClosingNode = -1;
2237     // We need a lock here as we're going to do a put later and we don't want multiple states
2238     //  creation
2239     ReentrantLock lock = locker.acquireLock(encodedName);
2240     RegionState state = regionStates.getRegionTransitionState(encodedName);
2241     boolean reassign = true;
2242     try {
2243       if (state == null) {
2244         // Region is not in transition.
2245         // We can unassign it only if it's not SPLIT/MERGED.
2246         state = regionStates.getRegionState(encodedName);
2247         if (state != null && state.isUnassignable()) {
2248           LOG.info("Attempting to unassign " + state + ", ignored");
2249           // Offline region will be reassigned below
2250           return;
2251         }
2252         // Create the znode in CLOSING state
2253         try {
2254           if (state == null || state.getServerName() == null) {
2255             // We don't know where the region is, offline it.
2256             // No need to send CLOSE RPC
2257             LOG.warn("Attempting to unassign a region not in RegionStates"
2258               + region.getRegionNameAsString() + ", offlined");
2259             regionOffline(region);
2260             return;
2261           }
2262           versionOfClosingNode = ZKAssign.createNodeClosing(
2263             watcher, region, state.getServerName());
2264           if (versionOfClosingNode == -1) {
2265             LOG.info("Attempting to unassign " +
2266               region.getRegionNameAsString() + " but ZK closing node "
2267               + "can't be created.");
2268             reassign = false; // not unassigned at all
2269             return;
2270           }
2271         } catch (KeeperException e) {
2272           if (e instanceof NodeExistsException) {
2273             // Handle race between master initiated close and regionserver
2274             // orchestrated splitting. See if existing node is in a
2275             // SPLITTING or SPLIT state.  If so, the regionserver started
2276             // an op on node before we could get our CLOSING in.  Deal.
2277             NodeExistsException nee = (NodeExistsException)e;
2278             String path = nee.getPath();
2279             try {
2280               if (isSplitOrSplittingOrMergedOrMerging(path)) {
2281                 LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " +
2282                   "skipping unassign because region no longer exists -- its split or merge");
2283                 reassign = false; // no need to reassign for split/merged region
2284                 return;
2285               }
2286             } catch (KeeperException.NoNodeException ke) {
2287               LOG.warn("Failed getData on SPLITTING/SPLIT at " + path +
2288                 "; presuming split and that the region to unassign, " +
2289                 encodedName + ", no longer exists -- confirm", ke);
2290               return;
2291             } catch (KeeperException ke) {
2292               LOG.error("Unexpected zk state", ke);
2293             } catch (DeserializationException de) {
2294               LOG.error("Failed parse", de);
2295             }
2296           }
2297           // If we get here, don't understand whats going on -- abort.
2298           server.abort("Unexpected ZK exception creating node CLOSING", e);
2299           reassign = false; // heading out already
2300           return;
2301         }
2302         state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2303       } else if (state.isFailedOpen()) {
2304         // The region is not open yet
2305         regionOffline(region);
2306         return;
2307       } else if (force && state.isPendingCloseOrClosing()) {
2308         LOG.debug("Attempting to unassign " + region.getRegionNameAsString() +
2309           " which is already " + state.getState()  +
2310           " but forcing to send a CLOSE RPC again ");
2311         if (state.isFailedClose()) {
2312           state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2313         }
2314         state.updateTimestampToNow();
2315       } else {
2316         LOG.debug("Attempting to unassign " +
2317           region.getRegionNameAsString() + " but it is " +
2318           "already in transition (" + state.getState() + ", force=" + force + ")");
2319         return;
2320       }
2321 
2322       unassign(region, state, versionOfClosingNode, dest, true, null);
2323     } finally {
2324       lock.unlock();
2325 
2326       // Region is expected to be reassigned afterwards
2327       if (reassign && regionStates.isRegionOffline(region)) {
2328         assign(region, true);
2329       }
2330     }
2331   }
2332 
2333   public void unassign(HRegionInfo region, boolean force){
2334      unassign(region, force, null);
2335   }
2336 
2337   /**
2338    * @param region regioninfo of znode to be deleted.
2339    */
2340   public void deleteClosingOrClosedNode(HRegionInfo region, ServerName sn) {
2341     String encodedName = region.getEncodedName();
2342     deleteNodeInStates(encodedName, "closing", sn, EventType.M_ZK_REGION_CLOSING,
2343       EventType.RS_ZK_REGION_CLOSED);
2344   }
2345 
2346   /**
2347    * @param path
2348    * @return True if znode is in SPLIT or SPLITTING or MERGED or MERGING state.
2349    * @throws KeeperException Can happen if the znode went away in meantime.
2350    * @throws DeserializationException
2351    */
2352   private boolean isSplitOrSplittingOrMergedOrMerging(final String path)
2353       throws KeeperException, DeserializationException {
2354     boolean result = false;
2355     // This may fail if the SPLIT or SPLITTING or MERGED or MERGING znode gets
2356     // cleaned up before we can get data from it.
2357     byte [] data = ZKAssign.getData(watcher, path);
2358     if (data == null) {
2359       LOG.info("Node " + path + " is gone");
2360       return false;
2361     }
2362     RegionTransition rt = RegionTransition.parseFrom(data);
2363     switch (rt.getEventType()) {
2364     case RS_ZK_REQUEST_REGION_SPLIT:
2365     case RS_ZK_REGION_SPLIT:
2366     case RS_ZK_REGION_SPLITTING:
2367     case RS_ZK_REQUEST_REGION_MERGE:
2368     case RS_ZK_REGION_MERGED:
2369     case RS_ZK_REGION_MERGING:
2370       result = true;
2371       break;
2372     default:
2373       LOG.info("Node " + path + " is in " + rt.getEventType());
2374       break;
2375     }
2376     return result;
2377   }
2378 
2379   /**
2380    * Used by unit tests. Return the number of regions opened so far in the life
2381    * of the master. Increases by one every time the master opens a region
2382    * @return the counter value of the number of regions opened so far
2383    */
2384   public int getNumRegionsOpened() {
2385     return numRegionsOpened.get();
2386   }
2387 
2388   /**
2389    * Waits until the specified region has completed assignment.
2390    * <p>
2391    * If the region is already assigned, returns immediately.  Otherwise, method
2392    * blocks until the region is assigned.
2393    * @param regionInfo region to wait on assignment for
2394    * @throws InterruptedException
2395    */
2396   public boolean waitForAssignment(HRegionInfo regionInfo)
2397       throws InterruptedException {
2398     while (!regionStates.isRegionOnline(regionInfo)) {
2399       if (regionStates.isRegionInState(regionInfo, State.FAILED_OPEN)
2400           || this.server.isStopped()) {
2401         return false;
2402       }
2403 
2404       // We should receive a notification, but it's
2405       //  better to have a timeout to recheck the condition here:
2406       //  it lowers the impact of a race condition if any
2407       regionStates.waitForUpdate(100);
2408     }
2409     return true;
2410   }
2411 
2412   /**
2413    * Assigns the hbase:meta region.
2414    * <p>
2415    * Assumes that hbase:meta is currently closed and is not being actively served by
2416    * any RegionServer.
2417    * <p>
2418    * Forcibly unsets the current meta region location in ZooKeeper and assigns
2419    * hbase:meta to a random RegionServer.
2420    * @throws KeeperException
2421    */
2422   public void assignMeta() throws KeeperException {
2423     MetaRegionTracker.deleteMetaLocation(this.watcher);
2424     assign(HRegionInfo.FIRST_META_REGIONINFO, true);
2425   }
2426 
2427   /**
2428    * Assigns specified regions retaining assignments, if any.
2429    * <p>
2430    * This is a synchronous call and will return once every region has been
2431    * assigned.  If anything fails, an exception is thrown
2432    * @throws InterruptedException
2433    * @throws IOException
2434    */
2435   public void assign(Map<HRegionInfo, ServerName> regions)
2436         throws IOException, InterruptedException {
2437     if (regions == null || regions.isEmpty()) {
2438       return;
2439     }
2440     List<ServerName> servers = serverManager.createDestinationServersList();
2441     if (servers == null || servers.isEmpty()) {
2442       throw new IOException("Found no destination server to assign region(s)");
2443     }
2444 
2445     // Reuse existing assignment info
2446     Map<ServerName, List<HRegionInfo>> bulkPlan =
2447       balancer.retainAssignment(regions, servers);
2448 
2449     assign(regions.size(), servers.size(),
2450       "retainAssignment=true", bulkPlan);
2451   }
2452 
2453   /**
2454    * Assigns specified regions round robin, if any.
2455    * <p>
2456    * This is a synchronous call and will return once every region has been
2457    * assigned.  If anything fails, an exception is thrown
2458    * @throws InterruptedException
2459    * @throws IOException
2460    */
2461   public void assign(List<HRegionInfo> regions)
2462         throws IOException, InterruptedException {
2463     if (regions == null || regions.isEmpty()) {
2464       return;
2465     }
2466 
2467     List<ServerName> servers = serverManager.createDestinationServersList();
2468     if (servers == null || servers.isEmpty()) {
2469       throw new IOException("Found no destination server to assign region(s)");
2470     }
2471 
2472     // Generate a round-robin bulk assignment plan
2473     Map<ServerName, List<HRegionInfo>> bulkPlan
2474       = balancer.roundRobinAssignment(regions, servers);
2475     processFavoredNodes(regions);
2476 
2477     assign(regions.size(), servers.size(),
2478       "round-robin=true", bulkPlan);
2479   }
2480 
2481   private void assign(int regions, int totalServers,
2482       String message, Map<ServerName, List<HRegionInfo>> bulkPlan)
2483           throws InterruptedException, IOException {
2484 
2485     int servers = bulkPlan.size();
2486     if (servers == 1 || (regions < bulkAssignThresholdRegions
2487         && servers < bulkAssignThresholdServers)) {
2488 
2489       // Not use bulk assignment.  This could be more efficient in small
2490       // cluster, especially mini cluster for testing, so that tests won't time out
2491       if (LOG.isTraceEnabled()) {
2492         LOG.trace("Not using bulk assignment since we are assigning only " + regions +
2493           " region(s) to " + servers + " server(s)");
2494       }
2495       for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) {
2496         if (!assign(plan.getKey(), plan.getValue())) {
2497           for (HRegionInfo region: plan.getValue()) {
2498             if (!regionStates.isRegionOnline(region)) {
2499               invokeAssign(region);
2500             }
2501           }
2502         }
2503       }
2504     } else {
2505       LOG.info("Bulk assigning " + regions + " region(s) across "
2506         + totalServers + " server(s), " + message);
2507 
2508       // Use fixed count thread pool assigning.
2509       BulkAssigner ba = new GeneralBulkAssigner(
2510         this.server, bulkPlan, this, bulkAssignWaitTillAllAssigned);
2511       ba.bulkAssign();
2512       LOG.info("Bulk assigning done");
2513     }
2514   }
2515 
2516   /**
2517    * Assigns all user regions, if any exist.  Used during cluster startup.
2518    * <p>
2519    * This is a synchronous call and will return once every region has been
2520    * assigned.  If anything fails, an exception is thrown and the cluster
2521    * should be shutdown.
2522    * @throws InterruptedException
2523    * @throws IOException
2524    * @throws KeeperException
2525    */
2526   private void assignAllUserRegions()
2527       throws IOException, InterruptedException, KeeperException {
2528     // Cleanup any existing ZK nodes and start watching
2529     ZKAssign.deleteAllNodes(watcher);
2530     ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
2531       this.watcher.assignmentZNode);
2532     failoverCleanupDone();
2533 
2534     // Skip assignment for regions of tables in DISABLING state because during clean cluster startup
2535     // no RS is alive and regions map also doesn't have any information about the regions.
2536     // See HBASE-6281.
2537     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisabledOrDisablingTables(watcher);
2538     disabledOrDisablingOrEnabling.addAll(ZKTable.getEnablingTables(watcher));
2539     // Scan hbase:meta for all user regions, skipping any disabled tables
2540     Map<HRegionInfo, ServerName> allRegions;
2541     SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment =
2542        new SnapshotOfRegionAssignmentFromMeta(catalogTracker, disabledOrDisablingOrEnabling, true);
2543     snapshotOfRegionAssignment.initialize();
2544     allRegions = snapshotOfRegionAssignment.getRegionToRegionServerMap();
2545     if (allRegions == null || allRegions.isEmpty()) return;
2546 
2547     // Determine what type of assignment to do on startup
2548     boolean retainAssignment = server.getConfiguration().
2549       getBoolean("hbase.master.startup.retainassign", true);
2550 
2551     if (retainAssignment) {
2552       assign(allRegions);
2553     } else {
2554       List<HRegionInfo> regions = new ArrayList<HRegionInfo>(allRegions.keySet());
2555       assign(regions);
2556     }
2557 
2558     for (HRegionInfo hri : allRegions.keySet()) {
2559       TableName tableName = hri.getTable();
2560       if (!zkTable.isEnabledTable(tableName)) {
2561         setEnabledTable(tableName);
2562       }
2563     }
2564   }
2565 
2566   /**
2567    * Wait until no regions in transition.
2568    * @param timeout How long to wait.
2569    * @return True if nothing in regions in transition.
2570    * @throws InterruptedException
2571    */
2572   boolean waitUntilNoRegionsInTransition(final long timeout)
2573       throws InterruptedException {
2574     // Blocks until there are no regions in transition. It is possible that
2575     // there
2576     // are regions in transition immediately after this returns but guarantees
2577     // that if it returns without an exception that there was a period of time
2578     // with no regions in transition from the point-of-view of the in-memory
2579     // state of the Master.
2580     final long endTime = System.currentTimeMillis() + timeout;
2581 
2582     while (!this.server.isStopped() && regionStates.isRegionsInTransition()
2583         && endTime > System.currentTimeMillis()) {
2584       regionStates.waitForUpdate(100);
2585     }
2586 
2587     return !regionStates.isRegionsInTransition();
2588   }
2589 
2590   /**
2591    * Rebuild the list of user regions and assignment information.
2592    * <p>
2593    * Returns a map of servers that are not found to be online and the regions
2594    * they were hosting.
2595    * @return map of servers not online to their assigned regions, as stored
2596    *         in META
2597    * @throws IOException
2598    */
2599   Map<ServerName, List<HRegionInfo>> rebuildUserRegions() throws IOException, KeeperException {
2600     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2601     Set<TableName> disabledOrEnablingTables = ZKTable.getDisabledTables(watcher);
2602     disabledOrEnablingTables.addAll(enablingTables);
2603     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisablingTables(watcher);
2604     disabledOrDisablingOrEnabling.addAll(disabledOrEnablingTables);
2605 
2606     // Region assignment from META
2607     List<Result> results = MetaReader.fullScan(this.catalogTracker);
2608     // Get any new but slow to checkin region server that joined the cluster
2609     Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
2610     // Map of offline servers and their regions to be returned
2611     Map<ServerName, List<HRegionInfo>> offlineServers =
2612       new TreeMap<ServerName, List<HRegionInfo>>();
2613     // Iterate regions in META
2614     for (Result result : results) {
2615       Pair<HRegionInfo, ServerName> region = HRegionInfo.getHRegionInfoAndServerName(result);
2616       if (region == null) continue;
2617       HRegionInfo regionInfo = region.getFirst();
2618       ServerName regionLocation = region.getSecond();
2619       if (regionInfo == null) continue;
2620       regionStates.createRegionState(regionInfo);
2621       if (regionStates.isRegionInState(regionInfo, State.SPLIT)) {
2622         // Split is considered to be completed. If the split znode still
2623         // exists, the region will be put back to SPLITTING state later
2624         LOG.debug("Region " + regionInfo.getRegionNameAsString()
2625            + " split is completed. Hence need not add to regions list");
2626         continue;
2627       }
2628       TableName tableName = regionInfo.getTable();
2629       if (regionLocation == null) {
2630         // regionLocation could be null if createTable didn't finish properly.
2631         // When createTable is in progress, HMaster restarts.
2632         // Some regions have been added to hbase:meta, but have not been assigned.
2633         // When this happens, the region's table must be in ENABLING state.
2634         // It can't be in ENABLED state as that is set when all regions are
2635         // assigned.
2636         // It can't be in DISABLING state, because DISABLING state transitions
2637         // from ENABLED state when application calls disableTable.
2638         // It can't be in DISABLED state, because DISABLED states transitions
2639         // from DISABLING state.
2640         if (!enablingTables.contains(tableName)) {
2641           LOG.warn("Region " + regionInfo.getEncodedName() +
2642             " has null regionLocation." + " But its table " + tableName +
2643             " isn't in ENABLING state.");
2644         }
2645       } else if (!onlineServers.contains(regionLocation)) {
2646         // Region is located on a server that isn't online
2647         List<HRegionInfo> offlineRegions = offlineServers.get(regionLocation);
2648         if (offlineRegions == null) {
2649           offlineRegions = new ArrayList<HRegionInfo>(1);
2650           offlineServers.put(regionLocation, offlineRegions);
2651         }
2652         offlineRegions.add(regionInfo);
2653         // need to enable the table if not disabled or disabling or enabling
2654         // this will be used in rolling restarts
2655         if (!disabledOrDisablingOrEnabling.contains(tableName)
2656             && !getZKTable().isEnabledTable(tableName)) {
2657           setEnabledTable(tableName);
2658         }
2659       } else {
2660         // Region is being served and on an active server
2661         // add only if region not in disabled or enabling table
2662         if (!disabledOrEnablingTables.contains(tableName)) {
2663           regionStates.updateRegionState(regionInfo, State.OPEN, regionLocation);
2664           regionStates.regionOnline(regionInfo, regionLocation);
2665         }
2666         // need to enable the table if not disabled or disabling or enabling
2667         // this will be used in rolling restarts
2668         if (!disabledOrDisablingOrEnabling.contains(tableName)
2669             && !getZKTable().isEnabledTable(tableName)) {
2670           setEnabledTable(tableName);
2671         }
2672       }
2673     }
2674     return offlineServers;
2675   }
2676 
2677   /**
2678    * Recover the tables that were not fully moved to DISABLED state. These
2679    * tables are in DISABLING state when the master restarted/switched.
2680    *
2681    * @throws KeeperException
2682    * @throws TableNotFoundException
2683    * @throws IOException
2684    */
2685   private void recoverTableInDisablingState()
2686       throws KeeperException, TableNotFoundException, IOException {
2687     Set<TableName> disablingTables = ZKTable.getDisablingTables(watcher);
2688     if (disablingTables.size() != 0) {
2689       for (TableName tableName : disablingTables) {
2690         // Recover by calling DisableTableHandler
2691         LOG.info("The table " + tableName
2692             + " is in DISABLING state.  Hence recovering by moving the table"
2693             + " to DISABLED state.");
2694         new DisableTableHandler(this.server, tableName, catalogTracker,
2695             this, tableLockManager, true).prepare().process();
2696       }
2697     }
2698   }
2699 
2700   /**
2701    * Recover the tables that are not fully moved to ENABLED state. These tables
2702    * are in ENABLING state when the master restarted/switched
2703    *
2704    * @throws KeeperException
2705    * @throws org.apache.hadoop.hbase.TableNotFoundException
2706    * @throws IOException
2707    */
2708   private void recoverTableInEnablingState()
2709       throws KeeperException, TableNotFoundException, IOException {
2710     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2711     if (enablingTables.size() != 0) {
2712       for (TableName tableName : enablingTables) {
2713         // Recover by calling EnableTableHandler
2714         LOG.info("The table " + tableName
2715             + " is in ENABLING state.  Hence recovering by moving the table"
2716             + " to ENABLED state.");
2717         // enableTable in sync way during master startup,
2718         // no need to invoke coprocessor
2719         EnableTableHandler eth = new EnableTableHandler(this.server, tableName,
2720           catalogTracker, this, tableLockManager, true);
2721         try {
2722           eth.prepare();
2723         } catch (TableNotFoundException e) {
2724           LOG.warn("Table " + tableName + " not found in hbase:meta to recover.");
2725           continue;
2726         }
2727         eth.process();
2728       }
2729     }
2730   }
2731 
2732   /**
2733    * Processes list of dead servers from result of hbase:meta scan and regions in RIT
2734    * <p>
2735    * This is used for failover to recover the lost regions that belonged to
2736    * RegionServers which failed while there was no active master or regions
2737    * that were in RIT.
2738    * <p>
2739    *
2740    *
2741    * @param deadServers
2742    *          The list of dead servers which failed while there was no active
2743    *          master. Can be null.
2744    * @throws IOException
2745    * @throws KeeperException
2746    */
2747   private void processDeadServersAndRecoverLostRegions(
2748       Map<ServerName, List<HRegionInfo>> deadServers)
2749           throws IOException, KeeperException {
2750     if (deadServers != null) {
2751       for (Map.Entry<ServerName, List<HRegionInfo>> server: deadServers.entrySet()) {
2752         ServerName serverName = server.getKey();
2753         // We need to keep such info even if the server is known dead
2754         regionStates.setLastRegionServerOfRegions(serverName, server.getValue());
2755         if (!serverManager.isServerDead(serverName)) {
2756           serverManager.expireServer(serverName); // Let SSH do region re-assign
2757         }
2758       }
2759     }
2760     List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(
2761       this.watcher, this.watcher.assignmentZNode);
2762     if (!nodes.isEmpty()) {
2763       for (String encodedRegionName : nodes) {
2764         processRegionInTransition(encodedRegionName, null);
2765       }
2766     }
2767 
2768     // Now we can safely claim failover cleanup completed and enable
2769     // ServerShutdownHandler for further processing. The nodes (below)
2770     // in transition, if any, are for regions not related to those
2771     // dead servers at all, and can be done in parallel to SSH.
2772     failoverCleanupDone();
2773   }
2774 
2775   /**
2776    * Set Regions in transitions metrics.
2777    * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
2778    * This iterator is not fail fast, which may lead to stale read; but that's better than
2779    * creating a copy of the map for metrics computation, as this method will be invoked
2780    * on a frequent interval.
2781    */
2782   public void updateRegionsInTransitionMetrics() {
2783     long currentTime = System.currentTimeMillis();
2784     int totalRITs = 0;
2785     int totalRITsOverThreshold = 0;
2786     long oldestRITTime = 0;
2787     int ritThreshold = this.server.getConfiguration().
2788       getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
2789     for (RegionState state: regionStates.getRegionsInTransition().values()) {
2790       totalRITs++;
2791       long ritTime = currentTime - state.getStamp();
2792       if (ritTime > ritThreshold) { // more than the threshold
2793         totalRITsOverThreshold++;
2794       }
2795       if (oldestRITTime < ritTime) {
2796         oldestRITTime = ritTime;
2797       }
2798     }
2799     if (this.metricsAssignmentManager != null) {
2800       this.metricsAssignmentManager.updateRITOldestAge(oldestRITTime);
2801       this.metricsAssignmentManager.updateRITCount(totalRITs);
2802       this.metricsAssignmentManager.updateRITCountOverThreshold(totalRITsOverThreshold);
2803     }
2804   }
2805 
2806   /**
2807    * @param region Region whose plan we are to clear.
2808    */
2809   void clearRegionPlan(final HRegionInfo region) {
2810     synchronized (this.regionPlans) {
2811       this.regionPlans.remove(region.getEncodedName());
2812     }
2813   }
2814 
2815   /**
2816    * Wait on region to clear regions-in-transition.
2817    * @param hri Region to wait on.
2818    * @throws IOException
2819    */
2820   public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
2821       throws IOException, InterruptedException {
2822     waitOnRegionToClearRegionsInTransition(hri, -1L);
2823   }
2824 
2825   /**
2826    * Wait on region to clear regions-in-transition or time out
2827    * @param hri
2828    * @param timeOut Milliseconds to wait for current region to be out of transition state.
2829    * @return True when a region clears regions-in-transition before timeout otherwise false
2830    * @throws InterruptedException
2831    */
2832   public boolean waitOnRegionToClearRegionsInTransition(final HRegionInfo hri, long timeOut)
2833       throws InterruptedException {
2834     if (!regionStates.isRegionInTransition(hri)) return true;
2835     long end = (timeOut <= 0) ? Long.MAX_VALUE : EnvironmentEdgeManager.currentTimeMillis()
2836         + timeOut;
2837     // There is already a timeout monitor on regions in transition so I
2838     // should not have to have one here too?
2839     LOG.info("Waiting for " + hri.getEncodedName() +
2840         " to leave regions-in-transition, timeOut=" + timeOut + " ms.");
2841     while (!this.server.isStopped() && regionStates.isRegionInTransition(hri)) {
2842       regionStates.waitForUpdate(100);
2843       if (EnvironmentEdgeManager.currentTimeMillis() > end) {
2844         LOG.info("Timed out on waiting for " + hri.getEncodedName() + " to be assigned.");
2845         return false;
2846       }
2847     }
2848     if (this.server.isStopped()) {
2849       LOG.info("Giving up wait on regions in transition because stoppable.isStopped is set");
2850       return false;
2851     }
2852     return true;
2853   }
2854 
2855   /**
2856    * Update timers for all regions in transition going against the server in the
2857    * serversInUpdatingTimer.
2858    */
2859   public class TimerUpdater extends Chore {
2860 
2861     public TimerUpdater(final int period, final Stoppable stopper) {
2862       super("AssignmentTimerUpdater", period, stopper);
2863     }
2864 
2865     @Override
2866     protected void chore() {
2867       Preconditions.checkState(tomActivated);
2868       ServerName serverToUpdateTimer = null;
2869       while (!serversInUpdatingTimer.isEmpty() && !stopper.isStopped()) {
2870         if (serverToUpdateTimer == null) {
2871           serverToUpdateTimer = serversInUpdatingTimer.first();
2872         } else {
2873           serverToUpdateTimer = serversInUpdatingTimer
2874               .higher(serverToUpdateTimer);
2875         }
2876         if (serverToUpdateTimer == null) {
2877           break;
2878         }
2879         updateTimers(serverToUpdateTimer);
2880         serversInUpdatingTimer.remove(serverToUpdateTimer);
2881       }
2882     }
2883   }
2884 
2885   /**
2886    * Monitor to check for time outs on region transition operations
2887    */
2888   public class TimeoutMonitor extends Chore {
2889     private boolean allRegionServersOffline = false;
2890     private ServerManager serverManager;
2891     private final int timeout;
2892 
2893     /**
2894      * Creates a periodic monitor to check for time outs on region transition
2895      * operations.  This will deal with retries if for some reason something
2896      * doesn't happen within the specified timeout.
2897      * @param period
2898    * @param stopper When {@link Stoppable#isStopped()} is true, this thread will
2899    * cleanup and exit cleanly.
2900      * @param timeout
2901      */
2902     public TimeoutMonitor(final int period, final Stoppable stopper,
2903         ServerManager serverManager,
2904         final int timeout) {
2905       super("AssignmentTimeoutMonitor", period, stopper);
2906       this.timeout = timeout;
2907       this.serverManager = serverManager;
2908     }
2909 
2910     private synchronized void setAllRegionServersOffline(
2911       boolean allRegionServersOffline) {
2912       this.allRegionServersOffline = allRegionServersOffline;
2913     }
2914 
2915     @Override
2916     protected void chore() {
2917       Preconditions.checkState(tomActivated);
2918       boolean noRSAvailable = this.serverManager.createDestinationServersList().isEmpty();
2919 
2920       // Iterate all regions in transition checking for time outs
2921       long now = System.currentTimeMillis();
2922       // no lock concurrent access ok: we will be working on a copy, and it's java-valid to do
2923       //  a copy while another thread is adding/removing items
2924       for (String regionName : regionStates.getRegionsInTransition().keySet()) {
2925         RegionState regionState = regionStates.getRegionTransitionState(regionName);
2926         if (regionState == null) continue;
2927 
2928         if (regionState.getStamp() + timeout <= now) {
2929           // decide on action upon timeout
2930           actOnTimeOut(regionState);
2931         } else if (this.allRegionServersOffline && !noRSAvailable) {
2932           RegionPlan existingPlan = regionPlans.get(regionName);
2933           if (existingPlan == null
2934               || !this.serverManager.isServerOnline(existingPlan
2935                   .getDestination())) {
2936             // if some RSs just came back online, we can start the assignment
2937             // right away
2938             actOnTimeOut(regionState);
2939           }
2940         }
2941       }
2942       setAllRegionServersOffline(noRSAvailable);
2943     }
2944 
2945     private void actOnTimeOut(RegionState regionState) {
2946       HRegionInfo regionInfo = regionState.getRegion();
2947       LOG.info("Regions in transition timed out:  " + regionState);
2948       // Expired! Do a retry.
2949       switch (regionState.getState()) {
2950       case CLOSED:
2951         LOG.info("Region " + regionInfo.getEncodedName()
2952             + " has been CLOSED for too long, waiting on queued "
2953             + "ClosedRegionHandler to run or server shutdown");
2954         // Update our timestamp.
2955         regionState.updateTimestampToNow();
2956         break;
2957       case OFFLINE:
2958         LOG.info("Region has been OFFLINE for too long, " + "reassigning "
2959             + regionInfo.getRegionNameAsString() + " to a random server");
2960         invokeAssign(regionInfo);
2961         break;
2962       case PENDING_OPEN:
2963         LOG.info("Region has been PENDING_OPEN for too "
2964             + "long, reassigning region=" + regionInfo.getRegionNameAsString());
2965         invokeAssign(regionInfo);
2966         break;
2967       case OPENING:
2968         processOpeningState(regionInfo);
2969         break;
2970       case OPEN:
2971         LOG.error("Region has been OPEN for too long, " +
2972             "we don't know where region was opened so can't do anything");
2973         regionState.updateTimestampToNow();
2974         break;
2975 
2976       case PENDING_CLOSE:
2977         LOG.info("Region has been PENDING_CLOSE for too "
2978             + "long, running forced unassign again on region="
2979             + regionInfo.getRegionNameAsString());
2980         invokeUnassign(regionInfo);
2981         break;
2982       case CLOSING:
2983         LOG.info("Region has been CLOSING for too " +
2984           "long, this should eventually complete or the server will " +
2985           "expire, send RPC again");
2986         invokeUnassign(regionInfo);
2987         break;
2988 
2989       case SPLIT:
2990       case SPLITTING:
2991       case FAILED_OPEN:
2992       case FAILED_CLOSE:
2993       case MERGING:
2994         break;
2995 
2996       default:
2997         throw new IllegalStateException("Received event is not valid.");
2998       }
2999     }
3000   }
3001 
3002   private void processOpeningState(HRegionInfo regionInfo) {
3003     LOG.info("Region has been OPENING for too long, reassigning region="
3004         + regionInfo.getRegionNameAsString());
3005     // Should have a ZK node in OPENING state
3006     try {
3007       String node = ZKAssign.getNodeName(watcher, regionInfo.getEncodedName());
3008       Stat stat = new Stat();
3009       byte [] data = ZKAssign.getDataNoWatch(watcher, node, stat);
3010       if (data == null) {
3011         LOG.warn("Data is null, node " + node + " no longer exists");
3012         return;
3013       }
3014       RegionTransition rt = RegionTransition.parseFrom(data);
3015       EventType et = rt.getEventType();
3016       if (et == EventType.RS_ZK_REGION_OPENED) {
3017         LOG.debug("Region has transitioned to OPENED, allowing "
3018             + "watched event handlers to process");
3019         return;
3020       } else if (et != EventType.RS_ZK_REGION_OPENING && et != EventType.RS_ZK_REGION_FAILED_OPEN ) {
3021         LOG.warn("While timing out a region, found ZK node in unexpected state: " + et);
3022         return;
3023       }
3024       invokeAssign(regionInfo);
3025     } catch (KeeperException ke) {
3026       LOG.error("Unexpected ZK exception timing out CLOSING region", ke);
3027     } catch (DeserializationException e) {
3028       LOG.error("Unexpected exception parsing CLOSING region", e);
3029     }
3030   }
3031 
3032   void invokeAssign(HRegionInfo regionInfo) {
3033     threadPoolExecutorService.submit(new AssignCallable(this, regionInfo));
3034   }
3035 
3036   private void invokeUnassign(HRegionInfo regionInfo) {
3037     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3038   }
3039 
3040   public boolean isCarryingMeta(ServerName serverName) {
3041     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
3042   }
3043 
3044   /**
3045    * Check if the shutdown server carries the specific region.
3046    * We have a bunch of places that store region location
3047    * Those values aren't consistent. There is a delay of notification.
3048    * The location from zookeeper unassigned node has the most recent data;
3049    * but the node could be deleted after the region is opened by AM.
3050    * The AM's info could be old when OpenedRegionHandler
3051    * processing hasn't finished yet when server shutdown occurs.
3052    * @return whether the serverName currently hosts the region
3053    */
3054   private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
3055     RegionTransition rt = null;
3056     try {
3057       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
3058       // This call can legitimately come by null
3059       rt = data == null? null: RegionTransition.parseFrom(data);
3060     } catch (KeeperException e) {
3061       server.abort("Exception reading unassigned node for region=" + hri.getEncodedName(), e);
3062     } catch (DeserializationException e) {
3063       server.abort("Exception parsing unassigned node for region=" + hri.getEncodedName(), e);
3064     }
3065 
3066     ServerName addressFromZK = rt != null? rt.getServerName():  null;
3067     if (addressFromZK != null) {
3068       // if we get something from ZK, we will use the data
3069       boolean matchZK = addressFromZK.equals(serverName);
3070       LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK +
3071         " current=" + serverName + ", matches=" + matchZK);
3072       return matchZK;
3073     }
3074 
3075     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
3076     boolean matchAM = (addressFromAM != null &&
3077       addressFromAM.equals(serverName));
3078     LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
3079       " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
3080       " server being checked: " + serverName);
3081 
3082     return matchAM;
3083   }
3084 
3085   /**
3086    * Process shutdown server removing any assignments.
3087    * @param sn Server that went down.
3088    * @return list of regions in transition on this server
3089    */
3090   public List<HRegionInfo> processServerShutdown(final ServerName sn) {
3091     // Clean out any existing assignment plans for this server
3092     synchronized (this.regionPlans) {
3093       for (Iterator <Map.Entry<String, RegionPlan>> i =
3094           this.regionPlans.entrySet().iterator(); i.hasNext();) {
3095         Map.Entry<String, RegionPlan> e = i.next();
3096         ServerName otherSn = e.getValue().getDestination();
3097         // The name will be null if the region is planned for a random assign.
3098         if (otherSn != null && otherSn.equals(sn)) {
3099           // Use iterator's remove else we'll get CME
3100           i.remove();
3101         }
3102       }
3103     }
3104     List<HRegionInfo> regions = regionStates.serverOffline(watcher, sn);
3105     for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) {
3106       HRegionInfo hri = it.next();
3107       String encodedName = hri.getEncodedName();
3108 
3109       // We need a lock on the region as we could update it
3110       Lock lock = locker.acquireLock(encodedName);
3111       try {
3112         RegionState regionState =
3113           regionStates.getRegionTransitionState(encodedName);
3114         if (regionState == null
3115             || (regionState.getServerName() != null && !regionState.isOnServer(sn))
3116             || !(regionState.isFailedClose() || regionState.isOffline()
3117               || regionState.isPendingOpenOrOpening())) {
3118           LOG.info("Skip " + regionState + " since it is not opening/failed_close"
3119             + " on the dead server any more: " + sn);
3120           it.remove();
3121         } else {
3122           try {
3123             // Delete the ZNode if exists
3124             ZKAssign.deleteNodeFailSilent(watcher, hri);
3125           } catch (KeeperException ke) {
3126             server.abort("Unexpected ZK exception deleting node " + hri, ke);
3127           }
3128           if (zkTable.isDisablingOrDisabledTable(hri.getTable())) {
3129             regionStates.regionOffline(hri);
3130             it.remove();
3131             continue;
3132           }
3133           // Mark the region offline and assign it again by SSH
3134           regionStates.updateRegionState(hri, State.OFFLINE);
3135         }
3136       } finally {
3137         lock.unlock();
3138       }
3139     }
3140     return regions;
3141   }
3142 
3143   /**
3144    * @param plan Plan to execute.
3145    */
3146   public void balance(final RegionPlan plan) {
3147     HRegionInfo hri = plan.getRegionInfo();
3148     TableName tableName = hri.getTable();
3149     if (zkTable.isDisablingOrDisabledTable(tableName)) {
3150       LOG.info("Ignored moving region of disabling/disabled table "
3151         + tableName);
3152       return;
3153     }
3154 
3155     // Move the region only if it's assigned
3156     String encodedName = hri.getEncodedName();
3157     ReentrantLock lock = locker.acquireLock(encodedName);
3158     try {
3159       if (!regionStates.isRegionOnline(hri)) {
3160         RegionState state = regionStates.getRegionState(encodedName);
3161         LOG.info("Ignored moving region not assigned: " + hri + ", "
3162           + (state == null ? "not in region states" : state));
3163         return;
3164       }
3165       synchronized (this.regionPlans) {
3166         this.regionPlans.put(plan.getRegionName(), plan);
3167       }
3168       unassign(hri, false, plan.getDestination());
3169     } finally {
3170       lock.unlock();
3171     }
3172   }
3173 
3174   public void stop() {
3175     if (tomActivated){
3176       this.timeoutMonitor.interrupt();
3177       this.timerUpdater.interrupt();
3178     }
3179   }
3180 
3181   /**
3182    * Shutdown the threadpool executor service
3183    */
3184   public void shutdown() {
3185     // It's an immediate shutdown, so we're clearing the remaining tasks.
3186     synchronized (zkEventWorkerWaitingList){
3187       zkEventWorkerWaitingList.clear();
3188     }
3189     threadPoolExecutorService.shutdownNow();
3190     zkEventWorkers.shutdownNow();
3191   }
3192 
3193   protected void setEnabledTable(TableName tableName) {
3194     try {
3195       this.zkTable.setEnabledTable(tableName);
3196     } catch (KeeperException e) {
3197       // here we can abort as it is the start up flow
3198       String errorMsg = "Unable to ensure that the table " + tableName
3199           + " will be" + " enabled because of a ZooKeeper issue";
3200       LOG.error(errorMsg);
3201       this.server.abort(errorMsg, e);
3202     }
3203   }
3204 
3205   /**
3206    * Set region as OFFLINED up in zookeeper asynchronously.
3207    * @param state
3208    * @return True if we succeeded, false otherwise (State was incorrect or failed
3209    * updating zk).
3210    */
3211   private boolean asyncSetOfflineInZooKeeper(final RegionState state,
3212       final AsyncCallback.StringCallback cb, final ServerName destination) {
3213     if (!state.isClosed() && !state.isOffline()) {
3214       this.server.abort("Unexpected state trying to OFFLINE; " + state,
3215         new IllegalStateException());
3216       return false;
3217     }
3218     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
3219     try {
3220       ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(),
3221         destination, cb, state);
3222     } catch (KeeperException e) {
3223       if (e instanceof NodeExistsException) {
3224         LOG.warn("Node for " + state.getRegion() + " already exists");
3225       } else {
3226         server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
3227       }
3228       return false;
3229     }
3230     return true;
3231   }
3232 
3233   private boolean deleteNodeInStates(String encodedName,
3234       String desc, ServerName sn, EventType... types) {
3235     try {
3236       for (EventType et: types) {
3237         if (ZKAssign.deleteNode(watcher, encodedName, et, sn)) {
3238           return true;
3239         }
3240       }
3241       LOG.info("Failed to delete the " + desc + " node for "
3242         + encodedName + ". The node type may not match");
3243     } catch (NoNodeException e) {
3244       if (LOG.isDebugEnabled()) {
3245         LOG.debug("The " + desc + " node for " + encodedName + " already deleted");
3246       }
3247     } catch (KeeperException ke) {
3248       server.abort("Unexpected ZK exception deleting " + desc
3249         + " node for the region " + encodedName, ke);
3250     }
3251     return false;
3252   }
3253 
3254   private void deleteMergingNode(String encodedName, ServerName sn) {
3255     deleteNodeInStates(encodedName, "merging", sn, EventType.RS_ZK_REGION_MERGING,
3256       EventType.RS_ZK_REQUEST_REGION_MERGE, EventType.RS_ZK_REGION_MERGED);
3257   }
3258 
3259   private void deleteSplittingNode(String encodedName, ServerName sn) {
3260     deleteNodeInStates(encodedName, "splitting", sn, EventType.RS_ZK_REGION_SPLITTING,
3261       EventType.RS_ZK_REQUEST_REGION_SPLIT, EventType.RS_ZK_REGION_SPLIT);
3262   }
3263 
3264   /**
3265    * A helper to handle region merging transition event.
3266    * It transitions merging regions to MERGING state.
3267    */
3268   private boolean handleRegionMerging(final RegionTransition rt, final String encodedName,
3269       final String prettyPrintedRegionName, final ServerName sn) {
3270     if (!serverManager.isServerOnline(sn)) {
3271       LOG.warn("Dropped merging! ServerName=" + sn + " unknown.");
3272       return false;
3273     }
3274     byte [] payloadOfMerging = rt.getPayload();
3275     List<HRegionInfo> mergingRegions;
3276     try {
3277       mergingRegions = HRegionInfo.parseDelimitedFrom(
3278         payloadOfMerging, 0, payloadOfMerging.length);
3279     } catch (IOException e) {
3280       LOG.error("Dropped merging! Failed reading "  + rt.getEventType()
3281         + " payload for " + prettyPrintedRegionName);
3282       return false;
3283     }
3284     assert mergingRegions.size() == 3;
3285     HRegionInfo p = mergingRegions.get(0);
3286     HRegionInfo hri_a = mergingRegions.get(1);
3287     HRegionInfo hri_b = mergingRegions.get(2);
3288 
3289     RegionState rs_p = regionStates.getRegionState(p);
3290     RegionState rs_a = regionStates.getRegionState(hri_a);
3291     RegionState rs_b = regionStates.getRegionState(hri_b);
3292 
3293     if (!((rs_a == null || rs_a.isOpenOrMergingOnServer(sn))
3294         && (rs_b == null || rs_b.isOpenOrMergingOnServer(sn))
3295         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
3296       LOG.warn("Dropped merging! Not in state good for MERGING; rs_p="
3297         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3298       return false;
3299     }
3300 
3301     EventType et = rt.getEventType();
3302     if (et == EventType.RS_ZK_REQUEST_REGION_MERGE) {
3303       try {
3304         if (RegionMergeTransaction.transitionMergingNode(watcher, p,
3305             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_MERGE,
3306             EventType.RS_ZK_REGION_MERGING) == -1) {
3307           byte[] data = ZKAssign.getData(watcher, encodedName);
3308           EventType currentType = null;
3309           if (data != null) {
3310             RegionTransition newRt = RegionTransition.parseFrom(data);
3311             currentType = newRt.getEventType();
3312           }
3313           if (currentType == null || (currentType != EventType.RS_ZK_REGION_MERGED
3314               && currentType != EventType.RS_ZK_REGION_MERGING)) {
3315             LOG.warn("Failed to transition pending_merge node "
3316               + encodedName + " to merging, it's now " + currentType);
3317             return false;
3318           }
3319         }
3320       } catch (Exception e) {
3321         LOG.warn("Failed to transition pending_merge node "
3322           + encodedName + " to merging", e);
3323         return false;
3324       }
3325     }
3326 
3327     synchronized (regionStates) {
3328       regionStates.updateRegionState(hri_a, State.MERGING);
3329       regionStates.updateRegionState(hri_b, State.MERGING);
3330       regionStates.updateRegionState(p, State.MERGING_NEW, sn);
3331 
3332       if (et != EventType.RS_ZK_REGION_MERGED) {
3333         regionStates.regionOffline(p, State.MERGING_NEW);
3334         this.mergingRegions.put(encodedName,
3335           new PairOfSameType<HRegionInfo>(hri_a, hri_b));
3336       } else {
3337         this.mergingRegions.remove(encodedName);
3338         regionOffline(hri_a, State.MERGED);
3339         regionOffline(hri_b, State.MERGED);
3340         regionOnline(p, sn);
3341       }
3342     }
3343 
3344     if (et == EventType.RS_ZK_REGION_MERGED) {
3345       LOG.debug("Handling MERGED event for " + encodedName + "; deleting node");
3346       // Remove region from ZK
3347       try {
3348         boolean successful = false;
3349         while (!successful) {
3350           // It's possible that the RS tickles in between the reading of the
3351           // znode and the deleting, so it's safe to retry.
3352           successful = ZKAssign.deleteNode(watcher, encodedName,
3353             EventType.RS_ZK_REGION_MERGED, sn);
3354         }
3355       } catch (KeeperException e) {
3356         if (e instanceof NoNodeException) {
3357           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3358           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3359         } else {
3360           server.abort("Error deleting MERGED node " + encodedName, e);
3361         }
3362       }
3363       LOG.info("Handled MERGED event; merged=" + p.getRegionNameAsString()
3364         + ", region_a=" + hri_a.getRegionNameAsString() + ", region_b="
3365         + hri_b.getRegionNameAsString() + ", on " + sn);
3366 
3367       // User could disable the table before master knows the new region.
3368       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3369         unassign(p);
3370       }
3371     }
3372     return true;
3373   }
3374 
3375   /**
3376    * A helper to handle region splitting transition event.
3377    */
3378   private boolean handleRegionSplitting(final RegionTransition rt, final String encodedName,
3379       final String prettyPrintedRegionName, final ServerName sn) {
3380     if (!serverManager.isServerOnline(sn)) {
3381       LOG.warn("Dropped splitting! ServerName=" + sn + " unknown.");
3382       return false;
3383     }
3384     byte [] payloadOfSplitting = rt.getPayload();
3385     List<HRegionInfo> splittingRegions;
3386     try {
3387       splittingRegions = HRegionInfo.parseDelimitedFrom(
3388         payloadOfSplitting, 0, payloadOfSplitting.length);
3389     } catch (IOException e) {
3390       LOG.error("Dropped splitting! Failed reading " + rt.getEventType()
3391         + " payload for " + prettyPrintedRegionName);
3392       return false;
3393     }
3394     assert splittingRegions.size() == 2;
3395     HRegionInfo hri_a = splittingRegions.get(0);
3396     HRegionInfo hri_b = splittingRegions.get(1);
3397 
3398     RegionState rs_p = regionStates.getRegionState(encodedName);
3399     RegionState rs_a = regionStates.getRegionState(hri_a);
3400     RegionState rs_b = regionStates.getRegionState(hri_b);
3401 
3402     if (!((rs_p == null || rs_p.isOpenOrSplittingOnServer(sn))
3403         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
3404         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
3405       LOG.warn("Dropped splitting! Not in state good for SPLITTING; rs_p="
3406         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3407       return false;
3408     }
3409 
3410     if (rs_p == null) {
3411       // Splitting region should be online
3412       rs_p = regionStates.updateRegionState(rt, State.OPEN);
3413       if (rs_p == null) {
3414         LOG.warn("Received splitting for region " + prettyPrintedRegionName
3415           + " from server " + sn + " but it doesn't exist anymore,"
3416           + " probably already processed its split");
3417         return false;
3418       }
3419       regionStates.regionOnline(rs_p.getRegion(), sn);
3420     }
3421 
3422     HRegionInfo p = rs_p.getRegion();
3423     EventType et = rt.getEventType();
3424     if (et == EventType.RS_ZK_REQUEST_REGION_SPLIT) {
3425       try {
3426         if (SplitTransaction.transitionSplittingNode(watcher, p,
3427             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_SPLIT,
3428             EventType.RS_ZK_REGION_SPLITTING) == -1) {
3429           byte[] data = ZKAssign.getData(watcher, encodedName);
3430           EventType currentType = null;
3431           if (data != null) {
3432             RegionTransition newRt = RegionTransition.parseFrom(data);
3433             currentType = newRt.getEventType();
3434           }
3435           if (currentType == null || (currentType != EventType.RS_ZK_REGION_SPLIT
3436               && currentType != EventType.RS_ZK_REGION_SPLITTING)) {
3437             LOG.warn("Failed to transition pending_split node "
3438               + encodedName + " to splitting, it's now " + currentType);
3439             return false;
3440           }
3441         }
3442       } catch (Exception e) {
3443         LOG.warn("Failed to transition pending_split node "
3444           + encodedName + " to splitting", e);
3445         return false;
3446       }
3447     }
3448 
3449     synchronized (regionStates) {
3450       regionStates.updateRegionState(hri_a, State.SPLITTING_NEW, sn);
3451       regionStates.updateRegionState(hri_b, State.SPLITTING_NEW, sn);
3452       regionStates.regionOffline(hri_a, State.SPLITTING_NEW);
3453       regionStates.regionOffline(hri_b, State.SPLITTING_NEW);
3454       regionStates.updateRegionState(rt, State.SPLITTING);
3455 
3456       // The below is for testing ONLY!  We can't do fault injection easily, so
3457       // resort to this kinda uglyness -- St.Ack 02/25/2011.
3458       if (TEST_SKIP_SPLIT_HANDLING) {
3459         LOG.warn("Skipping split message, TEST_SKIP_SPLIT_HANDLING is set");
3460         return true; // return true so that the splitting node stays
3461       }
3462 
3463       if (et == EventType.RS_ZK_REGION_SPLIT) {
3464         regionOffline(p, State.SPLIT);
3465         regionOnline(hri_a, sn);
3466         regionOnline(hri_b, sn);
3467       }
3468     }
3469 
3470     if (et == EventType.RS_ZK_REGION_SPLIT) {
3471       LOG.debug("Handling SPLIT event for " + encodedName + "; deleting node");
3472       // Remove region from ZK
3473       try {
3474         boolean successful = false;
3475         while (!successful) {
3476           // It's possible that the RS tickles in between the reading of the
3477           // znode and the deleting, so it's safe to retry.
3478           successful = ZKAssign.deleteNode(watcher, encodedName,
3479             EventType.RS_ZK_REGION_SPLIT, sn);
3480         }
3481       } catch (KeeperException e) {
3482         if (e instanceof NoNodeException) {
3483           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3484           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3485         } else {
3486           server.abort("Error deleting SPLIT node " + encodedName, e);
3487         }
3488       }
3489       LOG.info("Handled SPLIT event; parent=" + p.getRegionNameAsString()
3490         + ", daughter a=" + hri_a.getRegionNameAsString() + ", daughter b="
3491         + hri_b.getRegionNameAsString() + ", on " + sn);
3492 
3493       // User could disable the table before master knows the new region.
3494       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3495         unassign(hri_a);
3496         unassign(hri_b);
3497       }
3498     }
3499     return true;
3500   }
3501 
3502   /**
3503    * A region is offline.  The new state should be the specified one,
3504    * if not null.  If the specified state is null, the new state is Offline.
3505    * The specified state can be Split/Merged/Offline/null only.
3506    */
3507   private void regionOffline(final HRegionInfo regionInfo, final State state) {
3508     regionStates.regionOffline(regionInfo, state);
3509     removeClosedRegion(regionInfo);
3510     // remove the region plan as well just in case.
3511     clearRegionPlan(regionInfo);
3512   }
3513 
3514   /**
3515    * @return Instance of load balancer
3516    */
3517   public LoadBalancer getBalancer() {
3518     return this.balancer;
3519   }
3520 }