View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.NavigableMap;
31  import java.util.Set;
32  import java.util.TreeMap;
33  import java.util.concurrent.ConcurrentHashMap;
34  import java.util.concurrent.ConcurrentSkipListSet;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.TimeUnit;
37  import java.util.concurrent.atomic.AtomicBoolean;
38  import java.util.concurrent.atomic.AtomicInteger;
39  import java.util.concurrent.locks.Lock;
40  import java.util.concurrent.locks.ReentrantLock;
41  
42  import org.apache.commons.logging.Log;
43  import org.apache.commons.logging.LogFactory;
44  import org.apache.hadoop.classification.InterfaceAudience;
45  import org.apache.hadoop.conf.Configuration;
46  import org.apache.hadoop.hbase.Chore;
47  import org.apache.hadoop.hbase.HBaseIOException;
48  import org.apache.hadoop.hbase.HConstants;
49  import org.apache.hadoop.hbase.HRegionInfo;
50  import org.apache.hadoop.hbase.NotServingRegionException;
51  import org.apache.hadoop.hbase.RegionTransition;
52  import org.apache.hadoop.hbase.Server;
53  import org.apache.hadoop.hbase.ServerName;
54  import org.apache.hadoop.hbase.Stoppable;
55  import org.apache.hadoop.hbase.TableName;
56  import org.apache.hadoop.hbase.TableNotFoundException;
57  import org.apache.hadoop.hbase.catalog.CatalogTracker;
58  import org.apache.hadoop.hbase.catalog.MetaReader;
59  import org.apache.hadoop.hbase.client.Result;
60  import org.apache.hadoop.hbase.exceptions.DeserializationException;
61  import org.apache.hadoop.hbase.executor.EventHandler;
62  import org.apache.hadoop.hbase.executor.EventType;
63  import org.apache.hadoop.hbase.executor.ExecutorService;
64  import org.apache.hadoop.hbase.ipc.RpcClient;
65  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
66  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
67  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
68  import org.apache.hadoop.hbase.master.RegionState.State;
69  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
70  import org.apache.hadoop.hbase.master.balancer.FavoredNodeLoadBalancer;
71  import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
72  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
73  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
74  import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
75  import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
76  import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
77  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
78  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
79  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
80  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
81  import org.apache.hadoop.hbase.util.KeyLocker;
82  import org.apache.hadoop.hbase.util.Pair;
83  import org.apache.hadoop.hbase.util.PairOfSameType;
84  import org.apache.hadoop.hbase.util.Threads;
85  import org.apache.hadoop.hbase.util.Triple;
86  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
87  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
88  import org.apache.hadoop.hbase.zookeeper.ZKTable;
89  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
90  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
91  import org.apache.hadoop.ipc.RemoteException;
92  import org.apache.zookeeper.AsyncCallback;
93  import org.apache.zookeeper.KeeperException;
94  import org.apache.zookeeper.KeeperException.NoNodeException;
95  import org.apache.zookeeper.KeeperException.NodeExistsException;
96  import org.apache.zookeeper.data.Stat;
97  
98  import com.google.common.base.Preconditions;
99  import com.google.common.collect.LinkedHashMultimap;
100 
101 /**
102  * Manages and performs region assignment.
103  * <p>
104  * Monitors ZooKeeper for events related to regions in transition.
105  * <p>
106  * Handles existing regions in transition during master failover.
107  */
108 @InterfaceAudience.Private
109 public class AssignmentManager extends ZooKeeperListener {
110   private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
111 
112   public static final ServerName HBCK_CODE_SERVERNAME = ServerName.valueOf(HConstants.HBCK_CODE_NAME,
113       -1, -1L);
114 
115   public static final String ASSIGNMENT_TIMEOUT = "hbase.master.assignment.timeoutmonitor.timeout";
116   public static final int DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT = 600000;
117   public static final String ASSIGNMENT_TIMEOUT_MANAGEMENT = "hbase.assignment.timeout.management";
118   public static final boolean DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT = false;
119 
120   public static final String ALREADY_IN_TRANSITION_WAITTIME
121     = "hbase.assignment.already.intransition.waittime";
122   public static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute
123 
124   protected final Server server;
125 
126   private ServerManager serverManager;
127 
128   private boolean shouldAssignRegionsWithFavoredNodes;
129 
130   private CatalogTracker catalogTracker;
131 
132   protected final TimeoutMonitor timeoutMonitor;
133 
134   private final TimerUpdater timerUpdater;
135 
136   private LoadBalancer balancer;
137 
138   private final MetricsAssignmentManager metricsAssignmentManager;
139 
140   private final TableLockManager tableLockManager;
141 
142   private AtomicInteger numRegionsOpened = new AtomicInteger(0);
143 
144   final private KeyLocker<String> locker = new KeyLocker<String>();
145 
146   /**
147    * Map of regions to reopen after the schema of a table is changed. Key -
148    * encoded region name, value - HRegionInfo
149    */
150   private final Map <String, HRegionInfo> regionsToReopen;
151 
152   /*
153    * Maximum times we recurse an assignment/unassignment.
154    * See below in {@link #assign()} and {@link #unassign()}.
155    */
156   private final int maximumAttempts;
157 
158   /**
159    * Map of two merging regions from the region to be created.
160    */
161   private final Map<String, PairOfSameType<HRegionInfo>> mergingRegions
162     = new HashMap<String, PairOfSameType<HRegionInfo>>();
163 
164   /**
165    * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
166    * failure due to lack of availability of region plan
167    */
168   private final long sleepTimeBeforeRetryingMetaAssignment;
169 
170   /** Plans for region movement. Key is the encoded version of a region name*/
171   // TODO: When do plans get cleaned out?  Ever? In server open and in server
172   // shutdown processing -- St.Ack
173   // All access to this Map must be synchronized.
174   final NavigableMap<String, RegionPlan> regionPlans =
175     new TreeMap<String, RegionPlan>();
176 
177   private final ZKTable zkTable;
178 
179   /**
180    * Contains the server which need to update timer, these servers will be
181    * handled by {@link TimerUpdater}
182    */
183   private final ConcurrentSkipListSet<ServerName> serversInUpdatingTimer;
184 
185   private final ExecutorService executorService;
186 
187   // For unit tests, keep track of calls to ClosedRegionHandler
188   private Map<HRegionInfo, AtomicBoolean> closedRegionHandlerCalled = null;
189 
190   // For unit tests, keep track of calls to OpenedRegionHandler
191   private Map<HRegionInfo, AtomicBoolean> openedRegionHandlerCalled = null;
192 
193   //Thread pool executor service for timeout monitor
194   private java.util.concurrent.ExecutorService threadPoolExecutorService;
195 
196   // A bunch of ZK events workers. Each is a single thread executor service
197   private final java.util.concurrent.ExecutorService zkEventWorkers;
198 
199   private List<EventType> ignoreStatesRSOffline = Arrays.asList(
200       EventType.RS_ZK_REGION_FAILED_OPEN, EventType.RS_ZK_REGION_CLOSED);
201 
202   private final RegionStates regionStates;
203 
204   // The threshold to use bulk assigning. Using bulk assignment
205   // only if assigning at least this many regions to at least this
206   // many servers. If assigning fewer regions to fewer servers,
207   // bulk assigning may be not as efficient.
208   private final int bulkAssignThresholdRegions;
209   private final int bulkAssignThresholdServers;
210 
211   // Should bulk assignment wait till all regions are assigned,
212   // or it is timed out?  This is useful to measure bulk assignment
213   // performance, but not needed in most use cases.
214   private final boolean bulkAssignWaitTillAllAssigned;
215 
216   /**
217    * Indicator that AssignmentManager has recovered the region states so
218    * that ServerShutdownHandler can be fully enabled and re-assign regions
219    * of dead servers. So that when re-assignment happens, AssignmentManager
220    * has proper region states.
221    *
222    * Protected to ease testing.
223    */
224   protected final AtomicBoolean failoverCleanupDone = new AtomicBoolean(false);
225 
226   /** Is the TimeOutManagement activated **/
227   private final boolean tomActivated;
228 
229   /**
230    * A map to track the count a region fails to open in a row.
231    * So that we don't try to open a region forever if the failure is
232    * unrecoverable.  We don't put this information in region states
233    * because we don't expect this to happen frequently; we don't
234    * want to copy this information over during each state transition either.
235    */
236   private final ConcurrentHashMap<String, AtomicInteger>
237     failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
238 
239   /**
240    * For testing only!  Set to true to skip handling of split.
241    */
242   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="MS_SHOULD_BE_FINAL")
243   public static boolean TEST_SKIP_SPLIT_HANDLING = false;
244 
245   /**
246    * Constructs a new assignment manager.
247    *
248    * @param server
249    * @param serverManager
250    * @param catalogTracker
251    * @param service
252    * @throws KeeperException
253    * @throws IOException
254    */
255   public AssignmentManager(Server server, ServerManager serverManager,
256       CatalogTracker catalogTracker, final LoadBalancer balancer,
257       final ExecutorService service, MetricsMaster metricsMaster,
258       final TableLockManager tableLockManager) throws KeeperException, IOException {
259     super(server.getZooKeeper());
260     this.server = server;
261     this.serverManager = serverManager;
262     this.catalogTracker = catalogTracker;
263     this.executorService = service;
264     this.regionsToReopen = Collections.synchronizedMap
265                            (new HashMap<String, HRegionInfo> ());
266     Configuration conf = server.getConfiguration();
267     // Only read favored nodes if using the favored nodes load balancer.
268     this.shouldAssignRegionsWithFavoredNodes = conf.getClass(
269            HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class).equals(
270            FavoredNodeLoadBalancer.class);
271     this.tomActivated = conf.getBoolean(
272       ASSIGNMENT_TIMEOUT_MANAGEMENT, DEFAULT_ASSIGNMENT_TIMEOUT_MANAGEMENT);
273     if (tomActivated){
274       this.serversInUpdatingTimer =  new ConcurrentSkipListSet<ServerName>();
275       this.timeoutMonitor = new TimeoutMonitor(
276         conf.getInt("hbase.master.assignment.timeoutmonitor.period", 30000),
277         server, serverManager,
278         conf.getInt(ASSIGNMENT_TIMEOUT, DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT));
279       this.timerUpdater = new TimerUpdater(conf.getInt(
280         "hbase.master.assignment.timerupdater.period", 10000), server);
281       Threads.setDaemonThreadRunning(timerUpdater.getThread(),
282         server.getServerName() + ".timerUpdater");
283     } else {
284       this.serversInUpdatingTimer =  null;
285       this.timeoutMonitor = null;
286       this.timerUpdater = null;
287     }
288     this.zkTable = new ZKTable(this.watcher);
289     // This is the max attempts, not retries, so it should be at least 1.
290     this.maximumAttempts = Math.max(1,
291       this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10));
292     this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong(
293         "hbase.meta.assignment.retry.sleeptime", 1000l);
294     this.balancer = balancer;
295     int maxThreads = conf.getInt("hbase.assignment.threads.max", 30);
296     this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool(
297       maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM."));
298     this.regionStates = new RegionStates(server, serverManager);
299 
300     this.bulkAssignWaitTillAllAssigned =
301       conf.getBoolean("hbase.bulk.assignment.waittillallassigned", false);
302     this.bulkAssignThresholdRegions = conf.getInt("hbase.bulk.assignment.threshold.regions", 7);
303     this.bulkAssignThresholdServers = conf.getInt("hbase.bulk.assignment.threshold.servers", 3);
304 
305     int workers = conf.getInt("hbase.assignment.zkevent.workers", 20);
306     ThreadFactory threadFactory = Threads.newDaemonThreadFactory("AM.ZK.Worker");
307     zkEventWorkers = Threads.getBoundedCachedThreadPool(workers, 60L,
308             TimeUnit.SECONDS, threadFactory);
309     this.tableLockManager = tableLockManager;
310 
311     this.metricsAssignmentManager = new MetricsAssignmentManager();
312   }
313 
314   void startTimeOutMonitor() {
315     if (tomActivated) {
316       Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), server.getServerName()
317           + ".timeoutMonitor");
318     }
319   }
320 
321   /**
322    * @return Instance of ZKTable.
323    */
324   public ZKTable getZKTable() {
325     // These are 'expensive' to make involving trip to zk ensemble so allow
326     // sharing.
327     return this.zkTable;
328   }
329 
330   /**
331    * This SHOULD not be public. It is public now
332    * because of some unit tests.
333    *
334    * TODO: make it package private and keep RegionStates in the master package
335    */
336   public RegionStates getRegionStates() {
337     return regionStates;
338   }
339 
340   public RegionPlan getRegionReopenPlan(HRegionInfo hri) {
341     return new RegionPlan(hri, null, regionStates.getRegionServerOfRegion(hri));
342   }
343 
344   /**
345    * Add a regionPlan for the specified region.
346    * @param encodedName
347    * @param plan
348    */
349   public void addPlan(String encodedName, RegionPlan plan) {
350     synchronized (regionPlans) {
351       regionPlans.put(encodedName, plan);
352     }
353   }
354 
355   /**
356    * Add a map of region plans.
357    */
358   public void addPlans(Map<String, RegionPlan> plans) {
359     synchronized (regionPlans) {
360       regionPlans.putAll(plans);
361     }
362   }
363 
364   /**
365    * Set the list of regions that will be reopened
366    * because of an update in table schema
367    *
368    * @param regions
369    *          list of regions that should be tracked for reopen
370    */
371   public void setRegionsToReopen(List <HRegionInfo> regions) {
372     for(HRegionInfo hri : regions) {
373       regionsToReopen.put(hri.getEncodedName(), hri);
374     }
375   }
376 
377   /**
378    * Used by the client to identify if all regions have the schema updates
379    *
380    * @param tableName
381    * @return Pair indicating the status of the alter command
382    * @throws IOException
383    */
384   public Pair<Integer, Integer> getReopenStatus(TableName tableName)
385       throws IOException {
386     List <HRegionInfo> hris =
387       MetaReader.getTableRegions(this.server.getCatalogTracker(), tableName, true);
388     Integer pending = 0;
389     for (HRegionInfo hri : hris) {
390       String name = hri.getEncodedName();
391       // no lock concurrent access ok: sequential consistency respected.
392       if (regionsToReopen.containsKey(name)
393           || regionStates.isRegionInTransition(name)) {
394         pending++;
395       }
396     }
397     return new Pair<Integer, Integer>(pending, hris.size());
398   }
399 
400   /**
401    * Used by ServerShutdownHandler to make sure AssignmentManager has completed
402    * the failover cleanup before re-assigning regions of dead servers. So that
403    * when re-assignment happens, AssignmentManager has proper region states.
404    */
405   public boolean isFailoverCleanupDone() {
406     return failoverCleanupDone.get();
407   }
408 
409   /**
410    * To avoid racing with AM, external entities may need to lock a region,
411    * for example, when SSH checks what regions to skip re-assigning.
412    */
413   public Lock acquireRegionLock(final String encodedName) {
414     return locker.acquireLock(encodedName);
415   }
416 
417   /**
418    * Now, failover cleanup is completed. Notify server manager to
419    * process queued up dead servers processing, if any.
420    */
421   void failoverCleanupDone() {
422     failoverCleanupDone.set(true);
423     serverManager.processQueuedDeadServers();
424   }
425 
426   /**
427    * Called on startup.
428    * Figures whether a fresh cluster start of we are joining extant running cluster.
429    * @throws IOException
430    * @throws KeeperException
431    * @throws InterruptedException
432    */
433   void joinCluster() throws IOException,
434       KeeperException, InterruptedException {
435     // Concurrency note: In the below the accesses on regionsInTransition are
436     // outside of a synchronization block where usually all accesses to RIT are
437     // synchronized.  The presumption is that in this case it is safe since this
438     // method is being played by a single thread on startup.
439 
440     // TODO: Regions that have a null location and are not in regionsInTransitions
441     // need to be handled.
442 
443     // Scan hbase:meta to build list of existing regions, servers, and assignment
444     // Returns servers who have not checked in (assumed dead) and their regions
445     Map<ServerName, List<HRegionInfo>> deadServers = rebuildUserRegions();
446 
447     // This method will assign all user regions if a clean server startup or
448     // it will reconstruct master state and cleanup any leftovers from
449     // previous master process.
450     processDeadServersAndRegionsInTransition(deadServers);
451 
452     recoverTableInDisablingState();
453     recoverTableInEnablingState();
454   }
455 
456   /**
457    * Process all regions that are in transition in zookeeper and also
458    * processes the list of dead servers by scanning the META.
459    * Used by master joining an cluster.  If we figure this is a clean cluster
460    * startup, will assign all user regions.
461    * @param deadServers
462    *          Map of dead servers and their regions. Can be null.
463    * @throws KeeperException
464    * @throws IOException
465    * @throws InterruptedException
466    */
467   void processDeadServersAndRegionsInTransition(
468       final Map<ServerName, List<HRegionInfo>> deadServers)
469           throws KeeperException, IOException, InterruptedException {
470     List<String> nodes = ZKUtil.listChildrenNoWatch(watcher,
471       watcher.assignmentZNode);
472 
473     if (nodes == null) {
474       String errorMessage = "Failed to get the children from ZK";
475       server.abort(errorMessage, new IOException(errorMessage));
476       return;
477     }
478 
479     boolean failover = (!serverManager.getDeadServers().isEmpty() || !serverManager
480         .getRequeuedDeadServers().isEmpty());
481 
482     if (!failover) {
483       // If any one region except meta is assigned, it's a failover.
484       Map<HRegionInfo, ServerName> regions = regionStates.getRegionAssignments();
485       for (HRegionInfo hri: regions.keySet()) {
486         if (!hri.isMetaTable()) {
487           LOG.debug("Found " + hri + " out on cluster");
488           failover = true;
489           break;
490         }
491       }
492       if (!failover) {
493         // If any one region except meta is in transition, it's a failover.
494         for (String encodedName: nodes) {
495           RegionState state = regionStates.getRegionState(encodedName);
496           if (state != null && !state.getRegion().isMetaRegion()) {
497             LOG.debug("Found " + state.getRegion().getRegionNameAsString() + " in RITs");
498             failover = true;
499             break;
500           }
501         }
502       }
503     }
504 
505     // If we found user regions out on cluster, its a failover.
506     if (failover) {
507       LOG.info("Found regions out on cluster or in RIT; presuming failover");
508       // Process list of dead servers and regions in RIT.
509       // See HBASE-4580 for more information.
510       processDeadServersAndRecoverLostRegions(deadServers);
511     } else {
512       // Fresh cluster startup.
513       LOG.info("Clean cluster startup. Assigning userregions");
514       assignAllUserRegions();
515     }
516   }
517 
518   /**
519    * If region is up in zk in transition, then do fixup and block and wait until
520    * the region is assigned and out of transition.  Used on startup for
521    * catalog regions.
522    * @param hri Region to look for.
523    * @return True if we processed a region in transition else false if region
524    * was not up in zk in transition.
525    * @throws InterruptedException
526    * @throws KeeperException
527    * @throws IOException
528    */
529   boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
530       throws InterruptedException, KeeperException, IOException {
531     String encodedRegionName = hri.getEncodedName();
532     if (!processRegionInTransition(encodedRegionName, hri)) {
533       return false; // The region is not in transition
534     }
535     LOG.debug("Waiting on " + HRegionInfo.prettyPrint(encodedRegionName));
536     while (!this.server.isStopped() &&
537         this.regionStates.isRegionInTransition(encodedRegionName)) {
538       RegionState state = this.regionStates.getRegionTransitionState(encodedRegionName);
539       if (state == null || !serverManager.isServerOnline(state.getServerName())) {
540         // The region is not in transition, or not in transition on an online
541         // server. Doesn't help to block here any more. Caller need to
542         // verify the region is actually assigned.
543         break;
544       }
545       this.regionStates.waitForUpdate(100);
546     }
547     return true;
548   }
549 
550   /**
551    * Process failover of new master for region <code>encodedRegionName</code>
552    * up in zookeeper.
553    * @param encodedRegionName Region to process failover for.
554    * @param regionInfo If null we'll go get it from meta table.
555    * @return True if we processed <code>regionInfo</code> as a RIT.
556    * @throws KeeperException
557    * @throws IOException
558    */
559   boolean processRegionInTransition(final String encodedRegionName,
560       final HRegionInfo regionInfo) throws KeeperException, IOException {
561     // We need a lock here to ensure that we will not put the same region twice
562     // It has no reason to be a lock shared with the other operations.
563     // We can do the lock on the region only, instead of a global lock: what we want to ensure
564     // is that we don't have two threads working on the same region.
565     Lock lock = locker.acquireLock(encodedRegionName);
566     try {
567       Stat stat = new Stat();
568       byte [] data = ZKAssign.getDataAndWatch(watcher, encodedRegionName, stat);
569       if (data == null) return false;
570       RegionTransition rt;
571       try {
572         rt = RegionTransition.parseFrom(data);
573       } catch (DeserializationException e) {
574         LOG.warn("Failed parse znode data", e);
575         return false;
576       }
577       HRegionInfo hri = regionInfo;
578       if (hri == null) {
579         // The region info is not passed in. We will try to find the region
580         // from region states map/meta based on the encoded region name. But we
581         // may not be able to find it. This is valid for online merge that
582         // the region may have not been created if the merge is not completed.
583         // Therefore, it is not in meta at master recovery time.
584         hri = regionStates.getRegionInfo(rt.getRegionName());
585         EventType et = rt.getEventType();
586         if (hri == null && et != EventType.RS_ZK_REGION_MERGING
587             && et != EventType.RS_ZK_REQUEST_REGION_MERGE) {
588           LOG.warn("Couldn't find the region in recovering " + rt);
589           return false;
590         }
591       }
592       return processRegionsInTransition(
593         rt, hri, stat.getVersion());
594     } finally {
595       lock.unlock();
596     }
597   }
598 
599   /**
600    * This call is invoked only (1) master assign meta;
601    * (2) during failover mode startup, zk assignment node processing.
602    * The locker is set in the caller. It returns true if the region
603    * is in transition for sure, false otherwise.
604    *
605    * It should be private but it is used by some test too.
606    */
607   boolean processRegionsInTransition(
608       final RegionTransition rt, final HRegionInfo regionInfo,
609       final int expectedVersion) throws KeeperException {
610     EventType et = rt.getEventType();
611     // Get ServerName.  Could not be null.
612     final ServerName sn = rt.getServerName();
613     final byte[] regionName = rt.getRegionName();
614     final String encodedName = HRegionInfo.encodeRegionName(regionName);
615     final String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
616     LOG.info("Processing " + prettyPrintedRegionName + " in state: " + et);
617 
618     if (regionStates.isRegionInTransition(encodedName)) {
619       LOG.info("Processed region " + prettyPrintedRegionName + " in state: "
620         + et + ", does nothing since the region is already in transition "
621         + regionStates.getRegionTransitionState(encodedName));
622       // Just return
623       return true;
624     }
625     if (!serverManager.isServerOnline(sn)) {
626       // It was transitioning on a dead server, so it's closed now.
627       // Force to OFFLINE and put it in transition, but not assign it
628       // since log splitting for the dead server is not done yet.
629       LOG.debug("RIT " + encodedName + " in state=" + rt.getEventType() +
630         " was on deadserver; forcing offline");
631       if (regionStates.isRegionOnline(regionInfo)) {
632         // Meta could still show the region is assigned to the previous
633         // server. If that server is online, when we reload the meta, the
634         // region is put back to online, we need to offline it.
635         regionStates.regionOffline(regionInfo);
636       }
637       // Put it back in transition so that SSH can re-assign it
638       regionStates.updateRegionState(regionInfo, State.OFFLINE, sn);
639 
640       if (regionInfo.isMetaRegion()) {
641         // If it's meta region, reset the meta location.
642         // So that master knows the right meta region server.
643         MetaRegionTracker.setMetaLocation(watcher, sn);
644       } else {
645         // No matter the previous server is online or offline,
646         // we need to reset the last region server of the region.
647         regionStates.setLastRegionServerOfRegion(sn, encodedName);
648         // Make sure we know the server is dead.
649         if (!serverManager.isServerDead(sn)) {
650           serverManager.expireServer(sn);
651         }
652       }
653       return false;
654     }
655     switch (et) {
656       case M_ZK_REGION_CLOSING:
657         // Insert into RIT & resend the query to the region server: may be the previous master
658         // died before sending the query the first time.
659         final RegionState rsClosing = regionStates.updateRegionState(rt, State.CLOSING);
660         this.executorService.submit(
661           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
662             @Override
663             public void process() throws IOException {
664               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
665               try {
666                 unassign(regionInfo, rsClosing, expectedVersion, null, true, null);
667                 if (regionStates.isRegionOffline(regionInfo)) {
668                   assign(regionInfo, true);
669                 }
670               } finally {
671                 lock.unlock();
672               }
673             }
674           });
675         break;
676 
677       case RS_ZK_REGION_CLOSED:
678       case RS_ZK_REGION_FAILED_OPEN:
679         // Region is closed, insert into RIT and handle it
680         regionStates.updateRegionState(regionInfo, State.CLOSED, sn);
681         invokeAssign(regionInfo);
682         break;
683 
684       case M_ZK_REGION_OFFLINE:
685         // Insert in RIT and resend to the regionserver
686         regionStates.updateRegionState(rt, State.PENDING_OPEN);
687         final RegionState rsOffline = regionStates.getRegionState(regionInfo);
688         this.executorService.submit(
689           new EventHandler(server, EventType.M_MASTER_RECOVERY) {
690             @Override
691             public void process() throws IOException {
692               ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
693               try {
694                 RegionPlan plan = new RegionPlan(regionInfo, null, sn);
695                 addPlan(encodedName, plan);
696                 assign(rsOffline, false, false);
697               } finally {
698                 lock.unlock();
699               }
700             }
701           });
702         break;
703 
704       case RS_ZK_REGION_OPENING:
705         regionStates.updateRegionState(rt, State.OPENING);
706         break;
707 
708       case RS_ZK_REGION_OPENED:
709         // Region is opened, insert into RIT and handle it
710         // This could be done asynchronously, we would need then to acquire the lock in the
711         //  handler.
712         regionStates.updateRegionState(rt, State.OPEN);
713         new OpenedRegionHandler(server, this, regionInfo, sn, expectedVersion).process();
714         break;
715       case RS_ZK_REQUEST_REGION_SPLIT:
716       case RS_ZK_REGION_SPLITTING:
717       case RS_ZK_REGION_SPLIT:
718         // Splitting region should be online. We could have skipped it during
719         // user region rebuilding since we may consider the split is completed.
720         // Put it in SPLITTING state to avoid complications.
721         regionStates.regionOnline(regionInfo, sn);
722         regionStates.updateRegionState(rt, State.SPLITTING);
723         if (!handleRegionSplitting(
724             rt, encodedName, prettyPrintedRegionName, sn)) {
725           deleteSplittingNode(encodedName, sn);
726         }
727         break;
728       case RS_ZK_REQUEST_REGION_MERGE:
729       case RS_ZK_REGION_MERGING:
730       case RS_ZK_REGION_MERGED:
731         if (!handleRegionMerging(
732             rt, encodedName, prettyPrintedRegionName, sn)) {
733           deleteMergingNode(encodedName, sn);
734         }
735         break;
736       default:
737         throw new IllegalStateException("Received region in state:" + et + " is not valid.");
738     }
739     LOG.info("Processed region " + prettyPrintedRegionName + " in state "
740       + et + ", on " + (serverManager.isServerOnline(sn) ? "" : "dead ")
741       + "server: " + sn);
742     return true;
743   }
744 
745   /**
746    * When a region is closed, it should be removed from the regionsToReopen
747    * @param hri HRegionInfo of the region which was closed
748    */
749   public void removeClosedRegion(HRegionInfo hri) {
750     if (regionsToReopen.remove(hri.getEncodedName()) != null) {
751       LOG.debug("Removed region from reopening regions because it was closed");
752     }
753   }
754 
755   /**
756    * Handles various states an unassigned node can be in.
757    * <p>
758    * Method is called when a state change is suspected for an unassigned node.
759    * <p>
760    * This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
761    * yet).
762    * @param rt
763    * @param expectedVersion
764    */
765   void handleRegion(final RegionTransition rt, int expectedVersion) {
766     if (rt == null) {
767       LOG.warn("Unexpected NULL input for RegionTransition rt");
768       return;
769     }
770     final ServerName sn = rt.getServerName();
771     // Check if this is a special HBCK transition
772     if (sn.equals(HBCK_CODE_SERVERNAME)) {
773       handleHBCK(rt);
774       return;
775     }
776     final long createTime = rt.getCreateTime();
777     final byte[] regionName = rt.getRegionName();
778     String encodedName = HRegionInfo.encodeRegionName(regionName);
779     String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
780     // Verify this is a known server
781     if (!serverManager.isServerOnline(sn)
782       && !ignoreStatesRSOffline.contains(rt.getEventType())) {
783       LOG.warn("Attempted to handle region transition for server but " +
784         "it is not online: " + prettyPrintedRegionName + ", " + rt);
785       return;
786     }
787 
788     RegionState regionState =
789       regionStates.getRegionState(encodedName);
790     long startTime = System.currentTimeMillis();
791     if (LOG.isDebugEnabled()) {
792       boolean lateEvent = createTime < (startTime - 15000);
793       LOG.debug("Handling " + rt.getEventType() +
794         ", server=" + sn + ", region=" +
795         (prettyPrintedRegionName == null ? "null" : prettyPrintedRegionName) +
796         (lateEvent ? ", which is more than 15 seconds late" : "") +
797         ", current_state=" + regionState);
798     }
799     // We don't do anything for this event,
800     // so separate it out, no need to lock/unlock anything
801     if (rt.getEventType() == EventType.M_ZK_REGION_OFFLINE) {
802       return;
803     }
804 
805     // We need a lock on the region as we could update it
806     Lock lock = locker.acquireLock(encodedName);
807     try {
808       RegionState latestState =
809         regionStates.getRegionState(encodedName);
810       if ((regionState == null && latestState != null)
811           || (regionState != null && latestState == null)
812           || (regionState != null && latestState != null
813             && latestState.getState() != regionState.getState())) {
814         LOG.warn("Region state changed from " + regionState + " to "
815           + latestState + ", while acquiring lock");
816       }
817       long waitedTime = System.currentTimeMillis() - startTime;
818       if (waitedTime > 5000) {
819         LOG.warn("Took " + waitedTime + "ms to acquire the lock");
820       }
821       regionState = latestState;
822       switch (rt.getEventType()) {
823       case RS_ZK_REQUEST_REGION_SPLIT:
824       case RS_ZK_REGION_SPLITTING:
825       case RS_ZK_REGION_SPLIT:
826         if (!handleRegionSplitting(
827             rt, encodedName, prettyPrintedRegionName, sn)) {
828           deleteSplittingNode(encodedName, sn);
829         }
830         break;
831 
832       case RS_ZK_REQUEST_REGION_MERGE:
833       case RS_ZK_REGION_MERGING:
834       case RS_ZK_REGION_MERGED:
835         // Merged region is a new region, we can't find it in the region states now.
836         // However, the two merging regions are not new. They should be in state for merging.
837         if (!handleRegionMerging(
838             rt, encodedName, prettyPrintedRegionName, sn)) {
839           deleteMergingNode(encodedName, sn);
840         }
841         break;
842 
843       case M_ZK_REGION_CLOSING:
844         // Should see CLOSING after we have asked it to CLOSE or additional
845         // times after already being in state of CLOSING
846         if (regionState == null
847             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
848           LOG.warn("Received CLOSING for " + prettyPrintedRegionName
849             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
850             + regionStates.getRegionState(encodedName));
851           return;
852         }
853         // Transition to CLOSING (or update stamp if already CLOSING)
854         regionStates.updateRegionState(rt, State.CLOSING);
855         break;
856 
857       case RS_ZK_REGION_CLOSED:
858         // Should see CLOSED after CLOSING but possible after PENDING_CLOSE
859         if (regionState == null
860             || !regionState.isPendingCloseOrClosingOnServer(sn)) {
861           LOG.warn("Received CLOSED for " + prettyPrintedRegionName
862             + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
863             + regionStates.getRegionState(encodedName));
864           return;
865         }
866         // Handle CLOSED by assigning elsewhere or stopping if a disable
867         // If we got here all is good.  Need to update RegionState -- else
868         // what follows will fail because not in expected state.
869         new ClosedRegionHandler(server, this, regionState.getRegion()).process();
870         updateClosedRegionHandlerTracker(regionState.getRegion());
871         break;
872 
873         case RS_ZK_REGION_FAILED_OPEN:
874           if (regionState == null
875               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
876             LOG.warn("Received FAILED_OPEN for " + prettyPrintedRegionName
877               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
878               + regionStates.getRegionState(encodedName));
879             return;
880           }
881           AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
882           if (failedOpenCount == null) {
883             failedOpenCount = new AtomicInteger();
884             // No need to use putIfAbsent, or extra synchronization since
885             // this whole handleRegion block is locked on the encoded region
886             // name, and failedOpenTracker is updated only in this block
887             failedOpenTracker.put(encodedName, failedOpenCount);
888           }
889           if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
890             regionStates.updateRegionState(rt, State.FAILED_OPEN);
891             // remove the tracking info to save memory, also reset
892             // the count for next open initiative
893             failedOpenTracker.remove(encodedName);
894           } else {
895             // Handle this the same as if it were opened and then closed.
896             regionState = regionStates.updateRegionState(rt, State.CLOSED);
897             if (regionState != null) {
898               // When there are more than one region server a new RS is selected as the
899               // destination and the same is updated in the regionplan. (HBASE-5546)
900               try {
901                 getRegionPlan(regionState.getRegion(), sn, true);
902                 new ClosedRegionHandler(server, this, regionState.getRegion()).process();
903               } catch (HBaseIOException e) {
904                 LOG.warn("Failed to get region plan", e);
905               }
906             }
907           }
908           break;
909 
910         case RS_ZK_REGION_OPENING:
911           // Should see OPENING after we have asked it to OPEN or additional
912           // times after already being in state of OPENING
913           if (regionState == null
914               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
915             LOG.warn("Received OPENING for " + prettyPrintedRegionName
916               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
917               + regionStates.getRegionState(encodedName));
918             return;
919           }
920           // Transition to OPENING (or update stamp if already OPENING)
921           regionStates.updateRegionState(rt, State.OPENING);
922           break;
923 
924         case RS_ZK_REGION_OPENED:
925           // Should see OPENED after OPENING but possible after PENDING_OPEN.
926           if (regionState == null
927               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
928             LOG.warn("Received OPENED for " + prettyPrintedRegionName
929               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
930               + regionStates.getRegionState(encodedName));
931 
932             if (regionState != null) {
933               // Close it without updating the internal region states,
934               // so as not to create double assignments in unlucky scenarios
935               // mentioned in OpenRegionHandler#process
936               unassign(regionState.getRegion(), null, -1, null, false, sn);
937             }
938             return;
939           }
940           // Handle OPENED by removing from transition and deleted zk node
941           regionState = regionStates.updateRegionState(rt, State.OPEN);
942           if (regionState != null) {
943             failedOpenTracker.remove(encodedName); // reset the count, if any
944             new OpenedRegionHandler(
945               server, this, regionState.getRegion(), sn, expectedVersion).process();
946             updateOpenedRegionHandlerTracker(regionState.getRegion());
947           }
948           break;
949 
950         default:
951           throw new IllegalStateException("Received event is not valid.");
952       }
953     } finally {
954       lock.unlock();
955     }
956   }
957 
958   //For unit tests only
959   boolean wasClosedHandlerCalled(HRegionInfo hri) {
960     AtomicBoolean b = closedRegionHandlerCalled.get(hri);
961     //compareAndSet to be sure that unit tests don't see stale values. Means,
962     //we will return true exactly once unless the handler code resets to true
963     //this value.
964     return b == null ? false : b.compareAndSet(true, false);
965   }
966 
967   //For unit tests only
968   boolean wasOpenedHandlerCalled(HRegionInfo hri) {
969     AtomicBoolean b = openedRegionHandlerCalled.get(hri);
970     //compareAndSet to be sure that unit tests don't see stale values. Means,
971     //we will return true exactly once unless the handler code resets to true
972     //this value.
973     return b == null ? false : b.compareAndSet(true, false);
974   }
975 
976   //For unit tests only
977   void initializeHandlerTrackers() {
978     closedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
979     openedRegionHandlerCalled = new HashMap<HRegionInfo, AtomicBoolean>();
980   }
981 
982   void updateClosedRegionHandlerTracker(HRegionInfo hri) {
983     if (closedRegionHandlerCalled != null) { //only for unit tests this is true
984       closedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
985     }
986   }
987 
988   void updateOpenedRegionHandlerTracker(HRegionInfo hri) {
989     if (openedRegionHandlerCalled != null) { //only for unit tests this is true
990       openedRegionHandlerCalled.put(hri, new AtomicBoolean(true));
991     }
992   }
993 
994   // TODO: processFavoredNodes might throw an exception, for e.g., if the
995   // meta could not be contacted/updated. We need to see how seriously to treat
996   // this problem as. Should we fail the current assignment. We should be able
997   // to recover from this problem eventually (if the meta couldn't be updated
998   // things should work normally and eventually get fixed up).
999   void processFavoredNodes(List<HRegionInfo> regions) throws IOException {
1000     if (!shouldAssignRegionsWithFavoredNodes) return;
1001     // The AM gets the favored nodes info for each region and updates the meta
1002     // table with that info
1003     Map<HRegionInfo, List<ServerName>> regionToFavoredNodes =
1004         new HashMap<HRegionInfo, List<ServerName>>();
1005     for (HRegionInfo region : regions) {
1006       regionToFavoredNodes.put(region,
1007           ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region));
1008     }
1009     FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(regionToFavoredNodes, catalogTracker);
1010   }
1011 
1012   /**
1013    * Handle a ZK unassigned node transition triggered by HBCK repair tool.
1014    * <p>
1015    * This is handled in a separate code path because it breaks the normal rules.
1016    * @param rt
1017    */
1018   private void handleHBCK(RegionTransition rt) {
1019     String encodedName = HRegionInfo.encodeRegionName(rt.getRegionName());
1020     LOG.info("Handling HBCK triggered transition=" + rt.getEventType() +
1021       ", server=" + rt.getServerName() + ", region=" +
1022       HRegionInfo.prettyPrint(encodedName));
1023     RegionState regionState = regionStates.getRegionTransitionState(encodedName);
1024     switch (rt.getEventType()) {
1025       case M_ZK_REGION_OFFLINE:
1026         HRegionInfo regionInfo;
1027         if (regionState != null) {
1028           regionInfo = regionState.getRegion();
1029         } else {
1030           try {
1031             byte [] name = rt.getRegionName();
1032             Pair<HRegionInfo, ServerName> p = MetaReader.getRegion(catalogTracker, name);
1033             regionInfo = p.getFirst();
1034           } catch (IOException e) {
1035             LOG.info("Exception reading hbase:meta doing HBCK repair operation", e);
1036             return;
1037           }
1038         }
1039         LOG.info("HBCK repair is triggering assignment of region=" +
1040             regionInfo.getRegionNameAsString());
1041         // trigger assign, node is already in OFFLINE so don't need to update ZK
1042         assign(regionInfo, false);
1043         break;
1044 
1045       default:
1046         LOG.warn("Received unexpected region state from HBCK: " + rt.toString());
1047         break;
1048     }
1049 
1050   }
1051 
1052   // ZooKeeper events
1053 
1054   /**
1055    * New unassigned node has been created.
1056    *
1057    * <p>This happens when an RS begins the OPENING or CLOSING of a region by
1058    * creating an unassigned node.
1059    *
1060    * <p>When this happens we must:
1061    * <ol>
1062    *   <li>Watch the node for further events</li>
1063    *   <li>Read and handle the state in the node</li>
1064    * </ol>
1065    */
1066   @Override
1067   public void nodeCreated(String path) {
1068     handleAssignmentEvent(path);
1069   }
1070 
1071   /**
1072    * Existing unassigned node has had data changed.
1073    *
1074    * <p>This happens when an RS transitions from OFFLINE to OPENING, or between
1075    * OPENING/OPENED and CLOSING/CLOSED.
1076    *
1077    * <p>When this happens we must:
1078    * <ol>
1079    *   <li>Watch the node for further events</li>
1080    *   <li>Read and handle the state in the node</li>
1081    * </ol>
1082    */
1083   @Override
1084   public void nodeDataChanged(String path) {
1085     handleAssignmentEvent(path);
1086   }
1087 
1088 
1089   // We  don't want to have two events on the same region managed simultaneously.
1090   // For this reason, we need to wait if an event on the same region is currently in progress.
1091   // So we track the region names of the events in progress, and we keep a waiting list.
1092   private final Set<String> regionsInProgress = new HashSet<String>();
1093   // In a LinkedHashMultimap, the put order is kept when we retrieve the collection back. We need
1094   //  this as we want the events to be managed in the same order as we received them.
1095   private final LinkedHashMultimap <String, RegionRunnable>
1096       zkEventWorkerWaitingList = LinkedHashMultimap.create();
1097 
1098   /**
1099    * A specific runnable that works only on a region.
1100    */
1101   private interface RegionRunnable extends Runnable{
1102     /**
1103      * @return - the name of the region it works on.
1104      */
1105     String getRegionName();
1106   }
1107 
1108   /**
1109    * Submit a task, ensuring that there is only one task at a time that working on a given region.
1110    * Order is respected.
1111    */
1112   protected void zkEventWorkersSubmit(final RegionRunnable regRunnable) {
1113 
1114     synchronized (regionsInProgress) {
1115       // If we're there is already a task with this region, we add it to the
1116       //  waiting list and return.
1117       if (regionsInProgress.contains(regRunnable.getRegionName())) {
1118         synchronized (zkEventWorkerWaitingList){
1119           zkEventWorkerWaitingList.put(regRunnable.getRegionName(), regRunnable);
1120         }
1121         return;
1122       }
1123 
1124       // No event in progress on this region => we can submit a new task immediately.
1125       regionsInProgress.add(regRunnable.getRegionName());
1126       zkEventWorkers.submit(new Runnable() {
1127         @Override
1128         public void run() {
1129           try {
1130             regRunnable.run();
1131           } finally {
1132             // now that we have finished, let's see if there is an event for the same region in the
1133             //  waiting list. If it's the case, we can now submit it to the pool.
1134             synchronized (regionsInProgress) {
1135               regionsInProgress.remove(regRunnable.getRegionName());
1136               synchronized (zkEventWorkerWaitingList) {
1137                 java.util.Set<RegionRunnable> waiting = zkEventWorkerWaitingList.get(
1138                     regRunnable.getRegionName());
1139                 if (!waiting.isEmpty()) {
1140                   // We want the first object only. The only way to get it is through an iterator.
1141                   RegionRunnable toSubmit = waiting.iterator().next();
1142                   zkEventWorkerWaitingList.remove(toSubmit.getRegionName(), toSubmit);
1143                   zkEventWorkersSubmit(toSubmit);
1144                 }
1145               }
1146             }
1147           }
1148         }
1149       });
1150     }
1151   }
1152 
1153   @Override
1154   public void nodeDeleted(final String path) {
1155     if (path.startsWith(watcher.assignmentZNode)) {
1156       final String regionName = ZKAssign.getRegionName(watcher, path);
1157       zkEventWorkersSubmit(new RegionRunnable() {
1158         @Override
1159         public String getRegionName() {
1160           return regionName;
1161         }
1162 
1163         @Override
1164         public void run() {
1165           Lock lock = locker.acquireLock(regionName);
1166           try {
1167             RegionState rs = regionStates.getRegionTransitionState(regionName);
1168             if (rs == null) {
1169               rs = regionStates.getRegionState(regionName);
1170               if (rs == null || !rs.isMergingNew()) {
1171                 // MergingNew is an offline state
1172                 return;
1173               }
1174             }
1175 
1176             HRegionInfo regionInfo = rs.getRegion();
1177             String regionNameStr = regionInfo.getRegionNameAsString();
1178             LOG.debug("Znode " + regionNameStr + " deleted, state: " + rs);
1179             boolean disabled = getZKTable().isDisablingOrDisabledTable(regionInfo.getTable());
1180             ServerName serverName = rs.getServerName();
1181             if (serverManager.isServerOnline(serverName)) {
1182               if (rs.isOnServer(serverName)
1183                   && (rs.isOpened() || rs.isSplitting())) {
1184                 regionOnline(regionInfo, serverName);
1185                 if (disabled) {
1186                   // if server is offline, no hurt to unassign again
1187                   LOG.info("Opened " + regionNameStr
1188                     + "but this table is disabled, triggering close of region");
1189                   unassign(regionInfo);
1190                 }
1191               } else if (rs.isMergingNew()) {
1192                 synchronized (regionStates) {
1193                   String p = regionInfo.getEncodedName();
1194                   PairOfSameType<HRegionInfo> regions = mergingRegions.get(p);
1195                   if (regions != null) {
1196                     onlineMergingRegion(disabled, regions.getFirst(), serverName);
1197                     onlineMergingRegion(disabled, regions.getSecond(), serverName);
1198                   }
1199                 }
1200               }
1201             }
1202           } finally {
1203             lock.unlock();
1204           }
1205         }
1206 
1207         private void onlineMergingRegion(boolean disabled,
1208             final HRegionInfo hri, final ServerName serverName) {
1209           RegionState regionState = regionStates.getRegionState(hri);
1210           if (regionState != null && regionState.isMerging()
1211               && regionState.isOnServer(serverName)) {
1212             regionOnline(regionState.getRegion(), serverName);
1213             if (disabled) {
1214               unassign(hri);
1215             }
1216           }
1217         }
1218       });
1219     }
1220   }
1221 
1222   /**
1223    * New unassigned node has been created.
1224    *
1225    * <p>This happens when an RS begins the OPENING, SPLITTING or CLOSING of a
1226    * region by creating a znode.
1227    *
1228    * <p>When this happens we must:
1229    * <ol>
1230    *   <li>Watch the node for further children changed events</li>
1231    *   <li>Watch all new children for changed events</li>
1232    * </ol>
1233    */
1234   @Override
1235   public void nodeChildrenChanged(String path) {
1236     if (path.equals(watcher.assignmentZNode)) {
1237       zkEventWorkers.submit(new Runnable() {
1238         @Override
1239         public void run() {
1240           try {
1241             // Just make sure we see the changes for the new znodes
1242             List<String> children =
1243               ZKUtil.listChildrenAndWatchForNewChildren(
1244                 watcher, watcher.assignmentZNode);
1245             if (children != null) {
1246               Stat stat = new Stat();
1247               for (String child : children) {
1248                 // if region is in transition, we already have a watch
1249                 // on it, so no need to watch it again. So, as I know for now,
1250                 // this is needed to watch splitting nodes only.
1251                 if (!regionStates.isRegionInTransition(child)) {
1252                   ZKAssign.getDataAndWatch(watcher, child, stat);
1253                 }
1254               }
1255             }
1256           } catch (KeeperException e) {
1257             server.abort("Unexpected ZK exception reading unassigned children", e);
1258           }
1259         }
1260       });
1261     }
1262   }
1263 
1264   /**
1265    * Marks the region as online.  Removes it from regions in transition and
1266    * updates the in-memory assignment information.
1267    * <p>
1268    * Used when a region has been successfully opened on a region server.
1269    * @param regionInfo
1270    * @param sn
1271    */
1272   void regionOnline(HRegionInfo regionInfo, ServerName sn) {
1273     numRegionsOpened.incrementAndGet();
1274     regionStates.regionOnline(regionInfo, sn);
1275 
1276     // Remove plan if one.
1277     clearRegionPlan(regionInfo);
1278     // Add the server to serversInUpdatingTimer
1279     addToServersInUpdatingTimer(sn);
1280     balancer.regionOnline(regionInfo, sn);
1281   }
1282 
1283   /**
1284    * Pass the assignment event to a worker for processing.
1285    * Each worker is a single thread executor service.  The reason
1286    * for just one thread is to make sure all events for a given
1287    * region are processed in order.
1288    *
1289    * @param path
1290    */
1291   private void handleAssignmentEvent(final String path) {
1292     if (path.startsWith(watcher.assignmentZNode)) {
1293       final String regionName = ZKAssign.getRegionName(watcher, path);
1294 
1295       zkEventWorkersSubmit(new RegionRunnable() {
1296         @Override
1297         public String getRegionName() {
1298           return regionName;
1299         }
1300 
1301         @Override
1302         public void run() {
1303           try {
1304             Stat stat = new Stat();
1305             byte [] data = ZKAssign.getDataAndWatch(watcher, path, stat);
1306             if (data == null) return;
1307 
1308             RegionTransition rt = RegionTransition.parseFrom(data);
1309             handleRegion(rt, stat.getVersion());
1310           } catch (KeeperException e) {
1311             server.abort("Unexpected ZK exception reading unassigned node data", e);
1312           } catch (DeserializationException e) {
1313             server.abort("Unexpected exception deserializing node data", e);
1314           }
1315         }
1316       });
1317     }
1318   }
1319 
1320   /**
1321    * Add the server to the set serversInUpdatingTimer, then {@link TimerUpdater}
1322    * will update timers for this server in background
1323    * @param sn
1324    */
1325   private void addToServersInUpdatingTimer(final ServerName sn) {
1326     if (tomActivated){
1327       this.serversInUpdatingTimer.add(sn);
1328     }
1329   }
1330 
1331   /**
1332    * Touch timers for all regions in transition that have the passed
1333    * <code>sn</code> in common.
1334    * Call this method whenever a server checks in.  Doing so helps the case where
1335    * a new regionserver has joined the cluster and its been given 1k regions to
1336    * open.  If this method is tickled every time the region reports in a
1337    * successful open then the 1k-th region won't be timed out just because its
1338    * sitting behind the open of 999 other regions.  This method is NOT used
1339    * as part of bulk assign -- there we have a different mechanism for extending
1340    * the regions in transition timer (we turn it off temporarily -- because
1341    * there is no regionplan involved when bulk assigning.
1342    * @param sn
1343    */
1344   private void updateTimers(final ServerName sn) {
1345     Preconditions.checkState(tomActivated);
1346     if (sn == null) return;
1347 
1348     // This loop could be expensive.
1349     // First make a copy of current regionPlan rather than hold sync while
1350     // looping because holding sync can cause deadlock.  Its ok in this loop
1351     // if the Map we're going against is a little stale
1352     List<Map.Entry<String, RegionPlan>> rps;
1353     synchronized(this.regionPlans) {
1354       rps = new ArrayList<Map.Entry<String, RegionPlan>>(regionPlans.entrySet());
1355     }
1356 
1357     for (Map.Entry<String, RegionPlan> e : rps) {
1358       if (e.getValue() != null && e.getKey() != null && sn.equals(e.getValue().getDestination())) {
1359         RegionState regionState = regionStates.getRegionTransitionState(e.getKey());
1360         if (regionState != null) {
1361           regionState.updateTimestampToNow();
1362         }
1363       }
1364     }
1365   }
1366 
1367   /**
1368    * Marks the region as offline.  Removes it from regions in transition and
1369    * removes in-memory assignment information.
1370    * <p>
1371    * Used when a region has been closed and should remain closed.
1372    * @param regionInfo
1373    */
1374   public void regionOffline(final HRegionInfo regionInfo) {
1375     regionOffline(regionInfo, null);
1376   }
1377 
1378   public void offlineDisabledRegion(HRegionInfo regionInfo) {
1379     // Disabling so should not be reassigned, just delete the CLOSED node
1380     LOG.debug("Table being disabled so deleting ZK node and removing from " +
1381       "regions in transition, skipping assignment of region " +
1382         regionInfo.getRegionNameAsString());
1383     String encodedName = regionInfo.getEncodedName();
1384     deleteNodeInStates(encodedName, "closed", null,
1385       EventType.RS_ZK_REGION_CLOSED, EventType.M_ZK_REGION_OFFLINE);
1386     regionOffline(regionInfo);
1387   }
1388 
1389   // Assignment methods
1390 
1391   /**
1392    * Assigns the specified region.
1393    * <p>
1394    * If a RegionPlan is available with a valid destination then it will be used
1395    * to determine what server region is assigned to.  If no RegionPlan is
1396    * available, region will be assigned to a random available server.
1397    * <p>
1398    * Updates the RegionState and sends the OPEN RPC.
1399    * <p>
1400    * This will only succeed if the region is in transition and in a CLOSED or
1401    * OFFLINE state or not in transition (in-memory not zk), and of course, the
1402    * chosen server is up and running (It may have just crashed!).  If the
1403    * in-memory checks pass, the zk node is forced to OFFLINE before assigning.
1404    *
1405    * @param region server to be assigned
1406    * @param setOfflineInZK whether ZK node should be created/transitioned to an
1407    *                       OFFLINE state before assigning the region
1408    */
1409   public void assign(HRegionInfo region, boolean setOfflineInZK) {
1410     assign(region, setOfflineInZK, false);
1411   }
1412 
1413   /**
1414    * Use care with forceNewPlan. It could cause double assignment.
1415    */
1416   public void assign(HRegionInfo region,
1417       boolean setOfflineInZK, boolean forceNewPlan) {
1418     if (isDisabledorDisablingRegionInRIT(region)) {
1419       return;
1420     }
1421     if (this.serverManager.isClusterShutdown()) {
1422       LOG.info("Cluster shutdown is set; skipping assign of " +
1423         region.getRegionNameAsString());
1424       return;
1425     }
1426     String encodedName = region.getEncodedName();
1427     Lock lock = locker.acquireLock(encodedName);
1428     try {
1429       RegionState state = forceRegionStateToOffline(region, forceNewPlan);
1430       if (state != null) {
1431         if (regionStates.wasRegionOnDeadServer(encodedName)) {
1432           LOG.info("Skip assigning " + region.getRegionNameAsString()
1433             + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1434             + " is dead but not processed yet");
1435           return;
1436         }
1437         assign(state, setOfflineInZK, forceNewPlan);
1438       }
1439     } finally {
1440       lock.unlock();
1441     }
1442   }
1443 
1444   /**
1445    * Bulk assign regions to <code>destination</code>.
1446    * @param destination
1447    * @param regions Regions to assign.
1448    * @return true if successful
1449    */
1450   boolean assign(final ServerName destination, final List<HRegionInfo> regions) {
1451     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1452     try {
1453       int regionCount = regions.size();
1454       if (regionCount == 0) {
1455         return true;
1456       }
1457       LOG.debug("Assigning " + regionCount + " region(s) to " + destination.toString());
1458       Set<String> encodedNames = new HashSet<String>(regionCount);
1459       for (HRegionInfo region : regions) {
1460         encodedNames.add(region.getEncodedName());
1461       }
1462 
1463       List<HRegionInfo> failedToOpenRegions = new ArrayList<HRegionInfo>();
1464       Map<String, Lock> locks = locker.acquireLocks(encodedNames);
1465       try {
1466         AtomicInteger counter = new AtomicInteger(0);
1467         Map<String, Integer> offlineNodesVersions = new ConcurrentHashMap<String, Integer>();
1468         OfflineCallback cb = new OfflineCallback(
1469           watcher, destination, counter, offlineNodesVersions);
1470         Map<String, RegionPlan> plans = new HashMap<String, RegionPlan>(regions.size());
1471         List<RegionState> states = new ArrayList<RegionState>(regions.size());
1472         for (HRegionInfo region : regions) {
1473           String encodedName = region.getEncodedName();
1474           if (!isDisabledorDisablingRegionInRIT(region)) {
1475             RegionState state = forceRegionStateToOffline(region, false);
1476             boolean onDeadServer = false;
1477             if (state != null) {
1478               if (regionStates.wasRegionOnDeadServer(encodedName)) {
1479                 LOG.info("Skip assigning " + region.getRegionNameAsString()
1480                   + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName)
1481                   + " is dead but not processed yet");
1482                 onDeadServer = true;
1483               } else if (asyncSetOfflineInZooKeeper(state, cb, destination)) {
1484                 RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
1485                 plans.put(encodedName, plan);
1486                 states.add(state);
1487                 continue;
1488               }
1489             }
1490             // Reassign if the region wasn't on a dead server
1491             if (!onDeadServer) {
1492               LOG.info("failed to force region state to offline or "
1493                 + "failed to set it offline in ZK, will reassign later: " + region);
1494               failedToOpenRegions.add(region); // assign individually later
1495             }
1496           }
1497           // Release the lock, this region is excluded from bulk assign because
1498           // we can't update its state, or set its znode to offline.
1499           Lock lock = locks.remove(encodedName);
1500           lock.unlock();
1501         }
1502 
1503         // Wait until all unassigned nodes have been put up and watchers set.
1504         int total = states.size();
1505         for (int oldCounter = 0; !server.isStopped();) {
1506           int count = counter.get();
1507           if (oldCounter != count) {
1508             LOG.info(destination.toString() + " unassigned znodes=" + count +
1509               " of total=" + total);
1510             oldCounter = count;
1511           }
1512           if (count >= total) break;
1513           Threads.sleep(5);
1514         }
1515 
1516         if (server.isStopped()) {
1517           return false;
1518         }
1519 
1520         // Add region plans, so we can updateTimers when one region is opened so
1521         // that unnecessary timeout on RIT is reduced.
1522         this.addPlans(plans);
1523 
1524         List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos =
1525           new ArrayList<Triple<HRegionInfo, Integer, List<ServerName>>>(states.size());
1526         for (RegionState state: states) {
1527           HRegionInfo region = state.getRegion();
1528           String encodedRegionName = region.getEncodedName();
1529           Integer nodeVersion = offlineNodesVersions.get(encodedRegionName);
1530           if (nodeVersion == null || nodeVersion == -1) {
1531             LOG.warn("failed to offline in zookeeper: " + region);
1532             failedToOpenRegions.add(region); // assign individually later
1533             Lock lock = locks.remove(encodedRegionName);
1534             lock.unlock();
1535           } else {
1536             regionStates.updateRegionState(
1537               region, State.PENDING_OPEN, destination);
1538             List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1539             if (this.shouldAssignRegionsWithFavoredNodes) {
1540               favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1541             }
1542             regionOpenInfos.add(new Triple<HRegionInfo, Integer,  List<ServerName>>(
1543               region, nodeVersion, favoredNodes));
1544           }
1545         }
1546 
1547         // Move on to open regions.
1548         try {
1549           // Send OPEN RPC. If it fails on a IOE or RemoteException,
1550           // regions will be assigned individually.
1551           long maxWaitTime = System.currentTimeMillis() +
1552             this.server.getConfiguration().
1553               getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1554           for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1555             try {
1556               List<RegionOpeningState> regionOpeningStateList = serverManager
1557                 .sendRegionOpen(destination, regionOpenInfos);
1558               if (regionOpeningStateList == null) {
1559                 // Failed getting RPC connection to this server
1560                 return false;
1561               }
1562               for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
1563                 RegionOpeningState openingState = regionOpeningStateList.get(k);
1564                 if (openingState != RegionOpeningState.OPENED) {
1565                   HRegionInfo region = regionOpenInfos.get(k).getFirst();
1566                   if (openingState == RegionOpeningState.ALREADY_OPENED) {
1567                     processAlreadyOpenedRegion(region, destination);
1568                   } else if (openingState == RegionOpeningState.FAILED_OPENING) {
1569                     // Failed opening this region, reassign it later
1570                     failedToOpenRegions.add(region);
1571                   } else {
1572                     LOG.warn("THIS SHOULD NOT HAPPEN: unknown opening state "
1573                       + openingState + " in assigning region " + region);
1574                   }
1575                 }
1576               }
1577               break;
1578             } catch (IOException e) {
1579               if (e instanceof RemoteException) {
1580                 e = ((RemoteException)e).unwrapRemoteException();
1581               }
1582               if (e instanceof RegionServerStoppedException) {
1583                 LOG.warn("The region server was shut down, ", e);
1584                 // No need to retry, the region server is a goner.
1585                 return false;
1586               } else if (e instanceof ServerNotRunningYetException) {
1587                 long now = System.currentTimeMillis();
1588                 if (now < maxWaitTime) {
1589                   LOG.debug("Server is not yet up; waiting up to " +
1590                     (maxWaitTime - now) + "ms", e);
1591                   Thread.sleep(100);
1592                   i--; // reset the try count
1593                   continue;
1594                 }
1595               } else if (e instanceof java.net.SocketTimeoutException
1596                   && this.serverManager.isServerOnline(destination)) {
1597                 // In case socket is timed out and the region server is still online,
1598                 // the openRegion RPC could have been accepted by the server and
1599                 // just the response didn't go through.  So we will retry to
1600                 // open the region on the same server.
1601                 if (LOG.isDebugEnabled()) {
1602                   LOG.debug("Bulk assigner openRegion() to " + destination
1603                     + " has timed out, but the regions might"
1604                     + " already be opened on it.", e);
1605                 }
1606                 continue;
1607               }
1608               throw e;
1609             }
1610           }
1611         } catch (IOException e) {
1612           // Can be a socket timeout, EOF, NoRouteToHost, etc
1613           LOG.info("Unable to communicate with " + destination
1614             + " in order to assign regions, ", e);
1615           return false;
1616         } catch (InterruptedException e) {
1617           throw new RuntimeException(e);
1618         }
1619       } finally {
1620         for (Lock lock : locks.values()) {
1621           lock.unlock();
1622         }
1623       }
1624 
1625       if (!failedToOpenRegions.isEmpty()) {
1626         for (HRegionInfo region : failedToOpenRegions) {
1627           if (!regionStates.isRegionOnline(region)) {
1628             invokeAssign(region);
1629           }
1630         }
1631       }
1632       LOG.debug("Bulk assigning done for " + destination);
1633       return true;
1634     } finally {
1635       metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
1636     }
1637   }
1638 
1639   /**
1640    * Send CLOSE RPC if the server is online, otherwise, offline the region.
1641    *
1642    * The RPC will be sent only to the region sever found in the region state
1643    * if it is passed in, otherwise, to the src server specified. If region
1644    * state is not specified, we don't update region state at all, instead
1645    * we just send the RPC call. This is useful for some cleanup without
1646    * messing around the region states (see handleRegion, on region opened
1647    * on an unexpected server scenario, for an example)
1648    */
1649   private void unassign(final HRegionInfo region,
1650       final RegionState state, final int versionOfClosingNode,
1651       final ServerName dest, final boolean transitionInZK,
1652       final ServerName src) {
1653     ServerName server = src;
1654     if (state != null) {
1655       server = state.getServerName();
1656     }
1657     long maxWaitTime = -1;
1658     for (int i = 1; i <= this.maximumAttempts; i++) {
1659       if (this.server.isStopped() || this.server.isAborted()) {
1660         LOG.debug("Server stopped/aborted; skipping unassign of " + region);
1661         return;
1662       }
1663       // ClosedRegionhandler can remove the server from this.regions
1664       if (!serverManager.isServerOnline(server)) {
1665         LOG.debug("Offline " + region.getRegionNameAsString()
1666           + ", no need to unassign since it's on a dead server: " + server);
1667         if (transitionInZK) {
1668           // delete the node. if no node exists need not bother.
1669           deleteClosingOrClosedNode(region, server);
1670         }
1671         if (state != null) {
1672           regionOffline(region);
1673         }
1674         return;
1675       }
1676       try {
1677         // Send CLOSE RPC
1678         if (serverManager.sendRegionClose(server, region,
1679           versionOfClosingNode, dest, transitionInZK)) {
1680           LOG.debug("Sent CLOSE to " + server + " for region " +
1681             region.getRegionNameAsString());
1682           if (!transitionInZK && state != null) {
1683             // Retry to make sure the region is
1684             // closed so as to avoid double assignment.
1685             unassign(region, state, versionOfClosingNode,
1686               dest, transitionInZK,src);
1687           }
1688           return;
1689         }
1690         // This never happens. Currently regionserver close always return true.
1691         // Todo; this can now happen (0.96) if there is an exception in a coprocessor
1692         LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
1693           region.getRegionNameAsString());
1694       } catch (Throwable t) {
1695         if (t instanceof RemoteException) {
1696           t = ((RemoteException)t).unwrapRemoteException();
1697         }
1698         boolean logRetries = true;
1699         if (t instanceof NotServingRegionException
1700             || t instanceof RegionServerStoppedException
1701             || t instanceof ServerNotRunningYetException) {
1702           LOG.debug("Offline " + region.getRegionNameAsString()
1703             + ", it's not any more on " + server, t);
1704           if (transitionInZK) {
1705             deleteClosingOrClosedNode(region, server);
1706           }
1707           if (state != null) {
1708             regionOffline(region);
1709           }
1710           return;
1711         } else if ((t instanceof FailedServerException) || (state != null && 
1712             t instanceof RegionAlreadyInTransitionException)) {
1713           long sleepTime = 0;
1714           Configuration conf = this.server.getConfiguration();
1715           if(t instanceof FailedServerException) {
1716             sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, 
1717                   RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
1718           } else {
1719             // RS is already processing this region, only need to update the timestamp
1720             LOG.debug("update " + state + " the timestamp.");
1721             state.updateTimestampToNow();
1722             if (maxWaitTime < 0) {
1723               maxWaitTime =
1724                   EnvironmentEdgeManager.currentTimeMillis()
1725                       + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME,
1726                         DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1727             }
1728             long now = EnvironmentEdgeManager.currentTimeMillis();
1729             if (now < maxWaitTime) {
1730               LOG.debug("Region is already in transition; "
1731                 + "waiting up to " + (maxWaitTime - now) + "ms", t);
1732               sleepTime = 100;
1733               i--; // reset the try count
1734               logRetries = false;
1735             }
1736           }
1737           try {
1738             if (sleepTime > 0) {
1739               Thread.sleep(sleepTime);
1740             }
1741           } catch (InterruptedException ie) {
1742             LOG.warn("Failed to unassign "
1743               + region.getRegionNameAsString() + " since interrupted", ie);
1744             Thread.currentThread().interrupt();
1745             if (!tomActivated && state != null) {
1746               regionStates.updateRegionState(region, State.FAILED_CLOSE);
1747             }
1748             return;
1749           }
1750         }
1751 
1752         if (logRetries) {
1753           LOG.info("Server " + server + " returned " + t + " for "
1754             + region.getRegionNameAsString() + ", try=" + i
1755             + " of " + this.maximumAttempts, t);
1756           // Presume retry or server will expire.
1757         }
1758       }
1759     }
1760     // Run out of attempts
1761     if (!tomActivated && state != null) {
1762       regionStates.updateRegionState(region, State.FAILED_CLOSE);
1763     }
1764   }
1765 
1766   /**
1767    * Set region to OFFLINE unless it is opening and forceNewPlan is false.
1768    */
1769   private RegionState forceRegionStateToOffline(
1770       final HRegionInfo region, final boolean forceNewPlan) {
1771     RegionState state = regionStates.getRegionState(region);
1772     if (state == null) {
1773       LOG.warn("Assigning a region not in region states: " + region);
1774       state = regionStates.createRegionState(region);
1775     }
1776 
1777     ServerName sn = state.getServerName();
1778     if (forceNewPlan && LOG.isDebugEnabled()) {
1779       LOG.debug("Force region state offline " + state);
1780     }
1781 
1782     switch (state.getState()) {
1783     case OPEN:
1784     case OPENING:
1785     case PENDING_OPEN:
1786     case CLOSING:
1787     case PENDING_CLOSE:
1788       if (!forceNewPlan) {
1789         LOG.debug("Skip assigning " +
1790           region + ", it is already " + state);
1791         return null;
1792       }
1793     case FAILED_CLOSE:
1794     case FAILED_OPEN:
1795       unassign(region, state, -1, null, false, null);
1796       state = regionStates.getRegionState(region);
1797       if (state.isFailedClose()) {
1798         // If we can't close the region, we can't re-assign
1799         // it so as to avoid possible double assignment/data loss.
1800         LOG.info("Skip assigning " +
1801           region + ", we couldn't close it: " + state);
1802         return null;
1803       }
1804     case OFFLINE:
1805       // This region could have been open on this server
1806       // for a while. If the server is dead and not processed
1807       // yet, we can move on only if the meta shows the
1808       // region is not on this server actually, or on a server
1809       // not dead, or dead and processed already.
1810       if (regionStates.isServerDeadAndNotProcessed(sn)
1811           && wasRegionOnDeadServerByMeta(region, sn)) {
1812         LOG.info("Skip assigning " + region.getRegionNameAsString()
1813           + ", it is on a dead but not processed yet server");
1814         return null;
1815       }
1816     case CLOSED:
1817       break;
1818     default:
1819       LOG.error("Trying to assign region " + region
1820         + ", which is " + state);
1821       return null;
1822     }
1823     return state;
1824   }
1825 
1826   private boolean wasRegionOnDeadServerByMeta(
1827       final HRegionInfo region, final ServerName sn) {
1828     try {
1829       if (region.isMetaRegion()) {
1830         ServerName server = catalogTracker.getMetaLocation();
1831         return regionStates.isServerDeadAndNotProcessed(server);
1832       }
1833       while (!server.isStopped()) {
1834         try {
1835           catalogTracker.waitForMeta();
1836           Pair<HRegionInfo, ServerName> r =
1837             MetaReader.getRegion(catalogTracker, region.getRegionName());
1838           ServerName server = r == null ? null : r.getSecond();
1839           return regionStates.isServerDeadAndNotProcessed(server);
1840         } catch (IOException ioe) {
1841           LOG.info("Received exception accessing hbase:meta during force assign "
1842             + region.getRegionNameAsString() + ", retrying", ioe);
1843         }
1844       }
1845     } catch (InterruptedException e) {
1846       Thread.currentThread().interrupt();
1847       LOG.info("Interrupted accessing hbase:meta", e);
1848     }
1849     // Call is interrupted or server is stopped.
1850     return regionStates.isServerDeadAndNotProcessed(sn);
1851   }
1852 
1853   /**
1854    * Caller must hold lock on the passed <code>state</code> object.
1855    * @param state
1856    * @param setOfflineInZK
1857    * @param forceNewPlan
1858    */
1859   private void assign(RegionState state,
1860       final boolean setOfflineInZK, final boolean forceNewPlan) {
1861     long startTime = EnvironmentEdgeManager.currentTimeMillis();
1862     try {
1863       Configuration conf = server.getConfiguration();
1864       RegionState currentState = state;
1865       int versionOfOfflineNode = -1;
1866       RegionPlan plan = null;
1867       long maxWaitTime = -1;
1868       HRegionInfo region = state.getRegion();
1869       RegionOpeningState regionOpenState;
1870       Throwable previousException = null;
1871       for (int i = 1; i <= maximumAttempts; i++) {
1872         if (server.isStopped() || server.isAborted()) {
1873           LOG.info("Skip assigning " + region.getRegionNameAsString()
1874             + ", the server is stopped/aborted");
1875           return;
1876         }
1877         if (plan == null) { // Get a server for the region at first
1878           try {
1879             plan = getRegionPlan(region, forceNewPlan);
1880           } catch (HBaseIOException e) {
1881             LOG.warn("Failed to get region plan", e);
1882           }
1883         }
1884         if (plan == null) {
1885           LOG.warn("Unable to determine a plan to assign " + region);
1886           if (tomActivated){
1887             this.timeoutMonitor.setAllRegionServersOffline(true);
1888           } else {
1889             if (region.isMetaRegion()) {
1890               try {
1891                 Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
1892                 if (i == maximumAttempts) i = 1;
1893                 continue;
1894               } catch (InterruptedException e) {
1895                 LOG.error("Got exception while waiting for hbase:meta assignment");
1896                 Thread.currentThread().interrupt();
1897               }
1898             }
1899             regionStates.updateRegionState(region, State.FAILED_OPEN);
1900           }
1901           return;
1902         }
1903         if (setOfflineInZK && versionOfOfflineNode == -1) {
1904           // get the version of the znode after setting it to OFFLINE.
1905           // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
1906           versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination());
1907           if (versionOfOfflineNode != -1) {
1908             if (isDisabledorDisablingRegionInRIT(region)) {
1909               return;
1910             }
1911             // In case of assignment from EnableTableHandler table state is ENABLING. Any how
1912             // EnableTableHandler will set ENABLED after assigning all the table regions. If we
1913             // try to set to ENABLED directly then client API may think table is enabled.
1914             // When we have a case such as all the regions are added directly into hbase:meta and we call
1915             // assignRegion then we need to make the table ENABLED. Hence in such case the table
1916             // will not be in ENABLING or ENABLED state.
1917             TableName tableName = region.getTable();
1918             if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) {
1919               LOG.debug("Setting table " + tableName + " to ENABLED state.");
1920               setEnabledTable(tableName);
1921             }
1922           }
1923         }
1924         if (setOfflineInZK && versionOfOfflineNode == -1) {
1925           LOG.info("Unable to set offline in ZooKeeper to assign " + region);
1926           // Setting offline in ZK must have been failed due to ZK racing or some
1927           // exception which may make the server to abort. If it is ZK racing,
1928           // we should retry since we already reset the region state,
1929           // existing (re)assignment will fail anyway.
1930           if (!server.isAborted()) {
1931             continue;
1932           }
1933         }
1934         LOG.info("Assigning " + region.getRegionNameAsString() +
1935             " to " + plan.getDestination().toString());
1936         // Transition RegionState to PENDING_OPEN
1937         currentState = regionStates.updateRegionState(region,
1938           State.PENDING_OPEN, plan.getDestination());
1939 
1940         boolean needNewPlan;
1941         final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
1942             " to " + plan.getDestination();
1943         try {
1944           List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1945           if (this.shouldAssignRegionsWithFavoredNodes) {
1946             favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1947           }
1948           regionOpenState = serverManager.sendRegionOpen(
1949               plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
1950 
1951           if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
1952             // Failed opening this region, looping again on a new server.
1953             needNewPlan = true;
1954             LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
1955                 " trying to assign elsewhere instead; " +
1956                 "try=" + i + " of " + this.maximumAttempts);
1957           } else {
1958             // we're done
1959             if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
1960               processAlreadyOpenedRegion(region, plan.getDestination());
1961             }
1962             return;
1963           }
1964 
1965         } catch (Throwable t) {
1966           if (t instanceof RemoteException) {
1967             t = ((RemoteException) t).unwrapRemoteException();
1968           }
1969           previousException = t;
1970 
1971           // Should we wait a little before retrying? If the server is starting it's yes.
1972           // If the region is already in transition, it's yes as well: we want to be sure that
1973           //  the region will get opened but we don't want a double assignment.
1974           boolean hold = (t instanceof RegionAlreadyInTransitionException ||
1975               t instanceof ServerNotRunningYetException);
1976 
1977           // In case socket is timed out and the region server is still online,
1978           // the openRegion RPC could have been accepted by the server and
1979           // just the response didn't go through.  So we will retry to
1980           // open the region on the same server to avoid possible
1981           // double assignment.
1982           boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
1983               && this.serverManager.isServerOnline(plan.getDestination()));
1984 
1985 
1986           if (hold) {
1987             LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
1988               "try=" + i + " of " + this.maximumAttempts, t);
1989 
1990             if (maxWaitTime < 0) {
1991               if (t instanceof RegionAlreadyInTransitionException) {
1992                 maxWaitTime = EnvironmentEdgeManager.currentTimeMillis()
1993                   + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME,
1994                     DEFAULT_ALREADY_IN_TRANSITION_WAITTIME);
1995               } else {
1996                 maxWaitTime = this.server.getConfiguration().
1997                   getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1998               }
1999             }
2000             try {
2001               needNewPlan = false;
2002               long now = EnvironmentEdgeManager.currentTimeMillis();
2003               if (now < maxWaitTime) {
2004                 LOG.debug("Server is not yet up or region is already in transition; "
2005                   + "waiting up to " + (maxWaitTime - now) + "ms", t);
2006                 Thread.sleep(100);
2007                 i--; // reset the try count
2008               } else if (!(t instanceof RegionAlreadyInTransitionException)) {
2009                 LOG.debug("Server is not up for a while; try a new one", t);
2010                 needNewPlan = true;
2011               }
2012             } catch (InterruptedException ie) {
2013               LOG.warn("Failed to assign "
2014                   + region.getRegionNameAsString() + " since interrupted", ie);
2015               Thread.currentThread().interrupt();
2016               if (!tomActivated) {
2017                 regionStates.updateRegionState(region, State.FAILED_OPEN);
2018               }
2019               return;
2020             }
2021           } else if (retry) {
2022             needNewPlan = false;
2023             LOG.warn(assignMsg + ", trying to assign to the same region server " +
2024                 "try=" + i + " of " + this.maximumAttempts, t);
2025           } else {
2026             needNewPlan = true;
2027             LOG.warn(assignMsg + ", trying to assign elsewhere instead;" +
2028                 " try=" + i + " of " + this.maximumAttempts, t);
2029           }
2030         }
2031 
2032         if (i == this.maximumAttempts) {
2033           // Don't reset the region state or get a new plan any more.
2034           // This is the last try.
2035           continue;
2036         }
2037 
2038         // If region opened on destination of present plan, reassigning to new
2039         // RS may cause double assignments. In case of RegionAlreadyInTransitionException
2040         // reassigning to same RS.
2041         if (needNewPlan) {
2042           // Force a new plan and reassign. Will return null if no servers.
2043           // The new plan could be the same as the existing plan since we don't
2044           // exclude the server of the original plan, which should not be
2045           // excluded since it could be the only server up now.
2046           RegionPlan newPlan = null;
2047           try {
2048             newPlan = getRegionPlan(region, true);
2049           } catch (HBaseIOException e) {
2050             LOG.warn("Failed to get region plan", e);
2051           }
2052           if (newPlan == null) {
2053             if (tomActivated) {
2054               this.timeoutMonitor.setAllRegionServersOffline(true);
2055             } else {
2056               regionStates.updateRegionState(region, State.FAILED_OPEN);
2057             }
2058             LOG.warn("Unable to find a viable location to assign region " +
2059                 region.getRegionNameAsString());
2060             return;
2061           }
2062 
2063           if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
2064             // Clean out plan we failed execute and one that doesn't look like it'll
2065             // succeed anyways; we need a new plan!
2066             // Transition back to OFFLINE
2067             currentState = regionStates.updateRegionState(region, State.OFFLINE);
2068             versionOfOfflineNode = -1;
2069             plan = newPlan;
2070           } else if(plan.getDestination().equals(newPlan.getDestination()) &&
2071               previousException instanceof FailedServerException) {
2072             try {
2073               LOG.info("Trying to re-assign " + region.getRegionNameAsString() + 
2074                 " to the same failed server.");
2075               Thread.sleep(1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, 
2076                 RpcClient.FAILED_SERVER_EXPIRY_DEFAULT));
2077             } catch (InterruptedException ie) {
2078               LOG.warn("Failed to assign "
2079                   + region.getRegionNameAsString() + " since interrupted", ie);
2080               Thread.currentThread().interrupt();
2081               if (!tomActivated) {
2082                 regionStates.updateRegionState(region, State.FAILED_OPEN);
2083               }
2084               return;
2085             }
2086           }
2087         }
2088       }
2089       // Run out of attempts
2090       if (!tomActivated) {
2091         regionStates.updateRegionState(region, State.FAILED_OPEN);
2092       }
2093     } finally {
2094       metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTimeMillis() - startTime);
2095     }
2096   }
2097 
2098   private void processAlreadyOpenedRegion(HRegionInfo region, ServerName sn) {
2099     // Remove region from in-memory transition and unassigned node from ZK
2100     // While trying to enable the table the regions of the table were
2101     // already enabled.
2102     LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2103       + " to " + sn);
2104     String encodedName = region.getEncodedName();
2105     deleteNodeInStates(encodedName, "offline", sn, EventType.M_ZK_REGION_OFFLINE);
2106     regionStates.regionOnline(region, sn);
2107   }
2108 
2109   private boolean isDisabledorDisablingRegionInRIT(final HRegionInfo region) {
2110     TableName tableName = region.getTable();
2111     boolean disabled = this.zkTable.isDisabledTable(tableName);
2112     if (disabled || this.zkTable.isDisablingTable(tableName)) {
2113       LOG.info("Table " + tableName + (disabled ? " disabled;" : " disabling;") +
2114         " skipping assign of " + region.getRegionNameAsString());
2115       offlineDisabledRegion(region);
2116       return true;
2117     }
2118     return false;
2119   }
2120 
2121   /**
2122    * Set region as OFFLINED up in zookeeper
2123    *
2124    * @param state
2125    * @return the version of the offline node if setting of the OFFLINE node was
2126    *         successful, -1 otherwise.
2127    */
2128   private int setOfflineInZooKeeper(final RegionState state, final ServerName destination) {
2129     if (!state.isClosed() && !state.isOffline()) {
2130       String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
2131       this.server.abort(msg, new IllegalStateException(msg));
2132       return -1;
2133     }
2134     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
2135     int versionOfOfflineNode;
2136     try {
2137       // get the version after setting the znode to OFFLINE
2138       versionOfOfflineNode = ZKAssign.createOrForceNodeOffline(watcher,
2139         state.getRegion(), destination);
2140       if (versionOfOfflineNode == -1) {
2141         LOG.warn("Attempted to create/force node into OFFLINE state before "
2142             + "completing assignment but failed to do so for " + state);
2143         return -1;
2144       }
2145     } catch (KeeperException e) {
2146       server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
2147       return -1;
2148     }
2149     return versionOfOfflineNode;
2150   }
2151 
2152   /**
2153    * @param region the region to assign
2154    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2155    * if no servers to assign, it returns null).
2156    */
2157   private RegionPlan getRegionPlan(final HRegionInfo region,
2158       final boolean forceNewPlan)  throws HBaseIOException {
2159     return getRegionPlan(region, null, forceNewPlan);
2160   }
2161 
2162   /**
2163    * @param region the region to assign
2164    * @param serverToExclude Server to exclude (we know its bad). Pass null if
2165    * all servers are thought to be assignable.
2166    * @param forceNewPlan If true, then if an existing plan exists, a new plan
2167    * will be generated.
2168    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2169    * if no servers to assign, it returns null).
2170    */
2171   private RegionPlan getRegionPlan(final HRegionInfo region,
2172       final ServerName serverToExclude, final boolean forceNewPlan) throws HBaseIOException {
2173     // Pickup existing plan or make a new one
2174     final String encodedName = region.getEncodedName();
2175     final List<ServerName> destServers =
2176       serverManager.createDestinationServersList(serverToExclude);
2177 
2178     if (destServers.isEmpty()){
2179       LOG.warn("Can't move " + encodedName +
2180         ", there is no destination server available.");
2181       return null;
2182     }
2183 
2184     RegionPlan randomPlan = null;
2185     boolean newPlan = false;
2186     RegionPlan existingPlan;
2187 
2188     synchronized (this.regionPlans) {
2189       existingPlan = this.regionPlans.get(encodedName);
2190 
2191       if (existingPlan != null && existingPlan.getDestination() != null) {
2192         LOG.debug("Found an existing plan for " + region.getRegionNameAsString()
2193           + " destination server is " + existingPlan.getDestination() +
2194             " accepted as a dest server = " + destServers.contains(existingPlan.getDestination()));
2195       }
2196 
2197       if (forceNewPlan
2198           || existingPlan == null
2199           || existingPlan.getDestination() == null
2200           || !destServers.contains(existingPlan.getDestination())) {
2201         newPlan = true;
2202         randomPlan = new RegionPlan(region, null,
2203             balancer.randomAssignment(region, destServers));
2204         if (!region.isMetaTable() && shouldAssignRegionsWithFavoredNodes) {
2205           List<HRegionInfo> regions = new ArrayList<HRegionInfo>(1);
2206           regions.add(region);
2207           try {
2208             processFavoredNodes(regions);
2209           } catch (IOException ie) {
2210             LOG.warn("Ignoring exception in processFavoredNodes " + ie);
2211           }
2212         }
2213         this.regionPlans.put(encodedName, randomPlan);
2214       }
2215     }
2216 
2217     if (newPlan) {
2218       if (randomPlan.getDestination() == null) {
2219         LOG.warn("Can't find a destination for " + encodedName);
2220         return null;
2221       }
2222       LOG.debug("No previous transition plan found (or ignoring " +
2223         "an existing plan) for " + region.getRegionNameAsString() +
2224         "; generated random plan=" + randomPlan + "; " +
2225         serverManager.countOfRegionServers() +
2226                " (online=" + serverManager.getOnlineServers().size() +
2227                ", available=" + destServers.size() + ") available servers" +
2228                ", forceNewPlan=" + forceNewPlan);
2229         return randomPlan;
2230       }
2231     LOG.debug("Using pre-existing plan for " +
2232       region.getRegionNameAsString() + "; plan=" + existingPlan);
2233     return existingPlan;
2234   }
2235 
2236   /**
2237    * Unassigns the specified region.
2238    * <p>
2239    * Updates the RegionState and sends the CLOSE RPC unless region is being
2240    * split by regionserver; then the unassign fails (silently) because we
2241    * presume the region being unassigned no longer exists (its been split out
2242    * of existence). TODO: What to do if split fails and is rolled back and
2243    * parent is revivified?
2244    * <p>
2245    * If a RegionPlan is already set, it will remain.
2246    *
2247    * @param region server to be unassigned
2248    */
2249   public void unassign(HRegionInfo region) {
2250     unassign(region, false);
2251   }
2252 
2253 
2254   /**
2255    * Unassigns the specified region.
2256    * <p>
2257    * Updates the RegionState and sends the CLOSE RPC unless region is being
2258    * split by regionserver; then the unassign fails (silently) because we
2259    * presume the region being unassigned no longer exists (its been split out
2260    * of existence). TODO: What to do if split fails and is rolled back and
2261    * parent is revivified?
2262    * <p>
2263    * If a RegionPlan is already set, it will remain.
2264    *
2265    * @param region server to be unassigned
2266    * @param force if region should be closed even if already closing
2267    */
2268   public void unassign(HRegionInfo region, boolean force, ServerName dest) {
2269     // TODO: Method needs refactoring.  Ugly buried returns throughout.  Beware!
2270     LOG.debug("Starting unassign of " + region.getRegionNameAsString()
2271       + " (offlining), current state: " + regionStates.getRegionState(region));
2272 
2273     String encodedName = region.getEncodedName();
2274     // Grab the state of this region and synchronize on it
2275     int versionOfClosingNode = -1;
2276     // We need a lock here as we're going to do a put later and we don't want multiple states
2277     //  creation
2278     ReentrantLock lock = locker.acquireLock(encodedName);
2279     RegionState state = regionStates.getRegionTransitionState(encodedName);
2280     boolean reassign = true;
2281     try {
2282       if (state == null) {
2283         // Region is not in transition.
2284         // We can unassign it only if it's not SPLIT/MERGED.
2285         state = regionStates.getRegionState(encodedName);
2286         if (state != null && state.isUnassignable()) {
2287           LOG.info("Attempting to unassign " + state + ", ignored");
2288           // Offline region will be reassigned below
2289           return;
2290         }
2291         // Create the znode in CLOSING state
2292         try {
2293           if (state == null || state.getServerName() == null) {
2294             // We don't know where the region is, offline it.
2295             // No need to send CLOSE RPC
2296             LOG.warn("Attempting to unassign a region not in RegionStates"
2297               + region.getRegionNameAsString() + ", offlined");
2298             regionOffline(region);
2299             return;
2300           }
2301           versionOfClosingNode = ZKAssign.createNodeClosing(
2302             watcher, region, state.getServerName());
2303           if (versionOfClosingNode == -1) {
2304             LOG.info("Attempting to unassign " +
2305               region.getRegionNameAsString() + " but ZK closing node "
2306               + "can't be created.");
2307             reassign = false; // not unassigned at all
2308             return;
2309           }
2310         } catch (KeeperException e) {
2311           if (e instanceof NodeExistsException) {
2312             // Handle race between master initiated close and regionserver
2313             // orchestrated splitting. See if existing node is in a
2314             // SPLITTING or SPLIT state.  If so, the regionserver started
2315             // an op on node before we could get our CLOSING in.  Deal.
2316             NodeExistsException nee = (NodeExistsException)e;
2317             String path = nee.getPath();
2318             try {
2319               if (isSplitOrSplittingOrMergedOrMerging(path)) {
2320                 LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " +
2321                   "skipping unassign because region no longer exists -- its split or merge");
2322                 reassign = false; // no need to reassign for split/merged region
2323                 return;
2324               }
2325             } catch (KeeperException.NoNodeException ke) {
2326               LOG.warn("Failed getData on SPLITTING/SPLIT at " + path +
2327                 "; presuming split and that the region to unassign, " +
2328                 encodedName + ", no longer exists -- confirm", ke);
2329               return;
2330             } catch (KeeperException ke) {
2331               LOG.error("Unexpected zk state", ke);
2332             } catch (DeserializationException de) {
2333               LOG.error("Failed parse", de);
2334             }
2335           }
2336           // If we get here, don't understand whats going on -- abort.
2337           server.abort("Unexpected ZK exception creating node CLOSING", e);
2338           reassign = false; // heading out already
2339           return;
2340         }
2341         state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2342       } else if (state.isFailedOpen()) {
2343         // The region is not open yet
2344         regionOffline(region);
2345         return;
2346       } else if (force && state.isPendingCloseOrClosing()) {
2347         LOG.debug("Attempting to unassign " + region.getRegionNameAsString() +
2348           " which is already " + state.getState()  +
2349           " but forcing to send a CLOSE RPC again ");
2350         if (state.isFailedClose()) {
2351           state = regionStates.updateRegionState(region, State.PENDING_CLOSE);
2352         }
2353         state.updateTimestampToNow();
2354       } else {
2355         LOG.debug("Attempting to unassign " +
2356           region.getRegionNameAsString() + " but it is " +
2357           "already in transition (" + state.getState() + ", force=" + force + ")");
2358         return;
2359       }
2360 
2361       unassign(region, state, versionOfClosingNode, dest, true, null);
2362     } finally {
2363       lock.unlock();
2364 
2365       // Region is expected to be reassigned afterwards
2366       if (reassign && regionStates.isRegionOffline(region)) {
2367         assign(region, true);
2368       }
2369     }
2370   }
2371 
2372   public void unassign(HRegionInfo region, boolean force){
2373      unassign(region, force, null);
2374   }
2375 
2376   /**
2377    * @param region regioninfo of znode to be deleted.
2378    */
2379   public void deleteClosingOrClosedNode(HRegionInfo region, ServerName sn) {
2380     String encodedName = region.getEncodedName();
2381     deleteNodeInStates(encodedName, "closing", sn, EventType.M_ZK_REGION_CLOSING,
2382       EventType.RS_ZK_REGION_CLOSED);
2383   }
2384 
2385   /**
2386    * @param path
2387    * @return True if znode is in SPLIT or SPLITTING or MERGED or MERGING state.
2388    * @throws KeeperException Can happen if the znode went away in meantime.
2389    * @throws DeserializationException
2390    */
2391   private boolean isSplitOrSplittingOrMergedOrMerging(final String path)
2392       throws KeeperException, DeserializationException {
2393     boolean result = false;
2394     // This may fail if the SPLIT or SPLITTING or MERGED or MERGING znode gets
2395     // cleaned up before we can get data from it.
2396     byte [] data = ZKAssign.getData(watcher, path);
2397     if (data == null) {
2398       LOG.info("Node " + path + " is gone");
2399       return false;
2400     }
2401     RegionTransition rt = RegionTransition.parseFrom(data);
2402     switch (rt.getEventType()) {
2403     case RS_ZK_REQUEST_REGION_SPLIT:
2404     case RS_ZK_REGION_SPLIT:
2405     case RS_ZK_REGION_SPLITTING:
2406     case RS_ZK_REQUEST_REGION_MERGE:
2407     case RS_ZK_REGION_MERGED:
2408     case RS_ZK_REGION_MERGING:
2409       result = true;
2410       break;
2411     default:
2412       LOG.info("Node " + path + " is in " + rt.getEventType());
2413       break;
2414     }
2415     return result;
2416   }
2417 
2418   /**
2419    * Used by unit tests. Return the number of regions opened so far in the life
2420    * of the master. Increases by one every time the master opens a region
2421    * @return the counter value of the number of regions opened so far
2422    */
2423   public int getNumRegionsOpened() {
2424     return numRegionsOpened.get();
2425   }
2426 
2427   /**
2428    * Waits until the specified region has completed assignment.
2429    * <p>
2430    * If the region is already assigned, returns immediately.  Otherwise, method
2431    * blocks until the region is assigned.
2432    * @param regionInfo region to wait on assignment for
2433    * @throws InterruptedException
2434    */
2435   public boolean waitForAssignment(HRegionInfo regionInfo)
2436       throws InterruptedException {
2437     while (!regionStates.isRegionOnline(regionInfo)) {
2438       if (regionStates.isRegionInState(regionInfo, State.FAILED_OPEN)
2439           || this.server.isStopped()) {
2440         return false;
2441       }
2442 
2443       // We should receive a notification, but it's
2444       //  better to have a timeout to recheck the condition here:
2445       //  it lowers the impact of a race condition if any
2446       regionStates.waitForUpdate(100);
2447     }
2448     return true;
2449   }
2450 
2451   /**
2452    * Assigns the hbase:meta region.
2453    * <p>
2454    * Assumes that hbase:meta is currently closed and is not being actively served by
2455    * any RegionServer.
2456    * <p>
2457    * Forcibly unsets the current meta region location in ZooKeeper and assigns
2458    * hbase:meta to a random RegionServer.
2459    * @throws KeeperException
2460    */
2461   public void assignMeta() throws KeeperException {
2462     MetaRegionTracker.deleteMetaLocation(this.watcher);
2463     assign(HRegionInfo.FIRST_META_REGIONINFO, true);
2464   }
2465 
2466   /**
2467    * Assigns specified regions retaining assignments, if any.
2468    * <p>
2469    * This is a synchronous call and will return once every region has been
2470    * assigned.  If anything fails, an exception is thrown
2471    * @throws InterruptedException
2472    * @throws IOException
2473    */
2474   public void assign(Map<HRegionInfo, ServerName> regions)
2475         throws IOException, InterruptedException {
2476     if (regions == null || regions.isEmpty()) {
2477       return;
2478     }
2479     List<ServerName> servers = serverManager.createDestinationServersList();
2480     if (servers == null || servers.isEmpty()) {
2481       throw new IOException("Found no destination server to assign region(s)");
2482     }
2483 
2484     // Reuse existing assignment info
2485     Map<ServerName, List<HRegionInfo>> bulkPlan =
2486       balancer.retainAssignment(regions, servers);
2487 
2488     assign(regions.size(), servers.size(),
2489       "retainAssignment=true", bulkPlan);
2490   }
2491 
2492   /**
2493    * Assigns specified regions round robin, if any.
2494    * <p>
2495    * This is a synchronous call and will return once every region has been
2496    * assigned.  If anything fails, an exception is thrown
2497    * @throws InterruptedException
2498    * @throws IOException
2499    */
2500   public void assign(List<HRegionInfo> regions)
2501         throws IOException, InterruptedException {
2502     if (regions == null || regions.isEmpty()) {
2503       return;
2504     }
2505 
2506     List<ServerName> servers = serverManager.createDestinationServersList();
2507     if (servers == null || servers.isEmpty()) {
2508       throw new IOException("Found no destination server to assign region(s)");
2509     }
2510 
2511     // Generate a round-robin bulk assignment plan
2512     Map<ServerName, List<HRegionInfo>> bulkPlan
2513       = balancer.roundRobinAssignment(regions, servers);
2514     processFavoredNodes(regions);
2515 
2516     assign(regions.size(), servers.size(),
2517       "round-robin=true", bulkPlan);
2518   }
2519 
2520   private void assign(int regions, int totalServers,
2521       String message, Map<ServerName, List<HRegionInfo>> bulkPlan)
2522           throws InterruptedException, IOException {
2523 
2524     int servers = bulkPlan.size();
2525     if (servers == 1 || (regions < bulkAssignThresholdRegions
2526         && servers < bulkAssignThresholdServers)) {
2527 
2528       // Not use bulk assignment.  This could be more efficient in small
2529       // cluster, especially mini cluster for testing, so that tests won't time out
2530       if (LOG.isTraceEnabled()) {
2531         LOG.trace("Not using bulk assignment since we are assigning only " + regions +
2532           " region(s) to " + servers + " server(s)");
2533       }
2534       for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) {
2535         if (!assign(plan.getKey(), plan.getValue())) {
2536           for (HRegionInfo region: plan.getValue()) {
2537             if (!regionStates.isRegionOnline(region)) {
2538               invokeAssign(region);
2539             }
2540           }
2541         }
2542       }
2543     } else {
2544       LOG.info("Bulk assigning " + regions + " region(s) across "
2545         + totalServers + " server(s), " + message);
2546 
2547       // Use fixed count thread pool assigning.
2548       BulkAssigner ba = new GeneralBulkAssigner(
2549         this.server, bulkPlan, this, bulkAssignWaitTillAllAssigned);
2550       ba.bulkAssign();
2551       LOG.info("Bulk assigning done");
2552     }
2553   }
2554 
2555   /**
2556    * Assigns all user regions, if any exist.  Used during cluster startup.
2557    * <p>
2558    * This is a synchronous call and will return once every region has been
2559    * assigned.  If anything fails, an exception is thrown and the cluster
2560    * should be shutdown.
2561    * @throws InterruptedException
2562    * @throws IOException
2563    * @throws KeeperException
2564    */
2565   private void assignAllUserRegions()
2566       throws IOException, InterruptedException, KeeperException {
2567     // Cleanup any existing ZK nodes and start watching
2568     ZKAssign.deleteAllNodes(watcher);
2569     ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
2570       this.watcher.assignmentZNode);
2571     failoverCleanupDone();
2572 
2573     // Skip assignment for regions of tables in DISABLING state because during clean cluster startup
2574     // no RS is alive and regions map also doesn't have any information about the regions.
2575     // See HBASE-6281.
2576     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisabledOrDisablingTables(watcher);
2577     disabledOrDisablingOrEnabling.addAll(ZKTable.getEnablingTables(watcher));
2578     // Scan hbase:meta for all user regions, skipping any disabled tables
2579     Map<HRegionInfo, ServerName> allRegions;
2580     SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment =
2581        new SnapshotOfRegionAssignmentFromMeta(catalogTracker, disabledOrDisablingOrEnabling, true);
2582     snapshotOfRegionAssignment.initialize();
2583     allRegions = snapshotOfRegionAssignment.getRegionToRegionServerMap();
2584     if (allRegions == null || allRegions.isEmpty()) return;
2585 
2586     // Determine what type of assignment to do on startup
2587     boolean retainAssignment = server.getConfiguration().
2588       getBoolean("hbase.master.startup.retainassign", true);
2589 
2590     if (retainAssignment) {
2591       assign(allRegions);
2592     } else {
2593       List<HRegionInfo> regions = new ArrayList<HRegionInfo>(allRegions.keySet());
2594       assign(regions);
2595     }
2596 
2597     for (HRegionInfo hri : allRegions.keySet()) {
2598       TableName tableName = hri.getTable();
2599       if (!zkTable.isEnabledTable(tableName)) {
2600         setEnabledTable(tableName);
2601       }
2602     }
2603   }
2604 
2605   /**
2606    * Wait until no regions in transition.
2607    * @param timeout How long to wait.
2608    * @return True if nothing in regions in transition.
2609    * @throws InterruptedException
2610    */
2611   boolean waitUntilNoRegionsInTransition(final long timeout)
2612       throws InterruptedException {
2613     // Blocks until there are no regions in transition. It is possible that
2614     // there
2615     // are regions in transition immediately after this returns but guarantees
2616     // that if it returns without an exception that there was a period of time
2617     // with no regions in transition from the point-of-view of the in-memory
2618     // state of the Master.
2619     final long endTime = System.currentTimeMillis() + timeout;
2620 
2621     while (!this.server.isStopped() && regionStates.isRegionsInTransition()
2622         && endTime > System.currentTimeMillis()) {
2623       regionStates.waitForUpdate(100);
2624     }
2625 
2626     return !regionStates.isRegionsInTransition();
2627   }
2628 
2629   /**
2630    * Rebuild the list of user regions and assignment information.
2631    * <p>
2632    * Returns a map of servers that are not found to be online and the regions
2633    * they were hosting.
2634    * @return map of servers not online to their assigned regions, as stored
2635    *         in META
2636    * @throws IOException
2637    */
2638   Map<ServerName, List<HRegionInfo>> rebuildUserRegions() throws IOException, KeeperException {
2639     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2640     Set<TableName> disabledOrEnablingTables = ZKTable.getDisabledTables(watcher);
2641     disabledOrEnablingTables.addAll(enablingTables);
2642     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisablingTables(watcher);
2643     disabledOrDisablingOrEnabling.addAll(disabledOrEnablingTables);
2644 
2645     // Region assignment from META
2646     List<Result> results = MetaReader.fullScan(this.catalogTracker);
2647     // Get any new but slow to checkin region server that joined the cluster
2648     Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
2649     // Map of offline servers and their regions to be returned
2650     Map<ServerName, List<HRegionInfo>> offlineServers =
2651       new TreeMap<ServerName, List<HRegionInfo>>();
2652     // Iterate regions in META
2653     for (Result result : results) {
2654       Pair<HRegionInfo, ServerName> region = HRegionInfo.getHRegionInfoAndServerName(result);
2655       if (region == null) continue;
2656       HRegionInfo regionInfo = region.getFirst();
2657       ServerName regionLocation = region.getSecond();
2658       if (regionInfo == null) continue;
2659       regionStates.createRegionState(regionInfo);
2660       if (regionStates.isRegionInState(regionInfo, State.SPLIT)) {
2661         // Split is considered to be completed. If the split znode still
2662         // exists, the region will be put back to SPLITTING state later
2663         LOG.debug("Region " + regionInfo.getRegionNameAsString()
2664            + " split is completed. Hence need not add to regions list");
2665         continue;
2666       }
2667       TableName tableName = regionInfo.getTable();
2668       if (regionLocation == null) {
2669         // regionLocation could be null if createTable didn't finish properly.
2670         // When createTable is in progress, HMaster restarts.
2671         // Some regions have been added to hbase:meta, but have not been assigned.
2672         // When this happens, the region's table must be in ENABLING state.
2673         // It can't be in ENABLED state as that is set when all regions are
2674         // assigned.
2675         // It can't be in DISABLING state, because DISABLING state transitions
2676         // from ENABLED state when application calls disableTable.
2677         // It can't be in DISABLED state, because DISABLED states transitions
2678         // from DISABLING state.
2679         if (!enablingTables.contains(tableName)) {
2680           LOG.warn("Region " + regionInfo.getEncodedName() +
2681             " has null regionLocation." + " But its table " + tableName +
2682             " isn't in ENABLING state.");
2683         }
2684       } else if (!onlineServers.contains(regionLocation)) {
2685         // Region is located on a server that isn't online
2686         List<HRegionInfo> offlineRegions = offlineServers.get(regionLocation);
2687         if (offlineRegions == null) {
2688           offlineRegions = new ArrayList<HRegionInfo>(1);
2689           offlineServers.put(regionLocation, offlineRegions);
2690         }
2691         offlineRegions.add(regionInfo);
2692         // need to enable the table if not disabled or disabling or enabling
2693         // this will be used in rolling restarts
2694         if (!disabledOrDisablingOrEnabling.contains(tableName)
2695             && !getZKTable().isEnabledTable(tableName)) {
2696           setEnabledTable(tableName);
2697         }
2698       } else {
2699         // Region is being served and on an active server
2700         // add only if region not in disabled or enabling table
2701         if (!disabledOrEnablingTables.contains(tableName)) {
2702           regionStates.updateRegionState(regionInfo, State.OPEN, regionLocation);
2703           regionStates.regionOnline(regionInfo, regionLocation);
2704           balancer.regionOnline(regionInfo, regionLocation);
2705         }
2706         // need to enable the table if not disabled or disabling or enabling
2707         // this will be used in rolling restarts
2708         if (!disabledOrDisablingOrEnabling.contains(tableName)
2709             && !getZKTable().isEnabledTable(tableName)) {
2710           setEnabledTable(tableName);
2711         }
2712       }
2713     }
2714     return offlineServers;
2715   }
2716 
2717   /**
2718    * Recover the tables that were not fully moved to DISABLED state. These
2719    * tables are in DISABLING state when the master restarted/switched.
2720    *
2721    * @throws KeeperException
2722    * @throws TableNotFoundException
2723    * @throws IOException
2724    */
2725   private void recoverTableInDisablingState()
2726       throws KeeperException, TableNotFoundException, IOException {
2727     Set<TableName> disablingTables = ZKTable.getDisablingTables(watcher);
2728     if (disablingTables.size() != 0) {
2729       for (TableName tableName : disablingTables) {
2730         // Recover by calling DisableTableHandler
2731         LOG.info("The table " + tableName
2732             + " is in DISABLING state.  Hence recovering by moving the table"
2733             + " to DISABLED state.");
2734         new DisableTableHandler(this.server, tableName, catalogTracker,
2735             this, tableLockManager, true).prepare().process();
2736       }
2737     }
2738   }
2739 
2740   /**
2741    * Recover the tables that are not fully moved to ENABLED state. These tables
2742    * are in ENABLING state when the master restarted/switched
2743    *
2744    * @throws KeeperException
2745    * @throws org.apache.hadoop.hbase.TableNotFoundException
2746    * @throws IOException
2747    */
2748   private void recoverTableInEnablingState()
2749       throws KeeperException, TableNotFoundException, IOException {
2750     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2751     if (enablingTables.size() != 0) {
2752       for (TableName tableName : enablingTables) {
2753         // Recover by calling EnableTableHandler
2754         LOG.info("The table " + tableName
2755             + " is in ENABLING state.  Hence recovering by moving the table"
2756             + " to ENABLED state.");
2757         // enableTable in sync way during master startup,
2758         // no need to invoke coprocessor
2759         EnableTableHandler eth = new EnableTableHandler(this.server, tableName,
2760           catalogTracker, this, tableLockManager, true);
2761         try {
2762           eth.prepare();
2763         } catch (TableNotFoundException e) {
2764           LOG.warn("Table " + tableName + " not found in hbase:meta to recover.");
2765           continue;
2766         }
2767         eth.process();
2768       }
2769     }
2770   }
2771 
2772   /**
2773    * Processes list of dead servers from result of hbase:meta scan and regions in RIT
2774    * <p>
2775    * This is used for failover to recover the lost regions that belonged to
2776    * RegionServers which failed while there was no active master or regions
2777    * that were in RIT.
2778    * <p>
2779    *
2780    *
2781    * @param deadServers
2782    *          The list of dead servers which failed while there was no active
2783    *          master. Can be null.
2784    * @throws IOException
2785    * @throws KeeperException
2786    */
2787   private void processDeadServersAndRecoverLostRegions(
2788       Map<ServerName, List<HRegionInfo>> deadServers)
2789           throws IOException, KeeperException {
2790     if (deadServers != null) {
2791       for (Map.Entry<ServerName, List<HRegionInfo>> server: deadServers.entrySet()) {
2792         ServerName serverName = server.getKey();
2793         // We need to keep such info even if the server is known dead
2794         regionStates.setLastRegionServerOfRegions(serverName, server.getValue());
2795         if (!serverManager.isServerDead(serverName)) {
2796           serverManager.expireServer(serverName); // Let SSH do region re-assign
2797         }
2798       }
2799     }
2800     List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(
2801       this.watcher, this.watcher.assignmentZNode);
2802     if (!nodes.isEmpty()) {
2803       for (String encodedRegionName : nodes) {
2804         processRegionInTransition(encodedRegionName, null);
2805       }
2806     }
2807 
2808     // Now we can safely claim failover cleanup completed and enable
2809     // ServerShutdownHandler for further processing. The nodes (below)
2810     // in transition, if any, are for regions not related to those
2811     // dead servers at all, and can be done in parallel to SSH.
2812     failoverCleanupDone();
2813   }
2814 
2815   /**
2816    * Set Regions in transitions metrics.
2817    * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
2818    * This iterator is not fail fast, which may lead to stale read; but that's better than
2819    * creating a copy of the map for metrics computation, as this method will be invoked
2820    * on a frequent interval.
2821    */
2822   public void updateRegionsInTransitionMetrics() {
2823     long currentTime = System.currentTimeMillis();
2824     int totalRITs = 0;
2825     int totalRITsOverThreshold = 0;
2826     long oldestRITTime = 0;
2827     int ritThreshold = this.server.getConfiguration().
2828       getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
2829     for (RegionState state: regionStates.getRegionsInTransition().values()) {
2830       totalRITs++;
2831       long ritTime = currentTime - state.getStamp();
2832       if (ritTime > ritThreshold) { // more than the threshold
2833         totalRITsOverThreshold++;
2834       }
2835       if (oldestRITTime < ritTime) {
2836         oldestRITTime = ritTime;
2837       }
2838     }
2839     if (this.metricsAssignmentManager != null) {
2840       this.metricsAssignmentManager.updateRITOldestAge(oldestRITTime);
2841       this.metricsAssignmentManager.updateRITCount(totalRITs);
2842       this.metricsAssignmentManager.updateRITCountOverThreshold(totalRITsOverThreshold);
2843     }
2844   }
2845 
2846   /**
2847    * @param region Region whose plan we are to clear.
2848    */
2849   void clearRegionPlan(final HRegionInfo region) {
2850     synchronized (this.regionPlans) {
2851       this.regionPlans.remove(region.getEncodedName());
2852     }
2853   }
2854 
2855   /**
2856    * Wait on region to clear regions-in-transition.
2857    * @param hri Region to wait on.
2858    * @throws IOException
2859    */
2860   public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
2861       throws IOException, InterruptedException {
2862     waitOnRegionToClearRegionsInTransition(hri, -1L);
2863   }
2864 
2865   /**
2866    * Wait on region to clear regions-in-transition or time out
2867    * @param hri
2868    * @param timeOut Milliseconds to wait for current region to be out of transition state.
2869    * @return True when a region clears regions-in-transition before timeout otherwise false
2870    * @throws InterruptedException
2871    */
2872   public boolean waitOnRegionToClearRegionsInTransition(final HRegionInfo hri, long timeOut)
2873       throws InterruptedException {
2874     if (!regionStates.isRegionInTransition(hri)) return true;
2875     long end = (timeOut <= 0) ? Long.MAX_VALUE : EnvironmentEdgeManager.currentTimeMillis()
2876         + timeOut;
2877     // There is already a timeout monitor on regions in transition so I
2878     // should not have to have one here too?
2879     LOG.info("Waiting for " + hri.getEncodedName() +
2880         " to leave regions-in-transition, timeOut=" + timeOut + " ms.");
2881     while (!this.server.isStopped() && regionStates.isRegionInTransition(hri)) {
2882       regionStates.waitForUpdate(100);
2883       if (EnvironmentEdgeManager.currentTimeMillis() > end) {
2884         LOG.info("Timed out on waiting for " + hri.getEncodedName() + " to be assigned.");
2885         return false;
2886       }
2887     }
2888     if (this.server.isStopped()) {
2889       LOG.info("Giving up wait on regions in transition because stoppable.isStopped is set");
2890       return false;
2891     }
2892     return true;
2893   }
2894 
2895   /**
2896    * Update timers for all regions in transition going against the server in the
2897    * serversInUpdatingTimer.
2898    */
2899   public class TimerUpdater extends Chore {
2900 
2901     public TimerUpdater(final int period, final Stoppable stopper) {
2902       super("AssignmentTimerUpdater", period, stopper);
2903     }
2904 
2905     @Override
2906     protected void chore() {
2907       Preconditions.checkState(tomActivated);
2908       ServerName serverToUpdateTimer = null;
2909       while (!serversInUpdatingTimer.isEmpty() && !stopper.isStopped()) {
2910         if (serverToUpdateTimer == null) {
2911           serverToUpdateTimer = serversInUpdatingTimer.first();
2912         } else {
2913           serverToUpdateTimer = serversInUpdatingTimer
2914               .higher(serverToUpdateTimer);
2915         }
2916         if (serverToUpdateTimer == null) {
2917           break;
2918         }
2919         updateTimers(serverToUpdateTimer);
2920         serversInUpdatingTimer.remove(serverToUpdateTimer);
2921       }
2922     }
2923   }
2924 
2925   /**
2926    * Monitor to check for time outs on region transition operations
2927    */
2928   public class TimeoutMonitor extends Chore {
2929     private boolean allRegionServersOffline = false;
2930     private ServerManager serverManager;
2931     private final int timeout;
2932 
2933     /**
2934      * Creates a periodic monitor to check for time outs on region transition
2935      * operations.  This will deal with retries if for some reason something
2936      * doesn't happen within the specified timeout.
2937      * @param period
2938    * @param stopper When {@link Stoppable#isStopped()} is true, this thread will
2939    * cleanup and exit cleanly.
2940      * @param timeout
2941      */
2942     public TimeoutMonitor(final int period, final Stoppable stopper,
2943         ServerManager serverManager,
2944         final int timeout) {
2945       super("AssignmentTimeoutMonitor", period, stopper);
2946       this.timeout = timeout;
2947       this.serverManager = serverManager;
2948     }
2949 
2950     private synchronized void setAllRegionServersOffline(
2951       boolean allRegionServersOffline) {
2952       this.allRegionServersOffline = allRegionServersOffline;
2953     }
2954 
2955     @Override
2956     protected void chore() {
2957       Preconditions.checkState(tomActivated);
2958       boolean noRSAvailable = this.serverManager.createDestinationServersList().isEmpty();
2959 
2960       // Iterate all regions in transition checking for time outs
2961       long now = System.currentTimeMillis();
2962       // no lock concurrent access ok: we will be working on a copy, and it's java-valid to do
2963       //  a copy while another thread is adding/removing items
2964       for (String regionName : regionStates.getRegionsInTransition().keySet()) {
2965         RegionState regionState = regionStates.getRegionTransitionState(regionName);
2966         if (regionState == null) continue;
2967 
2968         if (regionState.getStamp() + timeout <= now) {
2969           // decide on action upon timeout
2970           actOnTimeOut(regionState);
2971         } else if (this.allRegionServersOffline && !noRSAvailable) {
2972           RegionPlan existingPlan = regionPlans.get(regionName);
2973           if (existingPlan == null
2974               || !this.serverManager.isServerOnline(existingPlan
2975                   .getDestination())) {
2976             // if some RSs just came back online, we can start the assignment
2977             // right away
2978             actOnTimeOut(regionState);
2979           }
2980         }
2981       }
2982       setAllRegionServersOffline(noRSAvailable);
2983     }
2984 
2985     private void actOnTimeOut(RegionState regionState) {
2986       HRegionInfo regionInfo = regionState.getRegion();
2987       LOG.info("Regions in transition timed out:  " + regionState);
2988       // Expired! Do a retry.
2989       switch (regionState.getState()) {
2990       case CLOSED:
2991         LOG.info("Region " + regionInfo.getEncodedName()
2992             + " has been CLOSED for too long, waiting on queued "
2993             + "ClosedRegionHandler to run or server shutdown");
2994         // Update our timestamp.
2995         regionState.updateTimestampToNow();
2996         break;
2997       case OFFLINE:
2998         LOG.info("Region has been OFFLINE for too long, " + "reassigning "
2999             + regionInfo.getRegionNameAsString() + " to a random server");
3000         invokeAssign(regionInfo);
3001         break;
3002       case PENDING_OPEN:
3003         LOG.info("Region has been PENDING_OPEN for too "
3004             + "long, reassigning region=" + regionInfo.getRegionNameAsString());
3005         invokeAssign(regionInfo);
3006         break;
3007       case OPENING:
3008         processOpeningState(regionInfo);
3009         break;
3010       case OPEN:
3011         LOG.error("Region has been OPEN for too long, " +
3012             "we don't know where region was opened so can't do anything");
3013         regionState.updateTimestampToNow();
3014         break;
3015 
3016       case PENDING_CLOSE:
3017         LOG.info("Region has been PENDING_CLOSE for too "
3018             + "long, running forced unassign again on region="
3019             + regionInfo.getRegionNameAsString());
3020         invokeUnassign(regionInfo);
3021         break;
3022       case CLOSING:
3023         LOG.info("Region has been CLOSING for too " +
3024           "long, this should eventually complete or the server will " +
3025           "expire, send RPC again");
3026         invokeUnassign(regionInfo);
3027         break;
3028 
3029       case SPLIT:
3030       case SPLITTING:
3031       case FAILED_OPEN:
3032       case FAILED_CLOSE:
3033       case MERGING:
3034         break;
3035 
3036       default:
3037         throw new IllegalStateException("Received event is not valid.");
3038       }
3039     }
3040   }
3041 
3042   private void processOpeningState(HRegionInfo regionInfo) {
3043     LOG.info("Region has been OPENING for too long, reassigning region="
3044         + regionInfo.getRegionNameAsString());
3045     // Should have a ZK node in OPENING state
3046     try {
3047       String node = ZKAssign.getNodeName(watcher, regionInfo.getEncodedName());
3048       Stat stat = new Stat();
3049       byte [] data = ZKAssign.getDataNoWatch(watcher, node, stat);
3050       if (data == null) {
3051         LOG.warn("Data is null, node " + node + " no longer exists");
3052         return;
3053       }
3054       RegionTransition rt = RegionTransition.parseFrom(data);
3055       EventType et = rt.getEventType();
3056       if (et == EventType.RS_ZK_REGION_OPENED) {
3057         LOG.debug("Region has transitioned to OPENED, allowing "
3058             + "watched event handlers to process");
3059         return;
3060       } else if (et != EventType.RS_ZK_REGION_OPENING && et != EventType.RS_ZK_REGION_FAILED_OPEN ) {
3061         LOG.warn("While timing out a region, found ZK node in unexpected state: " + et);
3062         return;
3063       }
3064       invokeAssign(regionInfo);
3065     } catch (KeeperException ke) {
3066       LOG.error("Unexpected ZK exception timing out CLOSING region", ke);
3067     } catch (DeserializationException e) {
3068       LOG.error("Unexpected exception parsing CLOSING region", e);
3069     }
3070   }
3071 
3072   void invokeAssign(HRegionInfo regionInfo) {
3073     threadPoolExecutorService.submit(new AssignCallable(this, regionInfo));
3074   }
3075 
3076   private void invokeUnassign(HRegionInfo regionInfo) {
3077     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3078   }
3079 
3080   public boolean isCarryingMeta(ServerName serverName) {
3081     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
3082   }
3083 
3084   /**
3085    * Check if the shutdown server carries the specific region.
3086    * We have a bunch of places that store region location
3087    * Those values aren't consistent. There is a delay of notification.
3088    * The location from zookeeper unassigned node has the most recent data;
3089    * but the node could be deleted after the region is opened by AM.
3090    * The AM's info could be old when OpenedRegionHandler
3091    * processing hasn't finished yet when server shutdown occurs.
3092    * @return whether the serverName currently hosts the region
3093    */
3094   private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
3095     RegionTransition rt = null;
3096     try {
3097       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
3098       // This call can legitimately come by null
3099       rt = data == null? null: RegionTransition.parseFrom(data);
3100     } catch (KeeperException e) {
3101       server.abort("Exception reading unassigned node for region=" + hri.getEncodedName(), e);
3102     } catch (DeserializationException e) {
3103       server.abort("Exception parsing unassigned node for region=" + hri.getEncodedName(), e);
3104     }
3105 
3106     ServerName addressFromZK = rt != null? rt.getServerName():  null;
3107     if (addressFromZK != null) {
3108       // if we get something from ZK, we will use the data
3109       boolean matchZK = addressFromZK.equals(serverName);
3110       LOG.debug("Checking region=" + hri.getRegionNameAsString() + ", zk server=" + addressFromZK +
3111         " current=" + serverName + ", matches=" + matchZK);
3112       return matchZK;
3113     }
3114 
3115     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
3116     boolean matchAM = (addressFromAM != null &&
3117       addressFromAM.equals(serverName));
3118     LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
3119       " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
3120       " server being checked: " + serverName);
3121 
3122     return matchAM;
3123   }
3124 
3125   /**
3126    * Process shutdown server removing any assignments.
3127    * @param sn Server that went down.
3128    * @return list of regions in transition on this server
3129    */
3130   public List<HRegionInfo> processServerShutdown(final ServerName sn) {
3131     // Clean out any existing assignment plans for this server
3132     synchronized (this.regionPlans) {
3133       for (Iterator <Map.Entry<String, RegionPlan>> i =
3134           this.regionPlans.entrySet().iterator(); i.hasNext();) {
3135         Map.Entry<String, RegionPlan> e = i.next();
3136         ServerName otherSn = e.getValue().getDestination();
3137         // The name will be null if the region is planned for a random assign.
3138         if (otherSn != null && otherSn.equals(sn)) {
3139           // Use iterator's remove else we'll get CME
3140           i.remove();
3141         }
3142       }
3143     }
3144     List<HRegionInfo> regions = regionStates.serverOffline(watcher, sn);
3145     for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) {
3146       HRegionInfo hri = it.next();
3147       String encodedName = hri.getEncodedName();
3148 
3149       // We need a lock on the region as we could update it
3150       Lock lock = locker.acquireLock(encodedName);
3151       try {
3152         RegionState regionState =
3153           regionStates.getRegionTransitionState(encodedName);
3154         if (regionState == null
3155             || (regionState.getServerName() != null && !regionState.isOnServer(sn))
3156             || !(regionState.isFailedClose() || regionState.isOffline()
3157               || regionState.isPendingOpenOrOpening())) {
3158           LOG.info("Skip " + regionState + " since it is not opening/failed_close"
3159             + " on the dead server any more: " + sn);
3160           it.remove();
3161         } else {
3162           try {
3163             // Delete the ZNode if exists
3164             ZKAssign.deleteNodeFailSilent(watcher, hri);
3165           } catch (KeeperException ke) {
3166             server.abort("Unexpected ZK exception deleting node " + hri, ke);
3167           }
3168           if (zkTable.isDisablingOrDisabledTable(hri.getTable())) {
3169             regionStates.regionOffline(hri);
3170             it.remove();
3171             continue;
3172           }
3173           // Mark the region offline and assign it again by SSH
3174           regionStates.updateRegionState(hri, State.OFFLINE);
3175         }
3176       } finally {
3177         lock.unlock();
3178       }
3179     }
3180     return regions;
3181   }
3182 
3183   /**
3184    * @param plan Plan to execute.
3185    */
3186   public void balance(final RegionPlan plan) {
3187     HRegionInfo hri = plan.getRegionInfo();
3188     TableName tableName = hri.getTable();
3189     if (zkTable.isDisablingOrDisabledTable(tableName)) {
3190       LOG.info("Ignored moving region of disabling/disabled table "
3191         + tableName);
3192       return;
3193     }
3194 
3195     // Move the region only if it's assigned
3196     String encodedName = hri.getEncodedName();
3197     ReentrantLock lock = locker.acquireLock(encodedName);
3198     try {
3199       if (!regionStates.isRegionOnline(hri)) {
3200         RegionState state = regionStates.getRegionState(encodedName);
3201         LOG.info("Ignored moving region not assigned: " + hri + ", "
3202           + (state == null ? "not in region states" : state));
3203         return;
3204       }
3205       synchronized (this.regionPlans) {
3206         this.regionPlans.put(plan.getRegionName(), plan);
3207       }
3208       unassign(hri, false, plan.getDestination());
3209     } finally {
3210       lock.unlock();
3211     }
3212   }
3213 
3214   public void stop() {
3215     shutdown(); // Stop executor service, etc
3216     if (tomActivated){
3217       this.timeoutMonitor.interrupt();
3218       this.timerUpdater.interrupt();
3219     }
3220   }
3221 
3222   /**
3223    * Shutdown the threadpool executor service
3224    */
3225   public void shutdown() {
3226     // It's an immediate shutdown, so we're clearing the remaining tasks.
3227     synchronized (zkEventWorkerWaitingList){
3228       zkEventWorkerWaitingList.clear();
3229     }
3230     threadPoolExecutorService.shutdownNow();
3231     zkEventWorkers.shutdownNow();
3232   }
3233 
3234   protected void setEnabledTable(TableName tableName) {
3235     try {
3236       this.zkTable.setEnabledTable(tableName);
3237     } catch (KeeperException e) {
3238       // here we can abort as it is the start up flow
3239       String errorMsg = "Unable to ensure that the table " + tableName
3240           + " will be" + " enabled because of a ZooKeeper issue";
3241       LOG.error(errorMsg);
3242       this.server.abort(errorMsg, e);
3243     }
3244   }
3245 
3246   /**
3247    * Set region as OFFLINED up in zookeeper asynchronously.
3248    * @param state
3249    * @return True if we succeeded, false otherwise (State was incorrect or failed
3250    * updating zk).
3251    */
3252   private boolean asyncSetOfflineInZooKeeper(final RegionState state,
3253       final AsyncCallback.StringCallback cb, final ServerName destination) {
3254     if (!state.isClosed() && !state.isOffline()) {
3255       this.server.abort("Unexpected state trying to OFFLINE; " + state,
3256         new IllegalStateException());
3257       return false;
3258     }
3259     regionStates.updateRegionState(state.getRegion(), State.OFFLINE);
3260     try {
3261       ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(),
3262         destination, cb, state);
3263     } catch (KeeperException e) {
3264       if (e instanceof NodeExistsException) {
3265         LOG.warn("Node for " + state.getRegion() + " already exists");
3266       } else {
3267         server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
3268       }
3269       return false;
3270     }
3271     return true;
3272   }
3273 
3274   private boolean deleteNodeInStates(String encodedName,
3275       String desc, ServerName sn, EventType... types) {
3276     try {
3277       for (EventType et: types) {
3278         if (ZKAssign.deleteNode(watcher, encodedName, et, sn)) {
3279           return true;
3280         }
3281       }
3282       LOG.info("Failed to delete the " + desc + " node for "
3283         + encodedName + ". The node type may not match");
3284     } catch (NoNodeException e) {
3285       if (LOG.isDebugEnabled()) {
3286         LOG.debug("The " + desc + " node for " + encodedName + " already deleted");
3287       }
3288     } catch (KeeperException ke) {
3289       server.abort("Unexpected ZK exception deleting " + desc
3290         + " node for the region " + encodedName, ke);
3291     }
3292     return false;
3293   }
3294 
3295   private void deleteMergingNode(String encodedName, ServerName sn) {
3296     deleteNodeInStates(encodedName, "merging", sn, EventType.RS_ZK_REGION_MERGING,
3297       EventType.RS_ZK_REQUEST_REGION_MERGE, EventType.RS_ZK_REGION_MERGED);
3298   }
3299 
3300   private void deleteSplittingNode(String encodedName, ServerName sn) {
3301     deleteNodeInStates(encodedName, "splitting", sn, EventType.RS_ZK_REGION_SPLITTING,
3302       EventType.RS_ZK_REQUEST_REGION_SPLIT, EventType.RS_ZK_REGION_SPLIT);
3303   }
3304 
3305   /**
3306    * A helper to handle region merging transition event.
3307    * It transitions merging regions to MERGING state.
3308    */
3309   private boolean handleRegionMerging(final RegionTransition rt, final String encodedName,
3310       final String prettyPrintedRegionName, final ServerName sn) {
3311     if (!serverManager.isServerOnline(sn)) {
3312       LOG.warn("Dropped merging! ServerName=" + sn + " unknown.");
3313       return false;
3314     }
3315     byte [] payloadOfMerging = rt.getPayload();
3316     List<HRegionInfo> mergingRegions;
3317     try {
3318       mergingRegions = HRegionInfo.parseDelimitedFrom(
3319         payloadOfMerging, 0, payloadOfMerging.length);
3320     } catch (IOException e) {
3321       LOG.error("Dropped merging! Failed reading "  + rt.getEventType()
3322         + " payload for " + prettyPrintedRegionName);
3323       return false;
3324     }
3325     assert mergingRegions.size() == 3;
3326     HRegionInfo p = mergingRegions.get(0);
3327     HRegionInfo hri_a = mergingRegions.get(1);
3328     HRegionInfo hri_b = mergingRegions.get(2);
3329 
3330     RegionState rs_p = regionStates.getRegionState(p);
3331     RegionState rs_a = regionStates.getRegionState(hri_a);
3332     RegionState rs_b = regionStates.getRegionState(hri_b);
3333 
3334     if (!((rs_a == null || rs_a.isOpenOrMergingOnServer(sn))
3335         && (rs_b == null || rs_b.isOpenOrMergingOnServer(sn))
3336         && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) {
3337       LOG.warn("Dropped merging! Not in state good for MERGING; rs_p="
3338         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3339       return false;
3340     }
3341 
3342     EventType et = rt.getEventType();
3343     if (et == EventType.RS_ZK_REQUEST_REGION_MERGE) {
3344       try {
3345         if (RegionMergeTransaction.transitionMergingNode(watcher, p,
3346             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_MERGE,
3347             EventType.RS_ZK_REGION_MERGING) == -1) {
3348           byte[] data = ZKAssign.getData(watcher, encodedName);
3349           EventType currentType = null;
3350           if (data != null) {
3351             RegionTransition newRt = RegionTransition.parseFrom(data);
3352             currentType = newRt.getEventType();
3353           }
3354           if (currentType == null || (currentType != EventType.RS_ZK_REGION_MERGED
3355               && currentType != EventType.RS_ZK_REGION_MERGING)) {
3356             LOG.warn("Failed to transition pending_merge node "
3357               + encodedName + " to merging, it's now " + currentType);
3358             return false;
3359           }
3360         }
3361       } catch (Exception e) {
3362         LOG.warn("Failed to transition pending_merge node "
3363           + encodedName + " to merging", e);
3364         return false;
3365       }
3366     }
3367 
3368     synchronized (regionStates) {
3369       regionStates.updateRegionState(hri_a, State.MERGING);
3370       regionStates.updateRegionState(hri_b, State.MERGING);
3371       regionStates.updateRegionState(p, State.MERGING_NEW, sn);
3372 
3373       if (et != EventType.RS_ZK_REGION_MERGED) {
3374         regionStates.regionOffline(p, State.MERGING_NEW);
3375         this.mergingRegions.put(encodedName,
3376           new PairOfSameType<HRegionInfo>(hri_a, hri_b));
3377       } else {
3378         this.mergingRegions.remove(encodedName);
3379         regionOffline(hri_a, State.MERGED);
3380         regionOffline(hri_b, State.MERGED);
3381         regionOnline(p, sn);
3382       }
3383     }
3384 
3385     if (et == EventType.RS_ZK_REGION_MERGED) {
3386       LOG.debug("Handling MERGED event for " + encodedName + "; deleting node");
3387       // Remove region from ZK
3388       try {
3389         boolean successful = false;
3390         while (!successful) {
3391           // It's possible that the RS tickles in between the reading of the
3392           // znode and the deleting, so it's safe to retry.
3393           successful = ZKAssign.deleteNode(watcher, encodedName,
3394             EventType.RS_ZK_REGION_MERGED, sn);
3395         }
3396       } catch (KeeperException e) {
3397         if (e instanceof NoNodeException) {
3398           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3399           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3400         } else {
3401           server.abort("Error deleting MERGED node " + encodedName, e);
3402         }
3403       }
3404       LOG.info("Handled MERGED event; merged=" + p.getRegionNameAsString()
3405         + ", region_a=" + hri_a.getRegionNameAsString() + ", region_b="
3406         + hri_b.getRegionNameAsString() + ", on " + sn);
3407 
3408       // User could disable the table before master knows the new region.
3409       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3410         unassign(p);
3411       }
3412     }
3413     return true;
3414   }
3415 
3416   /**
3417    * A helper to handle region splitting transition event.
3418    */
3419   private boolean handleRegionSplitting(final RegionTransition rt, final String encodedName,
3420       final String prettyPrintedRegionName, final ServerName sn) {
3421     if (!serverManager.isServerOnline(sn)) {
3422       LOG.warn("Dropped splitting! ServerName=" + sn + " unknown.");
3423       return false;
3424     }
3425     byte [] payloadOfSplitting = rt.getPayload();
3426     List<HRegionInfo> splittingRegions;
3427     try {
3428       splittingRegions = HRegionInfo.parseDelimitedFrom(
3429         payloadOfSplitting, 0, payloadOfSplitting.length);
3430     } catch (IOException e) {
3431       LOG.error("Dropped splitting! Failed reading " + rt.getEventType()
3432         + " payload for " + prettyPrintedRegionName);
3433       return false;
3434     }
3435     assert splittingRegions.size() == 2;
3436     HRegionInfo hri_a = splittingRegions.get(0);
3437     HRegionInfo hri_b = splittingRegions.get(1);
3438 
3439     RegionState rs_p = regionStates.getRegionState(encodedName);
3440     RegionState rs_a = regionStates.getRegionState(hri_a);
3441     RegionState rs_b = regionStates.getRegionState(hri_b);
3442 
3443     if (!((rs_p == null || rs_p.isOpenOrSplittingOnServer(sn))
3444         && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn))
3445         && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) {
3446       LOG.warn("Dropped splitting! Not in state good for SPLITTING; rs_p="
3447         + rs_p + ", rs_a=" + rs_a + ", rs_b=" + rs_b);
3448       return false;
3449     }
3450 
3451     if (rs_p == null) {
3452       // Splitting region should be online
3453       rs_p = regionStates.updateRegionState(rt, State.OPEN);
3454       if (rs_p == null) {
3455         LOG.warn("Received splitting for region " + prettyPrintedRegionName
3456           + " from server " + sn + " but it doesn't exist anymore,"
3457           + " probably already processed its split");
3458         return false;
3459       }
3460       regionStates.regionOnline(rs_p.getRegion(), sn);
3461     }
3462 
3463     HRegionInfo p = rs_p.getRegion();
3464     EventType et = rt.getEventType();
3465     if (et == EventType.RS_ZK_REQUEST_REGION_SPLIT) {
3466       try {
3467         if (SplitTransaction.transitionSplittingNode(watcher, p,
3468             hri_a, hri_b, sn, -1, EventType.RS_ZK_REQUEST_REGION_SPLIT,
3469             EventType.RS_ZK_REGION_SPLITTING) == -1) {
3470           byte[] data = ZKAssign.getData(watcher, encodedName);
3471           EventType currentType = null;
3472           if (data != null) {
3473             RegionTransition newRt = RegionTransition.parseFrom(data);
3474             currentType = newRt.getEventType();
3475           }
3476           if (currentType == null || (currentType != EventType.RS_ZK_REGION_SPLIT
3477               && currentType != EventType.RS_ZK_REGION_SPLITTING)) {
3478             LOG.warn("Failed to transition pending_split node "
3479               + encodedName + " to splitting, it's now " + currentType);
3480             return false;
3481           }
3482         }
3483       } catch (Exception e) {
3484         LOG.warn("Failed to transition pending_split node "
3485           + encodedName + " to splitting", e);
3486         return false;
3487       }
3488     }
3489 
3490     synchronized (regionStates) {
3491       regionStates.updateRegionState(hri_a, State.SPLITTING_NEW, sn);
3492       regionStates.updateRegionState(hri_b, State.SPLITTING_NEW, sn);
3493       regionStates.regionOffline(hri_a, State.SPLITTING_NEW);
3494       regionStates.regionOffline(hri_b, State.SPLITTING_NEW);
3495       regionStates.updateRegionState(rt, State.SPLITTING);
3496 
3497       // The below is for testing ONLY!  We can't do fault injection easily, so
3498       // resort to this kinda uglyness -- St.Ack 02/25/2011.
3499       if (TEST_SKIP_SPLIT_HANDLING) {
3500         LOG.warn("Skipping split message, TEST_SKIP_SPLIT_HANDLING is set");
3501         return true; // return true so that the splitting node stays
3502       }
3503 
3504       if (et == EventType.RS_ZK_REGION_SPLIT) {
3505         regionOffline(p, State.SPLIT);
3506         regionOnline(hri_a, sn);
3507         regionOnline(hri_b, sn);
3508       }
3509     }
3510 
3511     if (et == EventType.RS_ZK_REGION_SPLIT) {
3512       LOG.debug("Handling SPLIT event for " + encodedName + "; deleting node");
3513       // Remove region from ZK
3514       try {
3515         boolean successful = false;
3516         while (!successful) {
3517           // It's possible that the RS tickles in between the reading of the
3518           // znode and the deleting, so it's safe to retry.
3519           successful = ZKAssign.deleteNode(watcher, encodedName,
3520             EventType.RS_ZK_REGION_SPLIT, sn);
3521         }
3522       } catch (KeeperException e) {
3523         if (e instanceof NoNodeException) {
3524           String znodePath = ZKUtil.joinZNode(watcher.splitLogZNode, encodedName);
3525           LOG.debug("The znode " + znodePath + " does not exist.  May be deleted already.");
3526         } else {
3527           server.abort("Error deleting SPLIT node " + encodedName, e);
3528         }
3529       }
3530       LOG.info("Handled SPLIT event; parent=" + p.getRegionNameAsString()
3531         + ", daughter a=" + hri_a.getRegionNameAsString() + ", daughter b="
3532         + hri_b.getRegionNameAsString() + ", on " + sn);
3533 
3534       // User could disable the table before master knows the new region.
3535       if (zkTable.isDisablingOrDisabledTable(p.getTable())) {
3536         unassign(hri_a);
3537         unassign(hri_b);
3538       }
3539     }
3540     return true;
3541   }
3542 
3543   /**
3544    * A region is offline.  The new state should be the specified one,
3545    * if not null.  If the specified state is null, the new state is Offline.
3546    * The specified state can be Split/Merged/Offline/null only.
3547    */
3548   private void regionOffline(final HRegionInfo regionInfo, final State state) {
3549     regionStates.regionOffline(regionInfo, state);
3550     removeClosedRegion(regionInfo);
3551     // remove the region plan as well just in case.
3552     clearRegionPlan(regionInfo);
3553     balancer.regionOffline(regionInfo);
3554   }
3555 
3556   /**
3557    * @return Instance of load balancer
3558    */
3559   public LoadBalancer getBalancer() {
3560     return this.balancer;
3561   }
3562 }