View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.NavigableMap;
31  import java.util.Set;
32  import java.util.TreeMap;
33  import java.util.concurrent.ConcurrentHashMap;
34  import java.util.concurrent.ConcurrentSkipListSet;
35  import java.util.concurrent.ThreadFactory;
36  import java.util.concurrent.TimeUnit;
37  import java.util.concurrent.atomic.AtomicBoolean;
38  import java.util.concurrent.atomic.AtomicInteger;
39  import java.util.concurrent.locks.Lock;
40  import java.util.concurrent.locks.ReentrantLock;
41  
42  import com.google.common.base.Preconditions;
43  import org.apache.commons.logging.Log;
44  import org.apache.commons.logging.LogFactory;
45  import org.apache.hadoop.classification.InterfaceAudience;
46  import org.apache.hadoop.conf.Configuration;
47  import org.apache.hadoop.hbase.Chore;
48  import org.apache.hadoop.hbase.HBaseIOException;
49  import org.apache.hadoop.hbase.TableName;
50  import org.apache.hadoop.hbase.HTableDescriptor;
51  import org.apache.hadoop.hbase.exceptions.DeserializationException;
52  import org.apache.hadoop.hbase.HConstants;
53  import org.apache.hadoop.hbase.HRegionInfo;
54  import org.apache.hadoop.hbase.RegionTransition;
55  import org.apache.hadoop.hbase.Server;
56  import org.apache.hadoop.hbase.ServerName;
57  import org.apache.hadoop.hbase.Stoppable;
58  import org.apache.hadoop.hbase.catalog.CatalogTracker;
59  import org.apache.hadoop.hbase.catalog.MetaReader;
60  import org.apache.hadoop.hbase.client.Result;
61  import org.apache.hadoop.hbase.NotServingRegionException;
62  import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException;
63  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
64  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
65  import org.apache.hadoop.hbase.TableNotFoundException;
66  import org.apache.hadoop.hbase.executor.EventHandler;
67  import org.apache.hadoop.hbase.executor.EventType;
68  import org.apache.hadoop.hbase.executor.ExecutorService;
69  import org.apache.hadoop.hbase.master.RegionState.State;
70  import org.apache.hadoop.hbase.master.balancer.FavoredNodeAssignmentHelper;
71  import org.apache.hadoop.hbase.master.balancer.FavoredNodeLoadBalancer;
72  import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
73  import org.apache.hadoop.hbase.master.handler.DisableTableHandler;
74  import org.apache.hadoop.hbase.master.handler.EnableTableHandler;
75  import org.apache.hadoop.hbase.master.handler.MergedRegionHandler;
76  import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
77  import org.apache.hadoop.hbase.master.handler.SplitRegionHandler;
78  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
79  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
80  import org.apache.hadoop.hbase.util.KeyLocker;
81  import org.apache.hadoop.hbase.util.Pair;
82  import org.apache.hadoop.hbase.util.Threads;
83  import org.apache.hadoop.hbase.util.Triple;
84  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
85  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
86  import org.apache.hadoop.hbase.zookeeper.ZKTable;
87  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
88  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
89  import org.apache.hadoop.ipc.RemoteException;
90  import org.apache.zookeeper.AsyncCallback;
91  import org.apache.zookeeper.KeeperException;
92  import org.apache.zookeeper.KeeperException.NoNodeException;
93  import org.apache.zookeeper.KeeperException.NodeExistsException;
94  import org.apache.zookeeper.data.Stat;
95  
96  import com.google.common.collect.LinkedHashMultimap;
97  
98  /**
99   * Manages and performs region assignment.
100  * <p>
101  * Monitors ZooKeeper for events related to regions in transition.
102  * <p>
103  * Handles existing regions in transition during master failover.
104  */
105 @InterfaceAudience.Private
106 public class AssignmentManager extends ZooKeeperListener {
107   private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
108 
109   public static final ServerName HBCK_CODE_SERVERNAME = new ServerName(HConstants.HBCK_CODE_NAME,
110       -1, -1L);
111 
112   protected final Server server;
113 
114   private ServerManager serverManager;
115 
116   private boolean shouldAssignRegionsWithFavoredNodes;
117 
118   private CatalogTracker catalogTracker;
119 
120   protected final TimeoutMonitor timeoutMonitor;
121 
122   private final TimerUpdater timerUpdater;
123 
124   private LoadBalancer balancer;
125 
126   private final TableLockManager tableLockManager;
127 
128   final private KeyLocker<String> locker = new KeyLocker<String>();
129 
130   /**
131    * Map of regions to reopen after the schema of a table is changed. Key -
132    * encoded region name, value - HRegionInfo
133    */
134   private final Map <String, HRegionInfo> regionsToReopen;
135 
136   /*
137    * Maximum times we recurse an assignment/unassignment.
138    * See below in {@link #assign()} and {@link #unassign()}.
139    */
140   private final int maximumAttempts;
141   
142   /**
143    * The sleep time for which the assignment will wait before retrying in case of META assignment
144    * failure due to lack of availability of region plan
145    */
146   private final long sleepTimeBeforeRetryingMetaAssignment;
147 
148   /** Plans for region movement. Key is the encoded version of a region name*/
149   // TODO: When do plans get cleaned out?  Ever? In server open and in server
150   // shutdown processing -- St.Ack
151   // All access to this Map must be synchronized.
152   final NavigableMap<String, RegionPlan> regionPlans =
153     new TreeMap<String, RegionPlan>();
154 
155   private final ZKTable zkTable;
156 
157   /**
158    * Contains the server which need to update timer, these servers will be
159    * handled by {@link TimerUpdater}
160    */
161   private final ConcurrentSkipListSet<ServerName> serversInUpdatingTimer;
162 
163   private final ExecutorService executorService;
164 
165   // For unit tests, keep track of calls to ClosedRegionHandler
166   private Map<HRegionInfo, AtomicBoolean> closedRegionHandlerCalled = 
167       new HashMap<HRegionInfo, AtomicBoolean>();
168 
169   // For unit tests, keep track of calls to OpenedRegionHandler
170   private Map<HRegionInfo, AtomicBoolean> openedRegionHandlerCalled = 
171       new HashMap<HRegionInfo, AtomicBoolean>();
172 
173   // For unit tests, keep track of calls to SplitRegionHandler
174   private AtomicBoolean splitRegionHandlerCalled = new AtomicBoolean(false);
175 
176   //Thread pool executor service for timeout monitor
177   private java.util.concurrent.ExecutorService threadPoolExecutorService;
178 
179   // A bunch of ZK events workers. Each is a single thread executor service
180   private final java.util.concurrent.ExecutorService zkEventWorkers;
181 
182   private List<EventType> ignoreStatesRSOffline = Arrays.asList(
183       EventType.RS_ZK_REGION_FAILED_OPEN, EventType.RS_ZK_REGION_CLOSED);
184 
185   // metrics instance to send metrics for RITs
186   MetricsMaster metricsMaster;
187 
188   private final RegionStates regionStates;
189 
190   // The threshold to use bulk assigning. Using bulk assignment
191   // only if assigning at least this many regions to at least this
192   // many servers. If assigning fewer regions to fewer servers,
193   // bulk assigning may be not as efficient.
194   private final int bulkAssignThresholdRegions;
195   private final int bulkAssignThresholdServers;
196 
197   // Should bulk assignment wait till all regions are assigned,
198   // or it is timed out?  This is useful to measure bulk assignment
199   // performance, but not needed in most use cases.
200   private final boolean bulkAssignWaitTillAllAssigned;
201 
202   /**
203    * Indicator that AssignmentManager has recovered the region states so
204    * that ServerShutdownHandler can be fully enabled and re-assign regions
205    * of dead servers. So that when re-assignment happens, AssignmentManager
206    * has proper region states.
207    *
208    * Protected to ease testing.
209    */
210   protected final AtomicBoolean failoverCleanupDone = new AtomicBoolean(false);
211 
212   /** Is the TimeOutManagement activated **/
213   private final boolean tomActivated;
214 
215   /**
216    * A map to track the count a region fails to open in a row.
217    * So that we don't try to open a region forever if the failure is
218    * unrecoverable.  We don't put this information in region states
219    * because we don't expect this to happen frequently; we don't
220    * want to copy this information over during each state transition either.
221    */
222   private final ConcurrentHashMap<String, AtomicInteger>
223     failedOpenTracker = new ConcurrentHashMap<String, AtomicInteger>();
224 
225   /**
226    * Constructs a new assignment manager.
227    *
228    * @param server
229    * @param serverManager
230    * @param catalogTracker
231    * @param service
232    * @throws KeeperException
233    * @throws IOException
234    */
235   public AssignmentManager(Server server, ServerManager serverManager,
236       CatalogTracker catalogTracker, final LoadBalancer balancer,
237       final ExecutorService service, MetricsMaster metricsMaster,
238       final TableLockManager tableLockManager) throws KeeperException, IOException {
239     super(server.getZooKeeper());
240     this.server = server;
241     this.serverManager = serverManager;
242     this.catalogTracker = catalogTracker;
243     this.executorService = service;
244     this.regionsToReopen = Collections.synchronizedMap
245                            (new HashMap<String, HRegionInfo> ());
246     Configuration conf = server.getConfiguration();
247     // Only read favored nodes if using the favored nodes load balancer.
248     this.shouldAssignRegionsWithFavoredNodes = conf.getClass(
249            HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class).equals(
250            FavoredNodeLoadBalancer.class);
251     this.tomActivated = conf.getBoolean("hbase.assignment.timeout.management", false);
252     if (tomActivated){
253       this.serversInUpdatingTimer =  new ConcurrentSkipListSet<ServerName>();
254       this.timeoutMonitor = new TimeoutMonitor(
255         conf.getInt("hbase.master.assignment.timeoutmonitor.period", 30000),
256         server, serverManager,
257         conf.getInt("hbase.master.assignment.timeoutmonitor.timeout", 600000));
258       this.timerUpdater = new TimerUpdater(conf.getInt(
259         "hbase.master.assignment.timerupdater.period", 10000), server);
260       Threads.setDaemonThreadRunning(timerUpdater.getThread(),
261         server.getServerName() + ".timerUpdater");
262     } else {
263       this.serversInUpdatingTimer =  null;
264       this.timeoutMonitor = null;
265       this.timerUpdater = null;
266     }
267     this.zkTable = new ZKTable(this.watcher);
268     this.maximumAttempts =
269       this.server.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10);
270     this.sleepTimeBeforeRetryingMetaAssignment = this.server.getConfiguration().getLong(
271         "hbase.meta.assignment.retry.sleeptime", 1000l);
272     this.balancer = balancer;
273     int maxThreads = conf.getInt("hbase.assignment.threads.max", 30);
274     this.threadPoolExecutorService = Threads.getBoundedCachedThreadPool(
275       maxThreads, 60L, TimeUnit.SECONDS, Threads.newDaemonThreadFactory("AM."));
276     this.metricsMaster = metricsMaster;// can be null only with tests.
277     this.regionStates = new RegionStates(server, serverManager);
278 
279     this.bulkAssignWaitTillAllAssigned =
280       conf.getBoolean("hbase.bulk.assignment.waittillallassigned", false);
281     this.bulkAssignThresholdRegions = conf.getInt("hbase.bulk.assignment.threshold.regions", 7);
282     this.bulkAssignThresholdServers = conf.getInt("hbase.bulk.assignment.threshold.servers", 3);
283 
284     int workers = conf.getInt("hbase.assignment.zkevent.workers", 20);
285     ThreadFactory threadFactory = Threads.newDaemonThreadFactory("AM.ZK.Worker");
286     zkEventWorkers = Threads.getBoundedCachedThreadPool(workers, 60L,
287             TimeUnit.SECONDS, threadFactory);
288     this.tableLockManager = tableLockManager;
289   }
290 
291   void startTimeOutMonitor() {
292     if (tomActivated) {
293       Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), server.getServerName()
294           + ".timeoutMonitor");
295     }
296   }
297 
298   /**
299    * @return Instance of ZKTable.
300    */
301   public ZKTable getZKTable() {
302     // These are 'expensive' to make involving trip to zk ensemble so allow
303     // sharing.
304     return this.zkTable;
305   }
306 
307   /**
308    * This SHOULD not be public. It is public now
309    * because of some unit tests.
310    *
311    * TODO: make it package private and keep RegionStates in the master package
312    */
313   public RegionStates getRegionStates() {
314     return regionStates;
315   }
316 
317   public RegionPlan getRegionReopenPlan(HRegionInfo hri) {
318     return new RegionPlan(hri, null, regionStates.getRegionServerOfRegion(hri));
319   }
320 
321   /**
322    * Add a regionPlan for the specified region.
323    * @param encodedName
324    * @param plan
325    */
326   public void addPlan(String encodedName, RegionPlan plan) {
327     synchronized (regionPlans) {
328       regionPlans.put(encodedName, plan);
329     }
330   }
331 
332   /**
333    * Add a map of region plans.
334    */
335   public void addPlans(Map<String, RegionPlan> plans) {
336     synchronized (regionPlans) {
337       regionPlans.putAll(plans);
338     }
339   }
340 
341   /**
342    * Set the list of regions that will be reopened
343    * because of an update in table schema
344    *
345    * @param regions
346    *          list of regions that should be tracked for reopen
347    */
348   public void setRegionsToReopen(List <HRegionInfo> regions) {
349     for(HRegionInfo hri : regions) {
350       regionsToReopen.put(hri.getEncodedName(), hri);
351     }
352   }
353 
354   /**
355    * Used by the client to identify if all regions have the schema updates
356    *
357    * @param tableName
358    * @return Pair indicating the status of the alter command
359    * @throws IOException
360    */
361   public Pair<Integer, Integer> getReopenStatus(TableName tableName)
362       throws IOException {
363     List <HRegionInfo> hris =
364       MetaReader.getTableRegions(this.server.getCatalogTracker(), tableName, true);
365     Integer pending = 0;
366     for (HRegionInfo hri : hris) {
367       String name = hri.getEncodedName();
368       // no lock concurrent access ok: sequential consistency respected.
369       if (regionsToReopen.containsKey(name)
370           || regionStates.isRegionInTransition(name)) {
371         pending++;
372       }
373     }
374     return new Pair<Integer, Integer>(pending, hris.size());
375   }
376 
377   /**
378    * Used by ServerShutdownHandler to make sure AssignmentManager has completed
379    * the failover cleanup before re-assigning regions of dead servers. So that
380    * when re-assignment happens, AssignmentManager has proper region states.
381    */
382   public boolean isFailoverCleanupDone() {
383     return failoverCleanupDone.get();
384   }
385 
386   /**
387    * Now, failover cleanup is completed. Notify server manager to
388    * process queued up dead servers processing, if any.
389    */
390   void failoverCleanupDone() {
391     failoverCleanupDone.set(true);
392     serverManager.processQueuedDeadServers();
393   }
394 
395   /**
396    * Called on startup.
397    * Figures whether a fresh cluster start of we are joining extant running cluster.
398    * @throws IOException
399    * @throws KeeperException
400    * @throws InterruptedException
401    */
402   void joinCluster() throws IOException,
403       KeeperException, InterruptedException {
404     // Concurrency note: In the below the accesses on regionsInTransition are
405     // outside of a synchronization block where usually all accesses to RIT are
406     // synchronized.  The presumption is that in this case it is safe since this
407     // method is being played by a single thread on startup.
408 
409     // TODO: Regions that have a null location and are not in regionsInTransitions
410     // need to be handled.
411 
412     // Scan META to build list of existing regions, servers, and assignment
413     // Returns servers who have not checked in (assumed dead) and their regions
414     Map<ServerName, List<HRegionInfo>> deadServers = rebuildUserRegions();
415 
416     // This method will assign all user regions if a clean server startup or
417     // it will reconstruct master state and cleanup any leftovers from
418     // previous master process.
419     processDeadServersAndRegionsInTransition(deadServers);
420 
421     recoverTableInDisablingState();
422     recoverTableInEnablingState();
423   }
424 
425   /**
426    * Process all regions that are in transition in zookeeper and also
427    * processes the list of dead servers by scanning the META.
428    * Used by master joining an cluster.  If we figure this is a clean cluster
429    * startup, will assign all user regions.
430    * @param deadServers
431    *          Map of dead servers and their regions. Can be null.
432    * @throws KeeperException
433    * @throws IOException
434    * @throws InterruptedException
435    */
436   void processDeadServersAndRegionsInTransition(
437       final Map<ServerName, List<HRegionInfo>> deadServers)
438           throws KeeperException, IOException, InterruptedException {
439     List<String> nodes = ZKUtil.listChildrenNoWatch(watcher,
440       watcher.assignmentZNode);
441 
442     if (nodes == null) {
443       String errorMessage = "Failed to get the children from ZK";
444       server.abort(errorMessage, new IOException(errorMessage));
445       return;
446     }
447 
448     boolean failover = (!serverManager.getDeadServers().isEmpty() || !serverManager
449         .getRequeuedDeadServers().isEmpty());
450 
451     if (!failover) {
452       // Run through all regions.  If they are not assigned and not in RIT, then
453       // its a clean cluster startup, else its a failover.
454       Map<HRegionInfo, ServerName> regions = regionStates.getRegionAssignments();
455       for (Map.Entry<HRegionInfo, ServerName> e: regions.entrySet()) {
456         if (!HTableDescriptor.isSystemTable(e.getKey().getTableName())
457             && e.getValue() != null) {
458           LOG.debug("Found " + e + " out on cluster");
459           failover = true;
460           break;
461         }
462         if (nodes.contains(e.getKey().getEncodedName())) {
463           LOG.debug("Found " + e.getKey().getRegionNameAsString() + " in RITs");
464           // Could be a meta region.
465           failover = true;
466           break;
467         }
468       }
469     }
470 
471     // If we found user regions out on cluster, its a failover.
472     if (failover) {
473       LOG.info("Found regions out on cluster or in RIT; failover");
474       // Process list of dead servers and regions in RIT.
475       // See HBASE-4580 for more information.
476       processDeadServersAndRecoverLostRegions(deadServers);
477     } else {
478       // Fresh cluster startup.
479       LOG.info("Clean cluster startup. Assigning userregions");
480       assignAllUserRegions();
481     }
482   }
483 
484   /**
485    * If region is up in zk in transition, then do fixup and block and wait until
486    * the region is assigned and out of transition.  Used on startup for
487    * catalog regions.
488    * @param hri Region to look for.
489    * @return True if we processed a region in transition else false if region
490    * was not up in zk in transition.
491    * @throws InterruptedException
492    * @throws KeeperException
493    * @throws IOException
494    */
495   boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
496       throws InterruptedException, KeeperException, IOException {
497     boolean intransistion = processRegionInTransition(hri.getEncodedName(), hri);
498     if (!intransistion) return intransistion;
499     LOG.debug("Waiting on " + HRegionInfo.prettyPrint(hri.getEncodedName()));
500     while (!this.server.isStopped() &&
501       this.regionStates.isRegionInTransition(hri.getEncodedName())) {
502       // We put a timeout because we may have the region getting in just between the test
503       //  and the waitForUpdate
504       this.regionStates.waitForUpdate(100);
505     }
506     return intransistion;
507   }
508 
509   /**
510    * Process failover of new master for region <code>encodedRegionName</code>
511    * up in zookeeper.
512    * @param encodedRegionName Region to process failover for.
513    * @param regionInfo If null we'll go get it from meta table.
514    * @return True if we processed <code>regionInfo</code> as a RIT.
515    * @throws KeeperException
516    * @throws IOException
517    */
518   boolean processRegionInTransition(final String encodedRegionName,
519       final HRegionInfo regionInfo) throws KeeperException, IOException {
520     // We need a lock here to ensure that we will not put the same region twice
521     // It has no reason to be a lock shared with the other operations.
522     // We can do the lock on the region only, instead of a global lock: what we want to ensure
523     // is that we don't have two threads working on the same region.
524     Lock lock = locker.acquireLock(encodedRegionName);
525     try {
526       Stat stat = new Stat();
527       byte [] data = ZKAssign.getDataAndWatch(watcher, encodedRegionName, stat);
528       if (data == null) return false;
529       RegionTransition rt;
530       try {
531         rt = RegionTransition.parseFrom(data);
532       } catch (DeserializationException e) {
533         LOG.warn("Failed parse znode data", e);
534         return false;
535       }
536       HRegionInfo hri = regionInfo;
537       if (hri == null) {
538         hri = regionStates.getRegionInfo(rt.getRegionName());
539         if (hri == null) return false;
540       }
541       processRegionsInTransition(rt, hri, stat.getVersion());
542       return true;
543     } finally {
544       lock.unlock();
545     }
546   }
547 
548   /**
549    * This call is invoked only (1) master assign meta;
550    * (2) during failover mode startup, zk assignment node processing.
551    * The locker is set in the caller.
552    *
553    * It should be private but it is used by some test too.
554    */
555   void processRegionsInTransition(
556       final RegionTransition rt, final HRegionInfo regionInfo,
557       final int expectedVersion) throws KeeperException {
558     EventType et = rt.getEventType();
559     // Get ServerName.  Could not be null.
560     final ServerName sn = rt.getServerName();
561     final String encodedRegionName = regionInfo.getEncodedName();
562     final String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedRegionName);
563     LOG.info("Processing " + regionInfo.getRegionNameAsString() + " in state " + et);
564 
565 
566     if (regionStates.isRegionInTransition(encodedRegionName)) {
567       // Just return
568       return;
569     }
570     switch (et) {
571       case M_ZK_REGION_CLOSING:
572         // If zk node of the region was updated by a live server skip this
573         // region and just add it into RIT.
574         if (!serverManager.isServerOnline(sn)) {
575           // If was not online, its closed now. Force to OFFLINE and this
576           // will get it reassigned if appropriate
577           forceOffline(regionInfo, rt);
578         } else {
579           // Insert into RIT & resend the query to the region server: may be the previous master
580           // died before sending the query the first time.
581           regionStates.updateRegionState(rt, RegionState.State.CLOSING);
582           final RegionState rs = regionStates.getRegionState(regionInfo);
583           this.executorService.submit(
584               new EventHandler(server, EventType.M_MASTER_RECOVERY) {
585                 @Override
586                 public void process() throws IOException {
587                   ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
588                   try {
589                     unassign(regionInfo, rs, expectedVersion, null, true, null);
590                   } finally {
591                     lock.unlock();
592                   }
593                 }
594               });
595         }
596         break;
597 
598       case RS_ZK_REGION_CLOSED:
599       case RS_ZK_REGION_FAILED_OPEN:
600         // Region is closed, insert into RIT and handle it
601         addToRITandCallClose(regionInfo, RegionState.State.CLOSED, rt);
602         break;
603 
604       case M_ZK_REGION_OFFLINE:
605         // If zk node of the region was updated by a live server skip this
606         // region and just add it into RIT.
607         if (!serverManager.isServerOnline(sn)) {
608           // Region is offline, insert into RIT and handle it like a closed
609           addToRITandCallClose(regionInfo, RegionState.State.OFFLINE, rt);
610         } else {
611           // Insert in RIT and resend to the regionserver
612           regionStates.updateRegionState(rt, RegionState.State.PENDING_OPEN);
613           final RegionState rs = regionStates.getRegionState(regionInfo);
614           this.executorService.submit(
615               new EventHandler(server, EventType.M_MASTER_RECOVERY) {
616                 @Override
617                 public void process() throws IOException {
618                   ReentrantLock lock = locker.acquireLock(regionInfo.getEncodedName());
619                   try {
620                     RegionPlan plan = new RegionPlan(regionInfo, null, sn);
621                     addPlan(encodedRegionName, plan);
622                     assign(rs, false, false);
623                   } finally {
624                     lock.unlock();
625                   }
626                 }
627               });
628         }
629         break;
630 
631       case RS_ZK_REGION_OPENING:
632         if (!serverManager.isServerOnline(sn)) {
633           forceOffline(regionInfo, rt);
634         } else {
635           regionStates.updateRegionState(rt, RegionState.State.OPENING);
636         }
637         break;
638 
639       case RS_ZK_REGION_OPENED:
640         if (!serverManager.isServerOnline(sn)) {
641           forceOffline(regionInfo, rt);
642         } else {
643           // Region is opened, insert into RIT and handle it
644           // This could be done asynchronously, we would need then to acquire the lock in the
645           //  handler.
646           regionStates.updateRegionState(rt, RegionState.State.OPEN);
647           new OpenedRegionHandler(server, this, regionInfo, sn, expectedVersion).process();
648         }
649         break;
650       case RS_ZK_REGION_SPLITTING:
651         if (!serverManager.isServerOnline(sn)) {
652           // The regionserver started the split, but died before updating the status.
653           // It means (hopefully) that the split was not finished
654           // TBD - to study. In the meantime, do nothing as in the past.
655           LOG.warn("Processed region " + prettyPrintedRegionName + " in state : " + et +
656               " on a dead regionserver: " + sn + " doing nothing");
657         } else {
658           regionStates.updateRegionState(rt, RegionState.State.SPLITTING);
659           LOG.info("Processed " + prettyPrintedRegionName + " in state : " + et);
660         }
661         break;
662       case RS_ZK_REGION_SPLIT:
663         if (!serverManager.isServerOnline(sn)) {
664           forceOffline(regionInfo, rt);
665         } else {
666           LOG.info("Processed " + prettyPrintedRegionName + " in state : " + et +
667             " nothing to do.");
668           // We don't do anything. The regionserver is supposed to update the znode
669           // multiple times so if it's still up we will receive an update soon.
670         }
671         break;
672       case RS_ZK_REGION_MERGING:
673         if (!serverManager.isServerOnline(sn)) {
674           // The regionserver started the merge, but died before updating the status.
675           // It means (hopefully) that the merge was not finished
676           // This node should be gone soon since it is ephemeral.
677           LOG.warn("Processed " + prettyPrintedRegionName + " in state : " + et +
678               " on a dead regionserver: " + sn + " doing nothing");
679         } else {
680           handleRegionMerging(rt, prettyPrintedRegionName, sn);
681           LOG.info("Processed region " + prettyPrintedRegionName
682             + " in state : " + et);
683         }
684         break;
685       case RS_ZK_REGION_MERGED:
686         if (!serverManager.isServerOnline(sn)) {
687           // ServerShutdownHandler would handle this region
688           LOG.warn("Processed " + prettyPrintedRegionName
689               + " in state : " + et + " on a dead regionserver: " + sn
690               + " doing nothing");
691         } else {
692           LOG.info("Processed " + prettyPrintedRegionName + " in state : " +
693               et + " nothing to do.");
694           // We don't do anything. The regionserver is supposed to update the znode
695           // multiple times so if it's still up we will receive an update soon.
696         }
697         break;
698       default:
699         throw new IllegalStateException("Received region in state :" + et + " is not valid.");
700     }
701   }
702 
703   /**
704    * Put the region <code>hri</code> into an offline state up in zk.
705    *
706    * You need to have lock on the region before calling this method.
707    *
708    * @param hri
709    * @param oldRt
710    * @throws KeeperException
711    */
712   private void forceOffline(final HRegionInfo hri, final RegionTransition oldRt)
713       throws KeeperException {
714     // If was on dead server, its closed now.  Force to OFFLINE and then
715     // handle it like a close; this will get it reassigned if appropriate
716     LOG.debug("RIT " + hri.getEncodedName() + " in state=" + oldRt.getEventType() +
717       " was on deadserver; forcing offline");
718     ZKAssign.createOrForceNodeOffline(this.watcher, hri, oldRt.getServerName());
719     addToRITandCallClose(hri, RegionState.State.OFFLINE, oldRt);
720   }
721 
722   /**
723    * Add to the in-memory copy of regions in transition and then call close
724    * handler on passed region <code>hri</code>
725    * @param hri
726    * @param state
727    * @param oldData
728    */
729   private void addToRITandCallClose(final HRegionInfo hri,
730       final RegionState.State state, final RegionTransition oldData) {
731     regionStates.updateRegionState(oldData, state);
732     new ClosedRegionHandler(this.server, this, hri).process();
733   }
734 
735   /**
736    * When a region is closed, it should be removed from the regionsToReopen
737    * @param hri HRegionInfo of the region which was closed
738    */
739   public void removeClosedRegion(HRegionInfo hri) {
740     if (regionsToReopen.remove(hri.getEncodedName()) != null) {
741       LOG.debug("Removed region from reopening regions because it was closed");
742     }
743   }
744 
745   /**
746    * Handles various states an unassigned node can be in.
747    * <p>
748    * Method is called when a state change is suspected for an unassigned node.
749    * <p>
750    * This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
751    * yet).
752    * @param rt
753    * @param expectedVersion
754    */
755   void handleRegion(final RegionTransition rt, int expectedVersion) {
756     if (rt == null) {
757       LOG.warn("Unexpected NULL input for RegionTransition rt");
758       return;
759     }
760     final ServerName sn = rt.getServerName();
761     // Check if this is a special HBCK transition
762     if (sn.equals(HBCK_CODE_SERVERNAME)) {
763       handleHBCK(rt);
764       return;
765     }
766     final long createTime = rt.getCreateTime();
767     final byte[] regionName = rt.getRegionName();
768     String encodedName = HRegionInfo.encodeRegionName(regionName);
769     String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
770     // Verify this is a known server
771     if (!serverManager.isServerOnline(sn)
772       && !ignoreStatesRSOffline.contains(rt.getEventType())) {
773       LOG.warn("Attempted to handle region transition for server but " +
774         "server is not online: " + prettyPrintedRegionName);
775       return;
776     }
777 
778     RegionState regionState =
779       regionStates.getRegionTransitionState(encodedName);
780     long startTime = System.currentTimeMillis();
781     if (LOG.isDebugEnabled()) {
782       boolean lateEvent = createTime < (startTime - 15000);
783       LOG.debug("Handling transition=" + rt.getEventType() +
784         ", server=" + sn + ", region=" +
785         (prettyPrintedRegionName == null ? "null" : prettyPrintedRegionName) +
786         (lateEvent ? ", which is more than 15 seconds late" : "") +
787         ", current state from region state map =" + regionState);
788     }
789     // We don't do anything for this event,
790     // so separate it out, no need to lock/unlock anything
791     if (rt.getEventType() == EventType.M_ZK_REGION_OFFLINE) {
792       return;
793     }
794 
795     // We need a lock on the region as we could update it
796     Lock lock = locker.acquireLock(encodedName);
797     try {
798       RegionState latestState =
799         regionStates.getRegionTransitionState(encodedName);
800       if ((regionState == null && latestState != null)
801           || (regionState != null && latestState == null)
802           || (regionState != null && latestState != null
803             && latestState.getState() != regionState.getState())) {
804         LOG.warn("Region state changed from " + regionState + " to "
805           + latestState + ", while acquiring lock");
806       }
807       long waitedTime = System.currentTimeMillis() - startTime;
808       if (waitedTime > 5000) {
809         LOG.warn("Took " + waitedTime + "ms to acquire the lock");
810       }
811       regionState = latestState;
812       switch (rt.getEventType()) {
813         case RS_ZK_REGION_SPLITTING:
814           if (!isInStateForSplitting(regionState)) break;
815           regionStates.updateRegionState(rt, RegionState.State.SPLITTING);
816           break;
817 
818         case RS_ZK_REGION_SPLIT:
819           // RegionState must be null, or SPLITTING or PENDING_CLOSE.
820           if (!isInStateForSplitting(regionState)) break;
821           // If null, add SPLITTING state before going to SPLIT
822           if (regionState == null) {
823             regionState = regionStates.updateRegionState(rt,
824               RegionState.State.SPLITTING);
825 
826             String message = "Received SPLIT for region " + prettyPrintedRegionName +
827               " from server " + sn;
828             // If still null, it means we cannot find it and it was already processed
829             if (regionState == null) {
830               LOG.warn(message + " but it doesn't exist anymore," +
831                   " probably already processed its split");
832               break;
833             }
834             LOG.info(message +
835                 " but region was not first in SPLITTING state; continuing");
836           }
837           // Check it has daughters.
838           byte [] payload = rt.getPayload();
839           List<HRegionInfo> daughters;
840           try {
841             daughters = HRegionInfo.parseDelimitedFrom(payload, 0, payload.length);
842           } catch (IOException e) {
843             LOG.error("Dropped split! Failed reading split payload for " +
844               prettyPrintedRegionName);
845             break;
846           }
847           assert daughters.size() == 2;
848           // Assert that we can get a serverinfo for this server.
849           if (!this.serverManager.isServerOnline(sn)) {
850             LOG.error("Dropped split! ServerName=" + sn + " unknown.");
851             break;
852           }
853           // Run handler to do the rest of the SPLIT handling.
854           new SplitRegionHandler(server, this, regionState.getRegion(), sn, daughters).process();
855           splitRegionHandlerCalled.set(true);
856           break;
857 
858         case RS_ZK_REGION_MERGING:
859           // Merged region is a new region, we can't find it in the region states now.
860           // However, the two merging regions are not new. They should be in state for merging.
861           handleRegionMerging(rt, prettyPrintedRegionName, sn);
862           break;
863 
864         case RS_ZK_REGION_MERGED:
865           // Assert that we can get a serverinfo for this server.
866           if (!this.serverManager.isServerOnline(sn)) {
867             LOG.error("Dropped merge! ServerName=" + sn + " unknown.");
868             break;
869           }
870           // Get merged and merging regions.
871           byte[] payloadOfMerge = rt.getPayload();
872           List<HRegionInfo> mergeRegions;
873           try {
874             mergeRegions = HRegionInfo.parseDelimitedFrom(payloadOfMerge, 0,
875                 payloadOfMerge.length);
876           } catch (IOException e) {
877             LOG.error("Dropped merge! Failed reading merge payload for " +
878               prettyPrintedRegionName);
879             break;
880           }
881           assert mergeRegions.size() == 3;
882           HRegionInfo merge_a = mergeRegions.get(1);
883           HRegionInfo merge_b = mergeRegions.get(2);
884           if (!isInStateForMerging(sn, merge_a, merge_b)) {
885             // Move on. Merge already happened (passed PONR), no point to stop now
886             LOG.warn("Got merge event, but not in state good for MERGED; rs_a="
887               + merge_a + ", rs_b=" + merge_b);
888           }
889           // Run handler to do the rest of the MERGED handling.
890           new MergedRegionHandler(server, this, sn, mergeRegions).process();
891           break;
892 
893         case M_ZK_REGION_CLOSING:
894           // Should see CLOSING after we have asked it to CLOSE or additional
895           // times after already being in state of CLOSING
896           if (regionState == null
897               || !regionState.isPendingCloseOrClosingOnServer(sn)) {
898             LOG.warn("Received CLOSING for " + prettyPrintedRegionName
899               + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
900               + regionStates.getRegionState(encodedName));
901             return;
902           }
903           // Transition to CLOSING (or update stamp if already CLOSING)
904           regionStates.updateRegionState(rt, RegionState.State.CLOSING);
905           break;
906 
907         case RS_ZK_REGION_CLOSED:
908           // Should see CLOSED after CLOSING but possible after PENDING_CLOSE
909           if (regionState == null
910               || !regionState.isPendingCloseOrClosingOnServer(sn)) {
911             LOG.warn("Received CLOSED for " + prettyPrintedRegionName
912               + " from " + sn + " but the region isn't PENDING_CLOSE/CLOSING here: "
913               + regionStates.getRegionState(encodedName));
914             return;
915           }
916           // Handle CLOSED by assigning elsewhere or stopping if a disable
917           // If we got here all is good.  Need to update RegionState -- else
918           // what follows will fail because not in expected state.
919           regionState = regionStates.updateRegionState(rt, RegionState.State.CLOSED);
920           if (regionState != null) {
921             removeClosedRegion(regionState.getRegion());
922             new ClosedRegionHandler(server, this, regionState.getRegion()).process();
923             closedRegionHandlerCalled.put(regionState.getRegion(), new AtomicBoolean(true));
924           }
925           break;
926 
927         case RS_ZK_REGION_FAILED_OPEN:
928           if (regionState == null
929               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
930             LOG.warn("Received FAILED_OPEN for " + prettyPrintedRegionName
931               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
932               + regionStates.getRegionState(encodedName));
933             return;
934           }
935           AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName);
936           if (failedOpenCount == null) {
937             failedOpenCount = new AtomicInteger();
938             // No need to use putIfAbsent, or extra synchronization since
939             // this whole handleRegion block is locked on the encoded region
940             // name, and failedOpenTracker is updated only in this block
941             failedOpenTracker.put(encodedName, failedOpenCount);
942           }
943           if (failedOpenCount.incrementAndGet() >= maximumAttempts) {
944             regionStates.updateRegionState(rt, RegionState.State.FAILED_OPEN);
945             // remove the tracking info to save memory, also reset
946             // the count for next open initiative
947             failedOpenTracker.remove(encodedName);
948           } else {
949             // Handle this the same as if it were opened and then closed.
950             regionState = regionStates.updateRegionState(rt, RegionState.State.CLOSED);
951             if (regionState != null) {
952               // When there are more than one region server a new RS is selected as the
953               // destination and the same is updated in the regionplan. (HBASE-5546)
954               try {
955                 getRegionPlan(regionState.getRegion(), sn, true);
956                 new ClosedRegionHandler(server, this, regionState.getRegion()).process();
957               } catch (HBaseIOException e) {
958                 LOG.warn("Failed to get region plan", e);
959               }
960             }
961           }
962           break;
963 
964         case RS_ZK_REGION_OPENING:
965           // Should see OPENING after we have asked it to OPEN or additional
966           // times after already being in state of OPENING
967           if (regionState == null
968               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
969             LOG.warn("Received OPENING for " + prettyPrintedRegionName
970               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
971               + regionStates.getRegionState(encodedName));
972             return;
973           }
974           // Transition to OPENING (or update stamp if already OPENING)
975           regionStates.updateRegionState(rt, RegionState.State.OPENING);
976           break;
977 
978         case RS_ZK_REGION_OPENED:
979           // Should see OPENED after OPENING but possible after PENDING_OPEN.
980           if (regionState == null
981               || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
982             LOG.warn("Received OPENED for " + prettyPrintedRegionName
983               + " from " + sn + " but the region isn't PENDING_OPEN/OPENING here: "
984               + regionStates.getRegionState(encodedName));
985 
986             // Close it without updating the internal region states,
987             // so as not to create double assignments in unlucky scenarios
988             // mentioned in OpenRegionHandler#process
989             unassign(regionState.getRegion(), null, -1, null, false, sn);
990             return;
991           }
992           // Handle OPENED by removing from transition and deleted zk node
993           regionState = regionStates.updateRegionState(rt, RegionState.State.OPEN);
994           if (regionState != null) {
995             failedOpenTracker.remove(encodedName); // reset the count, if any
996             new OpenedRegionHandler(
997               server, this, regionState.getRegion(), sn, expectedVersion).process();
998             openedRegionHandlerCalled.put(regionState.getRegion(), new AtomicBoolean(true));
999           }
1000           break;
1001 
1002         default:
1003           throw new IllegalStateException("Received event is not valid.");
1004       }
1005     } finally {
1006       lock.unlock();
1007     }
1008   }
1009 
1010   //For unit tests only
1011   boolean wasClosedHandlerCalled(HRegionInfo hri) {
1012     AtomicBoolean b = closedRegionHandlerCalled.get(hri);
1013     //compareAndSet to be sure that unit tests don't see stale values. Means,
1014     //we will return true exactly once unless the handler code resets to true
1015     //this value.
1016     return b == null ? false : b.compareAndSet(true, false);
1017   }
1018 
1019   //For unit tests only
1020   boolean wasOpenedHandlerCalled(HRegionInfo hri) {
1021     AtomicBoolean b = openedRegionHandlerCalled.get(hri);
1022     //compareAndSet to be sure that unit tests don't see stale values. Means,
1023     //we will return true exactly once unless the handler code resets to true
1024     //this value.
1025     return b == null ? false : b.compareAndSet(true, false);
1026   }
1027 
1028   //For unit tests only
1029   boolean wasSplitHandlerCalled() {
1030     //compareAndSet to be sure that unit tests don't see stale values. Means,
1031     //we will return true exactly once unless the handler code resets to true
1032     //this value.
1033     return splitRegionHandlerCalled.compareAndSet(true, false);
1034   }
1035 
1036   /**
1037    * @return Returns true if this RegionState is splittable; i.e. the
1038    * RegionState is currently in splitting state or pending_close or
1039    * null (Anything else will return false). (Anything else will return false).
1040    */
1041   private boolean isInStateForSplitting(final RegionState rs) {
1042     if (rs == null) return true;
1043     if (rs.isSplitting()) return true;
1044     if (convertPendingCloseToSplitting(rs)) return true;
1045     LOG.warn("Dropped region split! Not in state good for SPLITTING; rs=" + rs);
1046     return false;
1047   }
1048 
1049   /**
1050    * @return Returns true if both regions are merging/open on specified server
1051    */
1052   private boolean isInStateForMerging(final ServerName sn,
1053       final HRegionInfo a, final HRegionInfo b) {
1054     RegionState rs_a = regionStates.getRegionState(a);
1055     RegionState rs_b = regionStates.getRegionState(b);
1056     return ((rs_a == null || rs_a.isOpenOrMergingOnServer(sn))
1057       && (rs_b == null || rs_b.isOpenOrMergingOnServer(sn)));
1058   }
1059 
1060   // TODO: processFavoredNodes might throw an exception, for e.g., if the
1061   // meta could not be contacted/updated. We need to see how seriously to treat
1062   // this problem as. Should we fail the current assignment. We should be able
1063   // to recover from this problem eventually (if the meta couldn't be updated
1064   // things should work normally and eventually get fixed up).
1065   void processFavoredNodes(List<HRegionInfo> regions) throws IOException {
1066     if (!shouldAssignRegionsWithFavoredNodes) return;
1067     // The AM gets the favored nodes info for each region and updates the meta
1068     // table with that info
1069     Map<HRegionInfo, List<ServerName>> regionToFavoredNodes =
1070         new HashMap<HRegionInfo, List<ServerName>>();
1071     for (HRegionInfo region : regions) {
1072       regionToFavoredNodes.put(region,
1073           ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region));
1074     }
1075     FavoredNodeAssignmentHelper.updateMetaWithFavoredNodesInfo(regionToFavoredNodes, catalogTracker);
1076   }
1077 
1078   /**
1079    * If the passed regionState is in PENDING_CLOSE, clean up PENDING_CLOSE
1080    * state and convert it to SPLITTING instead.
1081    * This can happen in case where master wants to close a region at same time
1082    * a regionserver starts a split.  The split won.  Clean out old PENDING_CLOSE
1083    * state.
1084    * @param rs
1085    * @return True if we converted from PENDING_CLOSE to SPLITTING
1086    */
1087   private boolean convertPendingCloseToSplitting(final RegionState rs) {
1088     if (!rs.isPendingClose()) return false;
1089     LOG.debug("Converting PENDING_CLOSE to SPLITTING; rs=" + rs);
1090     regionStates.updateRegionState(
1091       rs.getRegion(), RegionState.State.SPLITTING);
1092     // Clean up existing state.  Clear from region plans seems all we
1093     // have to do here by way of clean up of PENDING_CLOSE.
1094     clearRegionPlan(rs.getRegion());
1095     return true;
1096   }
1097 
1098   /**
1099    * Handle a ZK unassigned node transition triggered by HBCK repair tool.
1100    * <p>
1101    * This is handled in a separate code path because it breaks the normal rules.
1102    * @param rt
1103    */
1104   private void handleHBCK(RegionTransition rt) {
1105     String encodedName = HRegionInfo.encodeRegionName(rt.getRegionName());
1106     LOG.info("Handling HBCK triggered transition=" + rt.getEventType() +
1107       ", server=" + rt.getServerName() + ", region=" +
1108       HRegionInfo.prettyPrint(encodedName));
1109     RegionState regionState = regionStates.getRegionTransitionState(encodedName);
1110     switch (rt.getEventType()) {
1111       case M_ZK_REGION_OFFLINE:
1112         HRegionInfo regionInfo;
1113         if (regionState != null) {
1114           regionInfo = regionState.getRegion();
1115         } else {
1116           try {
1117             byte [] name = rt.getRegionName();
1118             Pair<HRegionInfo, ServerName> p = MetaReader.getRegion(catalogTracker, name);
1119             regionInfo = p.getFirst();
1120           } catch (IOException e) {
1121             LOG.info("Exception reading META doing HBCK repair operation", e);
1122             return;
1123           }
1124         }
1125         LOG.info("HBCK repair is triggering assignment of region=" +
1126             regionInfo.getRegionNameAsString());
1127         // trigger assign, node is already in OFFLINE so don't need to update ZK
1128         assign(regionInfo, false);
1129         break;
1130 
1131       default:
1132         LOG.warn("Received unexpected region state from HBCK: " + rt.toString());
1133         break;
1134     }
1135 
1136   }
1137 
1138   // ZooKeeper events
1139 
1140   /**
1141    * New unassigned node has been created.
1142    *
1143    * <p>This happens when an RS begins the OPENING or CLOSING of a region by
1144    * creating an unassigned node.
1145    *
1146    * <p>When this happens we must:
1147    * <ol>
1148    *   <li>Watch the node for further events</li>
1149    *   <li>Read and handle the state in the node</li>
1150    * </ol>
1151    */
1152   @Override
1153   public void nodeCreated(String path) {
1154     handleAssignmentEvent(path);
1155   }
1156 
1157   /**
1158    * Existing unassigned node has had data changed.
1159    *
1160    * <p>This happens when an RS transitions from OFFLINE to OPENING, or between
1161    * OPENING/OPENED and CLOSING/CLOSED.
1162    *
1163    * <p>When this happens we must:
1164    * <ol>
1165    *   <li>Watch the node for further events</li>
1166    *   <li>Read and handle the state in the node</li>
1167    * </ol>
1168    */
1169   @Override
1170   public void nodeDataChanged(String path) {
1171     handleAssignmentEvent(path);
1172   }
1173 
1174 
1175   // We  don't want to have two events on the same region managed simultaneously.
1176   // For this reason, we need to wait if an event on the same region is currently in progress.
1177   // So we track the region names of the events in progress, and we keep a waiting list.
1178   private final Set<String> regionsInProgress = new HashSet<String>();
1179   // In a LinkedHashMultimap, the put order is kept when we retrieve the collection back. We need
1180   //  this as we want the events to be managed in the same order as we received them.
1181   private final LinkedHashMultimap <String, RegionRunnable>
1182       zkEventWorkerWaitingList = LinkedHashMultimap.create();
1183 
1184   /**
1185    * A specific runnable that works only on a region.
1186    */
1187   private interface RegionRunnable extends Runnable{
1188     /**
1189      * @return - the name of the region it works on.
1190      */
1191     String getRegionName();
1192   }
1193 
1194   /**
1195    * Submit a task, ensuring that there is only one task at a time that working on a given region.
1196    * Order is respected.
1197    */
1198   protected void zkEventWorkersSubmit(final RegionRunnable regRunnable) {
1199 
1200     synchronized (regionsInProgress) {
1201       // If we're there is already a task with this region, we add it to the
1202       //  waiting list and return.
1203       if (regionsInProgress.contains(regRunnable.getRegionName())) {
1204         synchronized (zkEventWorkerWaitingList){
1205           zkEventWorkerWaitingList.put(regRunnable.getRegionName(), regRunnable);
1206         }
1207         return;
1208       }
1209 
1210       // No event in progress on this region => we can submit a new task immediately.
1211       regionsInProgress.add(regRunnable.getRegionName());
1212       zkEventWorkers.submit(new Runnable() {
1213         @Override
1214         public void run() {
1215           try {
1216             regRunnable.run();
1217           } finally {
1218             // now that we have finished, let's see if there is an event for the same region in the
1219             //  waiting list. If it's the case, we can now submit it to the pool.
1220             synchronized (regionsInProgress) {
1221               regionsInProgress.remove(regRunnable.getRegionName());
1222               synchronized (zkEventWorkerWaitingList) {
1223                 java.util.Set<RegionRunnable> waiting = zkEventWorkerWaitingList.get(
1224                     regRunnable.getRegionName());
1225                 if (!waiting.isEmpty()) {
1226                   // We want the first object only. The only way to get it is through an iterator.
1227                   RegionRunnable toSubmit = waiting.iterator().next();
1228                   zkEventWorkerWaitingList.remove(toSubmit.getRegionName(), toSubmit);
1229                   zkEventWorkersSubmit(toSubmit);
1230                 }
1231               }
1232             }
1233           }
1234         }
1235       });
1236     }
1237   }
1238 
1239   @Override
1240   public void nodeDeleted(final String path) {
1241     if (path.startsWith(watcher.assignmentZNode)) {
1242       final String regionName = ZKAssign.getRegionName(watcher, path);
1243       zkEventWorkersSubmit(new RegionRunnable() {
1244         @Override
1245         public String getRegionName() {
1246           return regionName;
1247         }
1248 
1249         @Override
1250         public void run() {
1251           Lock lock = locker.acquireLock(regionName);
1252           try {
1253             RegionState rs = regionStates.getRegionTransitionState(regionName);
1254             if (rs == null) return;
1255 
1256             HRegionInfo regionInfo = rs.getRegion();
1257             String regionNameStr = regionInfo.getRegionNameAsString();
1258             LOG.debug("The znode of " + regionNameStr
1259               + " has been deleted, region state: " + rs);
1260             if (rs.isOpened()) {
1261               ServerName serverName = rs.getServerName();
1262               regionOnline(regionInfo, serverName);
1263               LOG.info("The master has opened "
1264                 + regionNameStr + " that was online on " + serverName);
1265               boolean disabled = getZKTable().isDisablingOrDisabledTable(
1266                 regionInfo.getTableName());
1267               if (!serverManager.isServerOnline(serverName) && !disabled) {
1268                 LOG.info("Opened " + regionNameStr
1269                   + "but the region server is offline, reassign the region");
1270                 assign(regionInfo, true);
1271               } else if (disabled) {
1272                 // if server is offline, no hurt to unassign again
1273                 LOG.info("Opened " + regionNameStr
1274                   + "but this table is disabled, triggering close of region");
1275                 unassign(regionInfo);
1276               }
1277             }
1278           } finally {
1279             lock.unlock();
1280           }
1281         }
1282       });
1283     }
1284   }
1285 
1286   /**
1287    * New unassigned node has been created.
1288    *
1289    * <p>This happens when an RS begins the OPENING, SPLITTING or CLOSING of a
1290    * region by creating a znode.
1291    *
1292    * <p>When this happens we must:
1293    * <ol>
1294    *   <li>Watch the node for further children changed events</li>
1295    *   <li>Watch all new children for changed events</li>
1296    * </ol>
1297    */
1298   @Override
1299   public void nodeChildrenChanged(String path) {
1300     if (path.equals(watcher.assignmentZNode)) {
1301       zkEventWorkers.submit(new Runnable() {
1302         @Override
1303         public void run() {
1304           try {
1305             // Just make sure we see the changes for the new znodes
1306             List<String> children =
1307               ZKUtil.listChildrenAndWatchForNewChildren(
1308                 watcher, watcher.assignmentZNode);
1309             if (children != null) {
1310               Stat stat = new Stat();
1311               for (String child : children) {
1312                 // if region is in transition, we already have a watch
1313                 // on it, so no need to watch it again. So, as I know for now,
1314                 // this is needed to watch splitting nodes only.
1315                 if (!regionStates.isRegionInTransition(child)) {
1316                   stat.setVersion(0);
1317                   byte[] data = ZKAssign.getDataAndWatch(watcher,
1318                     ZKUtil.joinZNode(watcher.assignmentZNode, child), stat);
1319                   if (data != null && stat.getVersion() > 0) {
1320                     try {
1321                       RegionTransition rt = RegionTransition.parseFrom(data);
1322 
1323                       //See HBASE-7551, handle splitting too, in case we miss the node change event
1324                       EventType type = rt.getEventType();
1325                       if (type == EventType.RS_ZK_REGION_SPLITTING
1326                           || type == EventType.RS_ZK_REGION_MERGING) {
1327                         handleRegion(rt, stat.getVersion());
1328                       }
1329                     } catch (DeserializationException de) {
1330                       LOG.error("error getting data for " + child, de);
1331                     }
1332                   }
1333                 }
1334               }
1335             }
1336           } catch (KeeperException e) {
1337             server.abort("Unexpected ZK exception reading unassigned children", e);
1338           }
1339         }
1340       });
1341     }
1342   }
1343 
1344   /**
1345    * Marks the region as online.  Removes it from regions in transition and
1346    * updates the in-memory assignment information.
1347    * <p>
1348    * Used when a region has been successfully opened on a region server.
1349    * @param regionInfo
1350    * @param sn
1351    */
1352   void regionOnline(HRegionInfo regionInfo, ServerName sn) {
1353     if (!serverManager.isServerOnline(sn)) {
1354       LOG.warn("A region was opened on a dead server, ServerName=" +
1355         sn + ", region=" + regionInfo.getEncodedName());
1356     }
1357 
1358     regionStates.regionOnline(regionInfo, sn);
1359 
1360     // Remove plan if one.
1361     clearRegionPlan(regionInfo);
1362     // Add the server to serversInUpdatingTimer
1363     addToServersInUpdatingTimer(sn);
1364   }
1365 
1366   /**
1367    * Pass the assignment event to a worker for processing.
1368    * Each worker is a single thread executor service.  The reason
1369    * for just one thread is to make sure all events for a given
1370    * region are processed in order.
1371    *
1372    * @param path
1373    */
1374   private void handleAssignmentEvent(final String path) {
1375     if (path.startsWith(watcher.assignmentZNode)) {
1376       final String regionName = ZKAssign.getRegionName(watcher, path);
1377 
1378       zkEventWorkersSubmit(new RegionRunnable() {
1379         @Override
1380         public String getRegionName() {
1381           return regionName;
1382         }
1383 
1384         @Override
1385         public void run() {
1386           try {
1387             Stat stat = new Stat();
1388             byte [] data = ZKAssign.getDataAndWatch(watcher, path, stat);
1389             if (data == null) return;
1390 
1391             RegionTransition rt = RegionTransition.parseFrom(data);
1392             handleRegion(rt, stat.getVersion());
1393           } catch (KeeperException e) {
1394             server.abort("Unexpected ZK exception reading unassigned node data", e);
1395           } catch (DeserializationException e) {
1396             server.abort("Unexpected exception deserializing node data", e);
1397           }
1398         }
1399       });
1400     }
1401   }
1402 
1403   /**
1404    * Add the server to the set serversInUpdatingTimer, then {@link TimerUpdater}
1405    * will update timers for this server in background
1406    * @param sn
1407    */
1408   private void addToServersInUpdatingTimer(final ServerName sn) {
1409     if (tomActivated){
1410       this.serversInUpdatingTimer.add(sn);
1411     }
1412   }
1413 
1414   /**
1415    * Touch timers for all regions in transition that have the passed
1416    * <code>sn</code> in common.
1417    * Call this method whenever a server checks in.  Doing so helps the case where
1418    * a new regionserver has joined the cluster and its been given 1k regions to
1419    * open.  If this method is tickled every time the region reports in a
1420    * successful open then the 1k-th region won't be timed out just because its
1421    * sitting behind the open of 999 other regions.  This method is NOT used
1422    * as part of bulk assign -- there we have a different mechanism for extending
1423    * the regions in transition timer (we turn it off temporarily -- because
1424    * there is no regionplan involved when bulk assigning.
1425    * @param sn
1426    */
1427   private void updateTimers(final ServerName sn) {
1428     Preconditions.checkState(tomActivated);
1429     if (sn == null) return;
1430 
1431     // This loop could be expensive.
1432     // First make a copy of current regionPlan rather than hold sync while
1433     // looping because holding sync can cause deadlock.  Its ok in this loop
1434     // if the Map we're going against is a little stale
1435     List<Map.Entry<String, RegionPlan>> rps;
1436     synchronized(this.regionPlans) {
1437       rps = new ArrayList<Map.Entry<String, RegionPlan>>(regionPlans.entrySet());
1438     }
1439 
1440     for (Map.Entry<String, RegionPlan> e : rps) {
1441       if (e.getValue() != null && e.getKey() != null && sn.equals(e.getValue().getDestination())) {
1442         RegionState regionState = regionStates.getRegionTransitionState(e.getKey());
1443         if (regionState != null) {
1444           regionState.updateTimestampToNow();
1445         }
1446       }
1447     }
1448   }
1449 
1450   /**
1451    * Marks the region as offline.  Removes it from regions in transition and
1452    * removes in-memory assignment information.
1453    * <p>
1454    * Used when a region has been closed and should remain closed.
1455    * @param regionInfo
1456    */
1457   public void regionOffline(final HRegionInfo regionInfo) {
1458     regionOffline(regionInfo, null);
1459   }
1460 
1461   public void offlineDisabledRegion(HRegionInfo regionInfo) {
1462     // Disabling so should not be reassigned, just delete the CLOSED node
1463     LOG.debug("Table being disabled so deleting ZK node and removing from " +
1464         "regions in transition, skipping assignment of region " +
1465           regionInfo.getRegionNameAsString());
1466     try {
1467       if (!ZKAssign.deleteClosedNode(watcher, regionInfo.getEncodedName())) {
1468         // Could also be in OFFLINE mode
1469         ZKAssign.deleteOfflineNode(watcher, regionInfo.getEncodedName());
1470       }
1471     } catch (KeeperException.NoNodeException nne) {
1472       LOG.debug("Tried to delete closed node for " + regionInfo + " but it " +
1473           "does not exist so just offlining");
1474     } catch (KeeperException e) {
1475       this.server.abort("Error deleting CLOSED node in ZK", e);
1476     }
1477     regionOffline(regionInfo);
1478   }
1479 
1480   // Assignment methods
1481 
1482   /**
1483    * Assigns the specified region.
1484    * <p>
1485    * If a RegionPlan is available with a valid destination then it will be used
1486    * to determine what server region is assigned to.  If no RegionPlan is
1487    * available, region will be assigned to a random available server.
1488    * <p>
1489    * Updates the RegionState and sends the OPEN RPC.
1490    * <p>
1491    * This will only succeed if the region is in transition and in a CLOSED or
1492    * OFFLINE state or not in transition (in-memory not zk), and of course, the
1493    * chosen server is up and running (It may have just crashed!).  If the
1494    * in-memory checks pass, the zk node is forced to OFFLINE before assigning.
1495    *
1496    * @param region server to be assigned
1497    * @param setOfflineInZK whether ZK node should be created/transitioned to an
1498    *                       OFFLINE state before assigning the region
1499    */
1500   public void assign(HRegionInfo region, boolean setOfflineInZK) {
1501     assign(region, setOfflineInZK, false);
1502   }
1503 
1504   /**
1505    * Use care with forceNewPlan. It could cause double assignment.
1506    */
1507   public void assign(HRegionInfo region,
1508       boolean setOfflineInZK, boolean forceNewPlan) {
1509     if (!setOfflineInZK && isDisabledorDisablingRegionInRIT(region)) {
1510       return;
1511     }
1512     if (this.serverManager.isClusterShutdown()) {
1513       LOG.info("Cluster shutdown is set; skipping assign of " +
1514         region.getRegionNameAsString());
1515       return;
1516     }
1517     String encodedName = region.getEncodedName();
1518     Lock lock = locker.acquireLock(encodedName);
1519     try {
1520       RegionState state = forceRegionStateToOffline(region, forceNewPlan);
1521       if (state != null) {
1522         assign(state, setOfflineInZK, forceNewPlan);
1523       }
1524     } finally {
1525       lock.unlock();
1526     }
1527   }
1528 
1529   /**
1530    * Bulk assign regions to <code>destination</code>.
1531    * @param destination
1532    * @param regions Regions to assign.
1533    * @return true if successful
1534    */
1535   boolean assign(final ServerName destination, final List<HRegionInfo> regions) {
1536     int regionCount = regions.size();
1537     if (regionCount == 0) {
1538       return true;
1539     }
1540     LOG.debug("Assigning " + regionCount + " region(s) to " + destination.toString());
1541     Set<String> encodedNames = new HashSet<String>(regionCount);
1542     for (HRegionInfo region : regions) {
1543       encodedNames.add(region.getEncodedName());
1544     }
1545 
1546     List<HRegionInfo> failedToOpenRegions = new ArrayList<HRegionInfo>();
1547     Map<String, Lock> locks = locker.acquireLocks(encodedNames);
1548     try {
1549       AtomicInteger counter = new AtomicInteger(0);
1550       Map<String, Integer> offlineNodesVersions = new ConcurrentHashMap<String, Integer>();
1551       OfflineCallback cb = new OfflineCallback(
1552         watcher, destination, counter, offlineNodesVersions);
1553       Map<String, RegionPlan> plans = new HashMap<String, RegionPlan>(regions.size());
1554       List<RegionState> states = new ArrayList<RegionState>(regions.size());
1555       for (HRegionInfo region : regions) {
1556         String encodedRegionName = region.getEncodedName();
1557         RegionState state = forceRegionStateToOffline(region, true);
1558         if (state != null && asyncSetOfflineInZooKeeper(state, cb, destination)) {
1559           RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
1560           plans.put(encodedRegionName, plan);
1561           states.add(state);
1562         } else {
1563           LOG.warn("failed to force region state to offline or "
1564             + "failed to set it offline in ZK, will reassign later: " + region);
1565           failedToOpenRegions.add(region); // assign individually later
1566           Lock lock = locks.remove(encodedRegionName);
1567           lock.unlock();
1568         }
1569       }
1570 
1571       // Wait until all unassigned nodes have been put up and watchers set.
1572       int total = states.size();
1573       for (int oldCounter = 0; !server.isStopped();) {
1574         int count = counter.get();
1575         if (oldCounter != count) {
1576           LOG.info(destination.toString() + " unassigned znodes=" + count +
1577             " of total=" + total);
1578           oldCounter = count;
1579         }
1580         if (count >= total) break;
1581         Threads.sleep(5);
1582       }
1583 
1584       if (server.isStopped()) {
1585         return false;
1586       }
1587 
1588       // Add region plans, so we can updateTimers when one region is opened so
1589       // that unnecessary timeout on RIT is reduced.
1590       this.addPlans(plans);
1591 
1592       List<Triple<HRegionInfo, Integer, List<ServerName>>> regionOpenInfos =
1593         new ArrayList<Triple<HRegionInfo, Integer, List<ServerName>>>(states.size());
1594       for (RegionState state: states) {
1595         HRegionInfo region = state.getRegion();
1596         String encodedRegionName = region.getEncodedName();
1597         Integer nodeVersion = offlineNodesVersions.get(encodedRegionName);
1598         if (nodeVersion == null || nodeVersion == -1) {
1599           LOG.warn("failed to offline in zookeeper: " + region);
1600           failedToOpenRegions.add(region); // assign individually later
1601           Lock lock = locks.remove(encodedRegionName);
1602           lock.unlock();
1603         } else {
1604           regionStates.updateRegionState(region,
1605             RegionState.State.PENDING_OPEN, destination);
1606           List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1607           if (this.shouldAssignRegionsWithFavoredNodes) {
1608             favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1609           }
1610           regionOpenInfos.add(new Triple<HRegionInfo, Integer,  List<ServerName>>(
1611             region, nodeVersion, favoredNodes));
1612         }
1613       }
1614 
1615       // Move on to open regions.
1616       try {
1617         // Send OPEN RPC. If it fails on a IOE or RemoteException, the
1618         // TimeoutMonitor will pick up the pieces.
1619         long maxWaitTime = System.currentTimeMillis() +
1620           this.server.getConfiguration().
1621             getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1622         for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1623           try {
1624             List<RegionOpeningState> regionOpeningStateList = serverManager
1625               .sendRegionOpen(destination, regionOpenInfos);
1626             if (regionOpeningStateList == null) {
1627               // Failed getting RPC connection to this server
1628               return false;
1629             }
1630             for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
1631               RegionOpeningState openingState = regionOpeningStateList.get(k);
1632               if (openingState != RegionOpeningState.OPENED) {
1633                 HRegionInfo region = regionOpenInfos.get(k).getFirst();
1634                 if (openingState == RegionOpeningState.ALREADY_OPENED) {
1635                   processAlreadyOpenedRegion(region, destination);
1636                 } else if (openingState == RegionOpeningState.FAILED_OPENING) {
1637                   // Failed opening this region, reassign it later
1638                   failedToOpenRegions.add(region);
1639                 } else {
1640                   LOG.warn("THIS SHOULD NOT HAPPEN: unknown opening state "
1641                     + openingState + " in assigning region " + region);
1642                 }
1643               }
1644             }
1645             break;
1646           } catch (IOException e) {
1647             if (e instanceof RemoteException) {
1648               e = ((RemoteException)e).unwrapRemoteException();
1649             }
1650             if (e instanceof RegionServerStoppedException) {
1651               LOG.warn("The region server was shut down, ", e);
1652               // No need to retry, the region server is a goner.
1653               return false;
1654             } else if (e instanceof ServerNotRunningYetException) {
1655               long now = System.currentTimeMillis();
1656               if (now < maxWaitTime) {
1657                 LOG.debug("Server is not yet up; waiting up to " +
1658                   (maxWaitTime - now) + "ms", e);
1659                 Thread.sleep(100);
1660                 i--; // reset the try count
1661                 continue;
1662               }
1663             } else if (e instanceof java.net.SocketTimeoutException
1664                 && this.serverManager.isServerOnline(destination)) {
1665               // In case socket is timed out and the region server is still online,
1666               // the openRegion RPC could have been accepted by the server and
1667               // just the response didn't go through.  So we will retry to
1668               // open the region on the same server.
1669               if (LOG.isDebugEnabled()) {
1670                 LOG.debug("Bulk assigner openRegion() to " + destination
1671                   + " has timed out, but the regions might"
1672                   + " already be opened on it.", e);
1673               }
1674               continue;
1675             }
1676             throw e;
1677           }
1678         }
1679       } catch (IOException e) {
1680         // Can be a socket timeout, EOF, NoRouteToHost, etc
1681         LOG.info("Unable to communicate with the region server in order" +
1682           " to assign regions", e);
1683         return false;
1684       } catch (InterruptedException e) {
1685         throw new RuntimeException(e);
1686       }
1687     } finally {
1688       for (Lock lock : locks.values()) {
1689         lock.unlock();
1690       }
1691     }
1692 
1693     if (!failedToOpenRegions.isEmpty()) {
1694       for (HRegionInfo region : failedToOpenRegions) {
1695         invokeAssign(region);
1696       }
1697     }
1698     LOG.debug("Bulk assigning done for " + destination.toString());
1699     return true;
1700   }
1701 
1702   /**
1703    * Send CLOSE RPC if the server is online, otherwise, offline the region.
1704    *
1705    * The RPC will be sent only to the region sever found in the region state
1706    * if it is passed in, otherwise, to the src server specified. If region
1707    * state is not specified, we don't update region state at all, instead
1708    * we just send the RPC call. This is useful for some cleanup without
1709    * messing around the region states (see handleRegion, on region opened
1710    * on an unexpected server scenario, for an example)
1711    */
1712   private void unassign(final HRegionInfo region,
1713       final RegionState state, final int versionOfClosingNode,
1714       final ServerName dest, final boolean transitionInZK,
1715       final ServerName src) {
1716     ServerName server = src;
1717     if (state != null) {
1718       server = state.getServerName();
1719     }
1720     for (int i = 1; i <= this.maximumAttempts; i++) {
1721       // ClosedRegionhandler can remove the server from this.regions
1722       if (!serverManager.isServerOnline(server)) {
1723         if (transitionInZK) {
1724           // delete the node. if no node exists need not bother.
1725           deleteClosingOrClosedNode(region);
1726         }
1727         if (state != null) {
1728           regionOffline(region);
1729         }
1730         return;
1731       }
1732       try {
1733         // Send CLOSE RPC
1734         if (serverManager.sendRegionClose(server, region,
1735           versionOfClosingNode, dest, transitionInZK)) {
1736           LOG.debug("Sent CLOSE to " + server + " for region " +
1737             region.getRegionNameAsString());
1738           return;
1739         }
1740         // This never happens. Currently regionserver close always return true.
1741         // Todo; this can now happen (0.96) if there is an exception in a coprocessor
1742         LOG.warn("Server " + server + " region CLOSE RPC returned false for " +
1743           region.getRegionNameAsString());
1744       } catch (Throwable t) {
1745         if (t instanceof RemoteException) {
1746           t = ((RemoteException)t).unwrapRemoteException();
1747         }
1748         if (t instanceof NotServingRegionException
1749             || t instanceof RegionServerStoppedException) {
1750           if (transitionInZK) {
1751             deleteClosingOrClosedNode(region);
1752           }
1753           if (state != null) {
1754             regionOffline(region);
1755           }
1756           return;
1757         } else if (state != null
1758             && t instanceof RegionAlreadyInTransitionException) {
1759           // RS is already processing this region, only need to update the timestamp
1760           LOG.debug("update " + state + " the timestamp.");
1761           state.updateTimestampToNow();
1762         }
1763         LOG.info("Server " + server + " returned " + t + " for "
1764           + region.getRegionNameAsString() + ", try=" + i
1765           + " of " + this.maximumAttempts, t);
1766         // Presume retry or server will expire.
1767       }
1768     }
1769     // Run out of attempts
1770     if (!tomActivated && state != null) {
1771       regionStates.updateRegionState(region, RegionState.State.FAILED_CLOSE);
1772     }
1773   }
1774 
1775   /**
1776    * Set region to OFFLINE unless it is opening and forceNewPlan is false.
1777    */
1778   private RegionState forceRegionStateToOffline(
1779       final HRegionInfo region, final boolean forceNewPlan) {
1780     RegionState state = regionStates.getRegionState(region);
1781     if (state == null) {
1782       LOG.warn("Assigning a region not in region states: " + region);
1783       state = regionStates.createRegionState(region);
1784     } else {
1785       switch (state.getState()) {
1786       case OPEN:
1787       case OPENING:
1788       case PENDING_OPEN:
1789         if (!forceNewPlan) {
1790           LOG.debug("Attempting to assign region " +
1791             region + " but it is already in transition: " + state);
1792           return null;
1793         }
1794       case CLOSING:
1795       case PENDING_CLOSE:
1796       case FAILED_CLOSE:
1797       case FAILED_OPEN:
1798         unassign(region, state, -1, null, false, null);
1799         state = regionStates.getRegionState(region);
1800         if (state.isOffline()) break;
1801       case CLOSED:
1802         LOG.debug("Forcing OFFLINE; was=" + state);
1803         state = regionStates.updateRegionState(
1804           region, RegionState.State.OFFLINE);
1805       case OFFLINE:
1806         break;
1807       default:
1808         LOG.error("Trying to assign region " + region
1809           + ", which is in state " + state);
1810         return null;
1811       }
1812     }
1813     return state;
1814   }
1815 
1816   /**
1817    * Caller must hold lock on the passed <code>state</code> object.
1818    * @param state
1819    * @param setOfflineInZK
1820    * @param forceNewPlan
1821    */
1822   private void assign(RegionState state,
1823       final boolean setOfflineInZK, final boolean forceNewPlan) {
1824     RegionState currentState = state;
1825     int versionOfOfflineNode = -1;
1826     RegionPlan plan = null;
1827     long maxRegionServerStartupWaitTime = -1;
1828     HRegionInfo region = state.getRegion();
1829     RegionOpeningState regionOpenState;
1830     for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
1831       if (plan == null) { // Get a server for the region at first
1832         try {
1833           plan = getRegionPlan(region, forceNewPlan);
1834         } catch (HBaseIOException e) {
1835           LOG.warn("Failed to get region plan", e);
1836         }
1837       }
1838       if (plan == null) {
1839         LOG.warn("Unable to determine a plan to assign " + region);
1840         if (tomActivated){
1841           this.timeoutMonitor.setAllRegionServersOffline(true);
1842         } else {
1843           if (region.isMetaRegion()) {
1844             try {
1845               if (i != maximumAttempts) {
1846                 Thread.sleep(this.sleepTimeBeforeRetryingMetaAssignment);
1847                 continue;
1848               }
1849               // TODO : Ensure HBCK fixes this
1850               LOG.error("Unable to determine a plan to assign META even after repeated attempts. Run HBCK to fix this");
1851             } catch (InterruptedException e) {
1852               LOG.error("Got exception while waiting for META assignment");
1853               Thread.currentThread().interrupt();
1854             }
1855           }
1856           regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN);
1857         }
1858         return;
1859       }
1860       if (setOfflineInZK && versionOfOfflineNode == -1) {
1861         // get the version of the znode after setting it to OFFLINE.
1862         // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE
1863         versionOfOfflineNode = setOfflineInZooKeeper(currentState, plan.getDestination());
1864         if (versionOfOfflineNode != -1) {
1865           if (isDisabledorDisablingRegionInRIT(region)) {
1866             return;
1867           }
1868           // In case of assignment from EnableTableHandler table state is ENABLING. Any how
1869           // EnableTableHandler will set ENABLED after assigning all the table regions. If we
1870           // try to set to ENABLED directly then client API may think table is enabled.
1871           // When we have a case such as all the regions are added directly into .META. and we call
1872           // assignRegion then we need to make the table ENABLED. Hence in such case the table
1873           // will not be in ENABLING or ENABLED state.
1874           TableName tableName = region.getTableName();
1875           if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) {
1876             LOG.debug("Setting table " + tableName + " to ENABLED state.");
1877             setEnabledTable(tableName);
1878           }
1879         }
1880       }
1881       if (setOfflineInZK && versionOfOfflineNode == -1) {
1882         LOG.info("Unable to set offline in ZooKeeper to assign " + region);
1883         // Setting offline in ZK must have been failed due to ZK racing or some
1884         // exception which may make the server to abort. If it is ZK racing,
1885         // we should retry since we already reset the region state,
1886         // existing (re)assignment will fail anyway.
1887         if (!server.isAborted()) {
1888           continue;
1889         }
1890       }
1891       if (this.server.isStopped() || this.server.isAborted()) {
1892         LOG.debug("Server stopped/aborted; skipping assign of " + region);
1893         return;
1894       }
1895       LOG.info("Assigning " + region.getRegionNameAsString() +
1896           " to " + plan.getDestination().toString());
1897       // Transition RegionState to PENDING_OPEN
1898       currentState = regionStates.updateRegionState(region,
1899           RegionState.State.PENDING_OPEN, plan.getDestination());
1900 
1901       boolean needNewPlan;
1902       final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
1903           " to " + plan.getDestination();
1904       try {
1905         List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
1906         if (this.shouldAssignRegionsWithFavoredNodes) {
1907           favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
1908         }
1909         regionOpenState = serverManager.sendRegionOpen(
1910             plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
1911 
1912         if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
1913           // Failed opening this region, looping again on a new server.
1914           needNewPlan = true;
1915           LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
1916               " trying to assign elsewhere instead; " +
1917               "try=" + i + " of " + this.maximumAttempts);
1918         } else {
1919           // we're done
1920           if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
1921             processAlreadyOpenedRegion(region, plan.getDestination());
1922           }
1923           return;
1924         }
1925 
1926       } catch (Throwable t) {
1927         if (t instanceof RemoteException) {
1928           t = ((RemoteException) t).unwrapRemoteException();
1929         }
1930 
1931         // Should we wait a little before retrying? If the server is starting it's yes.
1932         // If the region is already in transition, it's yes as well: we want to be sure that
1933         //  the region will get opened but we don't want a double assignment.
1934         boolean hold = (t instanceof RegionAlreadyInTransitionException ||
1935             t instanceof ServerNotRunningYetException);
1936 
1937         // In case socket is timed out and the region server is still online,
1938         // the openRegion RPC could have been accepted by the server and
1939         // just the response didn't go through.  So we will retry to
1940         // open the region on the same server to avoid possible
1941         // double assignment.
1942         boolean retry = !hold && (t instanceof java.net.SocketTimeoutException
1943             && this.serverManager.isServerOnline(plan.getDestination()));
1944 
1945 
1946         if (hold) {
1947           LOG.warn(assignMsg + ", waiting a little before trying on the same region server " +
1948               "try=" + i + " of " + this.maximumAttempts, t);
1949 
1950           if (maxRegionServerStartupWaitTime < 0) {
1951             maxRegionServerStartupWaitTime = EnvironmentEdgeManager.currentTimeMillis() +
1952                 this.server.getConfiguration().
1953                     getLong("hbase.regionserver.rpc.startup.waittime", 60000);
1954           }
1955           try {
1956             long now = EnvironmentEdgeManager.currentTimeMillis();
1957             if (now < maxRegionServerStartupWaitTime) {
1958               LOG.debug("Server is not yet up; waiting up to " +
1959                   (maxRegionServerStartupWaitTime - now) + "ms", t);
1960               Thread.sleep(100);
1961               i--; // reset the try count
1962               needNewPlan = false;
1963             } else {
1964               LOG.debug("Server is not up for a while; try a new one", t);
1965               needNewPlan = true;
1966             }
1967           } catch (InterruptedException ie) {
1968             LOG.warn("Failed to assign "
1969                 + region.getRegionNameAsString() + " since interrupted", ie);
1970             Thread.currentThread().interrupt();
1971             if (!tomActivated) {
1972               regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN);
1973             }
1974             return;
1975           }
1976         } else if (retry) {
1977           needNewPlan = false;
1978           LOG.warn(assignMsg + ", trying to assign to the same region server " +
1979               "try=" + i + " of " + this.maximumAttempts, t);
1980         } else {
1981           needNewPlan = true;
1982           LOG.warn(assignMsg + ", trying to assign elsewhere instead;" +
1983               " try=" + i + " of " + this.maximumAttempts, t);
1984         }
1985       }
1986 
1987       if (i == this.maximumAttempts) {
1988         // Don't reset the region state or get a new plan any more.
1989         // This is the last try.
1990         continue;
1991       }
1992 
1993       // If region opened on destination of present plan, reassigning to new
1994       // RS may cause double assignments. In case of RegionAlreadyInTransitionException
1995       // reassigning to same RS.
1996       if (needNewPlan) {
1997         // Force a new plan and reassign. Will return null if no servers.
1998         // The new plan could be the same as the existing plan since we don't
1999         // exclude the server of the original plan, which should not be
2000         // excluded since it could be the only server up now.
2001         RegionPlan newPlan = null;
2002         try {
2003           newPlan = getRegionPlan(region, true);
2004         } catch (HBaseIOException e) {
2005           LOG.warn("Failed to get region plan", e);
2006         }
2007         if (newPlan == null) {
2008           if (tomActivated) {
2009             this.timeoutMonitor.setAllRegionServersOffline(true);
2010           } else {
2011             regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN);
2012           }
2013           LOG.warn("Unable to find a viable location to assign region " +
2014               region.getRegionNameAsString());
2015           return;
2016         }
2017 
2018         if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
2019           // Clean out plan we failed execute and one that doesn't look like it'll
2020           // succeed anyways; we need a new plan!
2021           // Transition back to OFFLINE
2022           currentState = regionStates.updateRegionState(region, RegionState.State.OFFLINE);
2023           versionOfOfflineNode = -1;
2024           plan = newPlan;
2025         }
2026       }
2027     }
2028     // Run out of attempts
2029     if (!tomActivated) {
2030       regionStates.updateRegionState(region, RegionState.State.FAILED_OPEN);
2031     }
2032   }
2033 
2034   private void processAlreadyOpenedRegion(HRegionInfo region, ServerName sn) {
2035     // Remove region from in-memory transition and unassigned node from ZK
2036     // While trying to enable the table the regions of the table were
2037     // already enabled.
2038     LOG.debug("ALREADY_OPENED " + region.getRegionNameAsString()
2039         + " to " + sn);
2040     String encodedRegionName = region.getEncodedName();
2041     try {
2042       ZKAssign.deleteOfflineNode(watcher, encodedRegionName);
2043     } catch (KeeperException.NoNodeException e) {
2044       if (LOG.isDebugEnabled()) {
2045         LOG.debug("The unassigned node " + encodedRegionName
2046             + " does not exist.");
2047       }
2048     } catch (KeeperException e) {
2049       server.abort(
2050           "Error deleting OFFLINED node in ZK for transition ZK node ("
2051               + encodedRegionName + ")", e);
2052     }
2053 
2054     regionStates.regionOnline(region, sn);
2055   }
2056 
2057   private boolean isDisabledorDisablingRegionInRIT(final HRegionInfo region) {
2058     TableName tableName = region.getTableName();
2059     boolean disabled = this.zkTable.isDisabledTable(tableName);
2060     if (disabled || this.zkTable.isDisablingTable(tableName)) {
2061       LOG.info("Table " + tableName + (disabled ? " disabled;" : " disabling;") +
2062         " skipping assign of " + region.getRegionNameAsString());
2063       offlineDisabledRegion(region);
2064       return true;
2065     }
2066     return false;
2067   }
2068 
2069   /**
2070    * Set region as OFFLINED up in zookeeper
2071    *
2072    * @param state
2073    * @return the version of the offline node if setting of the OFFLINE node was
2074    *         successful, -1 otherwise.
2075    */
2076   private int setOfflineInZooKeeper(final RegionState state, final ServerName destination) {
2077     if (!state.isClosed() && !state.isOffline()) {
2078       String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
2079       this.server.abort(msg, new IllegalStateException(msg));
2080       return -1;
2081     }
2082     regionStates.updateRegionState(state.getRegion(),
2083       RegionState.State.OFFLINE);
2084     int versionOfOfflineNode;
2085     try {
2086       // get the version after setting the znode to OFFLINE
2087       versionOfOfflineNode = ZKAssign.createOrForceNodeOffline(watcher,
2088         state.getRegion(), destination);
2089       if (versionOfOfflineNode == -1) {
2090         LOG.warn("Attempted to create/force node into OFFLINE state before "
2091             + "completing assignment but failed to do so for " + state);
2092         return -1;
2093       }
2094     } catch (KeeperException e) {
2095       server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
2096       return -1;
2097     }
2098     return versionOfOfflineNode;
2099   }
2100 
2101   /**
2102    * @param region the region to assign
2103    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2104    * if no servers to assign, it returns null).
2105    */
2106   private RegionPlan getRegionPlan(final HRegionInfo region,
2107       final boolean forceNewPlan)  throws HBaseIOException  {
2108     return getRegionPlan(region, null, forceNewPlan);
2109   }
2110 
2111   /**
2112    * @param region the region to assign
2113    * @param serverToExclude Server to exclude (we know its bad). Pass null if
2114    * all servers are thought to be assignable.
2115    * @param forceNewPlan If true, then if an existing plan exists, a new plan
2116    * will be generated.
2117    * @return Plan for passed <code>region</code> (If none currently, it creates one or
2118    * if no servers to assign, it returns null).
2119    */
2120   private RegionPlan getRegionPlan(final HRegionInfo region,
2121       final ServerName serverToExclude, final boolean forceNewPlan) throws HBaseIOException {
2122     // Pickup existing plan or make a new one
2123     final String encodedName = region.getEncodedName();
2124     final List<ServerName> destServers =
2125       serverManager.createDestinationServersList(serverToExclude);
2126 
2127     if (destServers.isEmpty()){
2128       LOG.warn("Can't move " + encodedName +
2129         ", there is no destination server available.");
2130       return null;
2131     }
2132 
2133     RegionPlan randomPlan = null;
2134     boolean newPlan = false;
2135     RegionPlan existingPlan;
2136 
2137     synchronized (this.regionPlans) {
2138       existingPlan = this.regionPlans.get(encodedName);
2139 
2140       if (existingPlan != null && existingPlan.getDestination() != null) {
2141         LOG.debug("Found an existing plan for " + region.getRegionNameAsString()
2142           + " destination server is " + existingPlan.getDestination() +
2143             " accepted as a dest server = " + destServers.contains(existingPlan.getDestination()));
2144       }
2145 
2146       if (forceNewPlan
2147           || existingPlan == null
2148           || existingPlan.getDestination() == null
2149           || !destServers.contains(existingPlan.getDestination())) {
2150         newPlan = true;
2151         randomPlan = new RegionPlan(region, null,
2152             balancer.randomAssignment(region, destServers));
2153         if (!region.isMetaTable() && shouldAssignRegionsWithFavoredNodes) {
2154           List<HRegionInfo> regions = new ArrayList<HRegionInfo>(1);
2155           regions.add(region);
2156           try {
2157             processFavoredNodes(regions);
2158           } catch (IOException ie) {
2159             LOG.warn("Ignoring exception in processFavoredNodes " + ie);
2160           }
2161         }
2162         this.regionPlans.put(encodedName, randomPlan);
2163       }
2164     }
2165 
2166     if (newPlan) {
2167       if (randomPlan.getDestination() == null) {
2168         LOG.warn("Can't find a destination for " + encodedName);
2169         return null;
2170       }
2171       LOG.debug("No previous transition plan was found (or we are ignoring " +
2172         "an existing plan) for " + region.getRegionNameAsString() +
2173         " so generated a random one; " + randomPlan + "; " +
2174         serverManager.countOfRegionServers() +
2175                " (online=" + serverManager.getOnlineServers().size() +
2176                ", available=" + destServers.size() + ") available servers" +
2177                ", forceNewPlan=" + forceNewPlan);
2178         return randomPlan;
2179       }
2180     LOG.debug("Using pre-existing plan for " +
2181       region.getRegionNameAsString() + "; plan=" + existingPlan);
2182     return existingPlan;
2183   }
2184 
2185   /**
2186    * Unassign the list of regions. Configuration knobs:
2187    * hbase.bulk.waitbetween.reopen indicates the number of milliseconds to
2188    * wait before unassigning another region from this region server
2189    *
2190    * @param regions
2191    * @throws InterruptedException
2192    */
2193   public void unassign(List<HRegionInfo> regions) {
2194     int waitTime = this.server.getConfiguration().getInt(
2195         "hbase.bulk.waitbetween.reopen", 0);
2196     for (HRegionInfo region : regions) {
2197       if (regionStates.isRegionInTransition(region))
2198         continue;
2199       unassign(region, false);
2200       while (regionStates.isRegionInTransition(region)) {
2201         try {
2202           Thread.sleep(10);
2203         } catch (InterruptedException e) {
2204           // Do nothing, continue
2205         }
2206       }
2207       if (waitTime > 0)
2208         try {
2209           Thread.sleep(waitTime);
2210         } catch (InterruptedException e) {
2211           // Do nothing, continue
2212         }
2213     }
2214   }
2215 
2216   /**
2217    * Unassigns the specified region.
2218    * <p>
2219    * Updates the RegionState and sends the CLOSE RPC unless region is being
2220    * split by regionserver; then the unassign fails (silently) because we
2221    * presume the region being unassigned no longer exists (its been split out
2222    * of existence). TODO: What to do if split fails and is rolled back and
2223    * parent is revivified?
2224    * <p>
2225    * If a RegionPlan is already set, it will remain.
2226    *
2227    * @param region server to be unassigned
2228    */
2229   public void unassign(HRegionInfo region) {
2230     unassign(region, false);
2231   }
2232 
2233 
2234   /**
2235    * Unassigns the specified region.
2236    * <p>
2237    * Updates the RegionState and sends the CLOSE RPC unless region is being
2238    * split by regionserver; then the unassign fails (silently) because we
2239    * presume the region being unassigned no longer exists (its been split out
2240    * of existence). TODO: What to do if split fails and is rolled back and
2241    * parent is revivified?
2242    * <p>
2243    * If a RegionPlan is already set, it will remain.
2244    *
2245    * @param region server to be unassigned
2246    * @param force if region should be closed even if already closing
2247    */
2248   public void unassign(HRegionInfo region, boolean force, ServerName dest) {
2249     // TODO: Method needs refactoring.  Ugly buried returns throughout.  Beware!
2250     LOG.debug("Starting unassign of " + region.getRegionNameAsString() + " (offlining)");
2251 
2252     String encodedName = region.getEncodedName();
2253     // Grab the state of this region and synchronize on it
2254     int versionOfClosingNode = -1;
2255     // We need a lock here as we're going to do a put later and we don't want multiple states
2256     //  creation
2257     ReentrantLock lock = locker.acquireLock(encodedName);
2258     RegionState state = regionStates.getRegionTransitionState(encodedName);
2259     try {
2260       if (state == null) {
2261         // Create the znode in CLOSING state
2262         try {
2263           state = regionStates.getRegionState(region);
2264           if (state == null || state.getServerName() == null) {
2265             // We don't know where the region is, offline it.
2266             // No need to send CLOSE RPC
2267             regionOffline(region);
2268             return;
2269           }
2270           versionOfClosingNode = ZKAssign.createNodeClosing(
2271             watcher, region, state.getServerName());
2272           if (versionOfClosingNode == -1) {
2273             LOG.debug("Attempting to unassign " +
2274                 region.getRegionNameAsString() + " but ZK closing node "
2275                 + "can't be created.");
2276             return;
2277           }
2278         } catch (KeeperException e) {
2279           if (e instanceof NodeExistsException) {
2280             // Handle race between master initiated close and regionserver
2281             // orchestrated splitting. See if existing node is in a
2282             // SPLITTING or SPLIT state.  If so, the regionserver started
2283             // an op on node before we could get our CLOSING in.  Deal.
2284             NodeExistsException nee = (NodeExistsException)e;
2285             String path = nee.getPath();
2286             try {
2287               if (isSplitOrSplittingOrMergedOrMerging(path)) {
2288                 LOG.debug(path + " is SPLIT or SPLITTING or MERGED or MERGING; " +
2289                   "skipping unassign because region no longer exists -- its split or merge");
2290                 return;
2291               }
2292             } catch (KeeperException.NoNodeException ke) {
2293               LOG.warn("Failed getData on SPLITTING/SPLIT at " + path +
2294                 "; presuming split and that the region to unassign, " +
2295                 encodedName + ", no longer exists -- confirm", ke);
2296               return;
2297             } catch (KeeperException ke) {
2298               LOG.error("Unexpected zk state", ke);
2299             } catch (DeserializationException de) {
2300               LOG.error("Failed parse", de);
2301             }
2302           }
2303           // If we get here, don't understand whats going on -- abort.
2304           server.abort("Unexpected ZK exception creating node CLOSING", e);
2305           return;
2306         }
2307         state = regionStates.updateRegionState(region, RegionState.State.PENDING_CLOSE);
2308       } else if (state.isFailedOpen()) {
2309         // The region is not open yet
2310         regionOffline(region);
2311         return;
2312       } else if (force && (state.isPendingClose()
2313           || state.isClosing() || state.isFailedClose())) {
2314         LOG.debug("Attempting to unassign " + region.getRegionNameAsString() +
2315           " which is already " + state.getState()  +
2316           " but forcing to send a CLOSE RPC again ");
2317         if (state.isFailedClose()) {
2318           state = regionStates.updateRegionState(region, RegionState.State.PENDING_CLOSE);
2319         }
2320         state.updateTimestampToNow();
2321       } else {
2322         LOG.debug("Attempting to unassign " +
2323           region.getRegionNameAsString() + " but it is " +
2324           "already in transition (" + state.getState() + ", force=" + force + ")");
2325         return;
2326       }
2327 
2328       unassign(region, state, versionOfClosingNode, dest, true, null);
2329     } finally {
2330       lock.unlock();
2331     }
2332   }
2333 
2334   public void unassign(HRegionInfo region, boolean force){
2335      unassign(region, force, null);
2336   }
2337 
2338   /**
2339    * @param region regioninfo of znode to be deleted.
2340    */
2341   public void deleteClosingOrClosedNode(HRegionInfo region) {
2342     String encodedName = region.getEncodedName();
2343     try {
2344       if (!ZKAssign.deleteNode(watcher, encodedName,
2345           EventType.M_ZK_REGION_CLOSING)) {
2346         boolean deleteNode = ZKAssign.deleteNode(watcher,
2347           encodedName, EventType.RS_ZK_REGION_CLOSED);
2348         // TODO : We don't abort if the delete node returns false. Is there any
2349         // such corner case?
2350         if (!deleteNode) {
2351           LOG.error("The deletion of the CLOSED node for "
2352             + encodedName + " returned " + deleteNode);
2353         }
2354       }
2355     } catch (NoNodeException e) {
2356       LOG.debug("CLOSING/CLOSED node for " + encodedName
2357         + " already deleted");
2358     } catch (KeeperException ke) {
2359       server.abort(
2360         "Unexpected ZK exception deleting node CLOSING/CLOSED for the region "
2361           + encodedName, ke);
2362     }
2363   }
2364 
2365   /**
2366    * @param path
2367    * @return True if znode is in SPLIT or SPLITTING or MERGED or MERGING state.
2368    * @throws KeeperException Can happen if the znode went away in meantime.
2369    * @throws DeserializationException
2370    */
2371   private boolean isSplitOrSplittingOrMergedOrMerging(final String path)
2372       throws KeeperException, DeserializationException {
2373     boolean result = false;
2374     // This may fail if the SPLIT or SPLITTING or MERGED or MERGING znode gets
2375     // cleaned up before we can get data from it.
2376     byte [] data = ZKAssign.getData(watcher, path);
2377     if (data == null) return false;
2378     RegionTransition rt = RegionTransition.parseFrom(data);
2379     switch (rt.getEventType()) {
2380     case RS_ZK_REGION_SPLIT:
2381     case RS_ZK_REGION_SPLITTING:
2382     case RS_ZK_REGION_MERGED:
2383     case RS_ZK_REGION_MERGING:
2384       result = true;
2385       break;
2386     default:
2387       break;
2388     }
2389     return result;
2390   }
2391 
2392   /**
2393    * Waits until the specified region has completed assignment.
2394    * <p>
2395    * If the region is already assigned, returns immediately.  Otherwise, method
2396    * blocks until the region is assigned.
2397    * @param regionInfo region to wait on assignment for
2398    * @throws InterruptedException
2399    */
2400   public boolean waitForAssignment(HRegionInfo regionInfo)
2401       throws InterruptedException {
2402     while (!regionStates.isRegionAssigned(regionInfo)) {
2403       if (regionStates.isRegionInState(regionInfo, State.FAILED_OPEN)
2404           || this.server.isStopped()) {
2405         return false;
2406       }
2407 
2408       // We should receive a notification, but it's
2409       //  better to have a timeout to recheck the condition here:
2410       //  it lowers the impact of a race condition if any
2411       regionStates.waitForUpdate(100);
2412     }
2413     return true;
2414   }
2415 
2416   /**
2417    * Assigns the META region.
2418    * <p>
2419    * Assumes that META is currently closed and is not being actively served by
2420    * any RegionServer.
2421    * <p>
2422    * Forcibly unsets the current meta region location in ZooKeeper and assigns
2423    * META to a random RegionServer.
2424    * @throws KeeperException
2425    */
2426   public void assignMeta() throws KeeperException {
2427     MetaRegionTracker.deleteMetaLocation(this.watcher);
2428     assign(HRegionInfo.FIRST_META_REGIONINFO, true);
2429   }
2430 
2431   /**
2432    * Assigns specified regions retaining assignments, if any.
2433    * <p>
2434    * This is a synchronous call and will return once every region has been
2435    * assigned.  If anything fails, an exception is thrown
2436    * @throws InterruptedException
2437    * @throws IOException
2438    */
2439   public void assign(Map<HRegionInfo, ServerName> regions)
2440         throws IOException, InterruptedException {
2441     if (regions == null || regions.isEmpty()) {
2442       return;
2443     }
2444     List<ServerName> servers = serverManager.createDestinationServersList();
2445     if (servers == null || servers.isEmpty()) {
2446       throw new IOException("Found no destination server to assign region(s)");
2447     }
2448 
2449     // Reuse existing assignment info
2450     Map<ServerName, List<HRegionInfo>> bulkPlan =
2451       balancer.retainAssignment(regions, servers);
2452 
2453     assign(regions.size(), servers.size(),
2454       "retainAssignment=true", bulkPlan);
2455   }
2456 
2457   /**
2458    * Assigns specified regions round robin, if any.
2459    * <p>
2460    * This is a synchronous call and will return once every region has been
2461    * assigned.  If anything fails, an exception is thrown
2462    * @throws InterruptedException
2463    * @throws IOException
2464    */
2465   public void assign(List<HRegionInfo> regions)
2466         throws IOException, InterruptedException {
2467     if (regions == null || regions.isEmpty()) {
2468       return;
2469     }
2470 
2471     List<ServerName> servers = serverManager.createDestinationServersList();
2472     if (servers == null || servers.isEmpty()) {
2473       throw new IOException("Found no destination server to assign region(s)");
2474     }
2475 
2476     // Generate a round-robin bulk assignment plan
2477     Map<ServerName, List<HRegionInfo>> bulkPlan
2478       = balancer.roundRobinAssignment(regions, servers);
2479     processFavoredNodes(regions);
2480 
2481     assign(regions.size(), servers.size(),
2482       "round-robin=true", bulkPlan);
2483   }
2484 
2485   private void assign(int regions, int totalServers,
2486       String message, Map<ServerName, List<HRegionInfo>> bulkPlan)
2487           throws InterruptedException, IOException {
2488 
2489     int servers = bulkPlan.size();
2490     if (servers == 1 || (regions < bulkAssignThresholdRegions
2491         && servers < bulkAssignThresholdServers)) {
2492 
2493       // Not use bulk assignment.  This could be more efficient in small
2494       // cluster, especially mini cluster for testing, so that tests won't time out
2495       if (LOG.isTraceEnabled()) {
2496         LOG.trace("Not using bulk assignment since we are assigning only " + regions +
2497           " region(s) to " + servers + " server(s)");
2498       }
2499       for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) {
2500         assign(plan.getKey(), plan.getValue());
2501       }
2502     } else {
2503       LOG.info("Bulk assigning " + regions + " region(s) across "
2504         + totalServers + " server(s), " + message);
2505 
2506       // Use fixed count thread pool assigning.
2507       BulkAssigner ba = new GeneralBulkAssigner(
2508         this.server, bulkPlan, this, bulkAssignWaitTillAllAssigned);
2509       ba.bulkAssign();
2510       LOG.info("Bulk assigning done");
2511     }
2512   }
2513 
2514   /**
2515    * Assigns all user regions, if any exist.  Used during cluster startup.
2516    * <p>
2517    * This is a synchronous call and will return once every region has been
2518    * assigned.  If anything fails, an exception is thrown and the cluster
2519    * should be shutdown.
2520    * @throws InterruptedException
2521    * @throws IOException
2522    * @throws KeeperException
2523    */
2524   private void assignAllUserRegions()
2525       throws IOException, InterruptedException, KeeperException {
2526     // Cleanup any existing ZK nodes and start watching
2527     ZKAssign.deleteAllNodes(watcher);
2528     ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
2529       this.watcher.assignmentZNode);
2530     failoverCleanupDone();
2531 
2532     // Skip assignment for regions of tables in DISABLING state because during clean cluster startup
2533     // no RS is alive and regions map also doesn't have any information about the regions.
2534     // See HBASE-6281.
2535     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisabledOrDisablingTables(watcher);
2536     disabledOrDisablingOrEnabling.addAll(ZKTable.getEnablingTables(watcher));
2537     // Scan META for all user regions, skipping any disabled tables
2538     Map<HRegionInfo, ServerName> allRegions;
2539     if (this.shouldAssignRegionsWithFavoredNodes) {
2540       allRegions = FavoredNodeAssignmentHelper.fullScan(
2541         catalogTracker, disabledOrDisablingOrEnabling, true, (FavoredNodeLoadBalancer)balancer);
2542     } else {
2543       allRegions = MetaReader.fullScan(
2544         catalogTracker, disabledOrDisablingOrEnabling, true);
2545     }
2546 
2547     if (allRegions == null) return;
2548 
2549     //remove system tables because they would have been assigned earlier
2550     for(Iterator<HRegionInfo> iter = allRegions.keySet().iterator();
2551         iter.hasNext();) {
2552       if (HTableDescriptor.isSystemTable(iter.next().getTableName())) {
2553         iter.remove();
2554       }
2555     }
2556 
2557     if (allRegions.isEmpty()) return;
2558 
2559     // Determine what type of assignment to do on startup
2560     boolean retainAssignment = server.getConfiguration().
2561       getBoolean("hbase.master.startup.retainassign", true);
2562 
2563     if (retainAssignment) {
2564       assign(allRegions);
2565     } else {
2566       List<HRegionInfo> regions = new ArrayList<HRegionInfo>(allRegions.keySet());
2567       assign(regions);
2568     }
2569 
2570     for (HRegionInfo hri : allRegions.keySet()) {
2571       TableName tableName = hri.getTableName();
2572       if (!zkTable.isEnabledTable(tableName)) {
2573         setEnabledTable(tableName);
2574       }
2575     }
2576   }
2577 
2578   /**
2579    * Wait until no regions in transition.
2580    * @param timeout How long to wait.
2581    * @return True if nothing in regions in transition.
2582    * @throws InterruptedException
2583    */
2584   boolean waitUntilNoRegionsInTransition(final long timeout)
2585       throws InterruptedException {
2586     // Blocks until there are no regions in transition. It is possible that
2587     // there
2588     // are regions in transition immediately after this returns but guarantees
2589     // that if it returns without an exception that there was a period of time
2590     // with no regions in transition from the point-of-view of the in-memory
2591     // state of the Master.
2592     final long endTime = System.currentTimeMillis() + timeout;
2593 
2594     while (!this.server.isStopped() && regionStates.isRegionsInTransition()
2595         && endTime > System.currentTimeMillis()) {
2596       regionStates.waitForUpdate(100);
2597     }
2598 
2599     return !regionStates.isRegionsInTransition();
2600   }
2601 
2602   /**
2603    * Rebuild the list of user regions and assignment information.
2604    * <p>
2605    * Returns a map of servers that are not found to be online and the regions
2606    * they were hosting.
2607    * @return map of servers not online to their assigned regions, as stored
2608    *         in META
2609    * @throws IOException
2610    */
2611   Map<ServerName, List<HRegionInfo>> rebuildUserRegions() throws IOException, KeeperException {
2612     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2613     Set<TableName> disabledOrEnablingTables = ZKTable.getDisabledTables(watcher);
2614     disabledOrEnablingTables.addAll(enablingTables);
2615     Set<TableName> disabledOrDisablingOrEnabling = ZKTable.getDisablingTables(watcher);
2616     disabledOrDisablingOrEnabling.addAll(disabledOrEnablingTables);
2617 
2618     // Region assignment from META
2619     List<Result> results = MetaReader.fullScan(this.catalogTracker);
2620     // Get any new but slow to checkin region server that joined the cluster
2621     Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet();
2622     // Map of offline servers and their regions to be returned
2623     Map<ServerName, List<HRegionInfo>> offlineServers =
2624       new TreeMap<ServerName, List<HRegionInfo>>();
2625     // Iterate regions in META
2626     for (Result result : results) {
2627       Pair<HRegionInfo, ServerName> region = HRegionInfo.getHRegionInfoAndServerName(result);
2628       if (region == null) continue;
2629       HRegionInfo regionInfo = region.getFirst();
2630       ServerName regionLocation = region.getSecond();
2631       if (regionInfo == null) continue;
2632       regionStates.createRegionState(regionInfo);
2633       TableName tableName = regionInfo.getTableName();
2634       if (regionLocation == null) {
2635         // regionLocation could be null if createTable didn't finish properly.
2636         // When createTable is in progress, HMaster restarts.
2637         // Some regions have been added to .META., but have not been assigned.
2638         // When this happens, the region's table must be in ENABLING state.
2639         // It can't be in ENABLED state as that is set when all regions are
2640         // assigned.
2641         // It can't be in DISABLING state, because DISABLING state transitions
2642         // from ENABLED state when application calls disableTable.
2643         // It can't be in DISABLED state, because DISABLED states transitions
2644         // from DISABLING state.
2645         if (!enablingTables.contains(tableName)) {
2646           LOG.warn("Region " + regionInfo.getEncodedName() +
2647             " has null regionLocation." + " But its table " + tableName +
2648             " isn't in ENABLING state.");
2649         }
2650       } else if (!onlineServers.contains(regionLocation)) {
2651         // Region is located on a server that isn't online
2652         List<HRegionInfo> offlineRegions = offlineServers.get(regionLocation);
2653         if (offlineRegions == null) {
2654           offlineRegions = new ArrayList<HRegionInfo>(1);
2655           offlineServers.put(regionLocation, offlineRegions);
2656         }
2657         offlineRegions.add(regionInfo);
2658         // need to enable the table if not disabled or disabling or enabling
2659         // this will be used in rolling restarts
2660         if (!disabledOrDisablingOrEnabling.contains(tableName)
2661             && !getZKTable().isEnabledTable(tableName)) {
2662           setEnabledTable(tableName);
2663         }
2664       } else {
2665         // If region is in offline and split state check the ZKNode
2666         if (regionInfo.isOffline() && regionInfo.isSplit()) {
2667           String node = ZKAssign.getNodeName(this.watcher, regionInfo
2668               .getEncodedName());
2669           Stat stat = new Stat();
2670           byte[] data = ZKUtil.getDataNoWatch(this.watcher, node, stat);
2671           // If znode does not exist, don't consider this region
2672           if (data == null) {
2673             LOG.debug("Region "	+  regionInfo.getRegionNameAsString()
2674                + " split is completed. Hence need not add to regions list");
2675             continue;
2676           }
2677         }
2678         // Region is being served and on an active server
2679         // add only if region not in disabled or enabling table
2680         if (!disabledOrEnablingTables.contains(tableName)) {
2681           regionStates.regionOnline(regionInfo, regionLocation);
2682         }
2683         // need to enable the table if not disabled or disabling or enabling
2684         // this will be used in rolling restarts
2685         if (!disabledOrDisablingOrEnabling.contains(tableName)
2686             && !getZKTable().isEnabledTable(tableName)) {
2687           setEnabledTable(tableName);
2688         }
2689       }
2690     }
2691     return offlineServers;
2692   }
2693 
2694   /**
2695    * Recover the tables that were not fully moved to DISABLED state. These
2696    * tables are in DISABLING state when the master restarted/switched.
2697    *
2698    * @throws KeeperException
2699    * @throws TableNotFoundException
2700    * @throws IOException
2701    */
2702   private void recoverTableInDisablingState()
2703       throws KeeperException, TableNotFoundException, IOException {
2704     Set<TableName> disablingTables = ZKTable.getDisablingTables(watcher);
2705     if (disablingTables.size() != 0) {
2706       for (TableName tableName : disablingTables) {
2707         // Recover by calling DisableTableHandler
2708         LOG.info("The table " + tableName
2709             + " is in DISABLING state.  Hence recovering by moving the table"
2710             + " to DISABLED state.");
2711         new DisableTableHandler(this.server, tableName, catalogTracker,
2712             this, tableLockManager, true).prepare().process();
2713       }
2714     }
2715   }
2716 
2717   /**
2718    * Recover the tables that are not fully moved to ENABLED state. These tables
2719    * are in ENABLING state when the master restarted/switched
2720    *
2721    * @throws KeeperException
2722    * @throws org.apache.hadoop.hbase.TableNotFoundException
2723    * @throws IOException
2724    */
2725   private void recoverTableInEnablingState()
2726       throws KeeperException, TableNotFoundException, IOException {
2727     Set<TableName> enablingTables = ZKTable.getEnablingTables(watcher);
2728     if (enablingTables.size() != 0) {
2729       for (TableName tableName : enablingTables) {
2730         // Recover by calling EnableTableHandler
2731         LOG.info("The table " + tableName
2732             + " is in ENABLING state.  Hence recovering by moving the table"
2733             + " to ENABLED state.");
2734         // enableTable in sync way during master startup,
2735         // no need to invoke coprocessor
2736         new EnableTableHandler(this.server, tableName,
2737             catalogTracker, this, tableLockManager, true).prepare().process();
2738       }
2739     }
2740   }
2741 
2742   /**
2743    * Processes list of dead servers from result of META scan and regions in RIT
2744    * <p>
2745    * This is used for failover to recover the lost regions that belonged to
2746    * RegionServers which failed while there was no active master or regions
2747    * that were in RIT.
2748    * <p>
2749    *
2750    *
2751    * @param deadServers
2752    *          The list of dead servers which failed while there was no active
2753    *          master. Can be null.
2754    * @throws IOException
2755    * @throws KeeperException
2756    */
2757   private void processDeadServersAndRecoverLostRegions(
2758       Map<ServerName, List<HRegionInfo>> deadServers)
2759           throws IOException, KeeperException {
2760     if (deadServers != null) {
2761       for (Map.Entry<ServerName, List<HRegionInfo>> server: deadServers.entrySet()) {
2762         ServerName serverName = server.getKey();
2763         if (!serverManager.isServerDead(serverName)) {
2764           serverManager.expireServer(serverName); // Let SSH do region re-assign
2765         }
2766       }
2767     }
2768     List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(
2769       this.watcher, this.watcher.assignmentZNode);
2770     if (!nodes.isEmpty()) {
2771       for (String encodedRegionName : nodes) {
2772         processRegionInTransition(encodedRegionName, null);
2773       }
2774     }
2775 
2776     // Now we can safely claim failover cleanup completed and enable
2777     // ServerShutdownHandler for further processing. The nodes (below)
2778     // in transition, if any, are for regions not related to those
2779     // dead servers at all, and can be done in parallel to SSH.
2780     failoverCleanupDone();
2781   }
2782 
2783   /**
2784    * Set Regions in transitions metrics.
2785    * This takes an iterator on the RegionInTransition map (CLSM), and is not synchronized.
2786    * This iterator is not fail fast, which may lead to stale read; but that's better than
2787    * creating a copy of the map for metrics computation, as this method will be invoked
2788    * on a frequent interval.
2789    */
2790   public void updateRegionsInTransitionMetrics() {
2791     long currentTime = System.currentTimeMillis();
2792     int totalRITs = 0;
2793     int totalRITsOverThreshold = 0;
2794     long oldestRITTime = 0;
2795     int ritThreshold = this.server.getConfiguration().
2796       getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000);
2797     for (RegionState state: regionStates.getRegionsInTransition().values()) {
2798       totalRITs++;
2799       long ritTime = currentTime - state.getStamp();
2800       if (ritTime > ritThreshold) { // more than the threshold
2801         totalRITsOverThreshold++;
2802       }
2803       if (oldestRITTime < ritTime) {
2804         oldestRITTime = ritTime;
2805       }
2806     }
2807     if (this.metricsMaster != null) {
2808       this.metricsMaster.updateRITOldestAge(oldestRITTime);
2809       this.metricsMaster.updateRITCount(totalRITs);
2810       this.metricsMaster.updateRITCountOverThreshold(totalRITsOverThreshold);
2811     }
2812   }
2813 
2814   /**
2815    * @param region Region whose plan we are to clear.
2816    */
2817   void clearRegionPlan(final HRegionInfo region) {
2818     synchronized (this.regionPlans) {
2819       this.regionPlans.remove(region.getEncodedName());
2820     }
2821   }
2822 
2823   /**
2824    * Wait on region to clear regions-in-transition.
2825    * @param hri Region to wait on.
2826    * @throws IOException
2827    */
2828   public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
2829       throws IOException, InterruptedException {
2830     waitOnRegionToClearRegionsInTransition(hri, -1L);
2831   }
2832 
2833   /**
2834    * Wait on region to clear regions-in-transition or time out
2835    * @param hri
2836    * @param timeOut Milliseconds to wait for current region to be out of transition state.
2837    * @return True when a region clears regions-in-transition before timeout otherwise false
2838    * @throws InterruptedException
2839    */
2840   public boolean waitOnRegionToClearRegionsInTransition(final HRegionInfo hri, long timeOut)
2841       throws InterruptedException {
2842     if (!regionStates.isRegionInTransition(hri)) return true;
2843     RegionState rs = null;
2844     long end = (timeOut <= 0) ? Long.MAX_VALUE : EnvironmentEdgeManager.currentTimeMillis()
2845         + timeOut;
2846     // There is already a timeout monitor on regions in transition so I
2847     // should not have to have one here too?
2848     LOG.info("Waiting on " + rs + " to clear regions-in-transition");
2849     while (!this.server.isStopped() && regionStates.isRegionInTransition(hri)) {
2850       regionStates.waitForUpdate(100);
2851       if (EnvironmentEdgeManager.currentTimeMillis() > end) {
2852         LOG.info("Timed out on waiting for " + hri.getEncodedName() + " to be assigned.");
2853         return false;
2854       }
2855     }
2856     if (this.server.isStopped()) {
2857       LOG.info("Giving up wait on regions in transition because stoppable.isStopped is set");
2858       return false;
2859     }
2860     return true;
2861   }
2862 
2863   /**
2864    * Update timers for all regions in transition going against the server in the
2865    * serversInUpdatingTimer.
2866    */
2867   public class TimerUpdater extends Chore {
2868 
2869     public TimerUpdater(final int period, final Stoppable stopper) {
2870       super("AssignmentTimerUpdater", period, stopper);
2871     }
2872 
2873     @Override
2874     protected void chore() {
2875       Preconditions.checkState(tomActivated);
2876       ServerName serverToUpdateTimer = null;
2877       while (!serversInUpdatingTimer.isEmpty() && !stopper.isStopped()) {
2878         if (serverToUpdateTimer == null) {
2879           serverToUpdateTimer = serversInUpdatingTimer.first();
2880         } else {
2881           serverToUpdateTimer = serversInUpdatingTimer
2882               .higher(serverToUpdateTimer);
2883         }
2884         if (serverToUpdateTimer == null) {
2885           break;
2886         }
2887         updateTimers(serverToUpdateTimer);
2888         serversInUpdatingTimer.remove(serverToUpdateTimer);
2889       }
2890     }
2891   }
2892 
2893   /**
2894    * Monitor to check for time outs on region transition operations
2895    */
2896   public class TimeoutMonitor extends Chore {
2897     private boolean allRegionServersOffline = false;
2898     private ServerManager serverManager;
2899     private final int timeout;
2900 
2901     /**
2902      * Creates a periodic monitor to check for time outs on region transition
2903      * operations.  This will deal with retries if for some reason something
2904      * doesn't happen within the specified timeout.
2905      * @param period
2906    * @param stopper When {@link Stoppable#isStopped()} is true, this thread will
2907    * cleanup and exit cleanly.
2908      * @param timeout
2909      */
2910     public TimeoutMonitor(final int period, final Stoppable stopper,
2911         ServerManager serverManager,
2912         final int timeout) {
2913       super("AssignmentTimeoutMonitor", period, stopper);
2914       this.timeout = timeout;
2915       this.serverManager = serverManager;
2916     }
2917 
2918     private synchronized void setAllRegionServersOffline(
2919       boolean allRegionServersOffline) {
2920       this.allRegionServersOffline = allRegionServersOffline;
2921     }
2922 
2923     @Override
2924     protected void chore() {
2925       Preconditions.checkState(tomActivated);
2926       boolean noRSAvailable = this.serverManager.createDestinationServersList().isEmpty();
2927 
2928       // Iterate all regions in transition checking for time outs
2929       long now = System.currentTimeMillis();
2930       // no lock concurrent access ok: we will be working on a copy, and it's java-valid to do
2931       //  a copy while another thread is adding/removing items
2932       for (String regionName : regionStates.getRegionsInTransition().keySet()) {
2933         RegionState regionState = regionStates.getRegionTransitionState(regionName);
2934         if (regionState == null) continue;
2935 
2936         if (regionState.getStamp() + timeout <= now) {
2937           // decide on action upon timeout
2938           actOnTimeOut(regionState);
2939         } else if (this.allRegionServersOffline && !noRSAvailable) {
2940           RegionPlan existingPlan = regionPlans.get(regionName);
2941           if (existingPlan == null
2942               || !this.serverManager.isServerOnline(existingPlan
2943                   .getDestination())) {
2944             // if some RSs just came back online, we can start the assignment
2945             // right away
2946             actOnTimeOut(regionState);
2947           }
2948         }
2949       }
2950       setAllRegionServersOffline(noRSAvailable);
2951     }
2952 
2953     private void actOnTimeOut(RegionState regionState) {
2954       HRegionInfo regionInfo = regionState.getRegion();
2955       LOG.info("Regions in transition timed out:  " + regionState);
2956       // Expired! Do a retry.
2957       switch (regionState.getState()) {
2958       case CLOSED:
2959         LOG.info("Region " + regionInfo.getEncodedName()
2960             + " has been CLOSED for too long, waiting on queued "
2961             + "ClosedRegionHandler to run or server shutdown");
2962         // Update our timestamp.
2963         regionState.updateTimestampToNow();
2964         break;
2965       case OFFLINE:
2966         LOG.info("Region has been OFFLINE for too long, " + "reassigning "
2967             + regionInfo.getRegionNameAsString() + " to a random server");
2968         invokeAssign(regionInfo);
2969         break;
2970       case PENDING_OPEN:
2971         LOG.info("Region has been PENDING_OPEN for too "
2972             + "long, reassigning region=" + regionInfo.getRegionNameAsString());
2973         invokeAssign(regionInfo);
2974         break;
2975       case OPENING:
2976         processOpeningState(regionInfo);
2977         break;
2978       case OPEN:
2979         LOG.error("Region has been OPEN for too long, " +
2980             "we don't know where region was opened so can't do anything");
2981         regionState.updateTimestampToNow();
2982         break;
2983 
2984       case PENDING_CLOSE:
2985         LOG.info("Region has been PENDING_CLOSE for too "
2986             + "long, running forced unassign again on region="
2987             + regionInfo.getRegionNameAsString());
2988         invokeUnassign(regionInfo);
2989         break;
2990       case CLOSING:
2991         LOG.info("Region has been CLOSING for too " +
2992           "long, this should eventually complete or the server will " +
2993           "expire, send RPC again");
2994         invokeUnassign(regionInfo);
2995         break;
2996 
2997       case SPLIT:
2998       case SPLITTING:
2999       case FAILED_OPEN:
3000       case FAILED_CLOSE:
3001       case MERGING:
3002         break;
3003 
3004       default:
3005         throw new IllegalStateException("Received event is not valid.");
3006       }
3007     }
3008   }
3009 
3010   private void processOpeningState(HRegionInfo regionInfo) {
3011     LOG.info("Region has been OPENING for too long, reassigning region="
3012         + regionInfo.getRegionNameAsString());
3013     // Should have a ZK node in OPENING state
3014     try {
3015       String node = ZKAssign.getNodeName(watcher, regionInfo.getEncodedName());
3016       Stat stat = new Stat();
3017       byte [] data = ZKAssign.getDataNoWatch(watcher, node, stat);
3018       if (data == null) {
3019         LOG.warn("Data is null, node " + node + " no longer exists");
3020         return;
3021       }
3022       RegionTransition rt = RegionTransition.parseFrom(data);
3023       EventType et = rt.getEventType();
3024       if (et == EventType.RS_ZK_REGION_OPENED) {
3025         LOG.debug("Region has transitioned to OPENED, allowing "
3026             + "watched event handlers to process");
3027         return;
3028       } else if (et != EventType.RS_ZK_REGION_OPENING && et != EventType.RS_ZK_REGION_FAILED_OPEN ) {
3029         LOG.warn("While timing out a region, found ZK node in unexpected state: " + et);
3030         return;
3031       }
3032       invokeAssign(regionInfo);
3033     } catch (KeeperException ke) {
3034       LOG.error("Unexpected ZK exception timing out CLOSING region", ke);
3035     } catch (DeserializationException e) {
3036       LOG.error("Unexpected exception parsing CLOSING region", e);
3037     }
3038   }
3039 
3040   void invokeAssign(HRegionInfo regionInfo) {
3041     threadPoolExecutorService.submit(new AssignCallable(this, regionInfo));
3042   }
3043 
3044   private void invokeUnassign(HRegionInfo regionInfo) {
3045     threadPoolExecutorService.submit(new UnAssignCallable(this, regionInfo));
3046   }
3047 
3048   public boolean isCarryingMeta(ServerName serverName) {
3049     return isCarryingRegion(serverName, HRegionInfo.FIRST_META_REGIONINFO);
3050   }
3051 
3052   /**
3053    * Check if the shutdown server carries the specific region.
3054    * We have a bunch of places that store region location
3055    * Those values aren't consistent. There is a delay of notification.
3056    * The location from zookeeper unassigned node has the most recent data;
3057    * but the node could be deleted after the region is opened by AM.
3058    * The AM's info could be old when OpenedRegionHandler
3059    * processing hasn't finished yet when server shutdown occurs.
3060    * @return whether the serverName currently hosts the region
3061    */
3062   private boolean isCarryingRegion(ServerName serverName, HRegionInfo hri) {
3063     RegionTransition rt = null;
3064     try {
3065       byte [] data = ZKAssign.getData(watcher, hri.getEncodedName());
3066       // This call can legitimately come by null
3067       rt = data == null? null: RegionTransition.parseFrom(data);
3068     } catch (KeeperException e) {
3069       server.abort("Exception reading unassigned node for region=" + hri.getEncodedName(), e);
3070     } catch (DeserializationException e) {
3071       server.abort("Exception parsing unassigned node for region=" + hri.getEncodedName(), e);
3072     }
3073 
3074     ServerName addressFromZK = rt != null? rt.getServerName():  null;
3075     if (addressFromZK != null) {
3076       // if we get something from ZK, we will use the data
3077       boolean matchZK = addressFromZK.equals(serverName);
3078       LOG.debug("based on ZK, current region=" + hri.getRegionNameAsString() +
3079           " is on server=" + addressFromZK +
3080           " server being checked=: " + serverName);
3081       return matchZK;
3082     }
3083 
3084     ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
3085     boolean matchAM = (addressFromAM != null &&
3086       addressFromAM.equals(serverName));
3087     LOG.debug("based on AM, current region=" + hri.getRegionNameAsString() +
3088       " is on server=" + (addressFromAM != null ? addressFromAM : "null") +
3089       " server being checked: " + serverName);
3090 
3091     return matchAM;
3092   }
3093 
3094   /**
3095    * Process shutdown server removing any assignments.
3096    * @param sn Server that went down.
3097    * @return list of regions in transition on this server
3098    */
3099   public List<HRegionInfo> processServerShutdown(final ServerName sn) {
3100     // Clean out any existing assignment plans for this server
3101     synchronized (this.regionPlans) {
3102       for (Iterator <Map.Entry<String, RegionPlan>> i =
3103           this.regionPlans.entrySet().iterator(); i.hasNext();) {
3104         Map.Entry<String, RegionPlan> e = i.next();
3105         ServerName otherSn = e.getValue().getDestination();
3106         // The name will be null if the region is planned for a random assign.
3107         if (otherSn != null && otherSn.equals(sn)) {
3108           // Use iterator's remove else we'll get CME
3109           i.remove();
3110         }
3111       }
3112     }
3113     List<HRegionInfo> regions = regionStates.serverOffline(watcher, sn);
3114     for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) {
3115       HRegionInfo hri = it.next();
3116       String encodedName = hri.getEncodedName();
3117 
3118       // We need a lock on the region as we could update it
3119       Lock lock = locker.acquireLock(encodedName);
3120       try {
3121         RegionState regionState =
3122           regionStates.getRegionTransitionState(encodedName);
3123         if (regionState == null
3124             || !regionState.isPendingOpenOrOpeningOnServer(sn)) {
3125           LOG.info("Skip " + hri
3126             + " since it is not opening on the dead server any more: " + sn);
3127           it.remove();
3128         } else {
3129           try {
3130             // Delete the ZNode if exists
3131             ZKAssign.deleteNodeFailSilent(watcher, hri);
3132           } catch (KeeperException ke) {
3133             server.abort("Unexpected ZK exception deleting node " + hri, ke);
3134           }
3135           if (zkTable.isDisablingOrDisabledTable(hri.getTableName())) {
3136             it.remove();
3137             regionStates.regionOffline(hri);
3138             continue;
3139           }
3140           // Mark the region closed and assign it again by SSH
3141           regionStates.updateRegionState(hri, RegionState.State.CLOSED);
3142         }
3143       } finally {
3144         lock.unlock();
3145       }
3146     }
3147     return regions;
3148   }
3149 
3150   /**
3151    * Update inmemory structures.
3152    * @param sn Server that reported the split
3153    * @param parent Parent region that was split
3154    * @param a Daughter region A
3155    * @param b Daughter region B
3156    */
3157   public void handleSplitReport(final ServerName sn, final HRegionInfo parent,
3158       final HRegionInfo a, final HRegionInfo b) {
3159     regionOffline(parent, State.SPLIT);
3160     regionOnline(a, sn);
3161     regionOnline(b, sn);
3162 
3163     // There's a possibility that the region was splitting while a user asked
3164     // the master to disable, we need to make sure we close those regions in
3165     // that case. This is not racing with the region server itself since RS
3166     // report is done after the split transaction completed.
3167     if (this.zkTable.isDisablingOrDisabledTable(
3168         parent.getTableName())) {
3169       unassign(a);
3170       unassign(b);
3171     }
3172   }
3173 
3174   /**
3175    * Update inmemory structures.
3176    * @param sn Server that reported the merge
3177    * @param merged regioninfo of merged
3178    * @param a region a
3179    * @param b region b
3180    */
3181   public void handleRegionsMergeReport(final ServerName sn,
3182       final HRegionInfo merged, final HRegionInfo a, final HRegionInfo b) {
3183     regionOffline(a, State.MERGED);
3184     regionOffline(b, State.MERGED);
3185     regionOnline(merged, sn);
3186 
3187     // There's a possibility that the region was merging while a user asked
3188     // the master to disable, we need to make sure we close those regions in
3189     // that case. This is not racing with the region server itself since RS
3190     // report is done after the regions merge transaction completed.
3191     if (this.zkTable.isDisablingOrDisabledTable(merged.getTableName())) {
3192       unassign(merged);
3193     }
3194   }
3195 
3196   /**
3197    * @param plan Plan to execute.
3198    */
3199   public void balance(final RegionPlan plan) {
3200     synchronized (this.regionPlans) {
3201       this.regionPlans.put(plan.getRegionName(), plan);
3202     }
3203     unassign(plan.getRegionInfo(), false, plan.getDestination());
3204   }
3205 
3206   public void stop() {
3207     if (tomActivated){
3208       this.timeoutMonitor.interrupt();
3209       this.timerUpdater.interrupt();
3210     }
3211   }
3212 
3213   /**
3214    * Shutdown the threadpool executor service
3215    */
3216   public void shutdown() {
3217     // It's an immediate shutdown, so we're clearing the remaining tasks.
3218     synchronized (zkEventWorkerWaitingList){
3219       zkEventWorkerWaitingList.clear();
3220     }
3221     threadPoolExecutorService.shutdownNow();
3222     zkEventWorkers.shutdownNow();
3223   }
3224 
3225   protected void setEnabledTable(TableName tableName) {
3226     try {
3227       this.zkTable.setEnabledTable(tableName);
3228     } catch (KeeperException e) {
3229       // here we can abort as it is the start up flow
3230       String errorMsg = "Unable to ensure that the table " + tableName
3231           + " will be" + " enabled because of a ZooKeeper issue";
3232       LOG.error(errorMsg);
3233       this.server.abort(errorMsg, e);
3234     }
3235   }
3236 
3237   /**
3238    * Set region as OFFLINED up in zookeeper asynchronously.
3239    * @param state
3240    * @return True if we succeeded, false otherwise (State was incorrect or failed
3241    * updating zk).
3242    */
3243   private boolean asyncSetOfflineInZooKeeper(final RegionState state,
3244       final AsyncCallback.StringCallback cb, final ServerName destination) {
3245     if (!state.isClosed() && !state.isOffline()) {
3246       this.server.abort("Unexpected state trying to OFFLINE; " + state,
3247         new IllegalStateException());
3248       return false;
3249     }
3250     regionStates.updateRegionState(
3251       state.getRegion(), RegionState.State.OFFLINE);
3252     try {
3253       ZKAssign.asyncCreateNodeOffline(watcher, state.getRegion(),
3254         destination, cb, state);
3255     } catch (KeeperException e) {
3256       if (e instanceof NodeExistsException) {
3257         LOG.warn("Node for " + state.getRegion() + " already exists");
3258       } else {
3259         server.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
3260       }
3261       return false;
3262     }
3263     return true;
3264   }
3265 
3266   /**
3267    * A helper to handle region merging transition event.
3268    * It transitions merging regions to MERGING state.
3269    */
3270   private boolean handleRegionMerging(final RegionTransition rt,
3271       final String prettyPrintedRegionName, final ServerName sn) {
3272     byte [] payloadOfMerging = rt.getPayload();
3273     List<HRegionInfo> mergingRegions;
3274     try {
3275       mergingRegions = HRegionInfo.parseDelimitedFrom(
3276         payloadOfMerging, 0, payloadOfMerging.length);
3277     } catch (IOException e) {
3278       LOG.error("Dropped merging! Failed reading merging payload for "
3279         + prettyPrintedRegionName);
3280       return false;
3281     }
3282     assert mergingRegions.size() == 2;
3283     HRegionInfo merging_a = mergingRegions.get(0);
3284     HRegionInfo merging_b = mergingRegions.get(1);
3285 
3286     if (!isInStateForMerging(sn, merging_a, merging_b)) {
3287       LOG.warn("Dropped merging! Not in state good for MERGING; rs_a="
3288         + merging_a + ", rs_b=" + merging_b);
3289       return false;
3290     }
3291     regionStates.updateRegionState(merging_a, RegionState.State.MERGING);
3292     regionStates.updateRegionState(merging_b, RegionState.State.MERGING);
3293     return true;
3294   }
3295 
3296   /**
3297    * A region is offline.  The new state should be the specified one,
3298    * if not null.  If the specified state is null, the new state is Offline.
3299    * The specified state can be Split/Merged/Offline/null only.
3300    */
3301   private void regionOffline(final HRegionInfo regionInfo, final State state) {
3302     regionStates.regionOffline(regionInfo, state);
3303     removeClosedRegion(regionInfo);
3304     // remove the region plan as well just in case.
3305     clearRegionPlan(regionInfo);
3306   }
3307 }