1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertFalse;
24  import static org.junit.Assert.assertNotNull;
25  import static org.junit.Assert.assertTrue;
26  
27  import java.io.IOException;
28  import java.util.ArrayList;
29  import java.util.List;
30  import java.util.Set;
31  import java.util.TreeSet;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.fs.FileSystem;
37  import org.apache.hadoop.fs.Path;
38  import org.apache.hadoop.hbase.Abortable;
39  import org.apache.hadoop.hbase.ClusterStatus;
40  import org.apache.hadoop.hbase.HBaseConfiguration;
41  import org.apache.hadoop.hbase.HBaseTestingUtility;
42  import org.apache.hadoop.hbase.HColumnDescriptor;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.HRegionInfo;
45  import org.apache.hadoop.hbase.HTableDescriptor;
46  import org.apache.hadoop.hbase.LargeTests;
47  import org.apache.hadoop.hbase.MasterNotRunningException;
48  import org.apache.hadoop.hbase.MiniHBaseCluster;
49  import org.apache.hadoop.hbase.ServerName;
50  import org.apache.hadoop.hbase.executor.EventHandler.EventType;
51  import org.apache.hadoop.hbase.executor.RegionTransitionData;
52  import org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
53  import org.apache.hadoop.hbase.regionserver.HRegion;
54  import org.apache.hadoop.hbase.regionserver.HRegionServer;
55  import org.apache.hadoop.hbase.util.Bytes;
56  import org.apache.hadoop.hbase.util.FSTableDescriptors;
57  import org.apache.hadoop.hbase.util.JVMClusterUtil;
58  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
59  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
60  import org.apache.hadoop.hbase.util.Threads;
61  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
62  import org.apache.hadoop.hbase.zookeeper.ZKTable;
63  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
64  import org.junit.Test;
65  import org.junit.experimental.categories.Category;
66  
67  @Category(LargeTests.class)
68  public class TestMasterFailover {
69    private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
70  
71    @Test (timeout=180000)
72    public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
73        throws Exception {
74      LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
75      final int NUM_MASTERS = 1;
76      final int NUM_RS = 2;
77  
78      Configuration conf = HBaseConfiguration.create();
79      conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
80      conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 8000);
81      // Start the cluster
82      HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
83  
84      TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
85      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
86  
87      // Find regionserver carrying meta.
88      List<RegionServerThread> regionServerThreads =
89        cluster.getRegionServerThreads();
90      int count = -1;
91      HRegion metaRegion = null;
92      for (RegionServerThread regionServerThread : regionServerThreads) {
93        HRegionServer regionServer = regionServerThread.getRegionServer();
94        metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
95        count++;
96        regionServer.abort("");
97        if (null != metaRegion) break;
98      }
99      HRegionServer regionServer = cluster.getRegionServer(count);
100 
101     TEST_UTIL.shutdownMiniHBaseCluster();
102 
103     // Create a ZKW to use in the test
104     ZooKeeperWatcher zkw = 
105       HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL, 
106           metaRegion, regionServer.getServerName());
107 
108     LOG.info("Staring cluster for second time");
109     TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
110 
111     // Failover should be completed, now wait for no RIT
112     log("Waiting for no more RIT");
113     ZKAssign.blockUntilNoRIT(zkw);
114 
115     zkw.close();
116     // Stop the cluster
117     TEST_UTIL.shutdownMiniCluster();
118   }
119 
120   /**
121    * Simple test of master failover.
122    * <p>
123    * Starts with three masters.  Kills a backup master.  Then kills the active
124    * master.  Ensures the final master becomes active and we can still contact
125    * the cluster.
126    * @throws Exception
127    */
128   @Test (timeout=240000)
129   public void testSimpleMasterFailover() throws Exception {
130 
131     final int NUM_MASTERS = 3;
132     final int NUM_RS = 3;
133 
134     // Create config to use for this cluster
135     Configuration conf = HBaseConfiguration.create();
136 
137     // Start the cluster
138     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
139     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
140     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
141 
142     // get all the master threads
143     List<MasterThread> masterThreads = cluster.getMasterThreads();
144 
145     // wait for each to come online
146     for (MasterThread mt : masterThreads) {
147       assertTrue(mt.isAlive());
148     }
149 
150     // verify only one is the active master and we have right number
151     int numActive = 0;
152     int activeIndex = -1;
153     ServerName activeName = null;
154     HMaster active = null;
155     for (int i = 0; i < masterThreads.size(); i++) {
156       if (masterThreads.get(i).getMaster().isActiveMaster()) {
157         numActive++;
158         activeIndex = i;
159         active = masterThreads.get(activeIndex).getMaster();
160         activeName = active.getServerName();
161       }
162     }
163     assertEquals(1, numActive);
164     assertEquals(NUM_MASTERS, masterThreads.size());
165     LOG.info("Active master " + activeName);
166 
167     // Check that ClusterStatus reports the correct active and backup masters
168     assertNotNull(active);
169     ClusterStatus status = active.getClusterStatus();
170     assertTrue(status.getMaster().equals(activeName));
171     assertEquals(2, status.getBackupMastersSize());
172     assertEquals(2, status.getBackupMasters().size());
173 
174     // attempt to stop one of the inactive masters
175     int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
176     HMaster master = cluster.getMaster(backupIndex);
177     LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
178     cluster.stopMaster(backupIndex, false);
179     cluster.waitOnMaster(backupIndex);
180 
181     // Verify still one active master and it's the same
182     for (int i = 0; i < masterThreads.size(); i++) {
183       if (masterThreads.get(i).getMaster().isActiveMaster()) {
184         assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
185         activeIndex = i;
186         active = masterThreads.get(activeIndex).getMaster();
187       }
188     }
189     assertEquals(1, numActive);
190     assertEquals(2, masterThreads.size());
191     int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
192     LOG.info("Active master " + active.getServerName() + " managing " + rsCount +  " regions servers");
193     assertEquals(3, rsCount);
194 
195     // Check that ClusterStatus reports the correct active and backup masters
196     assertNotNull(active);
197     status = active.getClusterStatus();
198     assertTrue(status.getMaster().equals(activeName));
199     assertEquals(1, status.getBackupMastersSize());
200     assertEquals(1, status.getBackupMasters().size());
201 
202     // kill the active master
203     LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
204     cluster.stopMaster(activeIndex, false);
205     cluster.waitOnMaster(activeIndex);
206 
207     // wait for an active master to show up and be ready
208     assertTrue(cluster.waitForActiveAndReadyMaster());
209 
210     LOG.debug("\n\nVerifying backup master is now active\n");
211     // should only have one master now
212     assertEquals(1, masterThreads.size());
213 
214     // and he should be active
215     active = masterThreads.get(0).getMaster();
216     assertNotNull(active);
217     status = active.getClusterStatus();
218     ServerName mastername = status.getMaster();
219     assertTrue(mastername.equals(active.getServerName()));
220     assertTrue(active.isActiveMaster());
221     assertEquals(0, status.getBackupMastersSize());
222     assertEquals(0, status.getBackupMasters().size());
223     int rss = status.getServersSize();
224     LOG.info("Active master " + mastername.getServerName() + " managing " +
225       rss +  " region servers");
226     assertEquals(3, rss);
227 
228     // Stop the cluster
229     TEST_UTIL.shutdownMiniCluster();
230   }
231 
232   /**
233    * Complex test of master failover that tests as many permutations of the
234    * different possible states that regions in transition could be in within ZK.
235    * <p>
236    * This tests the proper handling of these states by the failed-over master
237    * and includes a thorough testing of the timeout code as well.
238    * <p>
239    * Starts with a single master and three regionservers.
240    * <p>
241    * Creates two tables, enabledTable and disabledTable, each containing 5
242    * regions.  The disabledTable is then disabled.
243    * <p>
244    * After reaching steady-state, the master is killed.  We then mock several
245    * states in ZK.
246    * <p>
247    * After mocking them, we will startup a new master which should become the
248    * active master and also detect that it is a failover.  The primary test
249    * passing condition will be that all regions of the enabled table are
250    * assigned and all the regions of the disabled table are not assigned.
251    * <p>
252    * The different scenarios to be tested are below:
253    * <p>
254    * <b>ZK State:  OFFLINE</b>
255    * <p>A node can get into OFFLINE state if</p>
256    * <ul>
257    * <li>An RS fails to open a region, so it reverts the state back to OFFLINE
258    * <li>The Master is assigning the region to a RS before it sends RPC
259    * </ul>
260    * <p>We will mock the scenarios</p>
261    * <ul>
262    * <li>Master has assigned an enabled region but RS failed so a region is
263    *     not assigned anywhere and is sitting in ZK as OFFLINE</li>
264    * <li>This seems to cover both cases?</li>
265    * </ul>
266    * <p>
267    * <b>ZK State:  CLOSING</b>
268    * <p>A node can get into CLOSING state if</p>
269    * <ul>
270    * <li>An RS has begun to close a region
271    * </ul>
272    * <p>We will mock the scenarios</p>
273    * <ul>
274    * <li>Region of enabled table was being closed but did not complete
275    * <li>Region of disabled table was being closed but did not complete
276    * </ul>
277    * <p>
278    * <b>ZK State:  CLOSED</b>
279    * <p>A node can get into CLOSED state if</p>
280    * <ul>
281    * <li>An RS has completed closing a region but not acknowledged by master yet
282    * </ul>
283    * <p>We will mock the scenarios</p>
284    * <ul>
285    * <li>Region of a table that should be enabled was closed on an RS
286    * <li>Region of a table that should be disabled was closed on an RS
287    * </ul>
288    * <p>
289    * <b>ZK State:  OPENING</b>
290    * <p>A node can get into OPENING state if</p>
291    * <ul>
292    * <li>An RS has begun to open a region
293    * </ul>
294    * <p>We will mock the scenarios</p>
295    * <ul>
296    * <li>RS was opening a region of enabled table but never finishes
297    * </ul>
298    * <p>
299    * <b>ZK State:  OPENED</b>
300    * <p>A node can get into OPENED state if</p>
301    * <ul>
302    * <li>An RS has finished opening a region but not acknowledged by master yet
303    * </ul>
304    * <p>We will mock the scenarios</p>
305    * <ul>
306    * <li>Region of a table that should be enabled was opened on an RS
307    * <li>Region of a table that should be disabled was opened on an RS
308    * </ul>
309    * @throws Exception
310    */
311   @Test (timeout=180000)
312   public void testMasterFailoverWithMockedRIT() throws Exception {
313 
314     final int NUM_MASTERS = 1;
315     final int NUM_RS = 3;
316 
317     // Create config to use for this cluster
318     Configuration conf = HBaseConfiguration.create();
319     // Need to drop the timeout much lower
320     conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
321     conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
322     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3);
323     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 3);
324 
325     // Start the cluster
326     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
327     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
328     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
329     log("Cluster started");
330 
331     // Create a ZKW to use in the test
332     ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
333 
334     // get all the master threads
335     List<MasterThread> masterThreads = cluster.getMasterThreads();
336     assertEquals(1, masterThreads.size());
337 
338     // only one master thread, let's wait for it to be initialized
339     assertTrue(cluster.waitForActiveAndReadyMaster());
340     HMaster master = masterThreads.get(0).getMaster();
341     assertTrue(master.isActiveMaster());
342     assertTrue(master.isInitialized());
343 
344     // disable load balancing on this master
345     master.balanceSwitch(false);
346 
347     // create two tables in META, each with 10 regions
348     byte [] FAMILY = Bytes.toBytes("family");
349     byte [][] SPLIT_KEYS = new byte [][] {
350         new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
351         Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
352         Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
353         Bytes.toBytes("iii"), Bytes.toBytes("jjj")
354     };
355 
356     byte [] enabledTable = Bytes.toBytes("enabledTable");
357     HTableDescriptor htdEnabled = new HTableDescriptor(enabledTable);
358     htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
359 
360     FileSystem filesystem = FileSystem.get(conf);
361     Path rootdir = filesystem.makeQualified(
362         new Path(conf.get(HConstants.HBASE_DIR)));
363     // Write the .tableinfo
364     FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdEnabled);
365 
366     HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getName(), null, null);
367     createRegion(hriEnabled, rootdir, conf, htdEnabled);
368 
369     List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
370         TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
371 
372     byte [] disabledTable = Bytes.toBytes("disabledTable");
373     HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
374     htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
375     // Write the .tableinfo
376     FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdDisabled);
377     HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getName(), null, null);
378     createRegion(hriDisabled, rootdir, conf, htdDisabled);
379     List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
380         TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
381 
382     log("Regions in META have been created");
383 
384     // at this point we only expect 2 regions to be assigned out (catalogs)
385     assertEquals(2, cluster.countServedRegions());
386 
387     // Let's just assign everything to first RS
388     HRegionServer hrs = cluster.getRegionServer(0);
389     ServerName serverName = hrs.getServerName();
390     HRegionInfo closingRegion = enabledRegions.remove(0);
391     // we'll need some regions to already be assigned out properly on live RS
392     List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
393     enabledAndAssignedRegions.add(enabledRegions.remove(0));
394     enabledAndAssignedRegions.add(enabledRegions.remove(0));
395     enabledAndAssignedRegions.add(closingRegion);
396 
397     List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
398     disabledAndAssignedRegions.add(disabledRegions.remove(0));
399     disabledAndAssignedRegions.add(disabledRegions.remove(0));
400 
401     // now actually assign them
402     for (HRegionInfo hri : enabledAndAssignedRegions) {
403       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
404           new RegionPlan(hri, null, serverName));
405       master.assignRegion(hri);
406     }
407     for (HRegionInfo hri : disabledAndAssignedRegions) {
408       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
409           new RegionPlan(hri, null, serverName));
410       master.assignRegion(hri);
411     }
412 
413     // wait for no more RIT
414     log("Waiting for assignment to finish");
415     ZKAssign.blockUntilNoRIT(zkw);
416     log("Assignment completed");
417 
418     // Stop the master
419     log("Aborting master");
420     cluster.abortMaster(0);
421     cluster.waitOnMaster(0);
422     log("Master has aborted");
423 
424     /*
425      * Now, let's start mocking up some weird states as described in the method
426      * javadoc.
427      */
428 
429     List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
430     List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
431 
432     log("Beginning to mock scenarios");
433 
434     // Disable the disabledTable in ZK
435     ZKTable zktable = new ZKTable(zkw);
436     zktable.setDisabledTable(Bytes.toString(disabledTable));
437 
438     /*
439      *  ZK = OFFLINE
440      */
441 
442     // Region that should be assigned but is not and is in ZK as OFFLINE
443     HRegionInfo region = enabledRegions.remove(0);
444     regionsThatShouldBeOnline.add(region);
445     ZKAssign.createNodeOffline(zkw, region, serverName);
446 
447     /*
448      * ZK = CLOSING
449      */
450     regionsThatShouldBeOnline.add(closingRegion);
451     ZKAssign.createNodeClosing(zkw, closingRegion, serverName);
452 
453     /*
454      * ZK = CLOSED
455      */
456 
457     // Region of enabled table closed but not ack
458     region = enabledRegions.remove(0);
459     regionsThatShouldBeOnline.add(region);
460     int version = ZKAssign.createNodeClosing(zkw, region, serverName);
461     ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
462 
463     // Region of disabled table closed but not ack
464     region = disabledRegions.remove(0);
465     regionsThatShouldBeOffline.add(region);
466     version = ZKAssign.createNodeClosing(zkw, region, serverName);
467     ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
468 
469     /*
470      * ZK = OPENING
471      */
472 
473     // RS was opening a region of enabled table but never finishes
474     region = enabledRegions.remove(0);
475     regionsThatShouldBeOnline.add(region);
476     ZKAssign.createNodeOffline(zkw, region, serverName);
477     ZKAssign.transitionNodeOpening(zkw, region, serverName);
478 
479     /*
480      * ZK = OPENED
481      */
482 
483     // Region of enabled table was opened on RS
484     region = enabledRegions.remove(0);
485     regionsThatShouldBeOnline.add(region);
486     ZKAssign.createNodeOffline(zkw, region, serverName);
487     hrs.openRegion(region);
488     while (true) {
489       RegionTransitionData rtd = ZKAssign.getData(zkw, region.getEncodedName());
490       if (rtd != null && rtd.getEventType() == EventType.RS_ZK_REGION_OPENED) {
491         break;
492       }
493       Thread.sleep(100);
494     }
495 
496     // Region of disable table was opened on RS
497     region = disabledRegions.remove(0);
498     regionsThatShouldBeOffline.add(region);
499     ZKAssign.createNodeOffline(zkw, region, serverName);
500     hrs.openRegion(region);
501     while (true) {
502       RegionTransitionData rtd = ZKAssign.getData(zkw, region.getEncodedName());
503       if (rtd != null && rtd.getEventType() == EventType.RS_ZK_REGION_OPENED) {
504         break;
505       }
506       Thread.sleep(100);
507     }
508 
509     /*
510      * ZK = NONE
511      */
512 
513     /*
514      * DONE MOCKING
515      */
516 
517     log("Done mocking data up in ZK");
518 
519     // Start up a new master
520     log("Starting up a new master");
521     master = cluster.startMaster().getMaster();
522     log("Waiting for master to be ready");
523     cluster.waitForActiveAndReadyMaster();
524     log("Master is ready");
525 
526     // Failover should be completed, now wait for no RIT
527     log("Waiting for no more RIT");
528     ZKAssign.blockUntilNoRIT(zkw);
529     log("No more RIT in ZK, now doing final test verification");
530 
531     // Grab all the regions that are online across RSs
532     Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
533     for (JVMClusterUtil.RegionServerThread rst :
534       cluster.getRegionServerThreads()) {
535       onlineRegions.addAll(rst.getRegionServer().getOnlineRegions());
536     }
537 
538     // Now, everything that should be online should be online
539     for (HRegionInfo hri : regionsThatShouldBeOnline) {
540       assertTrue(onlineRegions.contains(hri));
541     }
542 
543     // Everything that should be offline should not be online
544     for (HRegionInfo hri : regionsThatShouldBeOffline) {
545       assertFalse(onlineRegions.contains(hri));
546     }
547 
548     log("Done with verification, all passed, shutting down cluster");
549 
550     // Done, shutdown the cluster
551     TEST_UTIL.shutdownMiniCluster();
552   }
553 
554 
555   /**
556    * Complex test of master failover that tests as many permutations of the
557    * different possible states that regions in transition could be in within ZK
558    * pointing to an RS that has died while no master is around to process it.
559    * <p>
560    * This tests the proper handling of these states by the failed-over master
561    * and includes a thorough testing of the timeout code as well.
562    * <p>
563    * Starts with a single master and two regionservers.
564    * <p>
565    * Creates two tables, enabledTable and disabledTable, each containing 5
566    * regions.  The disabledTable is then disabled.
567    * <p>
568    * After reaching steady-state, the master is killed.  We then mock several
569    * states in ZK.  And one of the RS will be killed.
570    * <p>
571    * After mocking them and killing an RS, we will startup a new master which
572    * should become the active master and also detect that it is a failover.  The
573    * primary test passing condition will be that all regions of the enabled
574    * table are assigned and all the regions of the disabled table are not
575    * assigned.
576    * <p>
577    * The different scenarios to be tested are below:
578    * <p>
579    * <b>ZK State:  CLOSING</b>
580    * <p>A node can get into CLOSING state if</p>
581    * <ul>
582    * <li>An RS has begun to close a region
583    * </ul>
584    * <p>We will mock the scenarios</p>
585    * <ul>
586    * <li>Region was being closed but the RS died before finishing the close
587    * </ul>
588    * <b>ZK State:  OPENED</b>
589    * <p>A node can get into OPENED state if</p>
590    * <ul>
591    * <li>An RS has finished opening a region but not acknowledged by master yet
592    * </ul>
593    * <p>We will mock the scenarios</p>
594    * <ul>
595    * <li>Region of a table that should be enabled was opened by a now-dead RS
596    * <li>Region of a table that should be disabled was opened by a now-dead RS
597    * </ul>
598    * <p>
599    * <b>ZK State:  NONE</b>
600    * <p>A region could not have a transition node if</p>
601    * <ul>
602    * <li>The server hosting the region died and no master processed it
603    * </ul>
604    * <p>We will mock the scenarios</p>
605    * <ul>
606    * <li>Region of enabled table was on a dead RS that was not yet processed
607    * <li>Region of disabled table was on a dead RS that was not yet processed
608    * </ul>
609    * @throws Exception
610    */
611   @Test (timeout=180000)
612   public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {
613 
614     final int NUM_MASTERS = 1;
615     final int NUM_RS = 2;
616 
617     // Create config to use for this cluster
618     Configuration conf = HBaseConfiguration.create();
619     // Need to drop the timeout much lower
620     conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
621     conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
622     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
623     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
624 
625     // Create and start the cluster
626     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
627     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
628     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
629     log("Cluster started");
630 
631     // Create a ZKW to use in the test
632     ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
633         "unittest", new Abortable() {
634 
635           @Override
636           public void abort(String why, Throwable e) {
637             LOG.error("Fatal ZK Error: " + why, e);
638             org.junit.Assert.assertFalse("Fatal ZK error", true);
639           }
640 
641           @Override
642           public boolean isAborted() {
643             return false;
644           }
645 
646     });
647 
648     // get all the master threads
649     List<MasterThread> masterThreads = cluster.getMasterThreads();
650     assertEquals(1, masterThreads.size());
651 
652     // only one master thread, let's wait for it to be initialized
653     assertTrue(cluster.waitForActiveAndReadyMaster());
654     HMaster master = masterThreads.get(0).getMaster();
655     assertTrue(master.isActiveMaster());
656     assertTrue(master.isInitialized());
657 
658     // disable load balancing on this master
659     master.balanceSwitch(false);
660 
661     // create two tables in META, each with 10 regions
662     byte [] FAMILY = Bytes.toBytes("family");
663     byte [][] SPLIT_KEYS = new byte [][] {
664         new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
665         Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
666         Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
667         Bytes.toBytes("iii"), Bytes.toBytes("jjj")
668     };
669 
670     byte [] enabledTable = Bytes.toBytes("enabledTable");
671     HTableDescriptor htdEnabled = new HTableDescriptor(enabledTable);
672     htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
673     FileSystem filesystem = FileSystem.get(conf);
674     Path rootdir = filesystem.makeQualified(
675            new Path(conf.get(HConstants.HBASE_DIR)));
676     // Write the .tableinfo
677     FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdEnabled);
678     HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getName(),
679         null, null);
680     createRegion(hriEnabled, rootdir, conf, htdEnabled);
681 
682     List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
683         TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
684 
685     byte [] disabledTable = Bytes.toBytes("disabledTable");
686     HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
687     htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
688     // Write the .tableinfo
689     FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdDisabled);
690     HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getName(), null, null);
691     createRegion(hriDisabled, rootdir, conf, htdDisabled);
692 
693     List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
694         TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
695 
696     log("Regions in META have been created");
697 
698     // at this point we only expect 2 regions to be assigned out (catalogs)
699     assertEquals(2, cluster.countServedRegions());
700 
701     // The first RS will stay online
702     List<RegionServerThread> regionservers =
703       cluster.getRegionServerThreads();
704     HRegionServer hrs = regionservers.get(0).getRegionServer();
705 
706     // The second RS is going to be hard-killed
707     RegionServerThread hrsDeadThread = regionservers.get(1);
708     HRegionServer hrsDead = hrsDeadThread.getRegionServer();
709     ServerName deadServerName = hrsDead.getServerName();
710 
711     // we'll need some regions to already be assigned out properly on live RS
712     List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
713     enabledAndAssignedRegions.add(enabledRegions.remove(0));
714     enabledAndAssignedRegions.add(enabledRegions.remove(0));
715     List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
716     disabledAndAssignedRegions.add(disabledRegions.remove(0));
717     disabledAndAssignedRegions.add(disabledRegions.remove(0));
718 
719     // now actually assign them
720     for (HRegionInfo hri : enabledAndAssignedRegions) {
721       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
722           new RegionPlan(hri, null, hrs.getServerName()));
723       master.assignRegion(hri);
724     }
725     for (HRegionInfo hri : disabledAndAssignedRegions) {
726       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
727           new RegionPlan(hri, null, hrs.getServerName()));
728       master.assignRegion(hri);
729     }
730 
731     assertTrue(" Table must be enabled.", master.getAssignmentManager()
732         .getZKTable().isEnabledTable("enabledTable"));
733     // we also need regions assigned out on the dead server
734     List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
735     enabledAndOnDeadRegions.add(enabledRegions.remove(0));
736     enabledAndOnDeadRegions.add(enabledRegions.remove(0));
737     List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
738     disabledAndOnDeadRegions.add(disabledRegions.remove(0));
739     disabledAndOnDeadRegions.add(disabledRegions.remove(0));
740 
741     // set region plan to server to be killed and trigger assign
742     for (HRegionInfo hri : enabledAndOnDeadRegions) {
743       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
744           new RegionPlan(hri, null, deadServerName));
745       master.assignRegion(hri);
746     }
747     for (HRegionInfo hri : disabledAndOnDeadRegions) {
748       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
749           new RegionPlan(hri, null, deadServerName));
750       master.assignRegion(hri);
751     }
752 
753     // wait for no more RIT
754     log("Waiting for assignment to finish");
755     ZKAssign.blockUntilNoRIT(zkw);
756     log("Assignment completed");
757 
758     // Stop the master
759     log("Aborting master");
760     cluster.abortMaster(0);
761     cluster.waitOnMaster(0);
762     log("Master has aborted");
763 
764     /*
765      * Now, let's start mocking up some weird states as described in the method
766      * javadoc.
767      */
768 
769     List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
770     List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
771 
772     log("Beginning to mock scenarios");
773 
774     // Disable the disabledTable in ZK
775     ZKTable zktable = new ZKTable(zkw);
776     zktable.setDisabledTable(Bytes.toString(disabledTable));
777 
778     assertTrue(" The enabled table should be identified on master fail over.",
779         zktable.isEnabledTable("enabledTable"));
780 
781     /*
782      * ZK = CLOSING
783      */
784 
785     // Region of enabled table being closed on dead RS but not finished
786     HRegionInfo region = enabledAndOnDeadRegions.remove(0);
787     regionsThatShouldBeOnline.add(region);
788     ZKAssign.createNodeClosing(zkw, region, deadServerName);
789     LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" +
790         region + "\n\n");
791 
792     // Region of disabled table being closed on dead RS but not finished
793     region = disabledAndOnDeadRegions.remove(0);
794     regionsThatShouldBeOffline.add(region);
795     ZKAssign.createNodeClosing(zkw, region, deadServerName);
796     LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" +
797         region + "\n\n");
798 
799     /*
800      * ZK = CLOSED
801      */
802 
803     // Region of enabled on dead server gets closed but not ack'd by master
804     region = enabledAndOnDeadRegions.remove(0);
805     regionsThatShouldBeOnline.add(region);
806     int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
807     ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
808     LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" +
809         region + "\n\n");
810 
811     // Region of disabled on dead server gets closed but not ack'd by master
812     region = disabledAndOnDeadRegions.remove(0);
813     regionsThatShouldBeOffline.add(region);
814     version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
815     ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
816     LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" +
817         region + "\n\n");
818 
819     /*
820      * ZK = OPENING
821      */
822 
823     // RS was opening a region of enabled table then died
824     region = enabledRegions.remove(0);
825     regionsThatShouldBeOnline.add(region);
826     ZKAssign.createNodeOffline(zkw, region, deadServerName);
827     ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
828     LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" +
829         region + "\n\n");
830 
831     // RS was opening a region of disabled table then died
832     region = disabledRegions.remove(0);
833     regionsThatShouldBeOffline.add(region);
834     ZKAssign.createNodeOffline(zkw, region, deadServerName);
835     ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
836     LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" +
837         region + "\n\n");
838 
839     /*
840      * ZK = OPENED
841      */
842 
843     // Region of enabled table was opened on dead RS
844     region = enabledRegions.remove(0);
845     regionsThatShouldBeOnline.add(region);
846     ZKAssign.createNodeOffline(zkw, region, deadServerName);
847     hrsDead.openRegion(region);
848     while (true) {
849       RegionTransitionData rtd = ZKAssign.getData(zkw, region.getEncodedName());
850       if (rtd != null && rtd.getEventType() == EventType.RS_ZK_REGION_OPENED) {
851         break;
852       }
853       Thread.sleep(100);
854     }
855     LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" +
856         region + "\n\n");
857 
858     // Region of disabled table was opened on dead RS
859     region = disabledRegions.remove(0);
860     regionsThatShouldBeOffline.add(region);
861     ZKAssign.createNodeOffline(zkw, region, deadServerName);
862     hrsDead.openRegion(region);
863     while (true) {
864       RegionTransitionData rtd = ZKAssign.getData(zkw, region.getEncodedName());
865       if (rtd != null && rtd.getEventType() == EventType.RS_ZK_REGION_OPENED) {
866         break;
867       }
868       Thread.sleep(100);
869     }
870     LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" +
871         region + "\n\n");
872 
873     /*
874      * ZK = NONE
875      */
876 
877     // Region of enabled table was open at steady-state on dead RS
878     region = enabledRegions.remove(0);
879     regionsThatShouldBeOnline.add(region);
880     ZKAssign.createNodeOffline(zkw, region, deadServerName);
881     hrsDead.openRegion(region);
882     while (true) {
883       RegionTransitionData rtd = ZKAssign.getData(zkw, region.getEncodedName());
884       if (rtd != null && rtd.getEventType() == EventType.RS_ZK_REGION_OPENED) {
885         ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
886         break;
887       }
888       Thread.sleep(100);
889     }
890     LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS"
891         + "\n" + region + "\n\n");
892 
893     // Region of disabled table was open at steady-state on dead RS
894     region = disabledRegions.remove(0);
895     regionsThatShouldBeOffline.add(region);
896     ZKAssign.createNodeOffline(zkw, region, deadServerName);
897     hrsDead.openRegion(region);
898     while (true) {
899       RegionTransitionData rtd = ZKAssign.getData(zkw, region.getEncodedName());
900       if (rtd != null && rtd.getEventType() == EventType.RS_ZK_REGION_OPENED) {
901         ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
902         break;
903       }
904       Thread.sleep(100);
905     }
906     LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS"
907       + "\n" + region + "\n\n");
908 
909     /*
910      * DONE MOCKING
911      */
912 
913     log("Done mocking data up in ZK");
914 
915     // Kill the RS that had a hard death
916     log("Killing RS " + deadServerName);
917     hrsDead.abort("Killing for unit test");
918     log("RS " + deadServerName + " killed");
919 
920     // Start up a new master.  Wait until regionserver is completely down
921     // before starting new master because of hbase-4511.
922     while (hrsDeadThread.isAlive()) {
923       Threads.sleep(10);
924     }
925     log("Starting up a new master");
926     master = cluster.startMaster().getMaster();
927     log("Waiting for master to be ready");
928     assertTrue(cluster.waitForActiveAndReadyMaster());
929     log("Master is ready");
930 
931     // Let's add some weird states to master in-memory state
932 
933     // After HBASE-3181, we need to have some ZK state if we're PENDING_OPEN
934     // b/c it is impossible for us to get into this state w/o a zk node
935     // this is not true of PENDING_CLOSE
936 
937     // PENDING_OPEN and enabled
938     region = enabledRegions.remove(0);
939     regionsThatShouldBeOnline.add(region);
940     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
941         new RegionState(region, RegionState.State.PENDING_OPEN, 0, null));
942     ZKAssign.createNodeOffline(zkw, region, master.getServerName());
943     // PENDING_OPEN and disabled
944     region = disabledRegions.remove(0);
945     regionsThatShouldBeOffline.add(region);
946     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
947         new RegionState(region, RegionState.State.PENDING_OPEN, 0, null));
948     ZKAssign.createNodeOffline(zkw, region, master.getServerName());
949     // This test is bad.  It puts up a PENDING_CLOSE but doesn't say what
950     // server we were PENDING_CLOSE against -- i.e. an entry in
951     // AssignmentManager#regions.  W/o a server, we NPE trying to resend close.
952     // In past, there was wonky logic that had us reassign region if no server
953     // at tail of the unassign.  This was removed.  Commenting out for now.
954     // TODO: Remove completely.
955     /*
956     // PENDING_CLOSE and enabled
957     region = enabledRegions.remove(0);
958     LOG.info("Setting PENDING_CLOSE enabled " + region.getEncodedName());
959     regionsThatShouldBeOnline.add(region);
960     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
961       new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
962     // PENDING_CLOSE and disabled
963     region = disabledRegions.remove(0);
964     LOG.info("Setting PENDING_CLOSE disabled " + region.getEncodedName());
965     regionsThatShouldBeOffline.add(region);
966     master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
967       new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
968       */
969 
970     // Failover should be completed, now wait for no RIT
971     log("Waiting for no more RIT");
972     ZKAssign.blockUntilNoRIT(zkw);
973     log("No more RIT in ZK");
974     long now = System.currentTimeMillis();
975     final long maxTime = 120000;
976     boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
977     if (!done) {
978       LOG.info("rit=" + master.assignmentManager.getRegionsInTransition());
979     }
980     long elapsed = System.currentTimeMillis() - now;
981     assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
982       elapsed < maxTime);
983     log("No more RIT in RIT map, doing final test verification");
984 
985     // Grab all the regions that are online across RSs
986     Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
987     for (JVMClusterUtil.RegionServerThread rst :
988         cluster.getRegionServerThreads()) {
989       try {
990         onlineRegions.addAll(rst.getRegionServer().getOnlineRegions());
991       } catch (org.apache.hadoop.hbase.regionserver.RegionServerStoppedException e) {
992         LOG.info("Got RegionServerStoppedException", e);
993       }
994     }
995 
996     // Now, everything that should be online should be online
997     for (HRegionInfo hri : regionsThatShouldBeOnline) {
998       assertTrue("region=" + hri.getRegionNameAsString(), onlineRegions.contains(hri));
999     }
1000 
1001     // Everything that should be offline should not be online
1002     for (HRegionInfo hri : regionsThatShouldBeOffline) {
1003       assertFalse(onlineRegions.contains(hri));
1004     }
1005 
1006     log("Done with verification, all passed, shutting down cluster");
1007 
1008     // Done, shutdown the cluster
1009     TEST_UTIL.shutdownMiniCluster();
1010   }
1011 
1012   HRegion createRegion(final HRegionInfo  hri, final Path rootdir, final Configuration c,
1013       final HTableDescriptor htd)
1014   throws IOException {
1015     HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
1016     // The above call to create a region will create an hlog file.  Each
1017     // log file create will also create a running thread to do syncing.  We need
1018     // to close out this log else we will have a running thread trying to sync
1019     // the file system continuously which is ugly when dfs is taken away at the
1020     // end of the test.
1021     HRegion.closeHRegion(r);
1022     return r;
1023   }
1024 
1025   // TODO: Next test to add is with testing permutations of the RIT or the RS
1026   //       killed are hosting ROOT and META regions.
1027 
1028   private void log(String string) {
1029     LOG.info("\n\n" + string + " \n\n");
1030   }
1031 
1032   @org.junit.Rule
1033   public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
1034     new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();
1035 }
1036