View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.assertNotNull;
24  import static org.junit.Assert.assertTrue;
25  
26  import java.io.IOException;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Set;
31  import java.util.TreeSet;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.fs.FileSystem;
37  import org.apache.hadoop.fs.Path;
38  import org.apache.hadoop.hbase.Abortable;
39  import org.apache.hadoop.hbase.ClusterStatus;
40  import org.apache.hadoop.hbase.TableName;
41  import org.apache.hadoop.hbase.HBaseConfiguration;
42  import org.apache.hadoop.hbase.HBaseTestingUtility;
43  import org.apache.hadoop.hbase.HColumnDescriptor;
44  import org.apache.hadoop.hbase.HRegionInfo;
45  import org.apache.hadoop.hbase.HTableDescriptor;
46  import org.apache.hadoop.hbase.LargeTests;
47  import org.apache.hadoop.hbase.MiniHBaseCluster;
48  import org.apache.hadoop.hbase.RegionTransition;
49  import org.apache.hadoop.hbase.ServerName;
50  import org.apache.hadoop.hbase.executor.EventType;
51  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
52  import org.apache.hadoop.hbase.regionserver.HRegion;
53  import org.apache.hadoop.hbase.regionserver.HRegionServer;
54  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
55  import org.apache.hadoop.hbase.util.Bytes;
56  import org.apache.hadoop.hbase.util.FSUtils;
57  import org.apache.hadoop.hbase.util.FSTableDescriptors;
58  import org.apache.hadoop.hbase.util.JVMClusterUtil;
59  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
60  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
61  import org.apache.hadoop.hbase.util.Threads;
62  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
63  import org.apache.hadoop.hbase.zookeeper.ZKTable;
64  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
65  import org.junit.Test;
66  import org.junit.experimental.categories.Category;
67  
68  @Category(LargeTests.class)
69  public class TestMasterFailover {
70    private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
71  
72    /**
73     * Complex test of master failover that tests as many permutations of the
74     * different possible states that regions in transition could be in within ZK.
75     * <p>
76     * This tests the proper handling of these states by the failed-over master
77     * and includes a thorough testing of the timeout code as well.
78     * <p>
79     * Starts with a single master and three regionservers.
80     * <p>
81     * Creates two tables, enabledTable and disabledTable, each containing 5
82     * regions.  The disabledTable is then disabled.
83     * <p>
84     * After reaching steady-state, the master is killed.  We then mock several
85     * states in ZK.
86     * <p>
87     * After mocking them, we will startup a new master which should become the
88     * active master and also detect that it is a failover.  The primary test
89     * passing condition will be that all regions of the enabled table are
90     * assigned and all the regions of the disabled table are not assigned.
91     * <p>
92     * The different scenarios to be tested are below:
93     * <p>
94     * <b>ZK State:  OFFLINE</b>
95     * <p>A node can get into OFFLINE state if</p>
96     * <ul>
97     * <li>An RS fails to open a region, so it reverts the state back to OFFLINE
98     * <li>The Master is assigning the region to a RS before it sends RPC
99     * </ul>
100    * <p>We will mock the scenarios</p>
101    * <ul>
102    * <li>Master has assigned an enabled region but RS failed so a region is
103    *     not assigned anywhere and is sitting in ZK as OFFLINE</li>
104    * <li>This seems to cover both cases?</li>
105    * </ul>
106    * <p>
107    * <b>ZK State:  CLOSING</b>
108    * <p>A node can get into CLOSING state if</p>
109    * <ul>
110    * <li>An RS has begun to close a region
111    * </ul>
112    * <p>We will mock the scenarios</p>
113    * <ul>
114    * <li>Region of enabled table was being closed but did not complete
115    * <li>Region of disabled table was being closed but did not complete
116    * </ul>
117    * <p>
118    * <b>ZK State:  CLOSED</b>
119    * <p>A node can get into CLOSED state if</p>
120    * <ul>
121    * <li>An RS has completed closing a region but not acknowledged by master yet
122    * </ul>
123    * <p>We will mock the scenarios</p>
124    * <ul>
125    * <li>Region of a table that should be enabled was closed on an RS
126    * <li>Region of a table that should be disabled was closed on an RS
127    * </ul>
128    * <p>
129    * <b>ZK State:  OPENING</b>
130    * <p>A node can get into OPENING state if</p>
131    * <ul>
132    * <li>An RS has begun to open a region
133    * </ul>
134    * <p>We will mock the scenarios</p>
135    * <ul>
136    * <li>RS was opening a region of enabled table but never finishes
137    * </ul>
138    * <p>
139    * <b>ZK State:  OPENED</b>
140    * <p>A node can get into OPENED state if</p>
141    * <ul>
142    * <li>An RS has finished opening a region but not acknowledged by master yet
143    * </ul>
144    * <p>We will mock the scenarios</p>
145    * <ul>
146    * <li>Region of a table that should be enabled was opened on an RS
147    * <li>Region of a table that should be disabled was opened on an RS
148    * </ul>
149    * @throws Exception
150    */
151   @Test (timeout=180000)
152   public void testMasterFailoverWithMockedRIT() throws Exception {
153 
154     final int NUM_MASTERS = 1;
155     final int NUM_RS = 3;
156 
157     // Create config to use for this cluster
158     Configuration conf = HBaseConfiguration.create();
159 
160     // Start the cluster
161     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
162     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
163     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
164     log("Cluster started");
165 
166     // Create a ZKW to use in the test
167     ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
168 
169     // get all the master threads
170     List<MasterThread> masterThreads = cluster.getMasterThreads();
171     assertEquals(1, masterThreads.size());
172 
173     // only one master thread, let's wait for it to be initialized
174     assertTrue(cluster.waitForActiveAndReadyMaster());
175     HMaster master = masterThreads.get(0).getMaster();
176     assertTrue(master.isActiveMaster());
177     assertTrue(master.isInitialized());
178 
179     // disable load balancing on this master
180     master.balanceSwitch(false);
181 
182     // create two tables in META, each with 10 regions
183     byte [] FAMILY = Bytes.toBytes("family");
184     byte [][] SPLIT_KEYS = new byte [][] {
185         new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
186         Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
187         Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
188         Bytes.toBytes("iii"), Bytes.toBytes("jjj")
189     };
190 
191     byte [] enabledTable = Bytes.toBytes("enabledTable");
192     HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
193     htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
194 
195     FileSystem filesystem = FileSystem.get(conf);
196     Path rootdir = FSUtils.getRootDir(conf);
197     FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
198     // Write the .tableinfo
199     fstd.createTableDescriptor(htdEnabled);
200 
201     HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(), null, null);
202     createRegion(hriEnabled, rootdir, conf, htdEnabled);
203 
204     List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
205         TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
206 
207     TableName disabledTable = TableName.valueOf("disabledTable");
208     HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
209     htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
210     // Write the .tableinfo
211     fstd.createTableDescriptor(htdDisabled);
212     HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
213     createRegion(hriDisabled, rootdir, conf, htdDisabled);
214     List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
215         TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
216 
217     log("Regions in META and namespace have been created");
218 
219     // at this point we only expect 3 regions to be assigned out (catalogs and namespace)
220     assertEquals(2, cluster.countServedRegions());
221 
222     // Let's just assign everything to first RS
223     HRegionServer hrs = cluster.getRegionServer(0);
224     ServerName serverName = hrs.getServerName();
225     HRegionInfo closingRegion = enabledRegions.remove(0);
226     // we'll need some regions to already be assigned out properly on live RS
227     List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
228     enabledAndAssignedRegions.add(enabledRegions.remove(0));
229     enabledAndAssignedRegions.add(enabledRegions.remove(0));
230     enabledAndAssignedRegions.add(closingRegion);
231 
232     List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
233     disabledAndAssignedRegions.add(disabledRegions.remove(0));
234     disabledAndAssignedRegions.add(disabledRegions.remove(0));
235 
236     // now actually assign them
237     for (HRegionInfo hri : enabledAndAssignedRegions) {
238       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
239           new RegionPlan(hri, null, serverName));
240       master.assignRegion(hri);
241     }
242     for (HRegionInfo hri : disabledAndAssignedRegions) {
243       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
244           new RegionPlan(hri, null, serverName));
245       master.assignRegion(hri);
246     }
247 
248     // wait for no more RIT
249     log("Waiting for assignment to finish");
250     ZKAssign.blockUntilNoRIT(zkw);
251     log("Assignment completed");
252 
253     // Stop the master
254     log("Aborting master");
255     cluster.abortMaster(0);
256     cluster.waitOnMaster(0);
257     log("Master has aborted");
258 
259     /*
260      * Now, let's start mocking up some weird states as described in the method
261      * javadoc.
262      */
263 
264     List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
265     List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
266 
267     log("Beginning to mock scenarios");
268 
269     // Disable the disabledTable in ZK
270     ZKTable zktable = new ZKTable(zkw);
271     zktable.setDisabledTable(disabledTable);
272 
273     /*
274      *  ZK = OFFLINE
275      */
276 
277     // Region that should be assigned but is not and is in ZK as OFFLINE
278     // Cause: This can happen if the master crashed after creating the znode but before sending the
279     //  request to the region server
280     HRegionInfo region = enabledRegions.remove(0);
281     regionsThatShouldBeOnline.add(region);
282     ZKAssign.createNodeOffline(zkw, region, serverName);
283 
284     /*
285      * ZK = CLOSING
286      */
287     // Cause: Same as offline.
288     regionsThatShouldBeOnline.add(closingRegion);
289     ZKAssign.createNodeClosing(zkw, closingRegion, serverName);
290 
291     /*
292      * ZK = CLOSED
293      */
294 
295     // Region of enabled table closed but not ack
296     //Cause: Master was down while the region server updated the ZK status.
297     region = enabledRegions.remove(0);
298     regionsThatShouldBeOnline.add(region);
299     int version = ZKAssign.createNodeClosing(zkw, region, serverName);
300     ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
301 
302     // Region of disabled table closed but not ack
303     region = disabledRegions.remove(0);
304     regionsThatShouldBeOffline.add(region);
305     version = ZKAssign.createNodeClosing(zkw, region, serverName);
306     ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
307 
308     /*
309      * ZK = OPENED
310      */
311 
312     // Region of enabled table was opened on RS
313     // Cause: as offline
314     region = enabledRegions.remove(0);
315     regionsThatShouldBeOnline.add(region);
316     ZKAssign.createNodeOffline(zkw, region, serverName);
317     ProtobufUtil.openRegion(hrs, region);
318     while (true) {
319       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
320       RegionTransition rt = RegionTransition.parseFrom(bytes);
321       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
322         break;
323       }
324       Thread.sleep(100);
325     }
326 
327     // Region of disable table was opened on RS
328     // Cause: Master failed while updating the status for this region server.
329     region = disabledRegions.remove(0);
330     regionsThatShouldBeOffline.add(region);
331     ZKAssign.createNodeOffline(zkw, region, serverName);
332     ProtobufUtil.openRegion(hrs, region);
333     while (true) {
334       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
335       RegionTransition rt = RegionTransition.parseFrom(bytes);
336       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
337         break;
338       }
339       Thread.sleep(100);
340     }
341 
342     /*
343      * ZK = NONE
344      */
345 
346     /*
347      * DONE MOCKING
348      */
349 
350     log("Done mocking data up in ZK");
351 
352     // Start up a new master
353     log("Starting up a new master");
354     master = cluster.startMaster().getMaster();
355     log("Waiting for master to be ready");
356     cluster.waitForActiveAndReadyMaster();
357     log("Master is ready");
358 
359     // Failover should be completed, now wait for no RIT
360     log("Waiting for no more RIT");
361     ZKAssign.blockUntilNoRIT(zkw);
362     log("No more RIT in ZK, now doing final test verification");
363 
364     // Grab all the regions that are online across RSs
365     Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
366     for (JVMClusterUtil.RegionServerThread rst :
367       cluster.getRegionServerThreads()) {
368       onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer()));
369     }
370 
371     // Now, everything that should be online should be online
372     for (HRegionInfo hri : regionsThatShouldBeOnline) {
373       assertTrue(onlineRegions.contains(hri));
374     }
375 
376     // Everything that should be offline should not be online
377     for (HRegionInfo hri : regionsThatShouldBeOffline) {
378       assertFalse(onlineRegions.contains(hri));
379     }
380 
381     log("Done with verification, all passed, shutting down cluster");
382 
383     // Done, shutdown the cluster
384     TEST_UTIL.shutdownMiniCluster();
385   }
386 
387 
388   /**
389    * Complex test of master failover that tests as many permutations of the
390    * different possible states that regions in transition could be in within ZK
391    * pointing to an RS that has died while no master is around to process it.
392    * <p>
393    * This tests the proper handling of these states by the failed-over master
394    * and includes a thorough testing of the timeout code as well.
395    * <p>
396    * Starts with a single master and two regionservers.
397    * <p>
398    * Creates two tables, enabledTable and disabledTable, each containing 5
399    * regions.  The disabledTable is then disabled.
400    * <p>
401    * After reaching steady-state, the master is killed.  We then mock several
402    * states in ZK.  And one of the RS will be killed.
403    * <p>
404    * After mocking them and killing an RS, we will startup a new master which
405    * should become the active master and also detect that it is a failover.  The
406    * primary test passing condition will be that all regions of the enabled
407    * table are assigned and all the regions of the disabled table are not
408    * assigned.
409    * <p>
410    * The different scenarios to be tested are below:
411    * <p>
412    * <b>ZK State:  CLOSING</b>
413    * <p>A node can get into CLOSING state if</p>
414    * <ul>
415    * <li>An RS has begun to close a region
416    * </ul>
417    * <p>We will mock the scenarios</p>
418    * <ul>
419    * <li>Region was being closed but the RS died before finishing the close
420    * </ul>
421    * <b>ZK State:  OPENED</b>
422    * <p>A node can get into OPENED state if</p>
423    * <ul>
424    * <li>An RS has finished opening a region but not acknowledged by master yet
425    * </ul>
426    * <p>We will mock the scenarios</p>
427    * <ul>
428    * <li>Region of a table that should be enabled was opened by a now-dead RS
429    * <li>Region of a table that should be disabled was opened by a now-dead RS
430    * </ul>
431    * <p>
432    * <b>ZK State:  NONE</b>
433    * <p>A region could not have a transition node if</p>
434    * <ul>
435    * <li>The server hosting the region died and no master processed it
436    * </ul>
437    * <p>We will mock the scenarios</p>
438    * <ul>
439    * <li>Region of enabled table was on a dead RS that was not yet processed
440    * <li>Region of disabled table was on a dead RS that was not yet processed
441    * </ul>
442    * @throws Exception
443    */
444   @Test (timeout=180000)
445   public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {
446 
447     final int NUM_MASTERS = 1;
448     final int NUM_RS = 2;
449 
450     // Create and start the cluster
451     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
452     Configuration conf = TEST_UTIL.getConfiguration();
453 
454     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
455     conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
456     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
457     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
458     log("Cluster started");
459 
460     // Create a ZKW to use in the test
461     ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
462         "unittest", new Abortable() {
463 
464           @Override
465           public void abort(String why, Throwable e) {
466             LOG.error("Fatal ZK Error: " + why, e);
467             org.junit.Assert.assertFalse("Fatal ZK error", true);
468           }
469 
470           @Override
471           public boolean isAborted() {
472             return false;
473           }
474 
475     });
476 
477     // get all the master threads
478     List<MasterThread> masterThreads = cluster.getMasterThreads();
479     assertEquals(1, masterThreads.size());
480 
481     // only one master thread, let's wait for it to be initialized
482     assertTrue(cluster.waitForActiveAndReadyMaster());
483     HMaster master = masterThreads.get(0).getMaster();
484     assertTrue(master.isActiveMaster());
485     assertTrue(master.isInitialized());
486 
487     // disable load balancing on this master
488     master.balanceSwitch(false);
489 
490     // create two tables in META, each with 30 regions
491     byte [] FAMILY = Bytes.toBytes("family");
492     byte[][] SPLIT_KEYS =
493         TEST_UTIL.getRegionSplitStartKeys(Bytes.toBytes("aaa"), Bytes.toBytes("zzz"), 30);
494 
495     byte [] enabledTable = Bytes.toBytes("enabledTable");
496     HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
497     htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
498     FileSystem filesystem = FileSystem.get(conf);
499     Path rootdir = FSUtils.getRootDir(conf);
500     FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
501     // Write the .tableinfo
502     fstd.createTableDescriptor(htdEnabled);
503     HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(),
504         null, null);
505     createRegion(hriEnabled, rootdir, conf, htdEnabled);
506 
507     List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
508         TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
509 
510     TableName disabledTable =
511         TableName.valueOf("disabledTable");
512     HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
513     htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
514     // Write the .tableinfo
515     fstd.createTableDescriptor(htdDisabled);
516     HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
517     createRegion(hriDisabled, rootdir, conf, htdDisabled);
518 
519     List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
520         TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
521 
522     log("Regions in META and Namespace have been created");
523 
524     // at this point we only expect 2 regions to be assigned out (catalogs and namespace  )
525     assertEquals(2, cluster.countServedRegions());
526 
527     // The first RS will stay online
528     List<RegionServerThread> regionservers =
529       cluster.getRegionServerThreads();
530     HRegionServer hrs = regionservers.get(0).getRegionServer();
531 
532     // The second RS is going to be hard-killed
533     RegionServerThread hrsDeadThread = regionservers.get(1);
534     HRegionServer hrsDead = hrsDeadThread.getRegionServer();
535     ServerName deadServerName = hrsDead.getServerName();
536 
537     // we'll need some regions to already be assigned out properly on live RS
538     List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
539     enabledAndAssignedRegions.addAll(enabledRegions.subList(0, 6));
540     enabledRegions.removeAll(enabledAndAssignedRegions);
541     List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
542     disabledAndAssignedRegions.addAll(disabledRegions.subList(0, 6));
543     disabledRegions.removeAll(disabledAndAssignedRegions);
544 
545     // now actually assign them
546     for (HRegionInfo hri : enabledAndAssignedRegions) {
547       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
548           new RegionPlan(hri, null, hrs.getServerName()));
549       master.assignRegion(hri);
550     }
551     for (HRegionInfo hri : disabledAndAssignedRegions) {
552       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
553           new RegionPlan(hri, null, hrs.getServerName()));
554       master.assignRegion(hri);
555     }
556 
557     log("Waiting for assignment to finish");
558     ZKAssign.blockUntilNoRIT(zkw);
559     master.assignmentManager.waitUntilNoRegionsInTransition(60000);
560     log("Assignment completed");
561 
562     assertTrue(" Table must be enabled.", master.getAssignmentManager()
563         .getZKTable().isEnabledTable(TableName.valueOf("enabledTable")));
564     // we also need regions assigned out on the dead server
565     List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
566     enabledAndOnDeadRegions.addAll(enabledRegions.subList(0, 6));
567     enabledRegions.removeAll(enabledAndOnDeadRegions);
568     List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
569     disabledAndOnDeadRegions.addAll(disabledRegions.subList(0, 6));
570     disabledRegions.removeAll(disabledAndOnDeadRegions);
571 
572     // set region plan to server to be killed and trigger assign
573     for (HRegionInfo hri : enabledAndOnDeadRegions) {
574       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
575           new RegionPlan(hri, null, deadServerName));
576       master.assignRegion(hri);
577     }
578     for (HRegionInfo hri : disabledAndOnDeadRegions) {
579       master.assignmentManager.regionPlans.put(hri.getEncodedName(),
580           new RegionPlan(hri, null, deadServerName));
581       master.assignRegion(hri);
582     }
583 
584     // wait for no more RIT
585     log("Waiting for assignment to finish");
586     ZKAssign.blockUntilNoRIT(zkw);
587     master.assignmentManager.waitUntilNoRegionsInTransition(60000);
588     log("Assignment completed");
589 
590     // Due to master.assignRegion(hri) could fail to assign a region to a specified RS
591     // therefore, we need make sure that regions are in the expected RS
592     verifyRegionLocation(hrs, enabledAndAssignedRegions);
593     verifyRegionLocation(hrs, disabledAndAssignedRegions);
594     verifyRegionLocation(hrsDead, enabledAndOnDeadRegions);
595     verifyRegionLocation(hrsDead, disabledAndOnDeadRegions);
596 
597     assertTrue(" Didn't get enough regions of enabledTalbe on live rs.",
598       enabledAndAssignedRegions.size() >= 2);
599     assertTrue(" Didn't get enough regions of disalbedTable on live rs.",
600       disabledAndAssignedRegions.size() >= 2);
601     assertTrue(" Didn't get enough regions of enabledTalbe on dead rs.",
602       enabledAndOnDeadRegions.size() >= 2);
603     assertTrue(" Didn't get enough regions of disalbedTable on dead rs.",
604       disabledAndOnDeadRegions.size() >= 2);
605 
606     // Stop the master
607     log("Aborting master");
608     cluster.abortMaster(0);
609     cluster.waitOnMaster(0);
610     log("Master has aborted");
611 
612     /*
613      * Now, let's start mocking up some weird states as described in the method
614      * javadoc.
615      */
616 
617     List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
618     List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
619 
620     log("Beginning to mock scenarios");
621 
622     // Disable the disabledTable in ZK
623     ZKTable zktable = new ZKTable(zkw);
624     zktable.setDisabledTable(disabledTable);
625 
626     assertTrue(" The enabled table should be identified on master fail over.",
627         zktable.isEnabledTable(TableName.valueOf("enabledTable")));
628 
629     /*
630      * ZK = CLOSING
631      */
632 
633     // Region of enabled table being closed on dead RS but not finished
634     HRegionInfo region = enabledAndOnDeadRegions.remove(0);
635     regionsThatShouldBeOnline.add(region);
636     ZKAssign.createNodeClosing(zkw, region, deadServerName);
637     LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" +
638         region + "\n\n");
639 
640     // Region of disabled table being closed on dead RS but not finished
641     region = disabledAndOnDeadRegions.remove(0);
642     regionsThatShouldBeOffline.add(region);
643     ZKAssign.createNodeClosing(zkw, region, deadServerName);
644     LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" +
645         region + "\n\n");
646 
647     /*
648      * ZK = CLOSED
649      */
650 
651     // Region of enabled on dead server gets closed but not ack'd by master
652     region = enabledAndOnDeadRegions.remove(0);
653     regionsThatShouldBeOnline.add(region);
654     int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
655     ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
656     LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" +
657         region + "\n\n");
658 
659     // Region of disabled on dead server gets closed but not ack'd by master
660     region = disabledAndOnDeadRegions.remove(0);
661     regionsThatShouldBeOffline.add(region);
662     version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
663     ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
664     LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" +
665         region + "\n\n");
666 
667     /*
668      * ZK = OPENING
669      */
670 
671     // RS was opening a region of enabled table then died
672     region = enabledRegions.remove(0);
673     regionsThatShouldBeOnline.add(region);
674     ZKAssign.createNodeOffline(zkw, region, deadServerName);
675     ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
676     LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" +
677         region + "\n\n");
678 
679     // RS was opening a region of disabled table then died
680     region = disabledRegions.remove(0);
681     regionsThatShouldBeOffline.add(region);
682     ZKAssign.createNodeOffline(zkw, region, deadServerName);
683     ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
684     LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" +
685         region + "\n\n");
686 
687     /*
688      * ZK = OPENED
689      */
690 
691     // Region of enabled table was opened on dead RS
692     region = enabledRegions.remove(0);
693     regionsThatShouldBeOnline.add(region);
694     ZKAssign.createNodeOffline(zkw, region, deadServerName);
695     ProtobufUtil.openRegion(hrsDead, region);
696     while (true) {
697       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
698       RegionTransition rt = RegionTransition.parseFrom(bytes);
699       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
700         break;
701       }
702       Thread.sleep(100);
703     }
704     LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" +
705         region + "\n\n");
706 
707     // Region of disabled table was opened on dead RS
708     region = disabledRegions.remove(0);
709     regionsThatShouldBeOffline.add(region);
710     ZKAssign.createNodeOffline(zkw, region, deadServerName);
711     ProtobufUtil.openRegion(hrsDead, region);
712     while (true) {
713       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
714       RegionTransition rt = RegionTransition.parseFrom(bytes);
715       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
716         break;
717       }
718       Thread.sleep(100);
719     }
720     LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" +
721         region + "\n\n");
722 
723     /*
724      * ZK = NONE
725      */
726 
727     // Region of enabled table was open at steady-state on dead RS
728     region = enabledRegions.remove(0);
729     regionsThatShouldBeOnline.add(region);
730     ZKAssign.createNodeOffline(zkw, region, deadServerName);
731     ProtobufUtil.openRegion(hrsDead, region);
732     while (true) {
733       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
734       RegionTransition rt = RegionTransition.parseFrom(bytes);
735       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
736         ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
737         LOG.debug("DELETED " + rt);
738         break;
739       }
740       Thread.sleep(100);
741     }
742     LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS"
743         + "\n" + region + "\n\n");
744 
745     // Region of disabled table was open at steady-state on dead RS
746     region = disabledRegions.remove(0);
747     regionsThatShouldBeOffline.add(region);
748     ZKAssign.createNodeOffline(zkw, region, deadServerName);
749     ProtobufUtil.openRegion(hrsDead, region);
750     while (true) {
751       byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
752       RegionTransition rt = RegionTransition.parseFrom(bytes);
753       if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
754         ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
755         break;
756       }
757       Thread.sleep(100);
758     }
759     LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS"
760       + "\n" + region + "\n\n");
761 
762     /*
763      * DONE MOCKING
764      */
765 
766     log("Done mocking data up in ZK");
767 
768     // Kill the RS that had a hard death
769     log("Killing RS " + deadServerName);
770     hrsDead.abort("Killing for unit test");
771     log("RS " + deadServerName + " killed");
772 
773     // Start up a new master.  Wait until regionserver is completely down
774     // before starting new master because of hbase-4511.
775     while (hrsDeadThread.isAlive()) {
776       Threads.sleep(10);
777     }
778     log("Starting up a new master");
779     master = cluster.startMaster().getMaster();
780     log("Waiting for master to be ready");
781     assertTrue(cluster.waitForActiveAndReadyMaster());
782     log("Master is ready");
783     
784     // Wait until SSH processing completed for dead server.
785     while (master.getServerManager().areDeadServersInProgress()) {
786       Thread.sleep(10);
787     }
788     
789     // Failover should be completed, now wait for no RIT
790     log("Waiting for no more RIT");
791     ZKAssign.blockUntilNoRIT(zkw);
792     log("No more RIT in ZK");
793     long now = System.currentTimeMillis();
794     long maxTime = 120000;
795     boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
796     if (!done) {
797       LOG.info("rit=" + master.getAssignmentManager().getRegionStates().getRegionsInTransition());
798     }
799     long elapsed = System.currentTimeMillis() - now;
800     assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
801       elapsed < maxTime);
802     log("No more RIT in RIT map, doing final test verification");
803 
804     // Grab all the regions that are online across RSs
805     Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
806     now = System.currentTimeMillis();
807     maxTime = 30000;
808     for (JVMClusterUtil.RegionServerThread rst :
809         cluster.getRegionServerThreads()) {
810       try {
811         HRegionServer rs = rst.getRegionServer();
812         while (!rs.getRegionsInTransitionInRS().isEmpty()) {
813           elapsed = System.currentTimeMillis() - now;
814           assertTrue("Test timed out in getting online regions", elapsed < maxTime);
815           if (rs.isAborted() || rs.isStopped()) {
816             // This region server is stopped, skip it.
817             break;
818           }
819           Thread.sleep(100);
820         }
821         onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rs));
822       } catch (RegionServerStoppedException e) {
823         LOG.info("Got RegionServerStoppedException", e);
824       }
825     }
826 
827     // Now, everything that should be online should be online
828     for (HRegionInfo hri : regionsThatShouldBeOnline) {
829       assertTrue("region=" + hri.getRegionNameAsString() + ", " + onlineRegions.toString(),
830         onlineRegions.contains(hri));
831     }
832 
833     // Everything that should be offline should not be online
834     for (HRegionInfo hri : regionsThatShouldBeOffline) {
835       assertFalse(onlineRegions.contains(hri));
836     }
837 
838     log("Done with verification, all passed, shutting down cluster");
839 
840     // Done, shutdown the cluster
841     TEST_UTIL.shutdownMiniCluster();
842   }
843 
844   /**
845    * Verify regions are on the expected region server
846    */
847   private void verifyRegionLocation(HRegionServer hrs, List<HRegionInfo> regions)
848       throws IOException {
849     List<HRegionInfo> tmpOnlineRegions = ProtobufUtil.getOnlineRegions(hrs);
850     Iterator<HRegionInfo> itr = regions.iterator();
851     while (itr.hasNext()) {
852       HRegionInfo tmp = itr.next();
853       if (!tmpOnlineRegions.contains(tmp)) {
854         itr.remove();
855       }
856     }
857   }
858 
859   HRegion createRegion(final HRegionInfo  hri, final Path rootdir, final Configuration c,
860       final HTableDescriptor htd)
861   throws IOException {
862     HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
863     // The above call to create a region will create an hlog file.  Each
864     // log file create will also create a running thread to do syncing.  We need
865     // to close out this log else we will have a running thread trying to sync
866     // the file system continuously which is ugly when dfs is taken away at the
867     // end of the test.
868     HRegion.closeHRegion(r);
869     return r;
870   }
871 
872   // TODO: Next test to add is with testing permutations of the RIT or the RS
873   //       killed are hosting ROOT and META regions.
874 
875   private void log(String string) {
876     LOG.info("\n\n" + string + " \n\n");
877   }
878 
879   @Test (timeout=180000)
880   public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
881       throws Exception {
882     LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
883     final int NUM_MASTERS = 1;
884     final int NUM_RS = 2;
885 
886     // Start the cluster
887     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
888     Configuration conf = TEST_UTIL.getConfiguration();
889     conf.setInt("hbase.master.info.port", -1);
890 
891     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
892     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
893 
894     // Find regionserver carrying meta.
895     List<RegionServerThread> regionServerThreads =
896       cluster.getRegionServerThreads();
897     int count = -1;
898     HRegion metaRegion = null;
899     for (RegionServerThread regionServerThread : regionServerThreads) {
900       HRegionServer regionServer = regionServerThread.getRegionServer();
901       metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
902       count++;
903       regionServer.abort("");
904       if (null != metaRegion) break;
905     }
906     HRegionServer regionServer = cluster.getRegionServer(count);
907 
908     TEST_UTIL.shutdownMiniHBaseCluster();
909 
910     // Create a ZKW to use in the test
911     ZooKeeperWatcher zkw =
912       HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
913           metaRegion, regionServer.getServerName());
914 
915     LOG.info("Staring cluster for second time");
916     TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
917 
918     HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
919     while (!master.isInitialized()) {
920       Thread.sleep(100);
921     }
922     // Failover should be completed, now wait for no RIT
923     log("Waiting for no more RIT");
924     ZKAssign.blockUntilNoRIT(zkw);
925 
926     zkw.close();
927     // Stop the cluster
928     TEST_UTIL.shutdownMiniCluster();
929   }
930 
931   /**
932    * Simple test of master failover.
933    * <p>
934    * Starts with three masters.  Kills a backup master.  Then kills the active
935    * master.  Ensures the final master becomes active and we can still contact
936    * the cluster.
937    * @throws Exception
938    */
939   @Test (timeout=240000)
940   public void testSimpleMasterFailover() throws Exception {
941 
942     final int NUM_MASTERS = 3;
943     final int NUM_RS = 3;
944 
945     // Start the cluster
946     HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
947 
948     TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
949     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
950 
951     // get all the master threads
952     List<MasterThread> masterThreads = cluster.getMasterThreads();
953 
954     // wait for each to come online
955     for (MasterThread mt : masterThreads) {
956       assertTrue(mt.isAlive());
957     }
958 
959     // verify only one is the active master and we have right number
960     int numActive = 0;
961     int activeIndex = -1;
962     ServerName activeName = null;
963     HMaster active = null;
964     for (int i = 0; i < masterThreads.size(); i++) {
965       if (masterThreads.get(i).getMaster().isActiveMaster()) {
966         numActive++;
967         activeIndex = i;
968         active = masterThreads.get(activeIndex).getMaster();
969         activeName = active.getServerName();
970       }
971     }
972     assertEquals(1, numActive);
973     assertEquals(NUM_MASTERS, masterThreads.size());
974     LOG.info("Active master " + activeName);
975 
976     // Check that ClusterStatus reports the correct active and backup masters
977     assertNotNull(active);
978     ClusterStatus status = active.getClusterStatus();
979     assertTrue(status.getMaster().equals(activeName));
980     assertEquals(2, status.getBackupMastersSize());
981     assertEquals(2, status.getBackupMasters().size());
982 
983     // attempt to stop one of the inactive masters
984     int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
985     HMaster master = cluster.getMaster(backupIndex);
986     LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
987     cluster.stopMaster(backupIndex, false);
988     cluster.waitOnMaster(backupIndex);
989 
990     // Verify still one active master and it's the same
991     for (int i = 0; i < masterThreads.size(); i++) {
992       if (masterThreads.get(i).getMaster().isActiveMaster()) {
993         assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
994         activeIndex = i;
995         active = masterThreads.get(activeIndex).getMaster();
996       }
997     }
998     assertEquals(1, numActive);
999     assertEquals(2, masterThreads.size());
1000     int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
1001     LOG.info("Active master " + active.getServerName() + " managing " + rsCount +  " regions servers");
1002     assertEquals(3, rsCount);
1003 
1004     // Check that ClusterStatus reports the correct active and backup masters
1005     assertNotNull(active);
1006     status = active.getClusterStatus();
1007     assertTrue(status.getMaster().equals(activeName));
1008     assertEquals(1, status.getBackupMastersSize());
1009     assertEquals(1, status.getBackupMasters().size());
1010 
1011     // kill the active master
1012     LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
1013     cluster.stopMaster(activeIndex, false);
1014     cluster.waitOnMaster(activeIndex);
1015 
1016     // wait for an active master to show up and be ready
1017     assertTrue(cluster.waitForActiveAndReadyMaster());
1018 
1019     LOG.debug("\n\nVerifying backup master is now active\n");
1020     // should only have one master now
1021     assertEquals(1, masterThreads.size());
1022 
1023     // and he should be active
1024     active = masterThreads.get(0).getMaster();
1025     assertNotNull(active);
1026     status = active.getClusterStatus();
1027     ServerName mastername = status.getMaster();
1028     assertTrue(mastername.equals(active.getServerName()));
1029     assertTrue(active.isActiveMaster());
1030     assertEquals(0, status.getBackupMastersSize());
1031     assertEquals(0, status.getBackupMasters().size());
1032     int rss = status.getServersSize();
1033     LOG.info("Active master " + mastername.getServerName() + " managing " +
1034       rss +  " region servers");
1035     assertEquals(3, rss);
1036 
1037     // Stop the cluster
1038     TEST_UTIL.shutdownMiniCluster();
1039   }
1040 }
1041