1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import static org.junit.Assert.assertEquals;
22 import static org.junit.Assert.assertFalse;
23 import static org.junit.Assert.assertNotNull;
24 import static org.junit.Assert.assertTrue;
25
26 import java.io.IOException;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Set;
31 import java.util.TreeSet;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.conf.Configuration;
36 import org.apache.hadoop.fs.FileSystem;
37 import org.apache.hadoop.fs.Path;
38 import org.apache.hadoop.hbase.Abortable;
39 import org.apache.hadoop.hbase.ClusterStatus;
40 import org.apache.hadoop.hbase.TableName;
41 import org.apache.hadoop.hbase.HBaseConfiguration;
42 import org.apache.hadoop.hbase.HBaseTestingUtility;
43 import org.apache.hadoop.hbase.HColumnDescriptor;
44 import org.apache.hadoop.hbase.HRegionInfo;
45 import org.apache.hadoop.hbase.HTableDescriptor;
46 import org.apache.hadoop.hbase.LargeTests;
47 import org.apache.hadoop.hbase.MiniHBaseCluster;
48 import org.apache.hadoop.hbase.RegionTransition;
49 import org.apache.hadoop.hbase.ServerName;
50 import org.apache.hadoop.hbase.executor.EventType;
51 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
52 import org.apache.hadoop.hbase.regionserver.HRegion;
53 import org.apache.hadoop.hbase.regionserver.HRegionServer;
54 import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
55 import org.apache.hadoop.hbase.util.Bytes;
56 import org.apache.hadoop.hbase.util.FSUtils;
57 import org.apache.hadoop.hbase.util.FSTableDescriptors;
58 import org.apache.hadoop.hbase.util.JVMClusterUtil;
59 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
60 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
61 import org.apache.hadoop.hbase.util.Threads;
62 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
63 import org.apache.hadoop.hbase.zookeeper.ZKTable;
64 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
65 import org.junit.Test;
66 import org.junit.experimental.categories.Category;
67
68 @Category(LargeTests.class)
69 public class TestMasterFailover {
70 private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151 @Test (timeout=180000)
152 public void testMasterFailoverWithMockedRIT() throws Exception {
153
154 final int NUM_MASTERS = 1;
155 final int NUM_RS = 3;
156
157
158 Configuration conf = HBaseConfiguration.create();
159
160
161 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
162 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
163 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
164 log("Cluster started");
165
166
167 ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);
168
169
170 List<MasterThread> masterThreads = cluster.getMasterThreads();
171 assertEquals(1, masterThreads.size());
172
173
174 assertTrue(cluster.waitForActiveAndReadyMaster());
175 HMaster master = masterThreads.get(0).getMaster();
176 assertTrue(master.isActiveMaster());
177 assertTrue(master.isInitialized());
178
179
180 master.balanceSwitch(false);
181
182
183 byte [] FAMILY = Bytes.toBytes("family");
184 byte [][] SPLIT_KEYS = new byte [][] {
185 new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
186 Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
187 Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
188 Bytes.toBytes("iii"), Bytes.toBytes("jjj")
189 };
190
191 byte [] enabledTable = Bytes.toBytes("enabledTable");
192 HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
193 htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
194
195 FileSystem filesystem = FileSystem.get(conf);
196 Path rootdir = FSUtils.getRootDir(conf);
197 FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
198
199 fstd.createTableDescriptor(htdEnabled);
200
201 HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(), null, null);
202 createRegion(hriEnabled, rootdir, conf, htdEnabled);
203
204 List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
205 TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
206
207 TableName disabledTable = TableName.valueOf("disabledTable");
208 HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
209 htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
210
211 fstd.createTableDescriptor(htdDisabled);
212 HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
213 createRegion(hriDisabled, rootdir, conf, htdDisabled);
214 List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
215 TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
216
217 log("Regions in META and namespace have been created");
218
219
220 assertEquals(2, cluster.countServedRegions());
221
222
223 HRegionServer hrs = cluster.getRegionServer(0);
224 ServerName serverName = hrs.getServerName();
225 HRegionInfo closingRegion = enabledRegions.remove(0);
226
227 List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
228 enabledAndAssignedRegions.add(enabledRegions.remove(0));
229 enabledAndAssignedRegions.add(enabledRegions.remove(0));
230 enabledAndAssignedRegions.add(closingRegion);
231
232 List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
233 disabledAndAssignedRegions.add(disabledRegions.remove(0));
234 disabledAndAssignedRegions.add(disabledRegions.remove(0));
235
236
237 for (HRegionInfo hri : enabledAndAssignedRegions) {
238 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
239 new RegionPlan(hri, null, serverName));
240 master.assignRegion(hri);
241 }
242 for (HRegionInfo hri : disabledAndAssignedRegions) {
243 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
244 new RegionPlan(hri, null, serverName));
245 master.assignRegion(hri);
246 }
247
248
249 log("Waiting for assignment to finish");
250 ZKAssign.blockUntilNoRIT(zkw);
251 log("Assignment completed");
252
253
254 log("Aborting master");
255 cluster.abortMaster(0);
256 cluster.waitOnMaster(0);
257 log("Master has aborted");
258
259
260
261
262
263
264 List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
265 List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
266
267 log("Beginning to mock scenarios");
268
269
270 ZKTable zktable = new ZKTable(zkw);
271 zktable.setDisabledTable(disabledTable);
272
273
274
275
276
277
278
279
280 HRegionInfo region = enabledRegions.remove(0);
281 regionsThatShouldBeOnline.add(region);
282 ZKAssign.createNodeOffline(zkw, region, serverName);
283
284
285
286
287
288 regionsThatShouldBeOnline.add(closingRegion);
289 ZKAssign.createNodeClosing(zkw, closingRegion, serverName);
290
291
292
293
294
295
296
297 region = enabledRegions.remove(0);
298 regionsThatShouldBeOnline.add(region);
299 int version = ZKAssign.createNodeClosing(zkw, region, serverName);
300 ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
301
302
303 region = disabledRegions.remove(0);
304 regionsThatShouldBeOffline.add(region);
305 version = ZKAssign.createNodeClosing(zkw, region, serverName);
306 ZKAssign.transitionNodeClosed(zkw, region, serverName, version);
307
308
309
310
311
312
313
314 region = enabledRegions.remove(0);
315 regionsThatShouldBeOnline.add(region);
316 ZKAssign.createNodeOffline(zkw, region, serverName);
317 ProtobufUtil.openRegion(hrs, region);
318 while (true) {
319 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
320 RegionTransition rt = RegionTransition.parseFrom(bytes);
321 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
322 break;
323 }
324 Thread.sleep(100);
325 }
326
327
328
329 region = disabledRegions.remove(0);
330 regionsThatShouldBeOffline.add(region);
331 ZKAssign.createNodeOffline(zkw, region, serverName);
332 ProtobufUtil.openRegion(hrs, region);
333 while (true) {
334 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
335 RegionTransition rt = RegionTransition.parseFrom(bytes);
336 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
337 break;
338 }
339 Thread.sleep(100);
340 }
341
342
343
344
345
346
347
348
349
350 log("Done mocking data up in ZK");
351
352
353 log("Starting up a new master");
354 master = cluster.startMaster().getMaster();
355 log("Waiting for master to be ready");
356 cluster.waitForActiveAndReadyMaster();
357 log("Master is ready");
358
359
360 log("Waiting for no more RIT");
361 ZKAssign.blockUntilNoRIT(zkw);
362 log("No more RIT in ZK, now doing final test verification");
363
364
365 Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
366 for (JVMClusterUtil.RegionServerThread rst :
367 cluster.getRegionServerThreads()) {
368 onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer()));
369 }
370
371
372 for (HRegionInfo hri : regionsThatShouldBeOnline) {
373 assertTrue(onlineRegions.contains(hri));
374 }
375
376
377 for (HRegionInfo hri : regionsThatShouldBeOffline) {
378 assertFalse(onlineRegions.contains(hri));
379 }
380
381 log("Done with verification, all passed, shutting down cluster");
382
383
384 TEST_UTIL.shutdownMiniCluster();
385 }
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444 @Test (timeout=180000)
445 public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {
446
447 final int NUM_MASTERS = 1;
448 final int NUM_RS = 2;
449
450
451 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
452 Configuration conf = TEST_UTIL.getConfiguration();
453
454 conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
455 conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
456 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
457 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
458 log("Cluster started");
459
460
461 ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
462 "unittest", new Abortable() {
463
464 @Override
465 public void abort(String why, Throwable e) {
466 LOG.error("Fatal ZK Error: " + why, e);
467 org.junit.Assert.assertFalse("Fatal ZK error", true);
468 }
469
470 @Override
471 public boolean isAborted() {
472 return false;
473 }
474
475 });
476
477
478 List<MasterThread> masterThreads = cluster.getMasterThreads();
479 assertEquals(1, masterThreads.size());
480
481
482 assertTrue(cluster.waitForActiveAndReadyMaster());
483 HMaster master = masterThreads.get(0).getMaster();
484 assertTrue(master.isActiveMaster());
485 assertTrue(master.isInitialized());
486
487
488 master.balanceSwitch(false);
489
490
491 byte [] FAMILY = Bytes.toBytes("family");
492 byte[][] SPLIT_KEYS =
493 TEST_UTIL.getRegionSplitStartKeys(Bytes.toBytes("aaa"), Bytes.toBytes("zzz"), 30);
494
495 byte [] enabledTable = Bytes.toBytes("enabledTable");
496 HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
497 htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
498 FileSystem filesystem = FileSystem.get(conf);
499 Path rootdir = FSUtils.getRootDir(conf);
500 FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
501
502 fstd.createTableDescriptor(htdEnabled);
503 HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(),
504 null, null);
505 createRegion(hriEnabled, rootdir, conf, htdEnabled);
506
507 List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
508 TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);
509
510 TableName disabledTable =
511 TableName.valueOf("disabledTable");
512 HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
513 htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
514
515 fstd.createTableDescriptor(htdDisabled);
516 HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
517 createRegion(hriDisabled, rootdir, conf, htdDisabled);
518
519 List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
520 TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
521
522 log("Regions in META and Namespace have been created");
523
524
525 assertEquals(2, cluster.countServedRegions());
526
527
528 List<RegionServerThread> regionservers =
529 cluster.getRegionServerThreads();
530 HRegionServer hrs = regionservers.get(0).getRegionServer();
531
532
533 RegionServerThread hrsDeadThread = regionservers.get(1);
534 HRegionServer hrsDead = hrsDeadThread.getRegionServer();
535 ServerName deadServerName = hrsDead.getServerName();
536
537
538 List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
539 enabledAndAssignedRegions.addAll(enabledRegions.subList(0, 6));
540 enabledRegions.removeAll(enabledAndAssignedRegions);
541 List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
542 disabledAndAssignedRegions.addAll(disabledRegions.subList(0, 6));
543 disabledRegions.removeAll(disabledAndAssignedRegions);
544
545
546 for (HRegionInfo hri : enabledAndAssignedRegions) {
547 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
548 new RegionPlan(hri, null, hrs.getServerName()));
549 master.assignRegion(hri);
550 }
551 for (HRegionInfo hri : disabledAndAssignedRegions) {
552 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
553 new RegionPlan(hri, null, hrs.getServerName()));
554 master.assignRegion(hri);
555 }
556
557 log("Waiting for assignment to finish");
558 ZKAssign.blockUntilNoRIT(zkw);
559 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
560 log("Assignment completed");
561
562 assertTrue(" Table must be enabled.", master.getAssignmentManager()
563 .getZKTable().isEnabledTable(TableName.valueOf("enabledTable")));
564
565 List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
566 enabledAndOnDeadRegions.addAll(enabledRegions.subList(0, 6));
567 enabledRegions.removeAll(enabledAndOnDeadRegions);
568 List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
569 disabledAndOnDeadRegions.addAll(disabledRegions.subList(0, 6));
570 disabledRegions.removeAll(disabledAndOnDeadRegions);
571
572
573 for (HRegionInfo hri : enabledAndOnDeadRegions) {
574 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
575 new RegionPlan(hri, null, deadServerName));
576 master.assignRegion(hri);
577 }
578 for (HRegionInfo hri : disabledAndOnDeadRegions) {
579 master.assignmentManager.regionPlans.put(hri.getEncodedName(),
580 new RegionPlan(hri, null, deadServerName));
581 master.assignRegion(hri);
582 }
583
584
585 log("Waiting for assignment to finish");
586 ZKAssign.blockUntilNoRIT(zkw);
587 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
588 log("Assignment completed");
589
590
591
592 verifyRegionLocation(hrs, enabledAndAssignedRegions);
593 verifyRegionLocation(hrs, disabledAndAssignedRegions);
594 verifyRegionLocation(hrsDead, enabledAndOnDeadRegions);
595 verifyRegionLocation(hrsDead, disabledAndOnDeadRegions);
596
597 assertTrue(" Didn't get enough regions of enabledTalbe on live rs.",
598 enabledAndAssignedRegions.size() >= 2);
599 assertTrue(" Didn't get enough regions of disalbedTable on live rs.",
600 disabledAndAssignedRegions.size() >= 2);
601 assertTrue(" Didn't get enough regions of enabledTalbe on dead rs.",
602 enabledAndOnDeadRegions.size() >= 2);
603 assertTrue(" Didn't get enough regions of disalbedTable on dead rs.",
604 disabledAndOnDeadRegions.size() >= 2);
605
606
607 log("Aborting master");
608 cluster.abortMaster(0);
609 cluster.waitOnMaster(0);
610 log("Master has aborted");
611
612
613
614
615
616
617 List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
618 List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();
619
620 log("Beginning to mock scenarios");
621
622
623 ZKTable zktable = new ZKTable(zkw);
624 zktable.setDisabledTable(disabledTable);
625
626 assertTrue(" The enabled table should be identified on master fail over.",
627 zktable.isEnabledTable(TableName.valueOf("enabledTable")));
628
629
630
631
632
633
634 HRegionInfo region = enabledAndOnDeadRegions.remove(0);
635 regionsThatShouldBeOnline.add(region);
636 ZKAssign.createNodeClosing(zkw, region, deadServerName);
637 LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" +
638 region + "\n\n");
639
640
641 region = disabledAndOnDeadRegions.remove(0);
642 regionsThatShouldBeOffline.add(region);
643 ZKAssign.createNodeClosing(zkw, region, deadServerName);
644 LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" +
645 region + "\n\n");
646
647
648
649
650
651
652 region = enabledAndOnDeadRegions.remove(0);
653 regionsThatShouldBeOnline.add(region);
654 int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
655 ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
656 LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" +
657 region + "\n\n");
658
659
660 region = disabledAndOnDeadRegions.remove(0);
661 regionsThatShouldBeOffline.add(region);
662 version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
663 ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
664 LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" +
665 region + "\n\n");
666
667
668
669
670
671
672 region = enabledRegions.remove(0);
673 regionsThatShouldBeOnline.add(region);
674 ZKAssign.createNodeOffline(zkw, region, deadServerName);
675 ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
676 LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" +
677 region + "\n\n");
678
679
680 region = disabledRegions.remove(0);
681 regionsThatShouldBeOffline.add(region);
682 ZKAssign.createNodeOffline(zkw, region, deadServerName);
683 ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
684 LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" +
685 region + "\n\n");
686
687
688
689
690
691
692 region = enabledRegions.remove(0);
693 regionsThatShouldBeOnline.add(region);
694 ZKAssign.createNodeOffline(zkw, region, deadServerName);
695 ProtobufUtil.openRegion(hrsDead, region);
696 while (true) {
697 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
698 RegionTransition rt = RegionTransition.parseFrom(bytes);
699 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
700 break;
701 }
702 Thread.sleep(100);
703 }
704 LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" +
705 region + "\n\n");
706
707
708 region = disabledRegions.remove(0);
709 regionsThatShouldBeOffline.add(region);
710 ZKAssign.createNodeOffline(zkw, region, deadServerName);
711 ProtobufUtil.openRegion(hrsDead, region);
712 while (true) {
713 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
714 RegionTransition rt = RegionTransition.parseFrom(bytes);
715 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
716 break;
717 }
718 Thread.sleep(100);
719 }
720 LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" +
721 region + "\n\n");
722
723
724
725
726
727
728 region = enabledRegions.remove(0);
729 regionsThatShouldBeOnline.add(region);
730 ZKAssign.createNodeOffline(zkw, region, deadServerName);
731 ProtobufUtil.openRegion(hrsDead, region);
732 while (true) {
733 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
734 RegionTransition rt = RegionTransition.parseFrom(bytes);
735 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
736 ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
737 LOG.debug("DELETED " + rt);
738 break;
739 }
740 Thread.sleep(100);
741 }
742 LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS"
743 + "\n" + region + "\n\n");
744
745
746 region = disabledRegions.remove(0);
747 regionsThatShouldBeOffline.add(region);
748 ZKAssign.createNodeOffline(zkw, region, deadServerName);
749 ProtobufUtil.openRegion(hrsDead, region);
750 while (true) {
751 byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
752 RegionTransition rt = RegionTransition.parseFrom(bytes);
753 if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
754 ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
755 break;
756 }
757 Thread.sleep(100);
758 }
759 LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS"
760 + "\n" + region + "\n\n");
761
762
763
764
765
766 log("Done mocking data up in ZK");
767
768
769 log("Killing RS " + deadServerName);
770 hrsDead.abort("Killing for unit test");
771 log("RS " + deadServerName + " killed");
772
773
774
775 while (hrsDeadThread.isAlive()) {
776 Threads.sleep(10);
777 }
778 log("Starting up a new master");
779 master = cluster.startMaster().getMaster();
780 log("Waiting for master to be ready");
781 assertTrue(cluster.waitForActiveAndReadyMaster());
782 log("Master is ready");
783
784
785 while (master.getServerManager().areDeadServersInProgress()) {
786 Thread.sleep(10);
787 }
788
789
790 log("Waiting for no more RIT");
791 ZKAssign.blockUntilNoRIT(zkw);
792 log("No more RIT in ZK");
793 long now = System.currentTimeMillis();
794 long maxTime = 120000;
795 boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
796 if (!done) {
797 LOG.info("rit=" + master.getAssignmentManager().getRegionStates().getRegionsInTransition());
798 }
799 long elapsed = System.currentTimeMillis() - now;
800 assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
801 elapsed < maxTime);
802 log("No more RIT in RIT map, doing final test verification");
803
804
805 Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
806 now = System.currentTimeMillis();
807 maxTime = 30000;
808 for (JVMClusterUtil.RegionServerThread rst :
809 cluster.getRegionServerThreads()) {
810 try {
811 HRegionServer rs = rst.getRegionServer();
812 while (!rs.getRegionsInTransitionInRS().isEmpty()) {
813 elapsed = System.currentTimeMillis() - now;
814 assertTrue("Test timed out in getting online regions", elapsed < maxTime);
815 if (rs.isAborted() || rs.isStopped()) {
816
817 break;
818 }
819 Thread.sleep(100);
820 }
821 onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rs));
822 } catch (RegionServerStoppedException e) {
823 LOG.info("Got RegionServerStoppedException", e);
824 }
825 }
826
827
828 for (HRegionInfo hri : regionsThatShouldBeOnline) {
829 assertTrue("region=" + hri.getRegionNameAsString() + ", " + onlineRegions.toString(),
830 onlineRegions.contains(hri));
831 }
832
833
834 for (HRegionInfo hri : regionsThatShouldBeOffline) {
835 assertFalse(onlineRegions.contains(hri));
836 }
837
838 log("Done with verification, all passed, shutting down cluster");
839
840
841 TEST_UTIL.shutdownMiniCluster();
842 }
843
844
845
846
847 private void verifyRegionLocation(HRegionServer hrs, List<HRegionInfo> regions)
848 throws IOException {
849 List<HRegionInfo> tmpOnlineRegions = ProtobufUtil.getOnlineRegions(hrs);
850 Iterator<HRegionInfo> itr = regions.iterator();
851 while (itr.hasNext()) {
852 HRegionInfo tmp = itr.next();
853 if (!tmpOnlineRegions.contains(tmp)) {
854 itr.remove();
855 }
856 }
857 }
858
859 HRegion createRegion(final HRegionInfo hri, final Path rootdir, final Configuration c,
860 final HTableDescriptor htd)
861 throws IOException {
862 HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
863
864
865
866
867
868 HRegion.closeHRegion(r);
869 return r;
870 }
871
872
873
874
875 private void log(String string) {
876 LOG.info("\n\n" + string + " \n\n");
877 }
878
879 @Test (timeout=180000)
880 public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
881 throws Exception {
882 LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
883 final int NUM_MASTERS = 1;
884 final int NUM_RS = 2;
885
886
887 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
888 Configuration conf = TEST_UTIL.getConfiguration();
889 conf.setInt("hbase.master.info.port", -1);
890
891 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
892 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
893
894
895 List<RegionServerThread> regionServerThreads =
896 cluster.getRegionServerThreads();
897 int count = -1;
898 HRegion metaRegion = null;
899 for (RegionServerThread regionServerThread : regionServerThreads) {
900 HRegionServer regionServer = regionServerThread.getRegionServer();
901 metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
902 count++;
903 regionServer.abort("");
904 if (null != metaRegion) break;
905 }
906 HRegionServer regionServer = cluster.getRegionServer(count);
907
908 TEST_UTIL.shutdownMiniHBaseCluster();
909
910
911 ZooKeeperWatcher zkw =
912 HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
913 metaRegion, regionServer.getServerName());
914
915 LOG.info("Staring cluster for second time");
916 TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);
917
918 HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
919 while (!master.isInitialized()) {
920 Thread.sleep(100);
921 }
922
923 log("Waiting for no more RIT");
924 ZKAssign.blockUntilNoRIT(zkw);
925
926 zkw.close();
927
928 TEST_UTIL.shutdownMiniCluster();
929 }
930
931
932
933
934
935
936
937
938
939 @Test (timeout=240000)
940 public void testSimpleMasterFailover() throws Exception {
941
942 final int NUM_MASTERS = 3;
943 final int NUM_RS = 3;
944
945
946 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
947
948 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
949 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
950
951
952 List<MasterThread> masterThreads = cluster.getMasterThreads();
953
954
955 for (MasterThread mt : masterThreads) {
956 assertTrue(mt.isAlive());
957 }
958
959
960 int numActive = 0;
961 int activeIndex = -1;
962 ServerName activeName = null;
963 HMaster active = null;
964 for (int i = 0; i < masterThreads.size(); i++) {
965 if (masterThreads.get(i).getMaster().isActiveMaster()) {
966 numActive++;
967 activeIndex = i;
968 active = masterThreads.get(activeIndex).getMaster();
969 activeName = active.getServerName();
970 }
971 }
972 assertEquals(1, numActive);
973 assertEquals(NUM_MASTERS, masterThreads.size());
974 LOG.info("Active master " + activeName);
975
976
977 assertNotNull(active);
978 ClusterStatus status = active.getClusterStatus();
979 assertTrue(status.getMaster().equals(activeName));
980 assertEquals(2, status.getBackupMastersSize());
981 assertEquals(2, status.getBackupMasters().size());
982
983
984 int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
985 HMaster master = cluster.getMaster(backupIndex);
986 LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
987 cluster.stopMaster(backupIndex, false);
988 cluster.waitOnMaster(backupIndex);
989
990
991 for (int i = 0; i < masterThreads.size(); i++) {
992 if (masterThreads.get(i).getMaster().isActiveMaster()) {
993 assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
994 activeIndex = i;
995 active = masterThreads.get(activeIndex).getMaster();
996 }
997 }
998 assertEquals(1, numActive);
999 assertEquals(2, masterThreads.size());
1000 int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
1001 LOG.info("Active master " + active.getServerName() + " managing " + rsCount + " regions servers");
1002 assertEquals(3, rsCount);
1003
1004
1005 assertNotNull(active);
1006 status = active.getClusterStatus();
1007 assertTrue(status.getMaster().equals(activeName));
1008 assertEquals(1, status.getBackupMastersSize());
1009 assertEquals(1, status.getBackupMasters().size());
1010
1011
1012 LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
1013 cluster.stopMaster(activeIndex, false);
1014 cluster.waitOnMaster(activeIndex);
1015
1016
1017 assertTrue(cluster.waitForActiveAndReadyMaster());
1018
1019 LOG.debug("\n\nVerifying backup master is now active\n");
1020
1021 assertEquals(1, masterThreads.size());
1022
1023
1024 active = masterThreads.get(0).getMaster();
1025 assertNotNull(active);
1026 status = active.getClusterStatus();
1027 ServerName mastername = status.getMaster();
1028 assertTrue(mastername.equals(active.getServerName()));
1029 assertTrue(active.isActiveMaster());
1030 assertEquals(0, status.getBackupMastersSize());
1031 assertEquals(0, status.getBackupMasters().size());
1032 int rss = status.getServersSize();
1033 LOG.info("Active master " + mastername.getServerName() + " managing " +
1034 rss + " region servers");
1035 assertEquals(3, rss);
1036
1037
1038 TEST_UTIL.shutdownMiniCluster();
1039 }
1040 }
1041