1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import static org.junit.Assert.assertEquals;
22
23 import java.io.IOException;
24 import java.util.List;
25 import java.util.NavigableSet;
26 import java.util.Set;
27 import java.util.TreeSet;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.conf.Configuration;
32 import org.apache.hadoop.hbase.HBaseConfiguration;
33 import org.apache.hadoop.hbase.HBaseTestingUtility;
34 import org.apache.hadoop.hbase.HRegionInfo;
35 import org.apache.hadoop.hbase.testclassification.LargeTests;
36 import org.apache.hadoop.hbase.MiniHBaseCluster;
37 import org.apache.hadoop.hbase.ServerName;
38 import org.apache.hadoop.hbase.TableName;
39 import org.apache.hadoop.hbase.client.HTable;
40 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
41 import org.apache.hadoop.hbase.util.Bytes;
42 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
43 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
44 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
45 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
46 import org.apache.zookeeper.KeeperException;
47 import org.junit.Test;
48 import org.junit.experimental.categories.Category;
49
50
51
52
53 @Category(LargeTests.class)
54 public class TestRollingRestart {
55 private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
56
57 @Test (timeout=500000)
58 public void testBasicRollingRestart() throws Exception {
59
60
61 final int NUM_MASTERS = 2;
62 final int NUM_RS = 3;
63 final int NUM_REGIONS_TO_CREATE = 20;
64
65 int expectedNumRS = 3;
66
67
68 log("Starting cluster");
69 Configuration conf = HBaseConfiguration.create();
70 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
71 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
72 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
73 log("Waiting for active/ready master");
74 cluster.waitForActiveAndReadyMaster();
75 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
76 null);
77 HMaster master = cluster.getMaster();
78
79
80 TableName table = TableName.valueOf("tableRestart");
81 byte [] family = Bytes.toBytes("family");
82 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
83 HTable ht = TEST_UTIL.createTable(table, family);
84 int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
85 NUM_REGIONS_TO_CREATE);
86 numRegions += 1;
87 log("Waiting for no more RIT\n");
88 blockUntilNoRIT(zkw, master);
89 log("Disabling table\n");
90 TEST_UTIL.getHBaseAdmin().disableTable(table);
91 log("Waiting for no more RIT\n");
92 blockUntilNoRIT(zkw, master);
93 NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
94 log("Verifying only catalog and namespace regions are assigned\n");
95 if (regions.size() != 2) {
96 for (String oregion : regions) log("Region still online: " + oregion);
97 }
98 assertEquals(2, regions.size());
99 log("Enabling table\n");
100 TEST_UTIL.getHBaseAdmin().enableTable(table);
101 log("Waiting for no more RIT\n");
102 blockUntilNoRIT(zkw, master);
103 log("Verifying there are " + numRegions + " assigned on cluster\n");
104 regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
105 assertRegionsAssigned(cluster, regions);
106 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
107
108
109 log("Adding a fourth RS");
110 RegionServerThread restarted = cluster.startRegionServer();
111 expectedNumRS++;
112 restarted.waitForServerOnline();
113 log("Additional RS is online");
114 log("Waiting for no more RIT");
115 blockUntilNoRIT(zkw, master);
116 log("Verifying there are " + numRegions + " assigned on cluster");
117 assertRegionsAssigned(cluster, regions);
118 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
119
120
121 List<MasterThread> masterThreads = cluster.getMasterThreads();
122 MasterThread activeMaster = null;
123 MasterThread backupMaster = null;
124 assertEquals(2, masterThreads.size());
125 if (masterThreads.get(0).getMaster().isActiveMaster()) {
126 activeMaster = masterThreads.get(0);
127 backupMaster = masterThreads.get(1);
128 } else {
129 activeMaster = masterThreads.get(1);
130 backupMaster = masterThreads.get(0);
131 }
132
133
134 log("Stopping backup master\n\n");
135 backupMaster.getMaster().stop("Stop of backup during rolling restart");
136 cluster.hbaseCluster.waitOnMaster(backupMaster);
137
138
139 log("Stopping primary master\n\n");
140 activeMaster.getMaster().stop("Stop of active during rolling restart");
141 cluster.hbaseCluster.waitOnMaster(activeMaster);
142
143
144 log("Restarting primary master\n\n");
145 activeMaster = cluster.startMaster();
146 cluster.waitForActiveAndReadyMaster();
147 master = activeMaster.getMaster();
148
149
150 log("Restarting backup master\n\n");
151 backupMaster = cluster.startMaster();
152
153 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
154
155
156
157
158 List<RegionServerThread> regionServers =
159 cluster.getLiveRegionServerThreads();
160 int num = 1;
161 int total = regionServers.size();
162 for (RegionServerThread rst : regionServers) {
163 ServerName serverName = rst.getRegionServer().getServerName();
164 log("Stopping region server " + num + " of " + total + " [ " +
165 serverName + "]");
166 rst.getRegionServer().stop("Stopping RS during rolling restart");
167 cluster.hbaseCluster.waitOnRegionServer(rst);
168 log("Waiting for RS shutdown to be handled by master");
169 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
170 log("RS shutdown done, waiting for no more RIT");
171 blockUntilNoRIT(zkw, master);
172 log("Verifying there are " + numRegions + " assigned on cluster");
173 assertRegionsAssigned(cluster, regions);
174 expectedNumRS--;
175 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
176 log("Restarting region server " + num + " of " + total);
177 restarted = cluster.startRegionServer();
178 restarted.waitForServerOnline();
179 expectedNumRS++;
180 log("Region server " + num + " is back online");
181 log("Waiting for no more RIT");
182 blockUntilNoRIT(zkw, master);
183 log("Verifying there are " + numRegions + " assigned on cluster");
184 assertRegionsAssigned(cluster, regions);
185 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
186 num++;
187 }
188 Thread.sleep(1000);
189 assertRegionsAssigned(cluster, regions);
190
191
192
193 ht.close();
194
195 TEST_UTIL.shutdownMiniCluster();
196 }
197
198 private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
199 throws KeeperException, InterruptedException {
200 ZKAssign.blockUntilNoRIT(zkw);
201 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
202 }
203
204 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
205 ServerName serverName) throws InterruptedException {
206 ServerManager sm = activeMaster.getMaster().getServerManager();
207
208 while (!sm.getDeadServers().isDeadServer(serverName)) {
209 log("Waiting for [" + serverName + "] to be listed as dead in master");
210 Thread.sleep(1);
211 }
212 log("Server [" + serverName + "] marked as dead, waiting for it to " +
213 "finish dead processing");
214 while (sm.areDeadServersInProgress()) {
215 log("Server [" + serverName + "] still being processed, waiting");
216 Thread.sleep(100);
217 }
218 log("Server [" + serverName + "] done with server shutdown processing");
219 }
220
221 private void log(String msg) {
222 LOG.debug("\n\nTRR: " + msg + "\n");
223 }
224
225 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
226 int numFound = 0;
227 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
228 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
229 }
230 for (MasterThread mt : cluster.getMasterThreads()) {
231 numFound += mt.getMaster().getNumberOfOnlineRegions();
232 }
233 return numFound;
234 }
235
236 private void assertRegionsAssigned(MiniHBaseCluster cluster,
237 Set<String> expectedRegions) throws IOException {
238 int numFound = getNumberOfOnlineRegions(cluster);
239 if (expectedRegions.size() > numFound) {
240 log("Expected to find " + expectedRegions.size() + " but only found"
241 + " " + numFound);
242 NavigableSet<String> foundRegions =
243 HBaseTestingUtility.getAllOnlineRegions(cluster);
244 for (String region : expectedRegions) {
245 if (!foundRegions.contains(region)) {
246 log("Missing region: " + region);
247 }
248 }
249 assertEquals(expectedRegions.size(), numFound);
250 } else if (expectedRegions.size() < numFound) {
251 int doubled = numFound - expectedRegions.size();
252 log("Expected to find " + expectedRegions.size() + " but found"
253 + " " + numFound + " (" + doubled + " double assignments?)");
254 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
255 for (String region : doubleRegions) {
256 log("Region is double assigned: " + region);
257 }
258 assertEquals(expectedRegions.size(), numFound);
259 } else {
260 log("Success! Found expected number of " + numFound + " regions");
261 }
262 }
263
264 private NavigableSet<String> getDoubleAssignedRegions(
265 MiniHBaseCluster cluster) throws IOException {
266 NavigableSet<String> online = new TreeSet<String>();
267 NavigableSet<String> doubled = new TreeSet<String>();
268 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
269 for (HRegionInfo region : ProtobufUtil.getOnlineRegions(
270 rst.getRegionServer().getRSRpcServices())) {
271 if(!online.add(region.getRegionNameAsString())) {
272 doubled.add(region.getRegionNameAsString());
273 }
274 }
275 }
276 return doubled;
277 }
278
279
280 }
281