1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import static org.junit.Assert.*;
23
24 import java.io.IOException;
25 import java.util.List;
26 import java.util.NavigableSet;
27 import java.util.Set;
28 import java.util.TreeSet;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32 import org.apache.hadoop.conf.Configuration;
33 import org.apache.hadoop.hbase.*;
34 import org.apache.hadoop.hbase.client.HTable;
35 import org.apache.hadoop.hbase.util.Bytes;
36 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
37 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
38 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
39 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
40 import org.apache.zookeeper.KeeperException;
41 import org.junit.Test;
42 import org.junit.experimental.categories.Category;
43
44
45
46
47 @Category(LargeTests.class)
48 public class TestRollingRestart {
49 private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
50
51 @Test (timeout=300000)
52 public void testBasicRollingRestart() throws Exception {
53
54
55 final int NUM_MASTERS = 2;
56 final int NUM_RS = 3;
57 final int NUM_REGIONS_TO_CREATE = 20;
58
59 int expectedNumRS = 3;
60
61
62 log("Starting cluster");
63 Configuration conf = HBaseConfiguration.create();
64 conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
65 conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
66 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
67 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
68 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
69 log("Waiting for active/ready master");
70 cluster.waitForActiveAndReadyMaster();
71 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
72 null);
73 HMaster master = cluster.getMaster();
74
75
76 byte [] table = Bytes.toBytes("tableRestart");
77 byte [] family = Bytes.toBytes("family");
78 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
79 HTable ht = TEST_UTIL.createTable(table, family);
80 int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
81 NUM_REGIONS_TO_CREATE);
82 numRegions += 2;
83 log("Waiting for no more RIT\n");
84 blockUntilNoRIT(zkw, master);
85 log("Disabling table\n");
86 TEST_UTIL.getHBaseAdmin().disableTable(table);
87 log("Waiting for no more RIT\n");
88 blockUntilNoRIT(zkw, master);
89 NavigableSet<String> regions = getAllOnlineRegions(cluster);
90 log("Verifying only catalog regions are assigned\n");
91 if (regions.size() != 2) {
92 for (String oregion : regions) log("Region still online: " + oregion);
93 }
94 assertEquals(2, regions.size());
95 log("Enabling table\n");
96 TEST_UTIL.getHBaseAdmin().enableTable(table);
97 log("Waiting for no more RIT\n");
98 blockUntilNoRIT(zkw, master);
99 log("Verifying there are " + numRegions + " assigned on cluster\n");
100 regions = getAllOnlineRegions(cluster);
101 assertRegionsAssigned(cluster, regions);
102 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
103
104
105 log("Adding a fourth RS");
106 RegionServerThread restarted = cluster.startRegionServer();
107 expectedNumRS++;
108 restarted.waitForServerOnline();
109 log("Additional RS is online");
110 log("Waiting for no more RIT");
111 blockUntilNoRIT(zkw, master);
112 log("Verifying there are " + numRegions + " assigned on cluster");
113 assertRegionsAssigned(cluster, regions);
114 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
115
116
117 List<MasterThread> masterThreads = cluster.getMasterThreads();
118 MasterThread activeMaster = null;
119 MasterThread backupMaster = null;
120 assertEquals(2, masterThreads.size());
121 if (masterThreads.get(0).getMaster().isActiveMaster()) {
122 activeMaster = masterThreads.get(0);
123 backupMaster = masterThreads.get(1);
124 } else {
125 activeMaster = masterThreads.get(1);
126 backupMaster = masterThreads.get(0);
127 }
128
129
130 log("Stopping backup master\n\n");
131 backupMaster.getMaster().stop("Stop of backup during rolling restart");
132 cluster.hbaseCluster.waitOnMaster(backupMaster);
133
134
135 log("Stopping primary master\n\n");
136 activeMaster.getMaster().stop("Stop of active during rolling restart");
137 cluster.hbaseCluster.waitOnMaster(activeMaster);
138
139
140 log("Restarting primary master\n\n");
141 activeMaster = cluster.startMaster();
142 cluster.waitForActiveAndReadyMaster();
143 master = activeMaster.getMaster();
144
145
146 log("Restarting backup master\n\n");
147 backupMaster = cluster.startMaster();
148
149 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
150
151
152
153
154 List<RegionServerThread> regionServers =
155 cluster.getLiveRegionServerThreads();
156 int num = 1;
157 int total = regionServers.size();
158 for (RegionServerThread rst : regionServers) {
159 ServerName serverName = rst.getRegionServer().getServerName();
160 log("Stopping region server " + num + " of " + total + " [ " +
161 serverName + "]");
162 rst.getRegionServer().stop("Stopping RS during rolling restart");
163 cluster.hbaseCluster.waitOnRegionServer(rst);
164 log("Waiting for RS shutdown to be handled by master");
165 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
166 log("RS shutdown done, waiting for no more RIT");
167 blockUntilNoRIT(zkw, master);
168 log("Verifying there are " + numRegions + " assigned on cluster");
169 assertRegionsAssigned(cluster, regions);
170 expectedNumRS--;
171 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
172 log("Restarting region server " + num + " of " + total);
173 restarted = cluster.startRegionServer();
174 restarted.waitForServerOnline();
175 expectedNumRS++;
176 log("Region server " + num + " is back online");
177 log("Waiting for no more RIT");
178 blockUntilNoRIT(zkw, master);
179 log("Verifying there are " + numRegions + " assigned on cluster");
180 assertRegionsAssigned(cluster, regions);
181 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
182 num++;
183 }
184 Thread.sleep(2000);
185 assertRegionsAssigned(cluster, regions);
186
187
188 RegionServerThread rootServer = getServerHostingRoot(cluster);
189 RegionServerThread metaServer = getServerHostingMeta(cluster);
190 if (rootServer == metaServer) {
191 log("ROOT and META on the same server so killing another random server");
192 int i=0;
193 while (rootServer == metaServer) {
194 metaServer = cluster.getRegionServerThreads().get(i);
195 i++;
196 }
197 }
198 log("Stopping server hosting ROOT");
199 rootServer.getRegionServer().stop("Stopping ROOT server");
200 log("Stopping server hosting META #1");
201 metaServer.getRegionServer().stop("Stopping META server");
202 cluster.hbaseCluster.waitOnRegionServer(rootServer);
203 log("Root server down");
204 cluster.hbaseCluster.waitOnRegionServer(metaServer);
205 log("Meta server down #1");
206 expectedNumRS -= 2;
207 log("Waiting for meta server #1 RS shutdown to be handled by master");
208 waitForRSShutdownToStartAndFinish(activeMaster,
209 metaServer.getRegionServer().getServerName());
210 log("Waiting for no more RIT");
211 long start = System.currentTimeMillis();
212 do {
213 blockUntilNoRIT(zkw, master);
214 } while (getNumberOfOnlineRegions(cluster) < numRegions
215 && System.currentTimeMillis()-start < 60000);
216 log("Verifying there are " + numRegions + " assigned on cluster");
217 assertRegionsAssigned(cluster, regions);
218 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
219
220
221 metaServer = getServerHostingMeta(cluster);
222 log("Stopping server hosting META #2");
223 metaServer.getRegionServer().stop("Stopping META server");
224 cluster.hbaseCluster.waitOnRegionServer(metaServer);
225 log("Meta server down");
226 expectedNumRS--;
227 log("Waiting for RS shutdown to be handled by master");
228 waitForRSShutdownToStartAndFinish(activeMaster,
229 metaServer.getRegionServer().getServerName());
230 log("RS shutdown done, waiting for no more RIT");
231 blockUntilNoRIT(zkw, master);
232 log("Verifying there are " + numRegions + " assigned on cluster");
233 assertRegionsAssigned(cluster, regions);
234 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
235
236
237 cluster.startRegionServer().waitForServerOnline();
238 cluster.startRegionServer().waitForServerOnline();
239 cluster.startRegionServer().waitForServerOnline();
240 Thread.sleep(1000);
241 log("Waiting for no more RIT");
242 blockUntilNoRIT(zkw, master);
243 log("Verifying there are " + numRegions + " assigned on cluster");
244 assertRegionsAssigned(cluster, regions);
245
246 metaServer = getServerHostingMeta(cluster);
247 log("Stopping server hosting META (1 of 3)");
248 metaServer.getRegionServer().stop("Stopping META server");
249 cluster.hbaseCluster.waitOnRegionServer(metaServer);
250 log("Meta server down (1 of 3)");
251 log("Waiting for RS shutdown to be handled by master");
252 waitForRSShutdownToStartAndFinish(activeMaster,
253 metaServer.getRegionServer().getServerName());
254 log("RS shutdown done, waiting for no more RIT");
255 blockUntilNoRIT(zkw, master);
256 log("Verifying there are " + numRegions + " assigned on cluster");
257 assertRegionsAssigned(cluster, regions);
258
259
260 metaServer = getServerHostingMeta(cluster);
261 log("Stopping server hosting META (2 of 3)");
262 metaServer.getRegionServer().stop("Stopping META server");
263 cluster.hbaseCluster.waitOnRegionServer(metaServer);
264 log("Meta server down (2 of 3)");
265 log("Waiting for RS shutdown to be handled by master");
266 waitForRSShutdownToStartAndFinish(activeMaster,
267 metaServer.getRegionServer().getServerName());
268 log("RS shutdown done, waiting for no more RIT");
269 blockUntilNoRIT(zkw, master);
270 log("Verifying there are " + numRegions + " assigned on cluster");
271 assertRegionsAssigned(cluster, regions);
272
273
274 metaServer = getServerHostingMeta(cluster);
275 log("Stopping server hosting META (3 of 3)");
276 metaServer.getRegionServer().stop("Stopping META server");
277 cluster.hbaseCluster.waitOnRegionServer(metaServer);
278 log("Meta server down (3 of 3)");
279 log("Waiting for RS shutdown to be handled by master");
280 waitForRSShutdownToStartAndFinish(activeMaster,
281 metaServer.getRegionServer().getServerName());
282 log("RS shutdown done, waiting for no more RIT");
283 blockUntilNoRIT(zkw, master);
284 log("Verifying there are " + numRegions + " assigned on cluster");
285 assertRegionsAssigned(cluster, regions);
286
287 if (cluster.getRegionServerThreads().size() != 1) {
288 log("Online regionservers:");
289 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
290 log("RS: " + rst.getRegionServer().getServerName());
291 }
292 }
293 assertEquals(1, cluster.getRegionServerThreads().size());
294
295
296
297
298 ht.close();
299
300 TEST_UTIL.shutdownMiniCluster();
301 }
302
303 private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
304 throws KeeperException, InterruptedException {
305 ZKAssign.blockUntilNoRIT(zkw);
306 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
307 }
308
309 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
310 ServerName serverName) throws InterruptedException {
311 ServerManager sm = activeMaster.getMaster().getServerManager();
312
313 while (!sm.getDeadServers().contains(serverName)) {
314 log("Waiting for [" + serverName + "] to be listed as dead in master");
315 Thread.sleep(1);
316 }
317 log("Server [" + serverName + "] marked as dead, waiting for it to " +
318 "finish dead processing");
319 while (sm.areDeadServersInProgress()) {
320 log("Server [" + serverName + "] still being processed, waiting");
321 Thread.sleep(100);
322 }
323 log("Server [" + serverName + "] done with server shutdown processing");
324 }
325
326 private void log(String msg) {
327 LOG.debug("\n\nTRR: " + msg + "\n");
328 }
329
330 private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster)
331 throws IOException {
332 return getServerHosting(cluster, HRegionInfo.FIRST_META_REGIONINFO);
333 }
334
335 private RegionServerThread getServerHostingRoot(MiniHBaseCluster cluster)
336 throws IOException {
337 return getServerHosting(cluster, HRegionInfo.ROOT_REGIONINFO);
338 }
339
340 private RegionServerThread getServerHosting(MiniHBaseCluster cluster,
341 HRegionInfo region) throws IOException {
342 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
343 if (rst.getRegionServer().getOnlineRegions().contains(region)) {
344 return rst;
345 }
346 }
347 return null;
348 }
349
350 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
351 int numFound = 0;
352 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
353 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
354 }
355 return numFound;
356 }
357
358 private void assertRegionsAssigned(MiniHBaseCluster cluster,
359 Set<String> expectedRegions) throws IOException {
360 int numFound = getNumberOfOnlineRegions(cluster);
361 if (expectedRegions.size() > numFound) {
362 log("Expected to find " + expectedRegions.size() + " but only found"
363 + " " + numFound);
364 NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
365 for (String region : expectedRegions) {
366 if (!foundRegions.contains(region)) {
367 log("Missing region: " + region);
368 }
369 }
370 assertEquals(expectedRegions.size(), numFound);
371 } else if (expectedRegions.size() < numFound) {
372 int doubled = numFound - expectedRegions.size();
373 log("Expected to find " + expectedRegions.size() + " but found"
374 + " " + numFound + " (" + doubled + " double assignments?)");
375 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
376 for (String region : doubleRegions) {
377 log("Region is double assigned: " + region);
378 }
379 assertEquals(expectedRegions.size(), numFound);
380 } else {
381 log("Success! Found expected number of " + numFound + " regions");
382 }
383 }
384
385 private NavigableSet<String> getAllOnlineRegions(MiniHBaseCluster cluster)
386 throws IOException {
387 NavigableSet<String> online = new TreeSet<String>();
388 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
389 for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
390 online.add(region.getRegionNameAsString());
391 }
392 }
393 return online;
394 }
395
396 private NavigableSet<String> getDoubleAssignedRegions(
397 MiniHBaseCluster cluster) throws IOException {
398 NavigableSet<String> online = new TreeSet<String>();
399 NavigableSet<String> doubled = new TreeSet<String>();
400 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
401 for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
402 if(!online.add(region.getRegionNameAsString())) {
403 doubled.add(region.getRegionNameAsString());
404 }
405 }
406 }
407 return doubled;
408 }
409
410
411 @org.junit.Rule
412 public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu =
413 new org.apache.hadoop.hbase.ResourceCheckerJUnitRule();
414 }
415