1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.master;
20
21 import static org.junit.Assert.*;
22
23 import java.io.IOException;
24 import java.util.List;
25 import java.util.NavigableSet;
26 import java.util.Set;
27 import java.util.TreeSet;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.conf.Configuration;
32 import org.apache.hadoop.hbase.*;
33 import org.apache.hadoop.hbase.client.HTable;
34 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35 import org.apache.hadoop.hbase.util.Bytes;
36 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
37 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
38 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
39 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
40 import org.apache.zookeeper.KeeperException;
41 import org.junit.Test;
42 import org.junit.experimental.categories.Category;
43
44
45
46
47 @Category(LargeTests.class)
48 public class TestRollingRestart {
49 private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
50
51 @Test (timeout=500000)
52 public void testBasicRollingRestart() throws Exception {
53
54
55 final int NUM_MASTERS = 2;
56 final int NUM_RS = 3;
57 final int NUM_REGIONS_TO_CREATE = 20;
58
59 int expectedNumRS = 3;
60
61
62 log("Starting cluster");
63 Configuration conf = HBaseConfiguration.create();
64 conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
65 conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
66 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
67 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
68 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
69 log("Waiting for active/ready master");
70 cluster.waitForActiveAndReadyMaster();
71 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
72 null);
73 HMaster master = cluster.getMaster();
74
75
76 byte [] table = Bytes.toBytes("tableRestart");
77 byte [] family = Bytes.toBytes("family");
78 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
79 HTable ht = TEST_UTIL.createTable(table, family);
80 int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
81 NUM_REGIONS_TO_CREATE);
82 numRegions += 1;
83 log("Waiting for no more RIT\n");
84 blockUntilNoRIT(zkw, master);
85 log("Disabling table\n");
86 TEST_UTIL.getHBaseAdmin().disableTable(table);
87 log("Waiting for no more RIT\n");
88 blockUntilNoRIT(zkw, master);
89 NavigableSet<String> regions = getAllOnlineRegions(cluster);
90 log("Verifying only catalog and namespace regions are assigned\n");
91 if (regions.size() != 2) {
92 for (String oregion : regions) log("Region still online: " + oregion);
93 }
94 assertEquals(2, regions.size());
95 log("Enabling table\n");
96 TEST_UTIL.getHBaseAdmin().enableTable(table);
97 log("Waiting for no more RIT\n");
98 blockUntilNoRIT(zkw, master);
99 log("Verifying there are " + numRegions + " assigned on cluster\n");
100 regions = getAllOnlineRegions(cluster);
101 assertRegionsAssigned(cluster, regions);
102 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
103
104
105 log("Adding a fourth RS");
106 RegionServerThread restarted = cluster.startRegionServer();
107 expectedNumRS++;
108 restarted.waitForServerOnline();
109 log("Additional RS is online");
110 log("Waiting for no more RIT");
111 blockUntilNoRIT(zkw, master);
112 log("Verifying there are " + numRegions + " assigned on cluster");
113 assertRegionsAssigned(cluster, regions);
114 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
115
116
117 List<MasterThread> masterThreads = cluster.getMasterThreads();
118 MasterThread activeMaster = null;
119 MasterThread backupMaster = null;
120 assertEquals(2, masterThreads.size());
121 if (masterThreads.get(0).getMaster().isActiveMaster()) {
122 activeMaster = masterThreads.get(0);
123 backupMaster = masterThreads.get(1);
124 } else {
125 activeMaster = masterThreads.get(1);
126 backupMaster = masterThreads.get(0);
127 }
128
129
130 log("Stopping backup master\n\n");
131 backupMaster.getMaster().stop("Stop of backup during rolling restart");
132 cluster.hbaseCluster.waitOnMaster(backupMaster);
133
134
135 log("Stopping primary master\n\n");
136 activeMaster.getMaster().stop("Stop of active during rolling restart");
137 cluster.hbaseCluster.waitOnMaster(activeMaster);
138
139
140 log("Restarting primary master\n\n");
141 activeMaster = cluster.startMaster();
142 cluster.waitForActiveAndReadyMaster();
143 master = activeMaster.getMaster();
144
145
146 log("Restarting backup master\n\n");
147 backupMaster = cluster.startMaster();
148
149 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
150
151
152
153
154 List<RegionServerThread> regionServers =
155 cluster.getLiveRegionServerThreads();
156 int num = 1;
157 int total = regionServers.size();
158 for (RegionServerThread rst : regionServers) {
159 ServerName serverName = rst.getRegionServer().getServerName();
160 log("Stopping region server " + num + " of " + total + " [ " +
161 serverName + "]");
162 rst.getRegionServer().stop("Stopping RS during rolling restart");
163 cluster.hbaseCluster.waitOnRegionServer(rst);
164 log("Waiting for RS shutdown to be handled by master");
165 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
166 log("RS shutdown done, waiting for no more RIT");
167 blockUntilNoRIT(zkw, master);
168 log("Verifying there are " + numRegions + " assigned on cluster");
169 assertRegionsAssigned(cluster, regions);
170 expectedNumRS--;
171 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
172 log("Restarting region server " + num + " of " + total);
173 restarted = cluster.startRegionServer();
174 restarted.waitForServerOnline();
175 expectedNumRS++;
176 log("Region server " + num + " is back online");
177 log("Waiting for no more RIT");
178 blockUntilNoRIT(zkw, master);
179 log("Verifying there are " + numRegions + " assigned on cluster");
180 assertRegionsAssigned(cluster, regions);
181 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
182 num++;
183 }
184 Thread.sleep(1000);
185 assertRegionsAssigned(cluster, regions);
186
187
188 RegionServerThread rootServer = getServerHostingRoot(cluster);
189 RegionServerThread metaServer = getServerHostingMeta(cluster);
190 if (rootServer == metaServer) {
191 log("ROOT and META on the same server so killing another random server");
192 int i=0;
193 while (rootServer == metaServer) {
194 metaServer = cluster.getRegionServerThreads().get(i);
195 i++;
196 }
197 }
198 log("Stopping server hosting META #1");
199 metaServer.getRegionServer().stop("Stopping META server");
200 cluster.hbaseCluster.waitOnRegionServer(metaServer);
201 log("Meta server down #1");
202 expectedNumRS--;
203 log("Waiting for meta server #1 RS shutdown to be handled by master");
204 waitForRSShutdownToStartAndFinish(activeMaster,
205 metaServer.getRegionServer().getServerName());
206 log("Waiting for no more RIT");
207 long start = System.currentTimeMillis();
208 do {
209 blockUntilNoRIT(zkw, master);
210 } while (getNumberOfOnlineRegions(cluster) < numRegions
211 && System.currentTimeMillis()-start < 60000);
212 log("Verifying there are " + numRegions + " assigned on cluster");
213 assertRegionsAssigned(cluster, regions);
214 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
215
216
217 metaServer = getServerHostingMeta(cluster);
218 log("Stopping server hosting META #2");
219 metaServer.getRegionServer().stop("Stopping META server");
220 cluster.hbaseCluster.waitOnRegionServer(metaServer);
221 log("Meta server down");
222 expectedNumRS--;
223 log("Waiting for RS shutdown to be handled by master");
224 waitForRSShutdownToStartAndFinish(activeMaster,
225 metaServer.getRegionServer().getServerName());
226 log("RS shutdown done, waiting for no more RIT");
227 blockUntilNoRIT(zkw, master);
228 log("Verifying there are " + numRegions + " assigned on cluster");
229 assertRegionsAssigned(cluster, regions);
230 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
231
232
233 cluster.startRegionServer().waitForServerOnline();
234 cluster.startRegionServer().waitForServerOnline();
235 cluster.startRegionServer().waitForServerOnline();
236 Thread.sleep(1000);
237 log("Waiting for no more RIT");
238 blockUntilNoRIT(zkw, master);
239 log("Verifying there are " + numRegions + " assigned on cluster");
240 assertRegionsAssigned(cluster, regions);
241
242 metaServer = getServerHostingMeta(cluster);
243 log("Stopping server hosting META (1 of 3)");
244 metaServer.getRegionServer().stop("Stopping META server");
245 cluster.hbaseCluster.waitOnRegionServer(metaServer);
246 log("Meta server down (1 of 3)");
247 log("Waiting for RS shutdown to be handled by master");
248 waitForRSShutdownToStartAndFinish(activeMaster,
249 metaServer.getRegionServer().getServerName());
250 log("RS shutdown done, waiting for no more RIT");
251 blockUntilNoRIT(zkw, master);
252 log("Verifying there are " + numRegions + " assigned on cluster");
253 assertRegionsAssigned(cluster, regions);
254
255
256 metaServer = getServerHostingMeta(cluster);
257 log("Stopping server hosting META (2 of 3)");
258 metaServer.getRegionServer().stop("Stopping META server");
259 cluster.hbaseCluster.waitOnRegionServer(metaServer);
260 log("Meta server down (2 of 3)");
261 log("Waiting for RS shutdown to be handled by master");
262 waitForRSShutdownToStartAndFinish(activeMaster,
263 metaServer.getRegionServer().getServerName());
264 log("RS shutdown done, waiting for no more RIT");
265 blockUntilNoRIT(zkw, master);
266 log("Verifying there are " + numRegions + " assigned on cluster");
267 assertRegionsAssigned(cluster, regions);
268
269
270 metaServer = getServerHostingMeta(cluster);
271 log("Stopping server hosting META (3 of 3)");
272 metaServer.getRegionServer().stop("Stopping META server");
273 cluster.hbaseCluster.waitOnRegionServer(metaServer);
274 log("Meta server down (3 of 3)");
275 log("Waiting for RS shutdown to be handled by master");
276 waitForRSShutdownToStartAndFinish(activeMaster,
277 metaServer.getRegionServer().getServerName());
278 log("RS shutdown done, waiting for no more RIT");
279 blockUntilNoRIT(zkw, master);
280 log("Verifying there are " + numRegions + " assigned on cluster");
281 assertRegionsAssigned(cluster, regions);
282
283 if (cluster.getRegionServerThreads().size() != 1) {
284 log("Online regionservers:");
285 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
286 log("RS: " + rst.getRegionServer().getServerName());
287 }
288 }
289 assertEquals(2, cluster.getRegionServerThreads().size());
290
291
292
293
294 ht.close();
295
296 TEST_UTIL.shutdownMiniCluster();
297 }
298
299 private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
300 throws KeeperException, InterruptedException {
301 ZKAssign.blockUntilNoRIT(zkw);
302 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
303 }
304
305 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
306 ServerName serverName) throws InterruptedException {
307 ServerManager sm = activeMaster.getMaster().getServerManager();
308
309 while (!sm.getDeadServers().isDeadServer(serverName)) {
310 log("Waiting for [" + serverName + "] to be listed as dead in master");
311 Thread.sleep(1);
312 }
313 log("Server [" + serverName + "] marked as dead, waiting for it to " +
314 "finish dead processing");
315 while (sm.areDeadServersInProgress()) {
316 log("Server [" + serverName + "] still being processed, waiting");
317 Thread.sleep(100);
318 }
319 log("Server [" + serverName + "] done with server shutdown processing");
320 }
321
322 private void log(String msg) {
323 LOG.debug("\n\nTRR: " + msg + "\n");
324 }
325
326 private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster)
327 throws IOException {
328 return getServerHosting(cluster, HRegionInfo.FIRST_META_REGIONINFO);
329 }
330
331 private RegionServerThread getServerHostingRoot(MiniHBaseCluster cluster)
332 throws IOException {
333 return getServerHosting(cluster, HRegionInfo.ROOT_REGIONINFO);
334 }
335
336 private RegionServerThread getServerHosting(MiniHBaseCluster cluster,
337 HRegionInfo region) throws IOException {
338 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
339 if (ProtobufUtil.getOnlineRegions(rst.getRegionServer()).contains(region)) {
340 return rst;
341 }
342 }
343 return null;
344 }
345
346 private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
347 int numFound = 0;
348 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
349 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
350 }
351 return numFound;
352 }
353
354 private void assertRegionsAssigned(MiniHBaseCluster cluster,
355 Set<String> expectedRegions) throws IOException {
356 int numFound = getNumberOfOnlineRegions(cluster);
357 if (expectedRegions.size() > numFound) {
358 log("Expected to find " + expectedRegions.size() + " but only found"
359 + " " + numFound);
360 NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
361 for (String region : expectedRegions) {
362 if (!foundRegions.contains(region)) {
363 log("Missing region: " + region);
364 }
365 }
366 assertEquals(expectedRegions.size(), numFound);
367 } else if (expectedRegions.size() < numFound) {
368 int doubled = numFound - expectedRegions.size();
369 log("Expected to find " + expectedRegions.size() + " but found"
370 + " " + numFound + " (" + doubled + " double assignments?)");
371 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
372 for (String region : doubleRegions) {
373 log("Region is double assigned: " + region);
374 }
375 assertEquals(expectedRegions.size(), numFound);
376 } else {
377 log("Success! Found expected number of " + numFound + " regions");
378 }
379 }
380
381 private NavigableSet<String> getAllOnlineRegions(MiniHBaseCluster cluster)
382 throws IOException {
383 NavigableSet<String> online = new TreeSet<String>();
384 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
385 for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer())) {
386 online.add(region.getRegionNameAsString());
387 }
388 }
389 return online;
390 }
391
392 private NavigableSet<String> getDoubleAssignedRegions(
393 MiniHBaseCluster cluster) throws IOException {
394 NavigableSet<String> online = new TreeSet<String>();
395 NavigableSet<String> doubled = new TreeSet<String>();
396 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
397 for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer())) {
398 if(!online.add(region.getRegionNameAsString())) {
399 doubled.add(region.getRegionNameAsString());
400 }
401 }
402 }
403 return doubled;
404 }
405
406
407 }
408