1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.master;
21
22 import static org.junit.Assert.*;
23 import java.util.List;
24 import java.util.NavigableSet;
25 import java.util.Set;
26 import java.util.TreeSet;
27
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30 import org.apache.hadoop.conf.Configuration;
31 import org.apache.hadoop.hbase.HBaseConfiguration;
32 import org.apache.hadoop.hbase.HBaseTestingUtility;
33 import org.apache.hadoop.hbase.HRegionInfo;
34 import org.apache.hadoop.hbase.MiniHBaseCluster;
35 import org.apache.hadoop.hbase.client.HTable;
36 import org.apache.hadoop.hbase.util.Bytes;
37 import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
38 import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
39 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
40 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
41 import org.apache.zookeeper.KeeperException;
42 import org.junit.Test;
43
44
45
46
47 public class TestRollingRestart {
48 private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
49
50 @Test
51 public void testBasicRollingRestart() throws Exception {
52
53
54 final int NUM_MASTERS = 2;
55 final int NUM_RS = 3;
56 final int NUM_REGIONS_TO_CREATE = 20;
57
58 int expectedNumRS = 3;
59
60
61 log("Starting cluster");
62 Configuration conf = HBaseConfiguration.create();
63 conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
64 conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
65 HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
66 TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
67 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
68 log("Waiting for active/ready master");
69 cluster.waitForActiveAndReadyMaster();
70 ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
71 null);
72 HMaster master = cluster.getMaster();
73
74
75 byte [] table = Bytes.toBytes("tableRestart");
76 byte [] family = Bytes.toBytes("family");
77 log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
78 HTable ht = TEST_UTIL.createTable(table, family);
79 int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
80 NUM_REGIONS_TO_CREATE);
81 numRegions += 2;
82 log("Waiting for no more RIT\n");
83 blockUntilNoRIT(zkw, master);
84 log("Disabling table\n");
85 TEST_UTIL.getHBaseAdmin().disableTable(table);
86 log("Waiting for no more RIT\n");
87 blockUntilNoRIT(zkw, master);
88 NavigableSet<String> regions = getAllOnlineRegions(cluster);
89 log("Verifying only catalog regions are assigned\n");
90 if (regions.size() != 2) {
91 for (String oregion : regions) log("Region still online: " + oregion);
92 }
93 assertEquals(2, regions.size());
94 log("Enabling table\n");
95 TEST_UTIL.getHBaseAdmin().enableTable(table);
96 log("Waiting for no more RIT\n");
97 blockUntilNoRIT(zkw, master);
98 log("Verifying there are " + numRegions + " assigned on cluster\n");
99 regions = getAllOnlineRegions(cluster);
100 assertRegionsAssigned(cluster, regions);
101 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
102
103
104 log("Adding a fourth RS");
105 RegionServerThread restarted = cluster.startRegionServer();
106 expectedNumRS++;
107 restarted.waitForServerOnline();
108 log("Additional RS is online");
109 log("Waiting for no more RIT");
110 blockUntilNoRIT(zkw, master);
111 log("Verifying there are " + numRegions + " assigned on cluster");
112 assertRegionsAssigned(cluster, regions);
113 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
114
115
116 List<MasterThread> masterThreads = cluster.getMasterThreads();
117 MasterThread activeMaster = null;
118 MasterThread backupMaster = null;
119 assertEquals(2, masterThreads.size());
120 if (masterThreads.get(0).getMaster().isActiveMaster()) {
121 activeMaster = masterThreads.get(0);
122 backupMaster = masterThreads.get(1);
123 } else {
124 activeMaster = masterThreads.get(1);
125 backupMaster = masterThreads.get(0);
126 }
127
128
129 log("Stopping backup master\n\n");
130 backupMaster.getMaster().stop("Stop of backup during rolling restart");
131 cluster.hbaseCluster.waitOnMaster(backupMaster);
132
133
134 log("Stopping primary master\n\n");
135 activeMaster.getMaster().stop("Stop of active during rolling restart");
136 cluster.hbaseCluster.waitOnMaster(activeMaster);
137
138
139 log("Restarting primary master\n\n");
140 activeMaster = cluster.startMaster();
141 cluster.waitForActiveAndReadyMaster();
142 master = activeMaster.getMaster();
143
144
145 log("Restarting backup master\n\n");
146 backupMaster = cluster.startMaster();
147
148 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
149
150
151
152
153 List<RegionServerThread> regionServers =
154 cluster.getLiveRegionServerThreads();
155 int num = 1;
156 int total = regionServers.size();
157 for (RegionServerThread rst : regionServers) {
158 String serverName = rst.getRegionServer().getServerName();
159 log("Stopping region server " + num + " of " + total + " [ " +
160 serverName + "]");
161 rst.getRegionServer().stop("Stopping RS during rolling restart");
162 cluster.hbaseCluster.waitOnRegionServer(rst);
163 log("Waiting for RS shutdown to be handled by master");
164 waitForRSShutdownToStartAndFinish(activeMaster, serverName);
165 log("RS shutdown done, waiting for no more RIT");
166 blockUntilNoRIT(zkw, master);
167 log("Verifying there are " + numRegions + " assigned on cluster");
168 assertRegionsAssigned(cluster, regions);
169 expectedNumRS--;
170 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
171 log("Restarting region server " + num + " of " + total);
172 restarted = cluster.startRegionServer();
173 restarted.waitForServerOnline();
174 expectedNumRS++;
175 log("Region server " + num + " is back online");
176 log("Waiting for no more RIT");
177 blockUntilNoRIT(zkw, master);
178 log("Verifying there are " + numRegions + " assigned on cluster");
179 assertRegionsAssigned(cluster, regions);
180 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
181 num++;
182 }
183 Thread.sleep(2000);
184 assertRegionsAssigned(cluster, regions);
185
186
187 RegionServerThread rootServer = getServerHostingRoot(cluster);
188 RegionServerThread metaServer = getServerHostingMeta(cluster);
189 if (rootServer == metaServer) {
190 log("ROOT and META on the same server so killing another random server");
191 int i=0;
192 while (rootServer == metaServer) {
193 metaServer = cluster.getRegionServerThreads().get(i);
194 i++;
195 }
196 }
197 log("Stopping server hosting ROOT");
198 rootServer.getRegionServer().stop("Stopping ROOT server");
199 log("Stopping server hosting META #1");
200 metaServer.getRegionServer().stop("Stopping META server");
201 cluster.hbaseCluster.waitOnRegionServer(rootServer);
202 log("Root server down");
203 cluster.hbaseCluster.waitOnRegionServer(metaServer);
204 log("Meta server down #1");
205 expectedNumRS -= 2;
206 log("Waiting for meta server #1 RS shutdown to be handled by master");
207 waitForRSShutdownToStartAndFinish(activeMaster,
208 metaServer.getRegionServer().getServerName());
209 log("Waiting for no more RIT");
210 blockUntilNoRIT(zkw, master);
211 log("Verifying there are " + numRegions + " assigned on cluster");
212 assertRegionsAssigned(cluster, regions);
213 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
214
215
216 metaServer = getServerHostingMeta(cluster);
217 log("Stopping server hosting META #2");
218 metaServer.getRegionServer().stop("Stopping META server");
219 cluster.hbaseCluster.waitOnRegionServer(metaServer);
220 log("Meta server down");
221 expectedNumRS--;
222 log("Waiting for RS shutdown to be handled by master");
223 waitForRSShutdownToStartAndFinish(activeMaster,
224 metaServer.getRegionServer().getServerName());
225 log("RS shutdown done, waiting for no more RIT");
226 blockUntilNoRIT(zkw, master);
227 log("Verifying there are " + numRegions + " assigned on cluster");
228 assertRegionsAssigned(cluster, regions);
229 assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
230
231
232 cluster.startRegionServer().waitForServerOnline();
233 cluster.startRegionServer().waitForServerOnline();
234 cluster.startRegionServer().waitForServerOnline();
235 Thread.sleep(1000);
236 log("Waiting for no more RIT");
237 blockUntilNoRIT(zkw, master);
238 log("Verifying there are " + numRegions + " assigned on cluster");
239 assertRegionsAssigned(cluster, regions);
240
241 metaServer = getServerHostingMeta(cluster);
242 log("Stopping server hosting META (1 of 3)");
243 metaServer.getRegionServer().stop("Stopping META server");
244 cluster.hbaseCluster.waitOnRegionServer(metaServer);
245 log("Meta server down (1 of 3)");
246 log("Waiting for RS shutdown to be handled by master");
247 waitForRSShutdownToStartAndFinish(activeMaster,
248 metaServer.getRegionServer().getServerName());
249 log("RS shutdown done, waiting for no more RIT");
250 blockUntilNoRIT(zkw, master);
251 log("Verifying there are " + numRegions + " assigned on cluster");
252 assertRegionsAssigned(cluster, regions);
253
254
255 metaServer = getServerHostingMeta(cluster);
256 log("Stopping server hosting META (2 of 3)");
257 metaServer.getRegionServer().stop("Stopping META server");
258 cluster.hbaseCluster.waitOnRegionServer(metaServer);
259 log("Meta server down (2 of 3)");
260 log("Waiting for RS shutdown to be handled by master");
261 waitForRSShutdownToStartAndFinish(activeMaster,
262 metaServer.getRegionServer().getServerName());
263 log("RS shutdown done, waiting for no more RIT");
264 blockUntilNoRIT(zkw, master);
265 log("Verifying there are " + numRegions + " assigned on cluster");
266 assertRegionsAssigned(cluster, regions);
267
268
269 metaServer = getServerHostingMeta(cluster);
270 log("Stopping server hosting META (3 of 3)");
271 metaServer.getRegionServer().stop("Stopping META server");
272 cluster.hbaseCluster.waitOnRegionServer(metaServer);
273 log("Meta server down (3 of 3)");
274 log("Waiting for RS shutdown to be handled by master");
275 waitForRSShutdownToStartAndFinish(activeMaster,
276 metaServer.getRegionServer().getServerName());
277 log("RS shutdown done, waiting for no more RIT");
278 blockUntilNoRIT(zkw, master);
279 log("Verifying there are " + numRegions + " assigned on cluster");
280 assertRegionsAssigned(cluster, regions);
281
282 if (cluster.getRegionServerThreads().size() != 1) {
283 log("Online regionservers:");
284 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
285 log("RS: " + rst.getRegionServer().getServerName());
286 }
287 }
288 assertEquals(1, cluster.getRegionServerThreads().size());
289
290
291
292
293
294
295 TEST_UTIL.shutdownMiniCluster();
296 }
297
298 private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
299 throws KeeperException, InterruptedException {
300 ZKAssign.blockUntilNoRIT(zkw);
301 master.assignmentManager.waitUntilNoRegionsInTransition(60000);
302 }
303
304 private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
305 String serverName) throws InterruptedException {
306 ServerManager sm = activeMaster.getMaster().getServerManager();
307
308 while (!sm.getDeadServers().contains(serverName)) {
309 log("Waiting for [" + serverName + "] to be listed as dead in master");
310 Thread.sleep(1);
311 }
312 log("Server [" + serverName + "] marked as dead, waiting for it to " +
313 "finish dead processing");
314 while (sm.areDeadServersInProgress()) {
315 log("Server [" + serverName + "] still being processed, waiting");
316 Thread.sleep(100);
317 }
318 log("Server [" + serverName + "] done with server shutdown processing");
319 }
320
321 private void log(String msg) {
322 LOG.debug("\n\nTRR: " + msg + "\n");
323 }
324
325 private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster) {
326 return getServerHosting(cluster, HRegionInfo.FIRST_META_REGIONINFO);
327 }
328
329 private RegionServerThread getServerHostingRoot(MiniHBaseCluster cluster) {
330 return getServerHosting(cluster, HRegionInfo.ROOT_REGIONINFO);
331 }
332
333 private RegionServerThread getServerHosting(MiniHBaseCluster cluster,
334 HRegionInfo region) {
335 for (RegionServerThread rst : cluster.getRegionServerThreads()) {
336 if (rst.getRegionServer().getOnlineRegions().contains(region)) {
337 return rst;
338 }
339 }
340 return null;
341 }
342
343 private void assertRegionsAssigned(MiniHBaseCluster cluster,
344 Set<String> expectedRegions) {
345 int numFound = 0;
346 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
347 numFound += rst.getRegionServer().getNumberOfOnlineRegions();
348 }
349 if (expectedRegions.size() > numFound) {
350 log("Expected to find " + expectedRegions.size() + " but only found"
351 + " " + numFound);
352 NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
353 for (String region : expectedRegions) {
354 if (!foundRegions.contains(region)) {
355 log("Missing region: " + region);
356 }
357 }
358 assertEquals(expectedRegions.size(), numFound);
359 } else if (expectedRegions.size() < numFound) {
360 int doubled = numFound - expectedRegions.size();
361 log("Expected to find " + expectedRegions.size() + " but found"
362 + " " + numFound + " (" + doubled + " double assignments?)");
363 NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
364 for (String region : doubleRegions) {
365 log("Region is double assigned: " + region);
366 }
367 assertEquals(expectedRegions.size(), numFound);
368 } else {
369 log("Success! Found expected number of " + numFound + " regions");
370 }
371 }
372
373 private NavigableSet<String> getAllOnlineRegions(MiniHBaseCluster cluster) {
374 NavigableSet<String> online = new TreeSet<String>();
375 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
376 for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
377 online.add(region.getRegionNameAsString());
378 }
379 }
380 return online;
381 }
382
383 private NavigableSet<String> getDoubleAssignedRegions(
384 MiniHBaseCluster cluster) {
385 NavigableSet<String> online = new TreeSet<String>();
386 NavigableSet<String> doubled = new TreeSet<String>();
387 for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
388 for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
389 if(!online.add(region.getRegionNameAsString())) {
390 doubled.add(region.getRegionNameAsString());
391 }
392 }
393 }
394 return doubled;
395 }
396
397 }