View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import static org.junit.Assert.assertEquals;
22  
23  import java.io.IOException;
24  import java.util.List;
25  import java.util.NavigableSet;
26  import java.util.Set;
27  import java.util.TreeSet;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.conf.Configuration;
32  import org.apache.hadoop.hbase.HBaseConfiguration;
33  import org.apache.hadoop.hbase.HBaseTestingUtility;
34  import org.apache.hadoop.hbase.HRegionInfo;
35  import org.apache.hadoop.hbase.testclassification.LargeTests;
36  import org.apache.hadoop.hbase.MiniHBaseCluster;
37  import org.apache.hadoop.hbase.ServerName;
38  import org.apache.hadoop.hbase.TableName;
39  import org.apache.hadoop.hbase.client.HTable;
40  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
41  import org.apache.hadoop.hbase.util.Bytes;
42  import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
43  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
44  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
45  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
46  import org.apache.zookeeper.KeeperException;
47  import org.junit.Test;
48  import org.junit.experimental.categories.Category;
49  
50  /**
51   * Tests the restarting of everything as done during rolling restarts.
52   */
53  @Category(LargeTests.class)
54  public class  TestRollingRestart {
55    private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
56  
57    @Test (timeout=500000)
58    public void testBasicRollingRestart() throws Exception {
59  
60      // Start a cluster with 2 masters and 4 regionservers
61      final int NUM_MASTERS = 2;
62      final int NUM_RS = 3;
63      final int NUM_REGIONS_TO_CREATE = 20;
64  
65      int expectedNumRS = 3;
66  
67      // Start the cluster
68      log("Starting cluster");
69      Configuration conf = HBaseConfiguration.create();
70      HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
71      TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
72      MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
73      log("Waiting for active/ready master");
74      cluster.waitForActiveAndReadyMaster();
75      ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
76          null);
77      HMaster master = cluster.getMaster();
78  
79      // Create a table with regions
80      TableName table = TableName.valueOf("tableRestart");
81      byte [] family = Bytes.toBytes("family");
82      log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
83      HTable ht = TEST_UTIL.createTable(table, family);
84      int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
85          NUM_REGIONS_TO_CREATE);
86      numRegions += 1; // catalogs
87      log("Waiting for no more RIT\n");
88      blockUntilNoRIT(zkw, master);
89      log("Disabling table\n");
90      TEST_UTIL.getHBaseAdmin().disableTable(table);
91      log("Waiting for no more RIT\n");
92      blockUntilNoRIT(zkw, master);
93      NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
94      log("Verifying only catalog and namespace regions are assigned\n");
95      if (regions.size() != 2) {
96        for (String oregion : regions) log("Region still online: " + oregion);
97      }
98      assertEquals(2, regions.size());
99      log("Enabling table\n");
100     TEST_UTIL.getHBaseAdmin().enableTable(table);
101     log("Waiting for no more RIT\n");
102     blockUntilNoRIT(zkw, master);
103     log("Verifying there are " + numRegions + " assigned on cluster\n");
104     regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
105     assertRegionsAssigned(cluster, regions);
106     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
107 
108     // Add a new regionserver
109     log("Adding a fourth RS");
110     RegionServerThread restarted = cluster.startRegionServer();
111     expectedNumRS++;
112     restarted.waitForServerOnline();
113     log("Additional RS is online");
114     log("Waiting for no more RIT");
115     blockUntilNoRIT(zkw, master);
116     log("Verifying there are " + numRegions + " assigned on cluster");
117     assertRegionsAssigned(cluster, regions);
118     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
119 
120     // Master Restarts
121     List<MasterThread> masterThreads = cluster.getMasterThreads();
122     MasterThread activeMaster = null;
123     MasterThread backupMaster = null;
124     assertEquals(2, masterThreads.size());
125     if (masterThreads.get(0).getMaster().isActiveMaster()) {
126       activeMaster = masterThreads.get(0);
127       backupMaster = masterThreads.get(1);
128     } else {
129       activeMaster = masterThreads.get(1);
130       backupMaster = masterThreads.get(0);
131     }
132 
133     // Bring down the backup master
134     log("Stopping backup master\n\n");
135     backupMaster.getMaster().stop("Stop of backup during rolling restart");
136     cluster.hbaseCluster.waitOnMaster(backupMaster);
137 
138     // Bring down the primary master
139     log("Stopping primary master\n\n");
140     activeMaster.getMaster().stop("Stop of active during rolling restart");
141     cluster.hbaseCluster.waitOnMaster(activeMaster);
142 
143     // Start primary master
144     log("Restarting primary master\n\n");
145     activeMaster = cluster.startMaster();
146     cluster.waitForActiveAndReadyMaster();
147     master = activeMaster.getMaster();
148 
149     // Start backup master
150     log("Restarting backup master\n\n");
151     backupMaster = cluster.startMaster();
152 
153     assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
154 
155     // RegionServer Restarts
156 
157     // Bring them down, one at a time, waiting between each to complete
158     List<RegionServerThread> regionServers =
159       cluster.getLiveRegionServerThreads();
160     int num = 1;
161     int total = regionServers.size();
162     for (RegionServerThread rst : regionServers) {
163       ServerName serverName = rst.getRegionServer().getServerName();
164       log("Stopping region server " + num + " of " + total + " [ " +
165           serverName + "]");
166       rst.getRegionServer().stop("Stopping RS during rolling restart");
167       cluster.hbaseCluster.waitOnRegionServer(rst);
168       log("Waiting for RS shutdown to be handled by master");
169       waitForRSShutdownToStartAndFinish(activeMaster, serverName);
170       log("RS shutdown done, waiting for no more RIT");
171       blockUntilNoRIT(zkw, master);
172       log("Verifying there are " + numRegions + " assigned on cluster");
173       assertRegionsAssigned(cluster, regions);
174       expectedNumRS--;
175       assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
176       log("Restarting region server " + num + " of " + total);
177       restarted = cluster.startRegionServer();
178       restarted.waitForServerOnline();
179       expectedNumRS++;
180       log("Region server " + num + " is back online");
181       log("Waiting for no more RIT");
182       blockUntilNoRIT(zkw, master);
183       log("Verifying there are " + numRegions + " assigned on cluster");
184       assertRegionsAssigned(cluster, regions);
185       assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
186       num++;
187     }
188     Thread.sleep(1000);
189     assertRegionsAssigned(cluster, regions);
190 
191     // TODO: Bring random 3 of 4 RS down at the same time
192 
193     ht.close();
194     // Stop the cluster
195     TEST_UTIL.shutdownMiniCluster();
196   }
197 
198   private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
199   throws KeeperException, InterruptedException {
200     ZKAssign.blockUntilNoRIT(zkw);
201     master.assignmentManager.waitUntilNoRegionsInTransition(60000);
202   }
203 
204   private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
205       ServerName serverName) throws InterruptedException {
206     ServerManager sm = activeMaster.getMaster().getServerManager();
207     // First wait for it to be in dead list
208     while (!sm.getDeadServers().isDeadServer(serverName)) {
209       log("Waiting for [" + serverName + "] to be listed as dead in master");
210       Thread.sleep(1);
211     }
212     log("Server [" + serverName + "] marked as dead, waiting for it to " +
213         "finish dead processing");
214     while (sm.areDeadServersInProgress()) {
215       log("Server [" + serverName + "] still being processed, waiting");
216       Thread.sleep(100);
217     }
218     log("Server [" + serverName + "] done with server shutdown processing");
219   }
220 
221   private void log(String msg) {
222     LOG.debug("\n\nTRR: " + msg + "\n");
223   }
224 
225   private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
226     int numFound = 0;
227     for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
228       numFound += rst.getRegionServer().getNumberOfOnlineRegions();
229     }
230     for (MasterThread mt : cluster.getMasterThreads()) {
231       numFound += mt.getMaster().getNumberOfOnlineRegions();
232     }
233     return numFound;
234   }
235   
236   private void assertRegionsAssigned(MiniHBaseCluster cluster,
237       Set<String> expectedRegions) throws IOException {
238     int numFound = getNumberOfOnlineRegions(cluster);
239     if (expectedRegions.size() > numFound) {
240       log("Expected to find " + expectedRegions.size() + " but only found"
241           + " " + numFound);
242       NavigableSet<String> foundRegions =
243         HBaseTestingUtility.getAllOnlineRegions(cluster);
244       for (String region : expectedRegions) {
245         if (!foundRegions.contains(region)) {
246           log("Missing region: " + region);
247         }
248       }
249       assertEquals(expectedRegions.size(), numFound);
250     } else if (expectedRegions.size() < numFound) {
251       int doubled = numFound - expectedRegions.size();
252       log("Expected to find " + expectedRegions.size() + " but found"
253           + " " + numFound + " (" + doubled + " double assignments?)");
254       NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
255       for (String region : doubleRegions) {
256         log("Region is double assigned: " + region);
257       }
258       assertEquals(expectedRegions.size(), numFound);
259     } else {
260       log("Success!  Found expected number of " + numFound + " regions");
261     }
262   }
263 
264   private NavigableSet<String> getDoubleAssignedRegions(
265       MiniHBaseCluster cluster) throws IOException {
266     NavigableSet<String> online = new TreeSet<String>();
267     NavigableSet<String> doubled = new TreeSet<String>();
268     for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
269       for (HRegionInfo region : ProtobufUtil.getOnlineRegions(
270           rst.getRegionServer().getRSRpcServices())) {
271         if(!online.add(region.getRegionNameAsString())) {
272           doubled.add(region.getRegionNameAsString());
273         }
274       }
275     }
276     return doubled;
277   }
278 
279 
280 }
281