View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collection;
24  import java.util.List;
25  
26  import junit.framework.Assert;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.master.HMaster;
31  import org.apache.hadoop.hbase.protobuf.RequestConverter;
32  import org.apache.hadoop.hbase.master.ServerManager;
33  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
34  import org.apache.hadoop.hbase.regionserver.HRegion;
35  import org.apache.hadoop.hbase.regionserver.HRegionServer;
36  import org.apache.hadoop.hbase.util.Bytes;
37  import org.apache.hadoop.hbase.util.JVMClusterUtil;
38  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
39  import org.apache.hadoop.hbase.util.Threads;
40  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
41  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
42  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
43  import org.apache.zookeeper.KeeperException;
44  import org.junit.AfterClass;
45  import org.junit.BeforeClass;
46  import org.junit.Test;
47  import org.junit.experimental.categories.Category;
48  
49  
50  /**
51   * Test the draining servers feature.
52   *
53   * This is typically an integration test: a unit test would be to check that the
54   * master does no assign regions to a regionserver marked as drained.
55   *
56   * @see <a href="https://issues.apache.org/jira/browse/HBASE-4298">HBASE-4298</a>
57   */
58  @Category(MediumTests.class)
59  public class TestDrainingServer {
60    private static final Log LOG = LogFactory.getLog(TestDrainingServer.class);
61    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
62    private static final int NB_SLAVES = 5;
63    private static final int COUNT_OF_REGIONS = NB_SLAVES * 2;
64  
65    /**
66     * Spin up a cluster with a bunch of regions on it.
67     */
68    @BeforeClass
69    public static void setUpBeforeClass() throws Exception {
70      TEST_UTIL.startMiniCluster(NB_SLAVES);
71      TEST_UTIL.getHBaseCluster().waitForActiveAndReadyMaster();
72      TEST_UTIL.getConfiguration().setBoolean("hbase.master.enabletable.roundrobin", true);
73  
74      final List<String> families = new ArrayList<String>(1);
75      families.add("family");
76      TEST_UTIL.createRandomTable("table", families, 1, 0, 0, COUNT_OF_REGIONS, 0);
77      // Ensure a stable env
78      TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, false);
79  
80      boolean ready = false;
81      while (!ready){
82        waitForAllRegionsOnline();
83  
84        // Assert that every regionserver has some regions on it.
85        int i = 0;
86        ready = true;
87        while (i < NB_SLAVES && ready){
88          HRegionServer hrs = TEST_UTIL.getMiniHBaseCluster().getRegionServer(i);
89          if (ProtobufUtil.getOnlineRegions(hrs).isEmpty()){
90            ready = false;
91          }
92          i++;
93        }
94  
95        if (!ready){
96          TEST_UTIL.getHBaseAdmin().setBalancerRunning(true, true);
97          Assert.assertTrue("Can't start a balance!", TEST_UTIL.getHBaseAdmin().balancer());
98          TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, false);
99          Thread.sleep(100);
100       }
101     }
102   }
103 
104   private static HRegionServer setDrainingServer(final HRegionServer hrs)
105   throws KeeperException {
106     LOG.info("Making " + hrs.getServerName() + " the draining server; " +
107       "it has " + hrs.getNumberOfOnlineRegions() + " online regions");
108     ZooKeeperWatcher zkw = hrs.getZooKeeper();
109     String hrsDrainingZnode =
110       ZKUtil.joinZNode(zkw.drainingZNode, hrs.getServerName().toString());
111     ZKUtil.createWithParents(zkw, hrsDrainingZnode);
112     return hrs;
113   }
114 
115   private static HRegionServer unsetDrainingServer(final HRegionServer hrs)
116   throws KeeperException {
117     ZooKeeperWatcher zkw = hrs.getZooKeeper();
118     String hrsDrainingZnode =
119       ZKUtil.joinZNode(zkw.drainingZNode, hrs.getServerName().toString());
120     ZKUtil.deleteNode(zkw, hrsDrainingZnode);
121     return hrs;
122   }
123 
124   @AfterClass
125   public static void tearDownAfterClass() throws Exception {
126     TEST_UTIL.shutdownMiniCluster();
127   }
128 
129   /**
130    * Test adding server to draining servers and then move regions off it.
131    * Make sure that no regions are moved back to the draining server.
132    * @throws IOException
133    * @throws KeeperException
134    */
135   @Test  // (timeout=30000)
136   public void testDrainingServerOffloading()
137   throws Exception {
138     // I need master in the below.
139     HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
140     HRegionInfo hriToMoveBack = null;
141     // Set first server as draining server.
142     HRegionServer drainingServer =
143       setDrainingServer(TEST_UTIL.getMiniHBaseCluster().getRegionServer(0));
144     try {
145       final int regionsOnDrainingServer =
146         drainingServer.getNumberOfOnlineRegions();
147       Assert.assertTrue(regionsOnDrainingServer > 0);
148       List<HRegionInfo> hris = ProtobufUtil.getOnlineRegions(drainingServer);
149       for (HRegionInfo hri : hris) {
150         // Pass null and AssignmentManager will chose a random server BUT it
151         // should exclude draining servers.
152         master.moveRegion(null,
153           RequestConverter.buildMoveRegionRequest(hri.getEncodedNameAsBytes(), null));
154         // Save off region to move back.
155         hriToMoveBack = hri;
156       }
157       // Wait for regions to come back on line again.
158       waitForAllRegionsOnline();
159       Assert.assertEquals(0, drainingServer.getNumberOfOnlineRegions());
160     } finally {
161       unsetDrainingServer(drainingServer);
162     }
163     // Now we've unset the draining server, we should be able to move a region
164     // to what was the draining server.
165     master.moveRegion(null,
166       RequestConverter.buildMoveRegionRequest(hriToMoveBack.getEncodedNameAsBytes(),
167       Bytes.toBytes(drainingServer.getServerName().toString())));
168     // Wait for regions to come back on line again.
169     waitForAllRegionsOnline();
170     Assert.assertEquals(1, drainingServer.getNumberOfOnlineRegions());
171   }
172 
173   /**
174    * Test that draining servers are ignored even after killing regionserver(s).
175    * Verify that the draining server is not given any of the dead servers regions.
176    * @throws KeeperException
177    * @throws IOException
178    */
179   @Test  (timeout=30000)
180   public void testDrainingServerWithAbort() throws KeeperException, Exception {
181     HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
182 
183     waitForAllRegionsOnline();
184 
185     final long regionCount = TEST_UTIL.getMiniHBaseCluster().countServedRegions();
186 
187     // Let's get a copy of the regions today.
188     Collection<HRegion> regions = getRegions();
189     LOG.info("All regions: " + regions);
190 
191       // Choose the draining server
192     HRegionServer drainingServer = TEST_UTIL.getMiniHBaseCluster().getRegionServer(0);
193     final int regionsOnDrainingServer = drainingServer.getNumberOfOnlineRegions();
194     Assert.assertTrue(regionsOnDrainingServer > 0);
195 
196     ServerManager sm = master.getServerManager();
197 
198     Collection<HRegion> regionsBefore = drainingServer.getOnlineRegionsLocalContext();
199     LOG.info("Regions of drained server are: "+ regionsBefore );
200 
201     try {
202       // Add first server to draining servers up in zk.
203       setDrainingServer(drainingServer);
204 
205       //wait for the master to receive and manage the event
206       while  (sm.createDestinationServersList().contains(drainingServer.getServerName())) {
207         Thread.sleep(1);
208       }
209 
210       LOG.info("The available servers are: "+ sm.createDestinationServersList());
211 
212       Assert.assertEquals("Nothing should have happened here.", regionsOnDrainingServer,
213         drainingServer.getNumberOfOnlineRegions());
214       Assert.assertFalse("We should not have regions in transition here. List is: " +
215           master.getAssignmentManager().getRegionStates().getRegionsInTransition(),
216           master.getAssignmentManager().getRegionStates().isRegionsInTransition());
217 
218       // Kill a few regionservers.
219       for (int aborted = 0; aborted <= 2; aborted++) {
220         HRegionServer hrs = TEST_UTIL.getMiniHBaseCluster().getRegionServer(aborted + 1);
221         hrs.abort("Aborting");
222       }
223 
224       // Wait for regions to come back online again.  waitForAllRegionsOnline can come back before
225       // we've assigned out regions on the cluster so retry if we are shy the wanted number
226       Collection<HRegion> regionsAfter = null;
227       for (int i = 0; i < 1000; i++) {
228         waitForAllRegionsOnline();
229         regionsAfter = getRegions();
230         if (regionsAfter.size() >= regionCount) break;
231         LOG.info("Expecting " + regionCount + " but only " + regionsAfter);
232         Threads.sleep(10);
233       }
234       LOG.info("Regions of drained server: " + regionsAfter + ", all regions: " + getRegions());
235       Assert.assertEquals("Test conditions are not met: regions were" +
236         " created/deleted during the test. ",
237         regionCount, TEST_UTIL.getMiniHBaseCluster().countServedRegions());
238 
239       // Assert the draining server still has the same regions.
240       regionsAfter = drainingServer.getOnlineRegionsLocalContext();
241       StringBuilder result = new StringBuilder();
242       for (HRegion r: regionsAfter){
243         if (!regionsBefore.contains(r)){
244           result.append(r).append(" was added after the drain");
245           if (regions.contains(r)){
246             result.append("(existing region");
247           } else {
248             result.append("(new region)");
249           }
250           result.append("; ");
251         }
252       }
253       for (HRegion r: regionsBefore){
254         if (!regionsAfter.contains(r)){
255           result.append(r).append(" was removed after the drain; ");
256         }
257       }
258       Assert.assertTrue("Errors are: "+ result.toString(), result.length()==0);
259 
260     } finally {
261       unsetDrainingServer(drainingServer);
262     }
263   }
264 
265   private Collection<HRegion> getRegions() {
266     Collection<HRegion> regions = new ArrayList<HRegion>();
267     List<RegionServerThread> rsthreads =
268       TEST_UTIL.getMiniHBaseCluster().getLiveRegionServerThreads();
269     for (RegionServerThread t: rsthreads) {
270       HRegionServer rs = t.getRegionServer();
271       Collection<HRegion> lr = rs.getOnlineRegionsLocalContext();
272       LOG.info("Found " + lr + " on " + rs);
273       regions.addAll(lr);
274     }
275     return regions;
276   }
277 
278   private static void waitForAllRegionsOnline() throws Exception {
279     // Wait for regions to come back on line again.
280     boolean done = false;
281     while (!done) {
282       Thread.sleep(1);
283 
284       // Nothing in ZK RIT for a start
285       ZKAssign.blockUntilNoRIT(TEST_UTIL.getZooKeeperWatcher());
286 
287       // Then we want all the regions to be marked as available...
288       if (!isAllRegionsOnline()) continue;
289 
290       // And without any work in progress on the master side
291       if (TEST_UTIL.getMiniHBaseCluster().getMaster().
292           getAssignmentManager().getRegionStates().isRegionsInTransition()) continue;
293 
294       // nor on the region server side
295       done = true;
296       for (JVMClusterUtil.RegionServerThread rs :
297           TEST_UTIL.getMiniHBaseCluster().getLiveRegionServerThreads()) {
298         if (!rs.getRegionServer().getRegionsInTransitionInRS().isEmpty()) {
299           done = false;
300         }
301         // Sleep some else we spam the log w/ notice that servers are not yet alive.
302         Threads.sleep(10);
303       }
304     }
305   }
306 
307   private static boolean isAllRegionsOnline() {
308     return TEST_UTIL.getMiniHBaseCluster().countServedRegions() >=
309         (COUNT_OF_REGIONS + 2 /*catalog and namespace regions*/);
310   }
311 }