1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertNotSame;
24  import static org.junit.Assert.assertTrue;
25  
26  import java.io.IOException;
27  import java.util.List;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.hbase.HBaseTestingUtility;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.HRegionInfo;
34  import org.apache.hadoop.hbase.MasterNotRunningException;
35  import org.apache.hadoop.hbase.MiniHBaseCluster;
36  import org.apache.hadoop.hbase.UnknownRegionException;
37  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
38  import org.apache.hadoop.hbase.client.Delete;
39  import org.apache.hadoop.hbase.client.HBaseAdmin;
40  import org.apache.hadoop.hbase.client.HTable;
41  import org.apache.hadoop.hbase.util.Bytes;
42  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
43  import org.junit.AfterClass;
44  import org.junit.Before;
45  import org.junit.BeforeClass;
46  import org.junit.Test;
47  
48  /**
49   * Like {@link TestSplitTransaction} in that we're testing {@link SplitTransaction}
50   * only the below tests are against a running cluster where {@link TestSplitTransaction}
51   * is tests against a bare {@link HRegion}.
52   */
53  public class TestSplitTransactionOnCluster {
54    private static final Log LOG =
55      LogFactory.getLog(TestSplitTransactionOnCluster.class);
56    private HBaseAdmin admin = null;
57    private MiniHBaseCluster cluster = null;
58  
59    private static final HBaseTestingUtility TESTING_UTIL =
60      new HBaseTestingUtility();
61  
62    @BeforeClass public static void before() throws Exception {
63      TESTING_UTIL.getConfiguration().setInt("hbase.balancer.period", 60000);
64      TESTING_UTIL.startMiniCluster(2);
65    }
66  
67    @AfterClass public static void after() throws Exception {
68      TESTING_UTIL.shutdownMiniCluster();
69    }
70  
71    @Before public void setup() throws IOException {
72      TESTING_UTIL.ensureSomeRegionServersAvailable(2);
73      this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
74      this.cluster = TESTING_UTIL.getMiniHBaseCluster();
75    }
76  
77    /**
78     * Messy test that simulates case where SplitTransactions fails to add one
79     * of the daughters up into the .META. table before crash.  We're testing
80     * fact that the shutdown handler will fixup the missing daughter region
81     * adding it back into .META.
82     * @throws IOException
83     * @throws InterruptedException
84     */
85    @Test (timeout = 600000) public void testShutdownSimpleFixup()
86    throws IOException, InterruptedException {
87      final byte [] tableName = Bytes.toBytes("testShutdownSimpleFixup");
88  
89      // Create table then get the single region for our new table.
90      HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
91  
92      List<HRegion> regions = cluster.getRegions(tableName);
93      assertEquals(1, regions.size());
94      HRegionInfo hri = regions.get(0).getRegionInfo();
95  
96      int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
97  
98      // Turn off balancer so it doesn't cut in and mess up our placements.
99      this.admin.balanceSwitch(false);
100     // Turn off the meta scanner so it don't remove parent on us.
101     cluster.getMaster().setCatalogJanitorEnabled(false);
102     try {
103       // Add a bit of load up into the table so splittable.
104       TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
105       // Get region pre-split.
106       HRegionServer server = cluster.getRegionServer(tableRegionIndex);
107       printOutRegions(server, "Initial regions: ");
108       int regionCount = server.getOnlineRegions().size();
109       // Now split.
110       split(hri, server, regionCount);
111       // Get daughters
112       List<HRegion> daughters = cluster.getRegions(tableName);
113       assertTrue(daughters.size() >= 2);
114       // Remove one of the daughters from .META. to simulate failed insert of
115       // daughter region up into .META.
116       removeDaughterFromMeta(daughters.get(0).getRegionName());
117       // Now crash the server
118       cluster.abortRegionServer(tableRegionIndex);
119       while(server.getOnlineRegions().size() > 0) {
120         LOG.info("Waiting on server to go down");
121         Thread.sleep(100);
122       }
123       // Wait till regions are back on line again.
124       while(cluster.getRegions(tableName).size() < daughters.size()) {
125         LOG.info("Waiting for repair to happen");
126         Thread.sleep(1000);
127       }
128       // Assert daughters are online.
129       regions = cluster.getRegions(tableName);
130       for (HRegion r: regions) {
131         assertTrue(daughters.contains(r));
132       }
133     } finally {
134       admin.balanceSwitch(true);
135       cluster.getMaster().setCatalogJanitorEnabled(true);
136     }
137   }
138 
139   /**
140    * Test that if daughter split on us, we won't do the shutdown handler fixup
141    * just because we can't find the immediate daughter of an offlined parent.
142    * @throws IOException
143    * @throws InterruptedException
144    */
145   @Test public void testShutdownFixupWhenDaughterHasSplit()
146   throws IOException, InterruptedException {
147     final byte [] tableName =
148       Bytes.toBytes("testShutdownFixupWhenDaughterHasSplit");
149 
150     // Create table then get the single region for our new table.
151     HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
152 
153     List<HRegion> regions = cluster.getRegions(tableName);
154     assertEquals(1, regions.size());
155     HRegionInfo hri = regions.get(0).getRegionInfo();
156 
157     int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
158 
159     // Turn off balancer so it doesn't cut in and mess up our placements.
160     this.admin.balanceSwitch(false);
161     // Turn off the meta scanner so it don't remove parent on us.
162     cluster.getMaster().setCatalogJanitorEnabled(false);
163     try {
164       // Add a bit of load up into the table so splittable.
165       TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
166       // Get region pre-split.
167       HRegionServer server = cluster.getRegionServer(tableRegionIndex);
168       printOutRegions(server, "Initial regions: ");
169       int regionCount = server.getOnlineRegions().size();
170       // Now split.
171       split(hri, server, regionCount);
172       // Get daughters
173       List<HRegion> daughters = cluster.getRegions(tableName);
174       assertTrue(daughters.size() >= 2);
175       // Now split one of the daughters.
176       regionCount = server.getOnlineRegions().size();
177       split(daughters.get(0).getRegionInfo(), server, regionCount);
178       // Get list of daughters
179       daughters = cluster.getRegions(tableName);
180       // Now crash the server
181       cluster.abortRegionServer(tableRegionIndex);
182       while(server.getOnlineRegions().size() > 0) {
183         LOG.info("Waiting on server to go down");
184         Thread.sleep(100);
185       }
186       // Wait till regions are back on line again.
187       while(cluster.getRegions(tableName).size() < daughters.size()) {
188         LOG.info("Waiting for repair to happen");
189         Thread.sleep(1000);
190       }
191       // Assert daughters are online and ONLY the original daughters -- that
192       // fixup didn't insert one during server shutdown recover.
193       regions = cluster.getRegions(tableName);
194       assertEquals(daughters.size(), regions.size());
195       for (HRegion r: regions) {
196         assertTrue(daughters.contains(r));
197       }
198     } finally {
199       admin.balanceSwitch(true);
200       cluster.getMaster().setCatalogJanitorEnabled(true);
201     }
202   }
203 
204   private void split(final HRegionInfo hri, final HRegionServer server,
205       final int regionCount)
206   throws IOException, InterruptedException {
207     this.admin.split(hri.getRegionNameAsString());
208     while(server.getOnlineRegions().size() <= regionCount) {
209       LOG.debug("Waiting on region to split");
210       Thread.sleep(100);
211     }
212   }
213 
214   private void removeDaughterFromMeta(final byte [] regionName) throws IOException {
215     HTable metaTable =
216       new HTable(TESTING_UTIL.getConfiguration(), HConstants.META_TABLE_NAME);
217     Delete d = new Delete(regionName);
218     LOG.info("Deleted " + Bytes.toString(regionName));
219     metaTable.delete(d);
220   }
221 
222   /**
223    * Ensure single table region is not on same server as the single .META. table
224    * region.
225    * @param admin
226    * @param hri
227    * @return Index of the server hosting the single table region
228    * @throws UnknownRegionException
229    * @throws MasterNotRunningException
230    * @throws ZooKeeperConnectionException
231    * @throws InterruptedException
232    */
233   private int ensureTableRegionNotOnSameServerAsMeta(final HBaseAdmin admin,
234       final HRegionInfo hri)
235   throws UnknownRegionException, MasterNotRunningException,
236   ZooKeeperConnectionException, InterruptedException {
237     MiniHBaseCluster cluster = TESTING_UTIL.getMiniHBaseCluster();
238     // Now make sure that the table region is not on same server as that hosting
239     // .META.  We don't want .META. replay polluting our test when we later crash
240     // the table region serving server.
241     int metaServerIndex = cluster.getServerWithMeta();
242     assertTrue(metaServerIndex != -1);
243     HRegionServer metaRegionServer = cluster.getRegionServer(metaServerIndex);
244     int tableRegionIndex = cluster.getServerWith(hri.getRegionName());
245     assertTrue(tableRegionIndex != -1);
246     HRegionServer tableRegionServer = cluster.getRegionServer(tableRegionIndex);
247     if (metaRegionServer.getServerName().equals(tableRegionServer.getServerName())) {
248       HRegionServer hrs = getOtherRegionServer(cluster, metaRegionServer);
249       admin.move(hri.getEncodedNameAsBytes(), Bytes.toBytes(hrs.getServerName()));
250     }
251     // Wait till table region is up on the server that is NOT carrying .META..
252     while (true) {
253       tableRegionIndex = cluster.getServerWith(hri.getRegionName());
254       if (tableRegionIndex != -1 && tableRegionIndex != metaServerIndex) break;
255       LOG.debug("Waiting on region move off the .META. server; current index " +
256         tableRegionIndex);
257       Thread.sleep(100);
258     }
259     // Verify for sure table region is not on same server as .META.
260     tableRegionIndex = cluster.getServerWith(hri.getRegionName());
261     assertTrue(tableRegionIndex != -1);
262     assertNotSame(metaServerIndex, tableRegionIndex);
263     return tableRegionIndex;
264   }
265 
266   /**
267    * Find regionserver other than the one passed.
268    * Can't rely on indexes into list of regionservers since crashed servers
269    * occupy an index.
270    * @param cluster
271    * @param notThisOne
272    * @return A regionserver that is not <code>notThisOne</code> or null if none
273    * found
274    */
275   private HRegionServer getOtherRegionServer(final MiniHBaseCluster cluster,
276       final HRegionServer notThisOne) {
277     for (RegionServerThread rst: cluster.getRegionServerThreads()) {
278       HRegionServer hrs = rst.getRegionServer();
279       if (hrs.getServerName().equals(notThisOne.getServerName())) continue;
280       if (hrs.isStopping() || hrs.isStopped()) continue;
281       return hrs;
282     }
283     return null;
284   }
285 
286   private void printOutRegions(final HRegionServer hrs, final String prefix) {
287     List<HRegionInfo> regions = hrs.getOnlineRegions();
288     for (HRegionInfo region: regions) {
289       LOG.info(prefix + region.getRegionNameAsString());
290     }
291   }
292 }