1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import static org.junit.Assert.assertEquals;
23  import static org.junit.Assert.assertNotSame;
24  import static org.junit.Assert.assertTrue;
25  
26  import java.io.IOException;
27  import java.util.List;
28  
29  import org.apache.commons.logging.Log;
30  import org.apache.commons.logging.LogFactory;
31  import org.apache.hadoop.hbase.HBaseTestingUtility;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.HRegionInfo;
34  import org.apache.hadoop.hbase.MasterNotRunningException;
35  import org.apache.hadoop.hbase.MiniHBaseCluster;
36  import org.apache.hadoop.hbase.UnknownRegionException;
37  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
38  import org.apache.hadoop.hbase.client.Delete;
39  import org.apache.hadoop.hbase.client.HBaseAdmin;
40  import org.apache.hadoop.hbase.client.HTable;
41  import org.apache.hadoop.hbase.util.Bytes;
42  import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
43  import org.junit.AfterClass;
44  import org.junit.Before;
45  import org.junit.BeforeClass;
46  import org.junit.Test;
47  
48  /**
49   * Like {@link TestSplitTransaction} in that we're testing {@link SplitTransaction}
50   * only the below tests are against a running cluster where {@link TestSplitTransaction}
51   * is tests against a bare {@link HRegion}.
52   */
53  public class TestSplitTransactionOnCluster {
54    private static final Log LOG =
55      LogFactory.getLog(TestSplitTransactionOnCluster.class);
56    private HBaseAdmin admin = null;
57    private MiniHBaseCluster cluster = null;
58  
59    private static final HBaseTestingUtility TESTING_UTIL =
60      new HBaseTestingUtility();
61  
62    @BeforeClass public static void before() throws Exception {
63      TESTING_UTIL.getConfiguration().setInt("hbase.balancer.period", 60000);
64      TESTING_UTIL.startMiniCluster(2);
65    }
66  
67    @AfterClass public static void after() throws Exception {
68      TESTING_UTIL.shutdownMiniCluster();
69    }
70  
71    @Before public void setup() throws IOException {
72      TESTING_UTIL.ensureSomeRegionServersAvailable(2);
73      this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration());
74      this.cluster = TESTING_UTIL.getMiniHBaseCluster();
75    }
76  
77    /**
78     * Messy test that simulates case where SplitTransactions fails to add one
79     * of the daughters up into the .META. table before crash.  We're testing
80     * fact that the shutdown handler will fixup the missing daughter region
81     * adding it back into .META.
82     * @throws IOException
83     * @throws InterruptedException
84     */
85    @Test (timeout = 600000) public void testShutdownSimpleFixup()
86    throws IOException, InterruptedException {
87      final byte [] tableName = Bytes.toBytes("testShutdownSimpleFixup");
88  
89      // Create table then get the single region for our new table.
90      HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
91  
92      List<HRegion> regions = cluster.getRegions(tableName);
93      assertEquals(1, regions.size());
94      HRegionInfo hri = regions.get(0).getRegionInfo();
95  
96      int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
97  
98      // Turn off balancer so it doesn't cut in and mess up our placements.
99      this.admin.balanceSwitch(false);
100     // Turn off the meta scanner so it don't remove parent on us.
101     cluster.getMaster().setCatalogJanitorEnabled(false);
102     try {
103       // Add a bit of load up into the table so splittable.
104       TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
105       // Get region pre-split.
106       HRegionServer server = cluster.getRegionServer(tableRegionIndex);
107       printOutRegions(server, "Initial regions: ");
108       int regionCount = server.getOnlineRegions().size();
109       // Now split.
110       split(hri, server, regionCount);
111       // Get daughters
112       List<HRegion> daughters = cluster.getRegions(tableName);
113       assertTrue(daughters.size() >= 2);
114       // Remove one of the daughters from .META. to simulate failed insert of
115       // daughter region up into .META.
116       removeDaughterFromMeta(daughters.get(0).getRegionName());
117       // Now crash the server
118       cluster.abortRegionServer(tableRegionIndex);
119       while(server.getOnlineRegions().size() > 0) {
120         LOG.info("Waiting on server to go down");
121         Thread.sleep(100);
122       }
123       // Wait till regions are back on line again.
124       while(cluster.getRegions(tableName).size() < daughters.size()) {
125         LOG.info("Waiting for repair to happen");
126         Thread.sleep(1000);
127       }
128       // Assert daughters are online.
129       regions = cluster.getRegions(tableName);
130       for (HRegion r: regions) {
131         assertTrue(daughters.contains(r));
132       }
133     } finally {
134       admin.balanceSwitch(true);
135       cluster.getMaster().setCatalogJanitorEnabled(true);
136     }
137   }
138 
139   /**
140    * Test that if daughter split on us, we won't do the shutdown handler fixup
141    * just because we can't find the immediate daughter of an offlined parent.
142    * @throws IOException
143    * @throws InterruptedException
144    */
145   @Test public void testShutdownFixupWhenDaughterHasSplit()
146   throws IOException, InterruptedException {
147     final byte [] tableName =
148       Bytes.toBytes("testShutdownFixupWhenDaughterHasSplit");
149 
150     // Create table then get the single region for our new table.
151     HTable t = TESTING_UTIL.createTable(tableName, HConstants.CATALOG_FAMILY);
152 
153     List<HRegion> regions = cluster.getRegions(tableName);
154     assertEquals(1, regions.size());
155     HRegionInfo hri = regions.get(0).getRegionInfo();
156 
157     int tableRegionIndex = ensureTableRegionNotOnSameServerAsMeta(admin, hri);
158 
159     // Turn off balancer so it doesn't cut in and mess up our placements.
160     this.admin.balanceSwitch(false);
161     // Turn off the meta scanner so it don't remove parent on us.
162     cluster.getMaster().setCatalogJanitorEnabled(false);
163     try {
164       // Add a bit of load up into the table so splittable.
165       TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY);
166       // Get region pre-split.
167       HRegionServer server = cluster.getRegionServer(tableRegionIndex);
168       printOutRegions(server, "Initial regions: ");
169       int regionCount = server.getOnlineRegions().size();
170       // Now split.
171       split(hri, server, regionCount);
172       // Get daughters
173       List<HRegion> daughters = cluster.getRegions(tableName);
174       assertTrue(daughters.size() >= 2);
175       //Test repeating split message. HBASE-3892
176       server.reportSplit(hri, daughters.get(0).getRegionInfo(), daughters.get(1).getRegionInfo());
177       LOG.info("Repeating split message. HBASE-3892");      
178       // Now split one of the daughters.
179       regionCount = server.getOnlineRegions().size();
180       split(daughters.get(0).getRegionInfo(), server, regionCount);
181       // Get list of daughters
182       daughters = cluster.getRegions(tableName);
183       // Now crash the server
184       cluster.abortRegionServer(tableRegionIndex);
185       while(server.getOnlineRegions().size() > 0) {
186         LOG.info("Waiting on server to go down");
187         Thread.sleep(100);
188       }
189       // Wait till regions are back on line again.
190       while(cluster.getRegions(tableName).size() < daughters.size()) {
191         LOG.info("Waiting for repair to happen");
192         Thread.sleep(1000);
193       }
194       // Assert daughters are online and ONLY the original daughters -- that
195       // fixup didn't insert one during server shutdown recover.
196       regions = cluster.getRegions(tableName);
197       assertEquals(daughters.size(), regions.size());
198       for (HRegion r: regions) {
199         assertTrue(daughters.contains(r));
200       }
201     } finally {
202       admin.balanceSwitch(true);
203       cluster.getMaster().setCatalogJanitorEnabled(true);
204     }
205   }
206 
207   private void split(final HRegionInfo hri, final HRegionServer server,
208       final int regionCount)
209   throws IOException, InterruptedException {
210     this.admin.split(hri.getRegionNameAsString());
211     while(server.getOnlineRegions().size() <= regionCount) {
212       LOG.debug("Waiting on region to split");
213       Thread.sleep(100);
214     }
215   }
216 
217   private void removeDaughterFromMeta(final byte [] regionName) throws IOException {
218     HTable metaTable =
219       new HTable(TESTING_UTIL.getConfiguration(), HConstants.META_TABLE_NAME);
220     Delete d = new Delete(regionName);
221     LOG.info("Deleted " + Bytes.toString(regionName));
222     metaTable.delete(d);
223   }
224 
225   /**
226    * Ensure single table region is not on same server as the single .META. table
227    * region.
228    * @param admin
229    * @param hri
230    * @return Index of the server hosting the single table region
231    * @throws UnknownRegionException
232    * @throws MasterNotRunningException
233    * @throws ZooKeeperConnectionException
234    * @throws InterruptedException
235    */
236   private int ensureTableRegionNotOnSameServerAsMeta(final HBaseAdmin admin,
237       final HRegionInfo hri)
238   throws UnknownRegionException, MasterNotRunningException,
239   ZooKeeperConnectionException, InterruptedException {
240     MiniHBaseCluster cluster = TESTING_UTIL.getMiniHBaseCluster();
241     // Now make sure that the table region is not on same server as that hosting
242     // .META.  We don't want .META. replay polluting our test when we later crash
243     // the table region serving server.
244     int metaServerIndex = cluster.getServerWithMeta();
245     assertTrue(metaServerIndex != -1);
246     HRegionServer metaRegionServer = cluster.getRegionServer(metaServerIndex);
247     int tableRegionIndex = cluster.getServerWith(hri.getRegionName());
248     assertTrue(tableRegionIndex != -1);
249     HRegionServer tableRegionServer = cluster.getRegionServer(tableRegionIndex);
250     if (metaRegionServer.getServerName().equals(tableRegionServer.getServerName())) {
251       HRegionServer hrs = getOtherRegionServer(cluster, metaRegionServer);
252       admin.move(hri.getEncodedNameAsBytes(), Bytes.toBytes(hrs.getServerName()));
253     }
254     // Wait till table region is up on the server that is NOT carrying .META..
255     while (true) {
256       tableRegionIndex = cluster.getServerWith(hri.getRegionName());
257       if (tableRegionIndex != -1 && tableRegionIndex != metaServerIndex) break;
258       LOG.debug("Waiting on region move off the .META. server; current index " +
259         tableRegionIndex);
260       Thread.sleep(100);
261     }
262     // Verify for sure table region is not on same server as .META.
263     tableRegionIndex = cluster.getServerWith(hri.getRegionName());
264     assertTrue(tableRegionIndex != -1);
265     assertNotSame(metaServerIndex, tableRegionIndex);
266     return tableRegionIndex;
267   }
268 
269   /**
270    * Find regionserver other than the one passed.
271    * Can't rely on indexes into list of regionservers since crashed servers
272    * occupy an index.
273    * @param cluster
274    * @param notThisOne
275    * @return A regionserver that is not <code>notThisOne</code> or null if none
276    * found
277    */
278   private HRegionServer getOtherRegionServer(final MiniHBaseCluster cluster,
279       final HRegionServer notThisOne) {
280     for (RegionServerThread rst: cluster.getRegionServerThreads()) {
281       HRegionServer hrs = rst.getRegionServer();
282       if (hrs.getServerName().equals(notThisOne.getServerName())) continue;
283       if (hrs.isStopping() || hrs.isStopped()) continue;
284       return hrs;
285     }
286     return null;
287   }
288 
289   private void printOutRegions(final HRegionServer hrs, final String prefix) {
290     List<HRegionInfo> regions = hrs.getOnlineRegions();
291     for (HRegionInfo region: regions) {
292       LOG.info(prefix + region.getRegionNameAsString());
293     }
294   }
295 }