View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotNull;
27  import static org.junit.Assert.assertTrue;
28  import static org.junit.Assert.fail;
29  
30  import java.io.IOException;
31  import java.util.ArrayList;
32  import java.util.Collection;
33  import java.util.HashMap;
34  import java.util.LinkedList;
35  import java.util.List;
36  import java.util.Map;
37  import java.util.Map.Entry;
38  import java.util.concurrent.CountDownLatch;
39  import java.util.concurrent.ExecutorService;
40  import java.util.concurrent.ScheduledThreadPoolExecutor;
41  import java.util.concurrent.SynchronousQueue;
42  import java.util.concurrent.ThreadPoolExecutor;
43  import java.util.concurrent.TimeUnit;
44  
45  import org.apache.commons.io.IOUtils;
46  import org.apache.commons.logging.Log;
47  import org.apache.commons.logging.LogFactory;
48  import org.apache.hadoop.conf.Configuration;
49  import org.apache.hadoop.fs.FileStatus;
50  import org.apache.hadoop.fs.FileSystem;
51  import org.apache.hadoop.fs.Path;
52  import org.apache.hadoop.hbase.ClusterStatus;
53  import org.apache.hadoop.hbase.HBaseTestingUtility;
54  import org.apache.hadoop.hbase.HColumnDescriptor;
55  import org.apache.hadoop.hbase.HConstants;
56  import org.apache.hadoop.hbase.HRegionInfo;
57  import org.apache.hadoop.hbase.HRegionLocation;
58  import org.apache.hadoop.hbase.HTableDescriptor;
59  import org.apache.hadoop.hbase.LargeTests;
60  import org.apache.hadoop.hbase.MiniHBaseCluster;
61  import org.apache.hadoop.hbase.ServerName;
62  import org.apache.hadoop.hbase.TableName;
63  import org.apache.hadoop.hbase.catalog.MetaEditor;
64  import org.apache.hadoop.hbase.client.Delete;
65  import org.apache.hadoop.hbase.client.Durability;
66  import org.apache.hadoop.hbase.client.Get;
67  import org.apache.hadoop.hbase.client.HBaseAdmin;
68  import org.apache.hadoop.hbase.client.HConnection;
69  import org.apache.hadoop.hbase.client.HConnectionManager;
70  import org.apache.hadoop.hbase.client.HTable;
71  import org.apache.hadoop.hbase.client.MetaScanner;
72  import org.apache.hadoop.hbase.client.Put;
73  import org.apache.hadoop.hbase.client.Result;
74  import org.apache.hadoop.hbase.client.ResultScanner;
75  import org.apache.hadoop.hbase.client.Scan;
76  import org.apache.hadoop.hbase.io.hfile.TestHFile;
77  import org.apache.hadoop.hbase.master.AssignmentManager;
78  import org.apache.hadoop.hbase.master.HMaster;
79  import org.apache.hadoop.hbase.master.RegionStates;
80  import org.apache.hadoop.hbase.master.TableLockManager;
81  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
82  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
83  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
84  import org.apache.hadoop.hbase.regionserver.HRegion;
85  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
86  import org.apache.hadoop.hbase.regionserver.HRegionServer;
87  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
88  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
89  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
90  import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
91  import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
92  import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
93  import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
94  import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
95  import org.apache.zookeeper.KeeperException;
96  import org.junit.AfterClass;
97  import org.junit.BeforeClass;
98  import org.junit.Ignore;
99  import org.junit.Test;
100 import org.junit.experimental.categories.Category;
101 import org.junit.rules.TestName;
102 
103 import com.google.common.collect.Multimap;
104 
105 /**
106  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
107  */
108 @Category(LargeTests.class)
109 public class TestHBaseFsck {
110   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
111   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
112   private final static Configuration conf = TEST_UTIL.getConfiguration();
113   private final static String FAM_STR = "fam";
114   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
115   private final static int REGION_ONLINE_TIMEOUT = 800;
116   private static RegionStates regionStates;
117   private static ExecutorService executorService;
118 
119   // for the instance, reset every test run
120   private HTable tbl;
121   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
122     Bytes.toBytes("B"), Bytes.toBytes("C") };
123   // one row per region.
124   private final static byte[][] ROWKEYS= new byte[][] {
125     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
126     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
127 
128   @BeforeClass
129   public static void setUpBeforeClass() throws Exception {
130     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
131     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
132     TEST_UTIL.startMiniCluster(3);
133 
134     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
135         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
136 
137     AssignmentManager assignmentManager =
138       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
139     regionStates = assignmentManager.getRegionStates();
140   }
141 
142   @AfterClass
143   public static void tearDownAfterClass() throws Exception {
144     TEST_UTIL.shutdownMiniCluster();
145   }
146 
147   @Test
148   public void testHBaseFsck() throws Exception {
149     assertNoErrors(doFsck(conf, false));
150     String table = "tableBadMetaAssign";
151     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
152 
153     // We created 1 table, should be fine
154     assertNoErrors(doFsck(conf, false));
155 
156     // Now let's mess it up and change the assignment in hbase:meta to
157     // point to a different region server
158     HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName(),
159         executorService);
160     Scan scan = new Scan();
161     scan.setStartRow(Bytes.toBytes(table+",,"));
162     ResultScanner scanner = meta.getScanner(scan);
163     HRegionInfo hri = null;
164 
165     Result res = scanner.next();
166     ServerName currServer =
167       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
168           HConstants.SERVER_QUALIFIER));
169     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
170         HConstants.STARTCODE_QUALIFIER));
171 
172     for (JVMClusterUtil.RegionServerThread rs :
173         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
174 
175       ServerName sn = rs.getRegionServer().getServerName();
176 
177       // When we find a diff RS, change the assignment and break
178       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
179           startCode != sn.getStartcode()) {
180         Put put = new Put(res.getRow());
181         put.setDurability(Durability.SKIP_WAL);
182         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
183           Bytes.toBytes(sn.getHostAndPort()));
184         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
185           Bytes.toBytes(sn.getStartcode()));
186         meta.put(put);
187         hri = HRegionInfo.getHRegionInfo(res);
188         break;
189       }
190     }
191 
192     // Try to fix the data
193     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
194         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
195 
196     TEST_UTIL.getHBaseCluster().getMaster()
197       .getAssignmentManager().waitForAssignment(hri);
198 
199     // Should be fixed now
200     assertNoErrors(doFsck(conf, false));
201 
202     // comment needed - what is the purpose of this line
203     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
204     ResultScanner s = t.getScanner(new Scan());
205     s.close();
206     t.close();
207 
208     scanner.close();
209     meta.close();
210   }
211 
212   /**
213    * Create a new region in META.
214    */
215   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
216       htd, byte[] startKey, byte[] endKey)
217       throws IOException {
218     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
219     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
220     MetaEditor.addRegionToMeta(meta, hri);
221     meta.close();
222     return hri;
223   }
224 
225   /**
226    * Debugging method to dump the contents of meta.
227    */
228   private void dumpMeta(TableName tableName) throws IOException {
229     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
230     for (byte[] row : metaRows) {
231       LOG.info(Bytes.toString(row));
232     }
233   }
234 
235   /**
236    * This method is used to undeploy a region -- close it and attempt to
237    * remove its state from the Master.
238    */
239   private void undeployRegion(HBaseAdmin admin, ServerName sn,
240       HRegionInfo hri) throws IOException, InterruptedException {
241     try {
242       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
243       if (!hri.isMetaTable()) {
244         admin.offline(hri.getRegionName());
245       }
246     } catch (IOException ioe) {
247       LOG.warn("Got exception when attempting to offline region "
248           + Bytes.toString(hri.getRegionName()), ioe);
249     }
250   }
251   /**
252    * Delete a region from assignments, meta, or completely from hdfs.
253    * @param unassign if true unassign region if assigned
254    * @param metaRow  if true remove region's row from META
255    * @param hdfs if true remove region's dir in HDFS
256    */
257   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
258       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
259       boolean hdfs) throws IOException, InterruptedException {
260     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
261   }
262 
263   /**
264    * Delete a region from assignments, meta, or completely from hdfs.
265    * @param unassign if true unassign region if assigned
266    * @param metaRow  if true remove region's row from META
267    * @param hdfs if true remove region's dir in HDFS
268    * @param regionInfoOnly if true remove a region dir's .regioninfo file
269    */
270   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
271       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
272       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
273     LOG.info("** Before delete:");
274     dumpMeta(htd.getTableName());
275 
276     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
277     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
278       HRegionInfo hri = e.getKey();
279       ServerName hsa = e.getValue();
280       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
281           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
282 
283         LOG.info("RegionName: " +hri.getRegionNameAsString());
284         byte[] deleteRow = hri.getRegionName();
285 
286         if (unassign) {
287           LOG.info("Undeploying region " + hri + " from server " + hsa);
288           undeployRegion(new HBaseAdmin(conf), hsa, hri);
289         }
290 
291         if (regionInfoOnly) {
292           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
293           Path rootDir = FSUtils.getRootDir(conf);
294           FileSystem fs = rootDir.getFileSystem(conf);
295           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
296               hri.getEncodedName());
297           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
298           fs.delete(hriPath, true);
299         }
300 
301         if (hdfs) {
302           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
303           Path rootDir = FSUtils.getRootDir(conf);
304           FileSystem fs = rootDir.getFileSystem(conf);
305           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
306               hri.getEncodedName());
307           HBaseFsck.debugLsr(conf, p);
308           boolean success = fs.delete(p, true);
309           LOG.info("Deleted " + p + " sucessfully? " + success);
310           HBaseFsck.debugLsr(conf, p);
311         }
312 
313         if (metaRow) {
314           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
315           Delete delete = new Delete(deleteRow);
316           meta.delete(delete);
317         }
318       }
319       LOG.info(hri.toString() + hsa.toString());
320     }
321 
322     TEST_UTIL.getMetaTableRows(htd.getTableName());
323     LOG.info("*** After delete:");
324     dumpMeta(htd.getTableName());
325   }
326 
327   /**
328    * Setup a clean table before we start mucking with it.
329    *
330    * @throws IOException
331    * @throws InterruptedException
332    * @throws KeeperException
333    */
334   HTable setupTable(TableName tablename) throws Exception {
335     HTableDescriptor desc = new HTableDescriptor(tablename);
336     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
337     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
338     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
339     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
340 
341     List<Put> puts = new ArrayList<Put>();
342     for (byte[] row : ROWKEYS) {
343       Put p = new Put(row);
344       p.add(FAM, Bytes.toBytes("val"), row);
345       puts.add(p);
346     }
347     tbl.put(puts);
348     tbl.flushCommits();
349     return tbl;
350   }
351 
352   /**
353    * Counts the number of row to verify data loss or non-dataloss.
354    */
355   int countRows() throws IOException {
356      Scan s = new Scan();
357      ResultScanner rs = tbl.getScanner(s);
358      int i = 0;
359      while(rs.next() !=null) {
360        i++;
361      }
362      return i;
363   }
364 
365   /**
366    * delete table in preparation for next test
367    *
368    * @param tablename
369    * @throws IOException
370    */
371   void deleteTable(TableName tablename) throws IOException {
372     HBaseAdmin admin = new HBaseAdmin(conf);
373     admin.getConnection().clearRegionCache();
374     if (admin.isTableEnabled(tablename)) {
375       admin.disableTableAsync(tablename);
376     }
377     long totalWait = 0;
378     long maxWait = 30*1000;
379     long sleepTime = 250;
380     while (!admin.isTableDisabled(tablename)) {
381       try {
382         Thread.sleep(sleepTime);
383         totalWait += sleepTime;
384         if (totalWait >= maxWait) {
385           fail("Waited too long for table to be disabled + " + tablename);
386         }
387       } catch (InterruptedException e) {
388         e.printStackTrace();
389         fail("Interrupted when trying to disable table " + tablename);
390       }
391     }
392     admin.deleteTable(tablename);
393   }
394 
395   /**
396    * This creates a clean table and confirms that the table is clean.
397    */
398   @Test
399   public void testHBaseFsckClean() throws Exception {
400     assertNoErrors(doFsck(conf, false));
401     TableName table = TableName.valueOf("tableClean");
402     try {
403       HBaseFsck hbck = doFsck(conf, false);
404       assertNoErrors(hbck);
405 
406       setupTable(table);
407       assertEquals(ROWKEYS.length, countRows());
408 
409       // We created 1 table, should be fine
410       hbck = doFsck(conf, false);
411       assertNoErrors(hbck);
412       assertEquals(0, hbck.getOverlapGroups(table).size());
413       assertEquals(ROWKEYS.length, countRows());
414     } finally {
415       deleteTable(table);
416     }
417   }
418 
419   /**
420    * Test thread pooling in the case where there are more regions than threads
421    */
422   @Test
423   public void testHbckThreadpooling() throws Exception {
424     TableName table =
425         TableName.valueOf("tableDupeStartKey");
426     try {
427       // Create table with 4 regions
428       setupTable(table);
429 
430       // limit number of threads to 1.
431       Configuration newconf = new Configuration(conf);
432       newconf.setInt("hbasefsck.numthreads", 1);
433       assertNoErrors(doFsck(newconf, false));
434 
435       // We should pass without triggering a RejectedExecutionException
436     } finally {
437       deleteTable(table);
438     }
439   }
440 
441   @Test
442   public void testHbckFixOrphanTable() throws Exception {
443     TableName table = TableName.valueOf("tableInfo");
444     FileSystem fs = null;
445     Path tableinfo = null;
446     try {
447       setupTable(table);
448       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
449 
450       Path hbaseTableDir = FSUtils.getTableDir(
451           FSUtils.getRootDir(conf), table);
452       fs = hbaseTableDir.getFileSystem(conf);
453       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
454       tableinfo = status.getPath();
455       fs.rename(tableinfo, new Path("/.tableinfo"));
456 
457       //to report error if .tableinfo is missing.
458       HBaseFsck hbck = doFsck(conf, false);
459       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
460 
461       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
462       hbck = doFsck(conf, true);
463       assertNoErrors(hbck);
464       status = null;
465       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
466       assertNotNull(status);
467 
468       HTableDescriptor htd = admin.getTableDescriptor(table);
469       htd.setValue("NOT_DEFAULT", "true");
470       admin.disableTable(table);
471       admin.modifyTable(table, htd);
472       admin.enableTable(table);
473       fs.delete(status.getPath(), true);
474 
475       // fix OrphanTable with cache
476       htd = admin.getTableDescriptor(table); // warms up cached htd on master
477       hbck = doFsck(conf, true);
478       assertNoErrors(hbck);
479       status = null;
480       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
481       assertNotNull(status);
482       htd = admin.getTableDescriptor(table);
483       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
484     } finally {
485       fs.rename(new Path("/.tableinfo"), tableinfo);
486       deleteTable(table);
487     }
488   }
489 
490   /**
491    * This create and fixes a bad table with regions that have a duplicate
492    * start key
493    */
494   @Test
495   public void testDupeStartKey() throws Exception {
496     TableName table =
497         TableName.valueOf("tableDupeStartKey");
498     try {
499       setupTable(table);
500       assertNoErrors(doFsck(conf, false));
501       assertEquals(ROWKEYS.length, countRows());
502 
503       // Now let's mess it up, by adding a region with a duplicate startkey
504       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
505           Bytes.toBytes("A"), Bytes.toBytes("A2"));
506       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
507       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
508           .waitForAssignment(hriDupe);
509       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
510       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
511 
512       HBaseFsck hbck = doFsck(conf, false);
513       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
514             ERROR_CODE.DUPE_STARTKEYS});
515       assertEquals(2, hbck.getOverlapGroups(table).size());
516       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
517 
518       // fix the degenerate region.
519       doFsck(conf,true);
520 
521       // check that the degenerate region is gone and no data loss
522       HBaseFsck hbck2 = doFsck(conf,false);
523       assertNoErrors(hbck2);
524       assertEquals(0, hbck2.getOverlapGroups(table).size());
525       assertEquals(ROWKEYS.length, countRows());
526     } finally {
527       deleteTable(table);
528     }
529   }
530 
531   /**
532    * Get region info from local cluster.
533    */
534   Map<ServerName, List<String>> getDeployedHRIs(
535       final HBaseAdmin admin) throws IOException {
536     ClusterStatus status = admin.getClusterStatus();
537     Collection<ServerName> regionServers = status.getServers();
538     Map<ServerName, List<String>> mm =
539         new HashMap<ServerName, List<String>>();
540     HConnection connection = admin.getConnection();
541     for (ServerName hsi : regionServers) {
542       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
543 
544       // list all online regions from this region server
545       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
546       List<String> regionNames = new ArrayList<String>();
547       for (HRegionInfo hri : regions) {
548         regionNames.add(hri.getRegionNameAsString());
549       }
550       mm.put(hsi, regionNames);
551     }
552     return mm;
553   }
554 
555   /**
556    * Returns the HSI a region info is on.
557    */
558   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
559     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
560       if (e.getValue().contains(hri.getRegionNameAsString())) {
561         return e.getKey();
562       }
563     }
564     return null;
565   }
566 
567   /**
568    * This create and fixes a bad table with regions that have a duplicate
569    * start key
570    */
571   @Test
572   public void testDupeRegion() throws Exception {
573     TableName table =
574         TableName.valueOf("tableDupeRegion");
575     try {
576       setupTable(table);
577       assertNoErrors(doFsck(conf, false));
578       assertEquals(ROWKEYS.length, countRows());
579 
580       // Now let's mess it up, by adding a region with a duplicate startkey
581       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
582           Bytes.toBytes("A"), Bytes.toBytes("B"));
583 
584       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
585       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
586           .waitForAssignment(hriDupe);
587       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
588       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
589 
590       // Yikes! The assignment manager can't tell between diff between two
591       // different regions with the same start/endkeys since it doesn't
592       // differentiate on ts/regionId!  We actually need to recheck
593       // deployments!
594       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
595       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
596         Thread.sleep(250);
597       }
598 
599       LOG.debug("Finished assignment of dupe region");
600 
601       // TODO why is dupe region different from dupe start keys?
602       HBaseFsck hbck = doFsck(conf, false);
603       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
604             ERROR_CODE.DUPE_STARTKEYS});
605       assertEquals(2, hbck.getOverlapGroups(table).size());
606       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
607 
608       // fix the degenerate region.
609       doFsck(conf,true);
610 
611       // check that the degenerate region is gone and no data loss
612       HBaseFsck hbck2 = doFsck(conf,false);
613       assertNoErrors(hbck2);
614       assertEquals(0, hbck2.getOverlapGroups(table).size());
615       assertEquals(ROWKEYS.length, countRows());
616     } finally {
617       deleteTable(table);
618     }
619   }
620 
621   /**
622    * This creates and fixes a bad table with regions that has startkey == endkey
623    */
624   @Test
625   public void testDegenerateRegions() throws Exception {
626     TableName table =
627         TableName.valueOf("tableDegenerateRegions");
628     try {
629       setupTable(table);
630       assertNoErrors(doFsck(conf,false));
631       assertEquals(ROWKEYS.length, countRows());
632 
633       // Now let's mess it up, by adding a region with a duplicate startkey
634       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
635           Bytes.toBytes("B"), Bytes.toBytes("B"));
636       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
637       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
638           .waitForAssignment(hriDupe);
639       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
640       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
641 
642       HBaseFsck hbck = doFsck(conf,false);
643       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
644           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
645       assertEquals(2, hbck.getOverlapGroups(table).size());
646       assertEquals(ROWKEYS.length, countRows());
647 
648       // fix the degenerate region.
649       doFsck(conf,true);
650 
651       // check that the degenerate region is gone and no data loss
652       HBaseFsck hbck2 = doFsck(conf,false);
653       assertNoErrors(hbck2);
654       assertEquals(0, hbck2.getOverlapGroups(table).size());
655       assertEquals(ROWKEYS.length, countRows());
656     } finally {
657       deleteTable(table);
658     }
659   }
660 
661   /**
662    * This creates and fixes a bad table where a region is completely contained
663    * by another region.
664    */
665   @Test
666   public void testContainedRegionOverlap() throws Exception {
667     TableName table =
668         TableName.valueOf("tableContainedRegionOverlap");
669     try {
670       setupTable(table);
671       assertEquals(ROWKEYS.length, countRows());
672 
673       // Mess it up by creating an overlap in the metadata
674       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
675           Bytes.toBytes("A2"), Bytes.toBytes("B"));
676       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
677       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
678           .waitForAssignment(hriOverlap);
679       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
680       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
681 
682       HBaseFsck hbck = doFsck(conf, false);
683       assertErrors(hbck, new ERROR_CODE[] {
684           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
685       assertEquals(2, hbck.getOverlapGroups(table).size());
686       assertEquals(ROWKEYS.length, countRows());
687 
688       // fix the problem.
689       doFsck(conf, true);
690 
691       // verify that overlaps are fixed
692       HBaseFsck hbck2 = doFsck(conf,false);
693       assertNoErrors(hbck2);
694       assertEquals(0, hbck2.getOverlapGroups(table).size());
695       assertEquals(ROWKEYS.length, countRows());
696     } finally {
697        deleteTable(table);
698     }
699   }
700 
701   /**
702    * This creates and fixes a bad table where an overlap group of
703    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
704    * region. Mess around the meta data so that closeRegion/offlineRegion
705    * throws exceptions.
706    */
707   @Test
708   public void testSidelineOverlapRegion() throws Exception {
709     TableName table =
710         TableName.valueOf("testSidelineOverlapRegion");
711     try {
712       setupTable(table);
713       assertEquals(ROWKEYS.length, countRows());
714 
715       // Mess it up by creating an overlap
716       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
717       HMaster master = cluster.getMaster();
718       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
719         Bytes.toBytes("A"), Bytes.toBytes("AB"));
720       master.assignRegion(hriOverlap1);
721       master.getAssignmentManager().waitForAssignment(hriOverlap1);
722       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
723         Bytes.toBytes("AB"), Bytes.toBytes("B"));
724       master.assignRegion(hriOverlap2);
725       master.getAssignmentManager().waitForAssignment(hriOverlap2);
726 
727       HBaseFsck hbck = doFsck(conf, false);
728       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
729         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
730       assertEquals(3, hbck.getOverlapGroups(table).size());
731       assertEquals(ROWKEYS.length, countRows());
732 
733       // mess around the overlapped regions, to trigger NotServingRegionException
734       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
735       ServerName serverName = null;
736       byte[] regionName = null;
737       for (HbckInfo hbi: overlapGroups.values()) {
738         if ("A".equals(Bytes.toString(hbi.getStartKey()))
739             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
740           regionName = hbi.getRegionName();
741 
742           // get an RS not serving the region to force bad assignment info in to META.
743           int k = cluster.getServerWith(regionName);
744           for (int i = 0; i < 3; i++) {
745             if (i != k) {
746               HRegionServer rs = cluster.getRegionServer(i);
747               serverName = rs.getServerName();
748               break;
749             }
750           }
751 
752           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
753           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
754             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
755           admin.offline(regionName);
756           break;
757         }
758       }
759 
760       assertNotNull(regionName);
761       assertNotNull(serverName);
762       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
763       Put put = new Put(regionName);
764       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
765         Bytes.toBytes(serverName.getHostAndPort()));
766       meta.put(put);
767 
768       // fix the problem.
769       HBaseFsck fsck = new HBaseFsck(conf);
770       fsck.connect();
771       fsck.setDisplayFullReport(); // i.e. -details
772       fsck.setTimeLag(0);
773       fsck.setFixAssignments(true);
774       fsck.setFixMeta(true);
775       fsck.setFixHdfsHoles(true);
776       fsck.setFixHdfsOverlaps(true);
777       fsck.setFixHdfsOrphans(true);
778       fsck.setFixVersionFile(true);
779       fsck.setSidelineBigOverlaps(true);
780       fsck.setMaxMerge(2);
781       fsck.onlineHbck();
782 
783       // verify that overlaps are fixed, and there are less rows
784       // since one region is sidelined.
785       HBaseFsck hbck2 = doFsck(conf,false);
786       assertNoErrors(hbck2);
787       assertEquals(0, hbck2.getOverlapGroups(table).size());
788       assertTrue(ROWKEYS.length > countRows());
789     } finally {
790        deleteTable(table);
791     }
792   }
793 
794   /**
795    * This creates and fixes a bad table where a region is completely contained
796    * by another region, and there is a hole (sort of like a bad split)
797    */
798   @Test
799   public void testOverlapAndOrphan() throws Exception {
800     TableName table =
801         TableName.valueOf("tableOverlapAndOrphan");
802     try {
803       setupTable(table);
804       assertEquals(ROWKEYS.length, countRows());
805 
806       // Mess it up by creating an overlap in the metadata
807       TEST_UTIL.getHBaseAdmin().disableTable(table);
808       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
809           Bytes.toBytes("B"), true, true, false, true);
810       TEST_UTIL.getHBaseAdmin().enableTable(table);
811 
812       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
813           Bytes.toBytes("A2"), Bytes.toBytes("B"));
814       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
815       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
816           .waitForAssignment(hriOverlap);
817       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
818       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
819 
820       HBaseFsck hbck = doFsck(conf, false);
821       assertErrors(hbck, new ERROR_CODE[] {
822           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
823           ERROR_CODE.HOLE_IN_REGION_CHAIN});
824 
825       // fix the problem.
826       doFsck(conf, true);
827 
828       // verify that overlaps are fixed
829       HBaseFsck hbck2 = doFsck(conf,false);
830       assertNoErrors(hbck2);
831       assertEquals(0, hbck2.getOverlapGroups(table).size());
832       assertEquals(ROWKEYS.length, countRows());
833     } finally {
834        deleteTable(table);
835     }
836   }
837 
838   /**
839    * This creates and fixes a bad table where a region overlaps two regions --
840    * a start key contained in another region and its end key is contained in
841    * yet another region.
842    */
843   @Test
844   public void testCoveredStartKey() throws Exception {
845     TableName table =
846         TableName.valueOf("tableCoveredStartKey");
847     try {
848       setupTable(table);
849       assertEquals(ROWKEYS.length, countRows());
850 
851       // Mess it up by creating an overlap in the metadata
852       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
853           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
854       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
855       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
856           .waitForAssignment(hriOverlap);
857       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
858       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
859 
860       HBaseFsck hbck = doFsck(conf, false);
861       assertErrors(hbck, new ERROR_CODE[] {
862           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
863           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
864       assertEquals(3, hbck.getOverlapGroups(table).size());
865       assertEquals(ROWKEYS.length, countRows());
866 
867       // fix the problem.
868       doFsck(conf, true);
869 
870       // verify that overlaps are fixed
871       HBaseFsck hbck2 = doFsck(conf, false);
872       assertErrors(hbck2, new ERROR_CODE[0]);
873       assertEquals(0, hbck2.getOverlapGroups(table).size());
874       assertEquals(ROWKEYS.length, countRows());
875     } finally {
876       deleteTable(table);
877     }
878   }
879 
880   /**
881    * This creates and fixes a bad table with a missing region -- hole in meta
882    * and data missing in the fs.
883    */
884   @Test
885   public void testRegionHole() throws Exception {
886     TableName table =
887         TableName.valueOf("tableRegionHole");
888     try {
889       setupTable(table);
890       assertEquals(ROWKEYS.length, countRows());
891 
892       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
893       TEST_UTIL.getHBaseAdmin().disableTable(table);
894       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
895           Bytes.toBytes("C"), true, true, true);
896       TEST_UTIL.getHBaseAdmin().enableTable(table);
897 
898       HBaseFsck hbck = doFsck(conf, false);
899       assertErrors(hbck, new ERROR_CODE[] {
900           ERROR_CODE.HOLE_IN_REGION_CHAIN});
901       // holes are separate from overlap groups
902       assertEquals(0, hbck.getOverlapGroups(table).size());
903 
904       // fix hole
905       doFsck(conf, true);
906 
907       // check that hole fixed
908       assertNoErrors(doFsck(conf,false));
909       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
910     } finally {
911       deleteTable(table);
912     }
913   }
914 
915   /**
916    * This creates and fixes a bad table with a missing region -- hole in meta
917    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
918    */
919   @Test
920   public void testHDFSRegioninfoMissing() throws Exception {
921     TableName table =
922         TableName.valueOf("tableHDFSRegioininfoMissing");
923     try {
924       setupTable(table);
925       assertEquals(ROWKEYS.length, countRows());
926 
927       // Mess it up by leaving a hole in the meta data
928       TEST_UTIL.getHBaseAdmin().disableTable(table);
929       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
930           Bytes.toBytes("C"), true, true, false, true);
931       TEST_UTIL.getHBaseAdmin().enableTable(table);
932 
933       HBaseFsck hbck = doFsck(conf, false);
934       assertErrors(hbck, new ERROR_CODE[] {
935           ERROR_CODE.ORPHAN_HDFS_REGION,
936           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
937           ERROR_CODE.HOLE_IN_REGION_CHAIN});
938       // holes are separate from overlap groups
939       assertEquals(0, hbck.getOverlapGroups(table).size());
940 
941       // fix hole
942       doFsck(conf, true);
943 
944       // check that hole fixed
945       assertNoErrors(doFsck(conf, false));
946       assertEquals(ROWKEYS.length, countRows());
947     } finally {
948       deleteTable(table);
949     }
950   }
951 
952   /**
953    * This creates and fixes a bad table with a region that is missing meta and
954    * not assigned to a region server.
955    */
956   @Test
957   public void testNotInMetaOrDeployedHole() throws Exception {
958     TableName table =
959         TableName.valueOf("tableNotInMetaOrDeployedHole");
960     try {
961       setupTable(table);
962       assertEquals(ROWKEYS.length, countRows());
963 
964       // Mess it up by leaving a hole in the meta data
965       TEST_UTIL.getHBaseAdmin().disableTable(table);
966       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
967           Bytes.toBytes("C"), true, true, false); // don't rm from fs
968       TEST_UTIL.getHBaseAdmin().enableTable(table);
969 
970       HBaseFsck hbck = doFsck(conf, false);
971       assertErrors(hbck, new ERROR_CODE[] {
972           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
973       // holes are separate from overlap groups
974       assertEquals(0, hbck.getOverlapGroups(table).size());
975 
976       // fix hole
977       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
978           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
979 
980       // check that hole fixed
981       assertNoErrors(doFsck(conf,false));
982       assertEquals(ROWKEYS.length, countRows());
983     } finally {
984       deleteTable(table);
985     }
986   }
987 
988   /**
989    * This creates fixes a bad table with a hole in meta.
990    */
991   @Test
992   public void testNotInMetaHole() throws Exception {
993     TableName table =
994         TableName.valueOf("tableNotInMetaHole");
995     try {
996       setupTable(table);
997       assertEquals(ROWKEYS.length, countRows());
998 
999       // Mess it up by leaving a hole in the meta data
1000       TEST_UTIL.getHBaseAdmin().disableTable(table);
1001       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1002           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1003       TEST_UTIL.getHBaseAdmin().enableTable(table);
1004 
1005       HBaseFsck hbck = doFsck(conf, false);
1006       assertErrors(hbck, new ERROR_CODE[] {
1007           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1008       // holes are separate from overlap groups
1009       assertEquals(0, hbck.getOverlapGroups(table).size());
1010 
1011       // fix hole
1012       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1013           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1014 
1015       // check that hole fixed
1016       assertNoErrors(doFsck(conf,false));
1017       assertEquals(ROWKEYS.length, countRows());
1018     } finally {
1019       deleteTable(table);
1020     }
1021   }
1022 
1023   /**
1024    * This creates and fixes a bad table with a region that is in meta but has
1025    * no deployment or data hdfs
1026    */
1027   @Test
1028   public void testNotInHdfs() throws Exception {
1029     TableName table =
1030         TableName.valueOf("tableNotInHdfs");
1031     try {
1032       setupTable(table);
1033       assertEquals(ROWKEYS.length, countRows());
1034 
1035       // make sure data in regions, if in hlog only there is no data loss
1036       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1037 
1038       // Mess it up by leaving a hole in the hdfs data
1039       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1040           Bytes.toBytes("C"), false, false, true); // don't rm meta
1041 
1042       HBaseFsck hbck = doFsck(conf, false);
1043       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1044       // holes are separate from overlap groups
1045       assertEquals(0, hbck.getOverlapGroups(table).size());
1046 
1047       // fix hole
1048       doFsck(conf, true);
1049 
1050       // check that hole fixed
1051       assertNoErrors(doFsck(conf,false));
1052       assertEquals(ROWKEYS.length - 2, countRows());
1053     } finally {
1054       deleteTable(table);
1055     }
1056   }
1057 
1058   /**
1059    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1060    * remove the table.
1061    */
1062   @Test
1063   public void testNoHdfsTable() throws Exception {
1064     TableName table = TableName.valueOf("NoHdfsTable");
1065     setupTable(table);
1066     assertEquals(ROWKEYS.length, countRows());
1067 
1068     // make sure data in regions, if in hlog only there is no data loss
1069     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1070 
1071     // Mess it up by deleting hdfs dirs
1072     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1073         Bytes.toBytes("A"), false, false, true); // don't rm meta
1074     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1075         Bytes.toBytes("B"), false, false, true); // don't rm meta
1076     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1077         Bytes.toBytes("C"), false, false, true); // don't rm meta
1078     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1079         Bytes.toBytes(""), false, false, true); // don't rm meta
1080 
1081     // also remove the table directory in hdfs
1082     deleteTableDir(table);
1083 
1084     HBaseFsck hbck = doFsck(conf, false);
1085     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1086         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1087         ERROR_CODE.NOT_IN_HDFS,});
1088     // holes are separate from overlap groups
1089     assertEquals(0, hbck.getOverlapGroups(table).size());
1090 
1091     // fix hole
1092     doFsck(conf, true); // detect dangling regions and remove those
1093 
1094     // check that hole fixed
1095     assertNoErrors(doFsck(conf,false));
1096     assertFalse("Table "+ table + " should have been deleted",
1097         TEST_UTIL.getHBaseAdmin().tableExists(table));
1098   }
1099 
1100   public void deleteTableDir(TableName table) throws IOException {
1101     Path rootDir = FSUtils.getRootDir(conf);
1102     FileSystem fs = rootDir.getFileSystem(conf);
1103     Path p = FSUtils.getTableDir(rootDir, table);
1104     HBaseFsck.debugLsr(conf, p);
1105     boolean success = fs.delete(p, true);
1106     LOG.info("Deleted " + p + " sucessfully? " + success);
1107   }
1108 
1109   /**
1110    * when the hbase.version file missing, It is fix the fault.
1111    */
1112   @Test
1113   public void testNoVersionFile() throws Exception {
1114     // delete the hbase.version file
1115     Path rootDir = FSUtils.getRootDir(conf);
1116     FileSystem fs = rootDir.getFileSystem(conf);
1117     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1118     fs.delete(versionFile, true);
1119 
1120     // test
1121     HBaseFsck hbck = doFsck(conf, false);
1122     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1123     // fix hbase.version missing
1124     doFsck(conf, true);
1125 
1126     // no version file fixed
1127     assertNoErrors(doFsck(conf, false));
1128   }
1129 
1130   /**
1131    * The region is not deployed when the table is disabled.
1132    */
1133   @Test
1134   public void testRegionShouldNotBeDeployed() throws Exception {
1135     TableName table =
1136         TableName.valueOf("tableRegionShouldNotBeDeployed");
1137     try {
1138       LOG.info("Starting testRegionShouldNotBeDeployed.");
1139       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1140       assertTrue(cluster.waitForActiveAndReadyMaster());
1141 
1142 
1143       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1144           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1145       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1146       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1147 
1148       // Write the .tableinfo
1149       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1150       fstd.createTableDescriptor(htdDisabled);
1151       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1152           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1153 
1154       // Let's just assign everything to first RS
1155       HRegionServer hrs = cluster.getRegionServer(0);
1156 
1157       // Create region files.
1158       TEST_UTIL.getHBaseAdmin().disableTable(table);
1159       TEST_UTIL.getHBaseAdmin().enableTable(table);
1160 
1161       // Disable the table and close its regions
1162       TEST_UTIL.getHBaseAdmin().disableTable(table);
1163       HRegionInfo region = disabledRegions.remove(0);
1164       byte[] regionName = region.getRegionName();
1165 
1166       // The region should not be assigned currently
1167       assertTrue(cluster.getServerWith(regionName) == -1);
1168 
1169       // Directly open a region on a region server.
1170       // If going through AM/ZK, the region won't be open.
1171       // Even it is opened, AM will close it which causes
1172       // flakiness of this test.
1173       HRegion r = HRegion.openHRegion(
1174         region, htdDisabled, hrs.getWAL(region), conf);
1175       hrs.addToOnlineRegions(r);
1176 
1177       HBaseFsck hbck = doFsck(conf, false);
1178       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1179 
1180       // fix this fault
1181       doFsck(conf, true);
1182 
1183       // check result
1184       assertNoErrors(doFsck(conf, false));
1185     } finally {
1186       TEST_UTIL.getHBaseAdmin().enableTable(table);
1187       deleteTable(table);
1188     }
1189   }
1190 
1191   /**
1192    * This creates two tables and mess both of them and fix them one by one
1193    */
1194   @Test
1195   public void testFixByTable() throws Exception {
1196     TableName table1 =
1197         TableName.valueOf("testFixByTable1");
1198     TableName table2 =
1199         TableName.valueOf("testFixByTable2");
1200     try {
1201       setupTable(table1);
1202       // make sure data in regions, if in hlog only there is no data loss
1203       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1204       // Mess them up by leaving a hole in the hdfs data
1205       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1206         Bytes.toBytes("C"), false, false, true); // don't rm meta
1207 
1208       setupTable(table2);
1209       // make sure data in regions, if in hlog only there is no data loss
1210       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1211       // Mess them up by leaving a hole in the hdfs data
1212       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1213         Bytes.toBytes("C"), false, false, true); // don't rm meta
1214 
1215       HBaseFsck hbck = doFsck(conf, false);
1216       assertErrors(hbck, new ERROR_CODE[] {
1217         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1218 
1219       // fix hole in table 1
1220       doFsck(conf, true, table1);
1221       // check that hole in table 1 fixed
1222       assertNoErrors(doFsck(conf, false, table1));
1223       // check that hole in table 2 still there
1224       assertErrors(doFsck(conf, false, table2),
1225         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1226 
1227       // fix hole in table 2
1228       doFsck(conf, true, table2);
1229       // check that hole in both tables fixed
1230       assertNoErrors(doFsck(conf, false));
1231       assertEquals(ROWKEYS.length - 2, countRows());
1232     } finally {
1233       deleteTable(table1);
1234       deleteTable(table2);
1235     }
1236   }
1237   /**
1238    * A split parent in meta, in hdfs, and not deployed
1239    */
1240   @Test
1241   public void testLingeringSplitParent() throws Exception {
1242     TableName table =
1243         TableName.valueOf("testLingeringSplitParent");
1244     HTable meta = null;
1245     try {
1246       setupTable(table);
1247       assertEquals(ROWKEYS.length, countRows());
1248 
1249       // make sure data in regions, if in hlog only there is no data loss
1250       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1251       HRegionLocation location = tbl.getRegionLocation("B");
1252 
1253       // Delete one region from meta, but not hdfs, unassign it.
1254       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1255         Bytes.toBytes("C"), true, true, false);
1256 
1257       // Create a new meta entry to fake it as a split parent.
1258       meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName(),
1259           executorService);
1260       HRegionInfo hri = location.getRegionInfo();
1261 
1262       HRegionInfo a = new HRegionInfo(tbl.getName(),
1263         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1264       HRegionInfo b = new HRegionInfo(tbl.getName(),
1265         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1266 
1267       hri.setOffline(true);
1268       hri.setSplit(true);
1269 
1270       MetaEditor.addRegionToMeta(meta, hri, a, b);
1271       meta.flushCommits();
1272       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1273 
1274       HBaseFsck hbck = doFsck(conf, false);
1275       assertErrors(hbck, new ERROR_CODE[] {
1276         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1277 
1278       // regular repair cannot fix lingering split parent
1279       hbck = doFsck(conf, true);
1280       assertErrors(hbck, new ERROR_CODE[] {
1281         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1282       assertFalse(hbck.shouldRerun());
1283       hbck = doFsck(conf, false);
1284       assertErrors(hbck, new ERROR_CODE[] {
1285         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1286 
1287       // fix lingering split parent
1288       hbck = new HBaseFsck(conf);
1289       hbck.connect();
1290       hbck.setDisplayFullReport(); // i.e. -details
1291       hbck.setTimeLag(0);
1292       hbck.setFixSplitParents(true);
1293       hbck.onlineHbck();
1294       assertTrue(hbck.shouldRerun());
1295 
1296       Get get = new Get(hri.getRegionName());
1297       Result result = meta.get(get);
1298       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1299         HConstants.SPLITA_QUALIFIER).isEmpty());
1300       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1301         HConstants.SPLITB_QUALIFIER).isEmpty());
1302       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1303 
1304       // fix other issues
1305       doFsck(conf, true);
1306 
1307       // check that all are fixed
1308       assertNoErrors(doFsck(conf, false));
1309       assertEquals(ROWKEYS.length, countRows());
1310     } finally {
1311       deleteTable(table);
1312       IOUtils.closeQuietly(meta);
1313     }
1314   }
1315 
1316   /**
1317    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1318    * valid cases where the daughters are there.
1319    */
1320   @Test
1321   public void testValidLingeringSplitParent() throws Exception {
1322     TableName table =
1323         TableName.valueOf("testLingeringSplitParent");
1324     HTable meta = null;
1325     try {
1326       setupTable(table);
1327       assertEquals(ROWKEYS.length, countRows());
1328 
1329       // make sure data in regions, if in hlog only there is no data loss
1330       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1331       HRegionLocation location = tbl.getRegionLocation("B");
1332 
1333       meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName());
1334       HRegionInfo hri = location.getRegionInfo();
1335 
1336       // do a regular split
1337       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1338       byte[] regionName = location.getRegionInfo().getRegionName();
1339       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1340       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1341           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1342 
1343       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1344       // for some time until children references are deleted. HBCK erroneously sees this as
1345       // overlapping regions
1346       HBaseFsck hbck = doFsck(conf, true, true, false, false, false, true, true, true, false, false, null);
1347       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1348 
1349       // assert that the split hbase:meta entry is still there.
1350       Get get = new Get(hri.getRegionName());
1351       Result result = meta.get(get);
1352       assertNotNull(result);
1353       assertNotNull(HRegionInfo.getHRegionInfo(result));
1354 
1355       assertEquals(ROWKEYS.length, countRows());
1356 
1357       // assert that we still have the split regions
1358       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1359       assertNoErrors(doFsck(conf, false));
1360     } finally {
1361       deleteTable(table);
1362       IOUtils.closeQuietly(meta);
1363     }
1364   }
1365 
1366   /**
1367    * Split crashed after write to hbase:meta finished for the parent region, but
1368    * failed to write daughters (pre HBASE-7721 codebase)
1369    */
1370   @Test
1371   public void testSplitDaughtersNotInMeta() throws Exception {
1372     TableName table =
1373         TableName.valueOf("testSplitdaughtersNotInMeta");
1374     HTable meta = null;
1375     try {
1376       setupTable(table);
1377       assertEquals(ROWKEYS.length, countRows());
1378 
1379       // make sure data in regions, if in hlog only there is no data loss
1380       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1381       HRegionLocation location = tbl.getRegionLocation("B");
1382 
1383       meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName());
1384       HRegionInfo hri = location.getRegionInfo();
1385 
1386       // do a regular split
1387       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1388       byte[] regionName = location.getRegionInfo().getRegionName();
1389       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1390       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1391           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1392 
1393       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1394 
1395       // Delete daughter regions from meta, but not hdfs, unassign it.
1396       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1397       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1398       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1399 
1400       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1401       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1402       meta.flushCommits();
1403 
1404       HBaseFsck hbck = doFsck(conf, false);
1405       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1406           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1407 
1408       // now fix it. The fix should not revert the region split, but add daughters to META
1409       hbck = doFsck(conf, true, true, false, false, false, false, false, false, false, false, null);
1410       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1411           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1412 
1413       // assert that the split hbase:meta entry is still there.
1414       Get get = new Get(hri.getRegionName());
1415       Result result = meta.get(get);
1416       assertNotNull(result);
1417       assertNotNull(HRegionInfo.getHRegionInfo(result));
1418 
1419       assertEquals(ROWKEYS.length, countRows());
1420 
1421       // assert that we still have the split regions
1422       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1423       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1424     } finally {
1425       deleteTable(table);
1426       IOUtils.closeQuietly(meta);
1427     }
1428   }
1429 
1430   /**
1431    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1432    * meta and data missing in the fs.
1433    */
1434   @Test(timeout=120000)
1435   public void testMissingFirstRegion() throws Exception {
1436     TableName table =
1437         TableName.valueOf("testMissingFirstRegion");
1438     try {
1439       setupTable(table);
1440       assertEquals(ROWKEYS.length, countRows());
1441 
1442       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1443       TEST_UTIL.getHBaseAdmin().disableTable(table);
1444       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1445           true, true);
1446       TEST_UTIL.getHBaseAdmin().enableTable(table);
1447 
1448       HBaseFsck hbck = doFsck(conf, false);
1449       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1450       // fix hole
1451       doFsck(conf, true);
1452       // check that hole fixed
1453       assertNoErrors(doFsck(conf, false));
1454     } finally {
1455       deleteTable(table);
1456     }
1457   }
1458 
1459   /**
1460    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1461    * the fs.
1462    */
1463   @Test(timeout=120000)
1464   public void testMissingLastRegion() throws Exception {
1465     TableName table =
1466         TableName.valueOf("testMissingLastRegion");
1467     try {
1468       setupTable(table);
1469       assertEquals(ROWKEYS.length, countRows());
1470 
1471       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1472       TEST_UTIL.getHBaseAdmin().disableTable(table);
1473       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1474           true, true);
1475       TEST_UTIL.getHBaseAdmin().enableTable(table);
1476 
1477       HBaseFsck hbck = doFsck(conf, false);
1478       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1479       // fix hole
1480       doFsck(conf, true);
1481       // check that hole fixed
1482       assertNoErrors(doFsck(conf, false));
1483     } finally {
1484       deleteTable(table);
1485     }
1486   }
1487 
1488   /**
1489    * Test -noHdfsChecking option can detect and fix assignments issue.
1490    */
1491   @Test
1492   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1493     TableName table =
1494         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1495     try {
1496       setupTable(table);
1497       assertEquals(ROWKEYS.length, countRows());
1498 
1499       // Mess it up by closing a region
1500       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1501         Bytes.toBytes("B"), true, false, false, false);
1502 
1503       // verify there is no other errors
1504       HBaseFsck hbck = doFsck(conf, false);
1505       assertErrors(hbck, new ERROR_CODE[] {
1506         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1507 
1508       // verify that noHdfsChecking report the same errors
1509       HBaseFsck fsck = new HBaseFsck(conf);
1510       fsck.connect();
1511       fsck.setDisplayFullReport(); // i.e. -details
1512       fsck.setTimeLag(0);
1513       fsck.setCheckHdfs(false);
1514       fsck.onlineHbck();
1515       assertErrors(fsck, new ERROR_CODE[] {
1516         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1517 
1518       // verify that fixAssignments works fine with noHdfsChecking
1519       fsck = new HBaseFsck(conf);
1520       fsck.connect();
1521       fsck.setDisplayFullReport(); // i.e. -details
1522       fsck.setTimeLag(0);
1523       fsck.setCheckHdfs(false);
1524       fsck.setFixAssignments(true);
1525       fsck.onlineHbck();
1526       assertTrue(fsck.shouldRerun());
1527       fsck.onlineHbck();
1528       assertNoErrors(fsck);
1529 
1530       assertEquals(ROWKEYS.length, countRows());
1531     } finally {
1532       deleteTable(table);
1533     }
1534   }
1535 
1536   /**
1537    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1538    * However, it can not fix it without checking Hdfs because we need to get
1539    * the region info from Hdfs in this case, then to patch the meta.
1540    */
1541   @Test
1542   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1543     TableName table =
1544         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1545     try {
1546       setupTable(table);
1547       assertEquals(ROWKEYS.length, countRows());
1548 
1549       // Mess it up by deleting a region from the metadata
1550       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1551         Bytes.toBytes("B"), false, true, false, false);
1552 
1553       // verify there is no other errors
1554       HBaseFsck hbck = doFsck(conf, false);
1555       assertErrors(hbck, new ERROR_CODE[] {
1556         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1557 
1558       // verify that noHdfsChecking report the same errors
1559       HBaseFsck fsck = new HBaseFsck(conf);
1560       fsck.connect();
1561       fsck.setDisplayFullReport(); // i.e. -details
1562       fsck.setTimeLag(0);
1563       fsck.setCheckHdfs(false);
1564       fsck.onlineHbck();
1565       assertErrors(fsck, new ERROR_CODE[] {
1566         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1567 
1568       // verify that fixMeta doesn't work with noHdfsChecking
1569       fsck = new HBaseFsck(conf);
1570       fsck.connect();
1571       fsck.setDisplayFullReport(); // i.e. -details
1572       fsck.setTimeLag(0);
1573       fsck.setCheckHdfs(false);
1574       fsck.setFixAssignments(true);
1575       fsck.setFixMeta(true);
1576       fsck.onlineHbck();
1577       assertFalse(fsck.shouldRerun());
1578       assertErrors(fsck, new ERROR_CODE[] {
1579         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1580     } finally {
1581       deleteTable(table);
1582     }
1583   }
1584 
1585   /**
1586    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1587    * and -noHdfsChecking can't detect orphan Hdfs region.
1588    */
1589   @Test
1590   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1591     TableName table =
1592         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1593     try {
1594       setupTable(table);
1595       assertEquals(ROWKEYS.length, countRows());
1596 
1597       // Mess it up by creating an overlap in the metadata
1598       TEST_UTIL.getHBaseAdmin().disableTable(table);
1599       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1600         Bytes.toBytes("B"), true, true, false, true);
1601       TEST_UTIL.getHBaseAdmin().enableTable(table);
1602 
1603       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1604         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1605       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1606       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1607         .waitForAssignment(hriOverlap);
1608       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1609       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1610 
1611       HBaseFsck hbck = doFsck(conf, false);
1612       assertErrors(hbck, new ERROR_CODE[] {
1613         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1614         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1615 
1616       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1617       HBaseFsck fsck = new HBaseFsck(conf);
1618       fsck.connect();
1619       fsck.setDisplayFullReport(); // i.e. -details
1620       fsck.setTimeLag(0);
1621       fsck.setCheckHdfs(false);
1622       fsck.onlineHbck();
1623       assertErrors(fsck, new ERROR_CODE[] {
1624         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1625 
1626       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1627       fsck = new HBaseFsck(conf);
1628       fsck.connect();
1629       fsck.setDisplayFullReport(); // i.e. -details
1630       fsck.setTimeLag(0);
1631       fsck.setCheckHdfs(false);
1632       fsck.setFixHdfsHoles(true);
1633       fsck.setFixHdfsOverlaps(true);
1634       fsck.setFixHdfsOrphans(true);
1635       fsck.onlineHbck();
1636       assertFalse(fsck.shouldRerun());
1637       assertErrors(fsck, new ERROR_CODE[] {
1638         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1639     } finally {
1640       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1641         TEST_UTIL.getHBaseAdmin().enableTable(table);
1642       }
1643       deleteTable(table);
1644     }
1645   }
1646 
1647   /**
1648    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1649    * legitimate hfile and return it.
1650    * @param fs
1651    * @param table
1652    * @return Path of a flushed hfile.
1653    * @throws IOException
1654    */
1655   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1656     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1657     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1658     Path famDir = new Path(regionDir, FAM_STR);
1659 
1660     // keep doing this until we get a legit hfile
1661     while (true) {
1662       FileStatus[] hfFss = fs.listStatus(famDir);
1663       if (hfFss.length == 0) {
1664         continue;
1665       }
1666       for (FileStatus hfs : hfFss) {
1667         if (!hfs.isDir()) {
1668           return hfs.getPath();
1669         }
1670       }
1671     }
1672   }
1673 
1674   /**
1675    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1676    */
1677   @Test(timeout=180000)
1678   public void testQuarantineCorruptHFile() throws Exception {
1679     TableName table = TableName.valueOf(name.getMethodName());
1680     try {
1681       setupTable(table);
1682       assertEquals(ROWKEYS.length, countRows());
1683       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1684 
1685       FileSystem fs = FileSystem.get(conf);
1686       Path hfile = getFlushedHFile(fs, table);
1687 
1688       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1689       TEST_UTIL.getHBaseAdmin().disableTable(table);
1690 
1691       // create new corrupt file called deadbeef (valid hfile name)
1692       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1693       TestHFile.truncateFile(fs, hfile, corrupt);
1694       LOG.info("Created corrupted file " + corrupt);
1695       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1696 
1697       // we cannot enable here because enable never finished due to the corrupt region.
1698       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1699       assertEquals(res.getRetCode(), 0);
1700       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1701       assertEquals(hfcc.getHFilesChecked(), 5);
1702       assertEquals(hfcc.getCorrupted().size(), 1);
1703       assertEquals(hfcc.getFailures().size(), 0);
1704       assertEquals(hfcc.getQuarantined().size(), 1);
1705       assertEquals(hfcc.getMissing().size(), 0);
1706 
1707       // Its been fixed, verify that we can enable.
1708       TEST_UTIL.getHBaseAdmin().enableTable(table);
1709     } finally {
1710       deleteTable(table);
1711     }
1712   }
1713 
1714   /**
1715   * Test that use this should have a timeout, because this method could potentially wait forever.
1716   */
1717   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1718                                 int corrupt, int fail, int quar, int missing) throws Exception {
1719     try {
1720       setupTable(table);
1721       assertEquals(ROWKEYS.length, countRows());
1722       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1723 
1724       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1725       TEST_UTIL.getHBaseAdmin().disableTable(table);
1726 
1727       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1728           table.getNameAsString()};
1729       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1730       HBaseFsck res = hbck.exec(exec, args);
1731 
1732       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1733       assertEquals(hfcc.getHFilesChecked(), check);
1734       assertEquals(hfcc.getCorrupted().size(), corrupt);
1735       assertEquals(hfcc.getFailures().size(), fail);
1736       assertEquals(hfcc.getQuarantined().size(), quar);
1737       assertEquals(hfcc.getMissing().size(), missing);
1738 
1739       // its been fixed, verify that we can enable
1740       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1741       admin.enableTableAsync(table);
1742       while (!admin.isTableEnabled(table)) {
1743         try {
1744           Thread.sleep(250);
1745         } catch (InterruptedException e) {
1746           e.printStackTrace();
1747           fail("Interrupted when trying to enable table " + table);
1748         }
1749       }
1750     } finally {
1751       deleteTable(table);
1752     }
1753   }
1754 
1755   /**
1756    * This creates a table and simulates the race situation where a concurrent compaction or split
1757    * has removed an hfile after the corruption checker learned about it.
1758    */
1759   @Test(timeout=180000)
1760   public void testQuarantineMissingHFile() throws Exception {
1761     TableName table = TableName.valueOf(name.getMethodName());
1762     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1763     // inject a fault in the hfcc created.
1764     final FileSystem fs = FileSystem.get(conf);
1765     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1766       @Override
1767       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1768         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1769           boolean attemptedFirstHFile = false;
1770           @Override
1771           protected void checkHFile(Path p) throws IOException {
1772             if (!attemptedFirstHFile) {
1773               attemptedFirstHFile = true;
1774               assertTrue(fs.delete(p, true)); // make sure delete happened.
1775             }
1776             super.checkHFile(p);
1777           }
1778         };
1779       }
1780     };
1781     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1782   }
1783 
1784   /**
1785    * This creates a table and simulates the race situation where a concurrent compaction or split
1786    * has removed an colfam dir before the corruption checker got to it.
1787    */
1788   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1789   // files in a column family on initial creation -- as suggested by Matteo.
1790   @Ignore @Test(timeout=180000)
1791   public void testQuarantineMissingFamdir() throws Exception {
1792     TableName table = TableName.valueOf(name.getMethodName());
1793     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1794     // inject a fault in the hfcc created.
1795     final FileSystem fs = FileSystem.get(conf);
1796     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1797       @Override
1798       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1799         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1800           boolean attemptedFirstFamDir = false;
1801           @Override
1802           protected void checkColFamDir(Path p) throws IOException {
1803             if (!attemptedFirstFamDir) {
1804               attemptedFirstFamDir = true;
1805               assertTrue(fs.delete(p, true)); // make sure delete happened.
1806             }
1807             super.checkColFamDir(p);
1808           }
1809         };
1810       }
1811     };
1812     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1813   }
1814 
1815   /**
1816    * This creates a table and simulates the race situation where a concurrent compaction or split
1817    * has removed a region dir before the corruption checker got to it.
1818    */
1819   @Test(timeout=180000)
1820   public void testQuarantineMissingRegionDir() throws Exception {
1821     TableName table = TableName.valueOf(name.getMethodName());
1822     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1823     // inject a fault in the hfcc created.
1824     final FileSystem fs = FileSystem.get(conf);
1825     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1826       @Override
1827       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1828         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1829           boolean attemptedFirstRegionDir = false;
1830           @Override
1831           protected void checkRegionDir(Path p) throws IOException {
1832             if (!attemptedFirstRegionDir) {
1833               attemptedFirstRegionDir = true;
1834               assertTrue(fs.delete(p, true)); // make sure delete happened.
1835             }
1836             super.checkRegionDir(p);
1837           }
1838         };
1839       }
1840     };
1841     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1842   }
1843 
1844   /**
1845    * Test fixing lingering reference file.
1846    */
1847   @Test
1848   public void testLingeringReferenceFile() throws Exception {
1849     TableName table =
1850         TableName.valueOf("testLingeringReferenceFile");
1851     try {
1852       setupTable(table);
1853       assertEquals(ROWKEYS.length, countRows());
1854 
1855       // Mess it up by creating a fake reference file
1856       FileSystem fs = FileSystem.get(conf);
1857       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1858       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1859       Path famDir = new Path(regionDir, FAM_STR);
1860       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
1861       fs.create(fakeReferenceFile);
1862 
1863       HBaseFsck hbck = doFsck(conf, false);
1864       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
1865       // fix reference file
1866       doFsck(conf, true);
1867       // check that reference file fixed
1868       assertNoErrors(doFsck(conf, false));
1869     } finally {
1870       deleteTable(table);
1871     }
1872   }
1873 
1874   /**
1875    * Test mission REGIONINFO_QUALIFIER in hbase:meta
1876    */
1877   @Test
1878   public void testMissingRegionInfoQualifier() throws Exception {
1879     TableName table =
1880         TableName.valueOf("testMissingRegionInfoQualifier");
1881     try {
1882       setupTable(table);
1883 
1884       // Mess it up by removing the RegionInfo for one region.
1885       final List<Delete> deletes = new LinkedList<Delete>();
1886       HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName());
1887       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
1888 
1889         @Override
1890         public boolean processRow(Result rowResult) throws IOException {
1891           if(!MetaScanner.getHRegionInfo(rowResult).getTable().isSystemTable()) {
1892             Delete delete = new Delete(rowResult.getRow());
1893             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1894             deletes.add(delete);
1895           }
1896           return true;
1897         }
1898 
1899         @Override
1900         public void close() throws IOException {
1901         }
1902       });
1903       meta.delete(deletes);
1904 
1905       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
1906       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
1907         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
1908       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
1909         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
1910       meta.close();
1911 
1912       HBaseFsck hbck = doFsck(conf, false);
1913       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
1914 
1915       // fix reference file
1916       hbck = doFsck(conf, true);
1917 
1918       // check that reference file fixed
1919       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
1920     } finally {
1921       deleteTable(table);
1922     }
1923   }
1924 
1925 
1926   /**
1927    * Test pluggable error reporter. It can be plugged in
1928    * from system property or configuration.
1929    */
1930   @Test
1931   public void testErrorReporter() throws Exception {
1932     try {
1933       MockErrorReporter.calledCount = 0;
1934       doFsck(conf, false);
1935       assertEquals(MockErrorReporter.calledCount, 0);
1936 
1937       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
1938       doFsck(conf, false);
1939       assertTrue(MockErrorReporter.calledCount > 20);
1940     } finally {
1941       conf.set("hbasefsck.errorreporter",
1942         PrintingErrorReporter.class.getName());
1943       MockErrorReporter.calledCount = 0;
1944     }
1945   }
1946 
1947   static class MockErrorReporter implements ErrorReporter {
1948     static int calledCount = 0;
1949 
1950     @Override
1951     public void clear() {
1952       calledCount++;
1953     }
1954 
1955     @Override
1956     public void report(String message) {
1957       calledCount++;
1958     }
1959 
1960     @Override
1961     public void reportError(String message) {
1962       calledCount++;
1963     }
1964 
1965     @Override
1966     public void reportError(ERROR_CODE errorCode, String message) {
1967       calledCount++;
1968     }
1969 
1970     @Override
1971     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
1972       calledCount++;
1973     }
1974 
1975     @Override
1976     public void reportError(ERROR_CODE errorCode,
1977         String message, TableInfo table, HbckInfo info) {
1978       calledCount++;
1979     }
1980 
1981     @Override
1982     public void reportError(ERROR_CODE errorCode, String message,
1983         TableInfo table, HbckInfo info1, HbckInfo info2) {
1984       calledCount++;
1985     }
1986 
1987     @Override
1988     public int summarize() {
1989       return ++calledCount;
1990     }
1991 
1992     @Override
1993     public void detail(String details) {
1994       calledCount++;
1995     }
1996 
1997     @Override
1998     public ArrayList<ERROR_CODE> getErrorList() {
1999       calledCount++;
2000       return new ArrayList<ERROR_CODE>();
2001     }
2002 
2003     @Override
2004     public void progress() {
2005       calledCount++;
2006     }
2007 
2008     @Override
2009     public void print(String message) {
2010       calledCount++;
2011     }
2012 
2013     @Override
2014     public void resetErrors() {
2015       calledCount++;
2016     }
2017 
2018     @Override
2019     public boolean tableHasErrors(TableInfo table) {
2020       calledCount++;
2021       return false;
2022     }
2023   }
2024 
2025   @Test(timeout=60000)
2026   public void testCheckTableLocks() throws Exception {
2027     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2028     EnvironmentEdgeManager.injectEdge(edge);
2029     // check no errors
2030     HBaseFsck hbck = doFsck(conf, false);
2031     assertNoErrors(hbck);
2032 
2033     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2034 
2035     // obtain one lock
2036     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2037     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2038         "testCheckTableLocks");
2039     writeLock.acquire();
2040     hbck = doFsck(conf, false);
2041     assertNoErrors(hbck); // should not have expired, no problems
2042 
2043     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2044         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2045 
2046     hbck = doFsck(conf, false);
2047     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2048 
2049     final CountDownLatch latch = new CountDownLatch(1);
2050     new Thread() {
2051       @Override
2052       public void run() {
2053         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2054             "testCheckTableLocks");
2055         try {
2056           latch.countDown();
2057           readLock.acquire();
2058         } catch (IOException ex) {
2059           fail();
2060         } catch (IllegalStateException ex) {
2061           return; // expected, since this will be reaped under us.
2062         }
2063         fail("should not have come here");
2064       };
2065     }.start();
2066 
2067     latch.await(); // wait until thread starts
2068     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2069 
2070     hbck = doFsck(conf, false);
2071     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2072 
2073     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2074         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2075 
2076     hbck = doFsck(conf, false);
2077     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2078 
2079     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2080                                                                  // which is not injectable through EnvironmentEdge
2081     Threads.sleep(10);
2082     hbck = doFsck(conf, true); // now fix both cases
2083 
2084     hbck = doFsck(conf, false);
2085     assertNoErrors(hbck);
2086 
2087     // ensure that locks are deleted
2088     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2089         "should acquire without blocking");
2090     writeLock.acquire(); // this should not block.
2091     writeLock.release(); // release for clean state
2092   }
2093 
2094   @Test
2095   public void testMetaOffline() throws Exception {
2096     // check no errors
2097     HBaseFsck hbck = doFsck(conf, false);
2098     assertNoErrors(hbck);
2099     deleteMetaRegion(conf, true, false, false);
2100     hbck = doFsck(conf, false);
2101     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2102     // inconsistency and whether we will be fixing it or not.
2103     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2104     hbck = doFsck(conf, true);
2105     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2106     hbck = doFsck(conf, false);
2107     assertNoErrors(hbck);
2108   }
2109 
2110   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2111       boolean regionInfoOnly) throws IOException, InterruptedException {
2112     HConnection connection = HConnectionManager.getConnection(conf);
2113     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2114         HConstants.EMPTY_START_ROW);
2115     ServerName hsa = ServerName.valueOf(metaLocation.getHostnamePort(), 0L);
2116     HRegionInfo hri = metaLocation.getRegionInfo();
2117     if (unassign) {
2118       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2119       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2120     }
2121 
2122     if (regionInfoOnly) {
2123       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2124       Path rootDir = FSUtils.getRootDir(conf);
2125       FileSystem fs = rootDir.getFileSystem(conf);
2126       Path p = new Path(rootDir + "/" + HTableDescriptor.META_TABLEDESC.getNameAsString(),
2127           hri.getEncodedName());
2128       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2129       fs.delete(hriPath, true);
2130     }
2131 
2132     if (hdfs) {
2133       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2134       Path rootDir = FSUtils.getRootDir(conf);
2135       FileSystem fs = rootDir.getFileSystem(conf);
2136       Path p = new Path(rootDir + "/" + HTableDescriptor.META_TABLEDESC.getNameAsString(),
2137           hri.getEncodedName());
2138       HBaseFsck.debugLsr(conf, p);
2139       boolean success = fs.delete(p, true);
2140       LOG.info("Deleted " + p + " sucessfully? " + success);
2141       HBaseFsck.debugLsr(conf, p);
2142     }
2143   }
2144 
2145   @Test
2146   public void testTableWithNoRegions() throws Exception {
2147     // We might end up with empty regions in a table
2148     // see also testNoHdfsTable()
2149     TableName table =
2150         TableName.valueOf(name.getMethodName());
2151     try {
2152       // create table with one region
2153       HTableDescriptor desc = new HTableDescriptor(table);
2154       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2155       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2156       TEST_UTIL.getHBaseAdmin().createTable(desc);
2157       tbl = new HTable(TEST_UTIL.getConfiguration(), table, executorService);
2158 
2159       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2160       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false,
2161           false, true);
2162 
2163       HBaseFsck hbck = doFsck(conf, false);
2164       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2165 
2166       doFsck(conf, true);
2167 
2168       // fix hole
2169       doFsck(conf, true);
2170 
2171       // check that hole fixed
2172       assertNoErrors(doFsck(conf, false));
2173     } finally {
2174       deleteTable(table);
2175     }
2176 
2177   }
2178 
2179   @org.junit.Rule
2180   public TestName name = new TestName();
2181 }