View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.concurrent.Callable;
40  import java.util.concurrent.CountDownLatch;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.Executors;
43  import java.util.concurrent.Future;
44  import java.util.concurrent.ScheduledThreadPoolExecutor;
45  import java.util.concurrent.SynchronousQueue;
46  import java.util.concurrent.ThreadPoolExecutor;
47  import java.util.concurrent.TimeUnit;
48  import java.util.concurrent.atomic.AtomicBoolean;
49  
50  import org.apache.commons.io.IOUtils;
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  import org.apache.hadoop.conf.Configuration;
54  import org.apache.hadoop.fs.FileStatus;
55  import org.apache.hadoop.fs.FileSystem;
56  import org.apache.hadoop.fs.Path;
57  import org.apache.hadoop.hbase.ClusterStatus;
58  import org.apache.hadoop.hbase.HBaseTestingUtility;
59  import org.apache.hadoop.hbase.HColumnDescriptor;
60  import org.apache.hadoop.hbase.HConstants;
61  import org.apache.hadoop.hbase.HRegionInfo;
62  import org.apache.hadoop.hbase.HRegionLocation;
63  import org.apache.hadoop.hbase.HTableDescriptor;
64  import org.apache.hadoop.hbase.TableExistsException;
65  import org.apache.hadoop.hbase.testclassification.LargeTests;
66  import org.apache.hadoop.hbase.MiniHBaseCluster;
67  import org.apache.hadoop.hbase.ServerName;
68  import org.apache.hadoop.hbase.TableName;
69  import org.apache.hadoop.hbase.catalog.MetaEditor;
70  import org.apache.hadoop.hbase.client.Delete;
71  import org.apache.hadoop.hbase.client.Durability;
72  import org.apache.hadoop.hbase.client.Get;
73  import org.apache.hadoop.hbase.client.HBaseAdmin;
74  import org.apache.hadoop.hbase.client.HConnection;
75  import org.apache.hadoop.hbase.client.HConnectionManager;
76  import org.apache.hadoop.hbase.client.HTable;
77  import org.apache.hadoop.hbase.client.MetaScanner;
78  import org.apache.hadoop.hbase.client.Put;
79  import org.apache.hadoop.hbase.client.Result;
80  import org.apache.hadoop.hbase.client.ResultScanner;
81  import org.apache.hadoop.hbase.client.Scan;
82  import org.apache.hadoop.hbase.io.hfile.TestHFile;
83  import org.apache.hadoop.hbase.master.AssignmentManager;
84  import org.apache.hadoop.hbase.master.HMaster;
85  import org.apache.hadoop.hbase.master.RegionState;
86  import org.apache.hadoop.hbase.master.RegionStates;
87  import org.apache.hadoop.hbase.master.TableLockManager;
88  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
89  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
90  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
91  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
92  import org.apache.hadoop.hbase.regionserver.HRegion;
93  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
94  import org.apache.hadoop.hbase.regionserver.HRegionServer;
95  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
96  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
97  import org.apache.hadoop.hbase.testclassification.LargeTests;
98  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
99  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
100 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
101 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
102 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
103 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
104 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
105 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
106 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
107 import org.apache.zookeeper.KeeperException;
108 import org.junit.AfterClass;
109 import org.junit.Assert;
110 import org.junit.Before;
111 import org.junit.BeforeClass;
112 import org.junit.Ignore;
113 import org.junit.Test;
114 import org.junit.experimental.categories.Category;
115 import org.junit.rules.TestName;
116 
117 import com.google.common.collect.Multimap;
118 
119 /**
120  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
121  */
122 @Category(LargeTests.class)
123 public class TestHBaseFsck {
124   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
125   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
126   private final static Configuration conf = TEST_UTIL.getConfiguration();
127   private final static String FAM_STR = "fam";
128   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
129   private final static int REGION_ONLINE_TIMEOUT = 800;
130   private static RegionStates regionStates;
131   private static ExecutorService executorService;
132 
133   // for the instance, reset every test run
134   private HTable tbl;
135   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
136     Bytes.toBytes("B"), Bytes.toBytes("C") };
137   // one row per region.
138   private final static byte[][] ROWKEYS= new byte[][] {
139     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
140     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
141 
142   @SuppressWarnings("deprecation")
143   @BeforeClass
144   public static void setUpBeforeClass() throws Exception {
145     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
146     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
147     TEST_UTIL.getConfiguration().setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
148     TEST_UTIL.startMiniCluster(3);
149     TEST_UTIL.setHDFSClientRetry(0);
150 
151     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
152         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
153 
154     AssignmentManager assignmentManager =
155       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
156     regionStates = assignmentManager.getRegionStates();
157     TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
158   }
159 
160   @AfterClass
161   public static void tearDownAfterClass() throws Exception {
162     TEST_UTIL.shutdownMiniCluster();
163   }
164 
165   @Test
166   public void testHBaseFsck() throws Exception {
167     assertNoErrors(doFsck(conf, false));
168     String table = "tableBadMetaAssign";
169     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
170 
171     // We created 1 table, should be fine
172     assertNoErrors(doFsck(conf, false));
173 
174     // Now let's mess it up and change the assignment in hbase:meta to
175     // point to a different region server
176     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
177     Scan scan = new Scan();
178     scan.setStartRow(Bytes.toBytes(table+",,"));
179     ResultScanner scanner = meta.getScanner(scan);
180     HRegionInfo hri = null;
181 
182     Result res = scanner.next();
183     ServerName currServer =
184       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
185           HConstants.SERVER_QUALIFIER));
186     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
187         HConstants.STARTCODE_QUALIFIER));
188 
189     for (JVMClusterUtil.RegionServerThread rs :
190         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
191 
192       ServerName sn = rs.getRegionServer().getServerName();
193 
194       // When we find a diff RS, change the assignment and break
195       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
196           startCode != sn.getStartcode()) {
197         Put put = new Put(res.getRow());
198         put.setDurability(Durability.SKIP_WAL);
199         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
200           Bytes.toBytes(sn.getHostAndPort()));
201         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
202           Bytes.toBytes(sn.getStartcode()));
203         meta.put(put);
204         hri = HRegionInfo.getHRegionInfo(res);
205         break;
206       }
207     }
208 
209     // Try to fix the data
210     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
211         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
212 
213     TEST_UTIL.getHBaseCluster().getMaster()
214       .getAssignmentManager().waitForAssignment(hri);
215 
216     // Should be fixed now
217     assertNoErrors(doFsck(conf, false));
218 
219     // comment needed - what is the purpose of this line
220     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
221     ResultScanner s = t.getScanner(new Scan());
222     s.close();
223     t.close();
224 
225     scanner.close();
226     meta.close();
227   }
228 
229   @Test(timeout=180000)
230   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
231     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
232     HBaseAdmin admin = null;
233     try {
234       admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
235       admin.closeRegion(cluster.getServerHoldingMeta(),
236           HRegionInfo.FIRST_META_REGIONINFO);
237     } finally {
238       if (admin != null) {
239         admin.close();
240       }
241     }
242     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
243     MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
244     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
245     HBaseFsck hbck = doFsck(conf, true);
246     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
247         ERROR_CODE.NULL_META_REGION });
248     assertNoErrors(doFsck(conf, false));
249   }
250 
251   /**
252    * Create a new region in META.
253    */
254   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
255       htd, byte[] startKey, byte[] endKey)
256       throws IOException {
257     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
258     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
259     MetaEditor.addRegionToMeta(meta, hri);
260     meta.close();
261     return hri;
262   }
263 
264   /**
265    * Debugging method to dump the contents of meta.
266    */
267   private void dumpMeta(TableName tableName) throws IOException {
268     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
269     for (byte[] row : metaRows) {
270       LOG.info(Bytes.toString(row));
271     }
272   }
273 
274   /**
275    * This method is used to undeploy a region -- close it and attempt to
276    * remove its state from the Master.
277    */
278   private void undeployRegion(HBaseAdmin admin, ServerName sn,
279       HRegionInfo hri) throws IOException, InterruptedException {
280     try {
281       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
282       if (!hri.isMetaTable()) {
283         admin.offline(hri.getRegionName());
284       }
285     } catch (IOException ioe) {
286       LOG.warn("Got exception when attempting to offline region "
287           + Bytes.toString(hri.getRegionName()), ioe);
288     }
289   }
290   /**
291    * Delete a region from assignments, meta, or completely from hdfs.
292    * @param unassign if true unassign region if assigned
293    * @param metaRow  if true remove region's row from META
294    * @param hdfs if true remove region's dir in HDFS
295    */
296   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
297       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
298       boolean hdfs) throws IOException, InterruptedException {
299     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
300   }
301 
302   /**
303    * Delete a region from assignments, meta, or completely from hdfs.
304    * @param unassign if true unassign region if assigned
305    * @param metaRow  if true remove region's row from META
306    * @param hdfs if true remove region's dir in HDFS
307    * @param regionInfoOnly if true remove a region dir's .regioninfo file
308    */
309   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
310       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
311       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
312     LOG.info("** Before delete:");
313     dumpMeta(htd.getTableName());
314 
315     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
316     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
317       HRegionInfo hri = e.getKey();
318       ServerName hsa = e.getValue();
319       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
320           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
321 
322         LOG.info("RegionName: " +hri.getRegionNameAsString());
323         byte[] deleteRow = hri.getRegionName();
324 
325         if (unassign) {
326           LOG.info("Undeploying region " + hri + " from server " + hsa);
327           undeployRegion(new HBaseAdmin(conf), hsa, hri);
328         }
329 
330         if (regionInfoOnly) {
331           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
332           Path rootDir = FSUtils.getRootDir(conf);
333           FileSystem fs = rootDir.getFileSystem(conf);
334           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
335               hri.getEncodedName());
336           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
337           fs.delete(hriPath, true);
338         }
339 
340         if (hdfs) {
341           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
342           Path rootDir = FSUtils.getRootDir(conf);
343           FileSystem fs = rootDir.getFileSystem(conf);
344           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
345               hri.getEncodedName());
346           HBaseFsck.debugLsr(conf, p);
347           boolean success = fs.delete(p, true);
348           LOG.info("Deleted " + p + " sucessfully? " + success);
349           HBaseFsck.debugLsr(conf, p);
350         }
351 
352         if (metaRow) {
353           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
354           Delete delete = new Delete(deleteRow);
355           meta.delete(delete);
356         }
357       }
358       LOG.info(hri.toString() + hsa.toString());
359     }
360 
361     TEST_UTIL.getMetaTableRows(htd.getTableName());
362     LOG.info("*** After delete:");
363     dumpMeta(htd.getTableName());
364   }
365 
366   /**
367    * Setup a clean table before we start mucking with it.
368    *
369    * @throws IOException
370    * @throws InterruptedException
371    * @throws KeeperException
372    */
373   HTable setupTable(TableName tablename) throws Exception {
374     HTableDescriptor desc = new HTableDescriptor(tablename);
375     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
376     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
377     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
378     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
379 
380     List<Put> puts = new ArrayList<Put>();
381     for (byte[] row : ROWKEYS) {
382       Put p = new Put(row);
383       p.add(FAM, Bytes.toBytes("val"), row);
384       puts.add(p);
385     }
386     tbl.put(puts);
387     tbl.flushCommits();
388     return tbl;
389   }
390 
391   /**
392    * Counts the number of row to verify data loss or non-dataloss.
393    */
394   int countRows() throws IOException {
395      Scan s = new Scan();
396      ResultScanner rs = tbl.getScanner(s);
397      int i = 0;
398      while(rs.next() !=null) {
399        i++;
400      }
401      return i;
402   }
403 
404   /**
405    * delete table in preparation for next test
406    *
407    * @param tablename
408    * @throws IOException
409    */
410   void deleteTable(TableName tablename) throws IOException {
411     HBaseAdmin admin = new HBaseAdmin(conf);
412     admin.getConnection().clearRegionCache();
413     if (admin.isTableEnabled(tablename)) {
414       admin.disableTableAsync(tablename);
415     }
416     long totalWait = 0;
417     long maxWait = 30*1000;
418     long sleepTime = 250;
419     while (!admin.isTableDisabled(tablename)) {
420       try {
421         Thread.sleep(sleepTime);
422         totalWait += sleepTime;
423         if (totalWait >= maxWait) {
424           fail("Waited too long for table to be disabled + " + tablename);
425         }
426       } catch (InterruptedException e) {
427         e.printStackTrace();
428         fail("Interrupted when trying to disable table " + tablename);
429       }
430     }
431     admin.deleteTable(tablename);
432   }
433 
434   /**
435    * This creates a clean table and confirms that the table is clean.
436    */
437   @Test
438   public void testHBaseFsckClean() throws Exception {
439     assertNoErrors(doFsck(conf, false));
440     TableName table = TableName.valueOf("tableClean");
441     try {
442       HBaseFsck hbck = doFsck(conf, false);
443       assertNoErrors(hbck);
444 
445       setupTable(table);
446       assertEquals(ROWKEYS.length, countRows());
447 
448       // We created 1 table, should be fine
449       hbck = doFsck(conf, false);
450       assertNoErrors(hbck);
451       assertEquals(0, hbck.getOverlapGroups(table).size());
452       assertEquals(ROWKEYS.length, countRows());
453     } finally {
454       deleteTable(table);
455     }
456   }
457 
458   /**
459    * Test thread pooling in the case where there are more regions than threads
460    */
461   @Test
462   public void testHbckThreadpooling() throws Exception {
463     TableName table =
464         TableName.valueOf("tableDupeStartKey");
465     try {
466       // Create table with 4 regions
467       setupTable(table);
468 
469       // limit number of threads to 1.
470       Configuration newconf = new Configuration(conf);
471       newconf.setInt("hbasefsck.numthreads", 1);
472       assertNoErrors(doFsck(newconf, false));
473 
474       // We should pass without triggering a RejectedExecutionException
475     } finally {
476       deleteTable(table);
477     }
478   }
479 
480   @Test
481   public void testHbckFixOrphanTable() throws Exception {
482     TableName table = TableName.valueOf("tableInfo");
483     FileSystem fs = null;
484     Path tableinfo = null;
485     try {
486       setupTable(table);
487       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
488 
489       Path hbaseTableDir = FSUtils.getTableDir(
490           FSUtils.getRootDir(conf), table);
491       fs = hbaseTableDir.getFileSystem(conf);
492       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
493       tableinfo = status.getPath();
494       fs.rename(tableinfo, new Path("/.tableinfo"));
495 
496       //to report error if .tableinfo is missing.
497       HBaseFsck hbck = doFsck(conf, false);
498       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
499 
500       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
501       hbck = doFsck(conf, true);
502       assertNoErrors(hbck);
503       status = null;
504       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
505       assertNotNull(status);
506 
507       HTableDescriptor htd = admin.getTableDescriptor(table);
508       htd.setValue("NOT_DEFAULT", "true");
509       admin.disableTable(table);
510       admin.modifyTable(table, htd);
511       admin.enableTable(table);
512       fs.delete(status.getPath(), true);
513 
514       // fix OrphanTable with cache
515       htd = admin.getTableDescriptor(table); // warms up cached htd on master
516       hbck = doFsck(conf, true);
517       assertNoErrors(hbck);
518       status = null;
519       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
520       assertNotNull(status);
521       htd = admin.getTableDescriptor(table);
522       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
523     } finally {
524       fs.rename(new Path("/.tableinfo"), tableinfo);
525       deleteTable(table);
526     }
527   }
528 
529   /**
530    * This test makes sure that parallel instances of Hbck is disabled.
531    *
532    * @throws Exception
533    */
534   @Test
535   public void testParallelHbck() throws Exception {
536     final ExecutorService service;
537     final Future<HBaseFsck> hbck1,hbck2;
538 
539     class RunHbck implements Callable<HBaseFsck>{
540       boolean fail = true;
541       @Override
542       public HBaseFsck call(){
543         try{
544           return doFsck(conf, false);
545         } catch(Exception e){
546           if (e.getMessage().contains("Duplicate hbck")) {
547             fail = false;
548           } else {
549             LOG.fatal("hbck failed.", e);
550           }
551         }
552         // If we reach here, then an exception was caught
553         if (fail) fail();
554         return null;
555       }
556     }
557     service = Executors.newFixedThreadPool(2);
558     hbck1 = service.submit(new RunHbck());
559     hbck2 = service.submit(new RunHbck());
560     service.shutdown();
561     //wait for 15 seconds, for both hbck calls finish
562     service.awaitTermination(15, TimeUnit.SECONDS);
563     HBaseFsck h1 = hbck1.get();
564     HBaseFsck h2 = hbck2.get();
565     // Make sure only one of the calls was successful
566     assert(h1 == null || h2 == null);
567     if (h1 != null) {
568       assert(h1.getRetCode() >= 0);
569     }
570     if (h2 != null) {
571       assert(h2.getRetCode() >= 0);
572     }
573   }
574 
575   /**
576    * This create and fixes a bad table with regions that have a duplicate
577    * start key
578    */
579   @Test
580   public void testDupeStartKey() throws Exception {
581     TableName table =
582         TableName.valueOf("tableDupeStartKey");
583     try {
584       setupTable(table);
585       assertNoErrors(doFsck(conf, false));
586       assertEquals(ROWKEYS.length, countRows());
587 
588       // Now let's mess it up, by adding a region with a duplicate startkey
589       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
590           Bytes.toBytes("A"), Bytes.toBytes("A2"));
591       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
592       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
593           .waitForAssignment(hriDupe);
594       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
595       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
596 
597       HBaseFsck hbck = doFsck(conf, false);
598       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
599             ERROR_CODE.DUPE_STARTKEYS});
600       assertEquals(2, hbck.getOverlapGroups(table).size());
601       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
602 
603       // fix the degenerate region.
604       doFsck(conf,true);
605 
606       // check that the degenerate region is gone and no data loss
607       HBaseFsck hbck2 = doFsck(conf,false);
608       assertNoErrors(hbck2);
609       assertEquals(0, hbck2.getOverlapGroups(table).size());
610       assertEquals(ROWKEYS.length, countRows());
611     } finally {
612       deleteTable(table);
613     }
614   }
615 
616   /**
617    * Get region info from local cluster.
618    */
619   Map<ServerName, List<String>> getDeployedHRIs(
620       final HBaseAdmin admin) throws IOException {
621     ClusterStatus status = admin.getClusterStatus();
622     Collection<ServerName> regionServers = status.getServers();
623     Map<ServerName, List<String>> mm =
624         new HashMap<ServerName, List<String>>();
625     HConnection connection = admin.getConnection();
626     for (ServerName hsi : regionServers) {
627       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
628 
629       // list all online regions from this region server
630       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
631       List<String> regionNames = new ArrayList<String>();
632       for (HRegionInfo hri : regions) {
633         regionNames.add(hri.getRegionNameAsString());
634       }
635       mm.put(hsi, regionNames);
636     }
637     return mm;
638   }
639 
640   /**
641    * Returns the HSI a region info is on.
642    */
643   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
644     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
645       if (e.getValue().contains(hri.getRegionNameAsString())) {
646         return e.getKey();
647       }
648     }
649     return null;
650   }
651 
652   /**
653    * This create and fixes a bad table with regions that have a duplicate
654    * start key
655    */
656   @Test
657   public void testDupeRegion() throws Exception {
658     TableName table =
659         TableName.valueOf("tableDupeRegion");
660     try {
661       setupTable(table);
662       assertNoErrors(doFsck(conf, false));
663       assertEquals(ROWKEYS.length, countRows());
664 
665       // Now let's mess it up, by adding a region with a duplicate startkey
666       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
667           Bytes.toBytes("A"), Bytes.toBytes("B"));
668 
669       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
670       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
671           .waitForAssignment(hriDupe);
672       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
673       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
674 
675       // Yikes! The assignment manager can't tell between diff between two
676       // different regions with the same start/endkeys since it doesn't
677       // differentiate on ts/regionId!  We actually need to recheck
678       // deployments!
679       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
680       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
681         Thread.sleep(250);
682       }
683 
684       LOG.debug("Finished assignment of dupe region");
685 
686       // TODO why is dupe region different from dupe start keys?
687       HBaseFsck hbck = doFsck(conf, false);
688       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
689             ERROR_CODE.DUPE_STARTKEYS});
690       assertEquals(2, hbck.getOverlapGroups(table).size());
691       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
692 
693       // fix the degenerate region.
694       doFsck(conf,true);
695 
696       // check that the degenerate region is gone and no data loss
697       HBaseFsck hbck2 = doFsck(conf,false);
698       assertNoErrors(hbck2);
699       assertEquals(0, hbck2.getOverlapGroups(table).size());
700       assertEquals(ROWKEYS.length, countRows());
701     } finally {
702       deleteTable(table);
703     }
704   }
705 
706   /**
707    * This creates and fixes a bad table with regions that has startkey == endkey
708    */
709   @Test
710   public void testDegenerateRegions() throws Exception {
711     TableName table =
712         TableName.valueOf("tableDegenerateRegions");
713     try {
714       setupTable(table);
715       assertNoErrors(doFsck(conf,false));
716       assertEquals(ROWKEYS.length, countRows());
717 
718       // Now let's mess it up, by adding a region with a duplicate startkey
719       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
720           Bytes.toBytes("B"), Bytes.toBytes("B"));
721       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
722       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
723           .waitForAssignment(hriDupe);
724       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
725       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
726 
727       HBaseFsck hbck = doFsck(conf,false);
728       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
729           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
730       assertEquals(2, hbck.getOverlapGroups(table).size());
731       assertEquals(ROWKEYS.length, countRows());
732 
733       // fix the degenerate region.
734       doFsck(conf,true);
735 
736       // check that the degenerate region is gone and no data loss
737       HBaseFsck hbck2 = doFsck(conf,false);
738       assertNoErrors(hbck2);
739       assertEquals(0, hbck2.getOverlapGroups(table).size());
740       assertEquals(ROWKEYS.length, countRows());
741     } finally {
742       deleteTable(table);
743     }
744   }
745 
746   /**
747    * This creates and fixes a bad table where a region is completely contained
748    * by another region.
749    */
750   @Test
751   public void testContainedRegionOverlap() throws Exception {
752     TableName table =
753         TableName.valueOf("tableContainedRegionOverlap");
754     try {
755       setupTable(table);
756       assertEquals(ROWKEYS.length, countRows());
757 
758       // Mess it up by creating an overlap in the metadata
759       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
760           Bytes.toBytes("A2"), Bytes.toBytes("B"));
761       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
762       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
763           .waitForAssignment(hriOverlap);
764       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
765       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
766 
767       HBaseFsck hbck = doFsck(conf, false);
768       assertErrors(hbck, new ERROR_CODE[] {
769           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
770       assertEquals(2, hbck.getOverlapGroups(table).size());
771       assertEquals(ROWKEYS.length, countRows());
772 
773       // fix the problem.
774       doFsck(conf, true);
775 
776       // verify that overlaps are fixed
777       HBaseFsck hbck2 = doFsck(conf,false);
778       assertNoErrors(hbck2);
779       assertEquals(0, hbck2.getOverlapGroups(table).size());
780       assertEquals(ROWKEYS.length, countRows());
781     } finally {
782        deleteTable(table);
783     }
784   }
785 
786   /**
787    * This creates and fixes a bad table where an overlap group of
788    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
789    * region. Mess around the meta data so that closeRegion/offlineRegion
790    * throws exceptions.
791    */
792   @Test
793   public void testSidelineOverlapRegion() throws Exception {
794     TableName table =
795         TableName.valueOf("testSidelineOverlapRegion");
796     try {
797       setupTable(table);
798       assertEquals(ROWKEYS.length, countRows());
799 
800       // Mess it up by creating an overlap
801       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
802       HMaster master = cluster.getMaster();
803       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
804         Bytes.toBytes("A"), Bytes.toBytes("AB"));
805       master.assignRegion(hriOverlap1);
806       master.getAssignmentManager().waitForAssignment(hriOverlap1);
807       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
808         Bytes.toBytes("AB"), Bytes.toBytes("B"));
809       master.assignRegion(hriOverlap2);
810       master.getAssignmentManager().waitForAssignment(hriOverlap2);
811 
812       HBaseFsck hbck = doFsck(conf, false);
813       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
814         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
815       assertEquals(3, hbck.getOverlapGroups(table).size());
816       assertEquals(ROWKEYS.length, countRows());
817 
818       // mess around the overlapped regions, to trigger NotServingRegionException
819       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
820       ServerName serverName = null;
821       byte[] regionName = null;
822       for (HbckInfo hbi: overlapGroups.values()) {
823         if ("A".equals(Bytes.toString(hbi.getStartKey()))
824             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
825           regionName = hbi.getRegionName();
826 
827           // get an RS not serving the region to force bad assignment info in to META.
828           int k = cluster.getServerWith(regionName);
829           for (int i = 0; i < 3; i++) {
830             if (i != k) {
831               HRegionServer rs = cluster.getRegionServer(i);
832               serverName = rs.getServerName();
833               break;
834             }
835           }
836 
837           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
838           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
839             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
840           admin.offline(regionName);
841           break;
842         }
843       }
844 
845       assertNotNull(regionName);
846       assertNotNull(serverName);
847       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
848       Put put = new Put(regionName);
849       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
850         Bytes.toBytes(serverName.getHostAndPort()));
851       meta.put(put);
852 
853       // fix the problem.
854       HBaseFsck fsck = new HBaseFsck(conf);
855       fsck.connect();
856       fsck.setDisplayFullReport(); // i.e. -details
857       fsck.setTimeLag(0);
858       fsck.setFixAssignments(true);
859       fsck.setFixMeta(true);
860       fsck.setFixHdfsHoles(true);
861       fsck.setFixHdfsOverlaps(true);
862       fsck.setFixHdfsOrphans(true);
863       fsck.setFixVersionFile(true);
864       fsck.setSidelineBigOverlaps(true);
865       fsck.setMaxMerge(2);
866       fsck.onlineHbck();
867 
868       // verify that overlaps are fixed, and there are less rows
869       // since one region is sidelined.
870       HBaseFsck hbck2 = doFsck(conf,false);
871       assertNoErrors(hbck2);
872       assertEquals(0, hbck2.getOverlapGroups(table).size());
873       assertTrue(ROWKEYS.length > countRows());
874     } finally {
875        deleteTable(table);
876     }
877   }
878 
879   /**
880    * This creates and fixes a bad table where a region is completely contained
881    * by another region, and there is a hole (sort of like a bad split)
882    */
883   @Test
884   public void testOverlapAndOrphan() throws Exception {
885     TableName table =
886         TableName.valueOf("tableOverlapAndOrphan");
887     try {
888       setupTable(table);
889       assertEquals(ROWKEYS.length, countRows());
890 
891       // Mess it up by creating an overlap in the metadata
892       TEST_UTIL.getHBaseAdmin().disableTable(table);
893       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
894           Bytes.toBytes("B"), true, true, false, true);
895       TEST_UTIL.getHBaseAdmin().enableTable(table);
896 
897       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
898           Bytes.toBytes("A2"), Bytes.toBytes("B"));
899       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
900       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
901           .waitForAssignment(hriOverlap);
902       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
903       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
904 
905       HBaseFsck hbck = doFsck(conf, false);
906       assertErrors(hbck, new ERROR_CODE[] {
907           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
908           ERROR_CODE.HOLE_IN_REGION_CHAIN});
909 
910       // fix the problem.
911       doFsck(conf, true);
912 
913       // verify that overlaps are fixed
914       HBaseFsck hbck2 = doFsck(conf,false);
915       assertNoErrors(hbck2);
916       assertEquals(0, hbck2.getOverlapGroups(table).size());
917       assertEquals(ROWKEYS.length, countRows());
918     } finally {
919        deleteTable(table);
920     }
921   }
922 
923   /**
924    * This creates and fixes a bad table where a region overlaps two regions --
925    * a start key contained in another region and its end key is contained in
926    * yet another region.
927    */
928   @Test
929   public void testCoveredStartKey() throws Exception {
930     TableName table =
931         TableName.valueOf("tableCoveredStartKey");
932     try {
933       setupTable(table);
934       assertEquals(ROWKEYS.length, countRows());
935 
936       // Mess it up by creating an overlap in the metadata
937       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
938           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
939       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
940       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
941           .waitForAssignment(hriOverlap);
942       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
943       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
944 
945       HBaseFsck hbck = doFsck(conf, false);
946       assertErrors(hbck, new ERROR_CODE[] {
947           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
948           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
949       assertEquals(3, hbck.getOverlapGroups(table).size());
950       assertEquals(ROWKEYS.length, countRows());
951 
952       // fix the problem.
953       doFsck(conf, true);
954 
955       // verify that overlaps are fixed
956       HBaseFsck hbck2 = doFsck(conf, false);
957       assertErrors(hbck2, new ERROR_CODE[0]);
958       assertEquals(0, hbck2.getOverlapGroups(table).size());
959       assertEquals(ROWKEYS.length, countRows());
960     } finally {
961       deleteTable(table);
962     }
963   }
964 
965   /**
966    * This creates and fixes a bad table with a missing region -- hole in meta
967    * and data missing in the fs.
968    */
969   @Test
970   public void testRegionHole() throws Exception {
971     TableName table =
972         TableName.valueOf("tableRegionHole");
973     try {
974       setupTable(table);
975       assertEquals(ROWKEYS.length, countRows());
976 
977       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
978       TEST_UTIL.getHBaseAdmin().disableTable(table);
979       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
980           Bytes.toBytes("C"), true, true, true);
981       TEST_UTIL.getHBaseAdmin().enableTable(table);
982 
983       HBaseFsck hbck = doFsck(conf, false);
984       assertErrors(hbck, new ERROR_CODE[] {
985           ERROR_CODE.HOLE_IN_REGION_CHAIN});
986       // holes are separate from overlap groups
987       assertEquals(0, hbck.getOverlapGroups(table).size());
988 
989       // fix hole
990       doFsck(conf, true);
991 
992       // check that hole fixed
993       assertNoErrors(doFsck(conf,false));
994       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
995     } finally {
996       deleteTable(table);
997     }
998   }
999 
1000   /**
1001    * This creates and fixes a bad table with a missing region -- hole in meta
1002    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1003    */
1004   @Test
1005   public void testHDFSRegioninfoMissing() throws Exception {
1006     TableName table =
1007         TableName.valueOf("tableHDFSRegioininfoMissing");
1008     try {
1009       setupTable(table);
1010       assertEquals(ROWKEYS.length, countRows());
1011 
1012       // Mess it up by leaving a hole in the meta data
1013       TEST_UTIL.getHBaseAdmin().disableTable(table);
1014       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1015           Bytes.toBytes("C"), true, true, false, true);
1016       TEST_UTIL.getHBaseAdmin().enableTable(table);
1017 
1018       HBaseFsck hbck = doFsck(conf, false);
1019       assertErrors(hbck, new ERROR_CODE[] {
1020           ERROR_CODE.ORPHAN_HDFS_REGION,
1021           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1022           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1023       // holes are separate from overlap groups
1024       assertEquals(0, hbck.getOverlapGroups(table).size());
1025 
1026       // fix hole
1027       doFsck(conf, true);
1028 
1029       // check that hole fixed
1030       assertNoErrors(doFsck(conf, false));
1031       assertEquals(ROWKEYS.length, countRows());
1032     } finally {
1033       deleteTable(table);
1034     }
1035   }
1036 
1037   /**
1038    * This creates and fixes a bad table with a region that is missing meta and
1039    * not assigned to a region server.
1040    */
1041   @Test
1042   public void testNotInMetaOrDeployedHole() throws Exception {
1043     TableName table =
1044         TableName.valueOf("tableNotInMetaOrDeployedHole");
1045     try {
1046       setupTable(table);
1047       assertEquals(ROWKEYS.length, countRows());
1048 
1049       // Mess it up by leaving a hole in the meta data
1050       TEST_UTIL.getHBaseAdmin().disableTable(table);
1051       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1052           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1053       TEST_UTIL.getHBaseAdmin().enableTable(table);
1054 
1055       HBaseFsck hbck = doFsck(conf, false);
1056       assertErrors(hbck, new ERROR_CODE[] {
1057           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1058       // holes are separate from overlap groups
1059       assertEquals(0, hbck.getOverlapGroups(table).size());
1060 
1061       // fix hole
1062       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1063           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1064 
1065       // check that hole fixed
1066       assertNoErrors(doFsck(conf,false));
1067       assertEquals(ROWKEYS.length, countRows());
1068     } finally {
1069       deleteTable(table);
1070     }
1071   }
1072 
1073   /**
1074    * This creates fixes a bad table with a hole in meta.
1075    */
1076   @Test
1077   public void testNotInMetaHole() throws Exception {
1078     TableName table =
1079         TableName.valueOf("tableNotInMetaHole");
1080     try {
1081       setupTable(table);
1082       assertEquals(ROWKEYS.length, countRows());
1083 
1084       // Mess it up by leaving a hole in the meta data
1085       TEST_UTIL.getHBaseAdmin().disableTable(table);
1086       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1087           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1088       TEST_UTIL.getHBaseAdmin().enableTable(table);
1089 
1090       HBaseFsck hbck = doFsck(conf, false);
1091       assertErrors(hbck, new ERROR_CODE[] {
1092           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1093       // holes are separate from overlap groups
1094       assertEquals(0, hbck.getOverlapGroups(table).size());
1095 
1096       // fix hole
1097       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1098           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1099 
1100       // check that hole fixed
1101       assertNoErrors(doFsck(conf,false));
1102       assertEquals(ROWKEYS.length, countRows());
1103     } finally {
1104       deleteTable(table);
1105     }
1106   }
1107 
1108   /**
1109    * This creates and fixes a bad table with a region that is in meta but has
1110    * no deployment or data hdfs
1111    */
1112   @Test
1113   public void testNotInHdfs() throws Exception {
1114     TableName table =
1115         TableName.valueOf("tableNotInHdfs");
1116     try {
1117       setupTable(table);
1118       assertEquals(ROWKEYS.length, countRows());
1119 
1120       // make sure data in regions, if in hlog only there is no data loss
1121       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1122 
1123       // Mess it up by leaving a hole in the hdfs data
1124       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1125           Bytes.toBytes("C"), false, false, true); // don't rm meta
1126 
1127       HBaseFsck hbck = doFsck(conf, false);
1128       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1129       // holes are separate from overlap groups
1130       assertEquals(0, hbck.getOverlapGroups(table).size());
1131 
1132       // fix hole
1133       doFsck(conf, true);
1134 
1135       // check that hole fixed
1136       assertNoErrors(doFsck(conf,false));
1137       assertEquals(ROWKEYS.length - 2, countRows());
1138     } finally {
1139       deleteTable(table);
1140     }
1141   }
1142 
1143   /**
1144    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1145    * remove the table.
1146    */
1147   @Test
1148   public void testNoHdfsTable() throws Exception {
1149     TableName table = TableName.valueOf("NoHdfsTable");
1150     setupTable(table);
1151     assertEquals(ROWKEYS.length, countRows());
1152 
1153     // make sure data in regions, if in hlog only there is no data loss
1154     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1155 
1156     // Mess it up by deleting hdfs dirs
1157     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1158         Bytes.toBytes("A"), false, false, true); // don't rm meta
1159     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1160         Bytes.toBytes("B"), false, false, true); // don't rm meta
1161     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1162         Bytes.toBytes("C"), false, false, true); // don't rm meta
1163     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1164         Bytes.toBytes(""), false, false, true); // don't rm meta
1165 
1166     // also remove the table directory in hdfs
1167     deleteTableDir(table);
1168 
1169     HBaseFsck hbck = doFsck(conf, false);
1170     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1171         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1172         ERROR_CODE.NOT_IN_HDFS,});
1173     // holes are separate from overlap groups
1174     assertEquals(0, hbck.getOverlapGroups(table).size());
1175 
1176     // fix hole
1177     doFsck(conf, true); // detect dangling regions and remove those
1178 
1179     // check that hole fixed
1180     assertNoErrors(doFsck(conf,false));
1181     assertFalse("Table "+ table + " should have been deleted",
1182         TEST_UTIL.getHBaseAdmin().tableExists(table));
1183   }
1184 
1185   public void deleteTableDir(TableName table) throws IOException {
1186     Path rootDir = FSUtils.getRootDir(conf);
1187     FileSystem fs = rootDir.getFileSystem(conf);
1188     Path p = FSUtils.getTableDir(rootDir, table);
1189     HBaseFsck.debugLsr(conf, p);
1190     boolean success = fs.delete(p, true);
1191     LOG.info("Deleted " + p + " sucessfully? " + success);
1192   }
1193 
1194   /**
1195    * when the hbase.version file missing, It is fix the fault.
1196    */
1197   @Test
1198   public void testNoVersionFile() throws Exception {
1199     // delete the hbase.version file
1200     Path rootDir = FSUtils.getRootDir(conf);
1201     FileSystem fs = rootDir.getFileSystem(conf);
1202     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1203     fs.delete(versionFile, true);
1204 
1205     // test
1206     HBaseFsck hbck = doFsck(conf, false);
1207     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1208     // fix hbase.version missing
1209     doFsck(conf, true);
1210 
1211     // no version file fixed
1212     assertNoErrors(doFsck(conf, false));
1213   }
1214 
1215   /**
1216    * The region is not deployed when the table is disabled.
1217    */
1218   @Test
1219   public void testRegionShouldNotBeDeployed() throws Exception {
1220     TableName table =
1221         TableName.valueOf("tableRegionShouldNotBeDeployed");
1222     try {
1223       LOG.info("Starting testRegionShouldNotBeDeployed.");
1224       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1225       assertTrue(cluster.waitForActiveAndReadyMaster());
1226 
1227 
1228       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1229           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1230       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1231       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1232 
1233       // Write the .tableinfo
1234       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1235       fstd.createTableDescriptor(htdDisabled);
1236       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1237           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1238 
1239       // Let's just assign everything to first RS
1240       HRegionServer hrs = cluster.getRegionServer(0);
1241 
1242       // Create region files.
1243       TEST_UTIL.getHBaseAdmin().disableTable(table);
1244       TEST_UTIL.getHBaseAdmin().enableTable(table);
1245 
1246       // Disable the table and close its regions
1247       TEST_UTIL.getHBaseAdmin().disableTable(table);
1248       HRegionInfo region = disabledRegions.remove(0);
1249       byte[] regionName = region.getRegionName();
1250 
1251       // The region should not be assigned currently
1252       assertTrue(cluster.getServerWith(regionName) == -1);
1253 
1254       // Directly open a region on a region server.
1255       // If going through AM/ZK, the region won't be open.
1256       // Even it is opened, AM will close it which causes
1257       // flakiness of this test.
1258       HRegion r = HRegion.openHRegion(
1259         region, htdDisabled, hrs.getWAL(region), conf);
1260       hrs.addToOnlineRegions(r);
1261 
1262       HBaseFsck hbck = doFsck(conf, false);
1263       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1264 
1265       // fix this fault
1266       doFsck(conf, true);
1267 
1268       // check result
1269       assertNoErrors(doFsck(conf, false));
1270     } finally {
1271       TEST_UTIL.getHBaseAdmin().enableTable(table);
1272       deleteTable(table);
1273     }
1274   }
1275 
1276   /**
1277    * This creates two tables and mess both of them and fix them one by one
1278    */
1279   @Test
1280   public void testFixByTable() throws Exception {
1281     TableName table1 =
1282         TableName.valueOf("testFixByTable1");
1283     TableName table2 =
1284         TableName.valueOf("testFixByTable2");
1285     try {
1286       setupTable(table1);
1287       // make sure data in regions, if in hlog only there is no data loss
1288       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1289       // Mess them up by leaving a hole in the hdfs data
1290       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1291         Bytes.toBytes("C"), false, false, true); // don't rm meta
1292 
1293       setupTable(table2);
1294       // make sure data in regions, if in hlog only there is no data loss
1295       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1296       // Mess them up by leaving a hole in the hdfs data
1297       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1298         Bytes.toBytes("C"), false, false, true); // don't rm meta
1299 
1300       HBaseFsck hbck = doFsck(conf, false);
1301       assertErrors(hbck, new ERROR_CODE[] {
1302         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1303 
1304       // fix hole in table 1
1305       doFsck(conf, true, table1);
1306       // check that hole in table 1 fixed
1307       assertNoErrors(doFsck(conf, false, table1));
1308       // check that hole in table 2 still there
1309       assertErrors(doFsck(conf, false, table2),
1310         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1311 
1312       // fix hole in table 2
1313       doFsck(conf, true, table2);
1314       // check that hole in both tables fixed
1315       assertNoErrors(doFsck(conf, false));
1316       assertEquals(ROWKEYS.length - 2, countRows());
1317     } finally {
1318       deleteTable(table1);
1319       deleteTable(table2);
1320     }
1321   }
1322   /**
1323    * A split parent in meta, in hdfs, and not deployed
1324    */
1325   @Test
1326   public void testLingeringSplitParent() throws Exception {
1327     TableName table =
1328         TableName.valueOf("testLingeringSplitParent");
1329     HTable meta = null;
1330     try {
1331       setupTable(table);
1332       assertEquals(ROWKEYS.length, countRows());
1333 
1334       // make sure data in regions, if in hlog only there is no data loss
1335       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1336       HRegionLocation location = tbl.getRegionLocation("B");
1337 
1338       // Delete one region from meta, but not hdfs, unassign it.
1339       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1340         Bytes.toBytes("C"), true, true, false);
1341 
1342       // Create a new meta entry to fake it as a split parent.
1343       meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
1344       HRegionInfo hri = location.getRegionInfo();
1345 
1346       HRegionInfo a = new HRegionInfo(tbl.getName(),
1347         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1348       HRegionInfo b = new HRegionInfo(tbl.getName(),
1349         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1350 
1351       hri.setOffline(true);
1352       hri.setSplit(true);
1353 
1354       MetaEditor.addRegionToMeta(meta, hri, a, b);
1355       meta.flushCommits();
1356       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1357 
1358       HBaseFsck hbck = doFsck(conf, false);
1359       assertErrors(hbck, new ERROR_CODE[] {
1360         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1361 
1362       // regular repair cannot fix lingering split parent
1363       hbck = doFsck(conf, true);
1364       assertErrors(hbck, new ERROR_CODE[] {
1365         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1366       assertFalse(hbck.shouldRerun());
1367       hbck = doFsck(conf, false);
1368       assertErrors(hbck, new ERROR_CODE[] {
1369         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1370 
1371       // fix lingering split parent
1372       hbck = new HBaseFsck(conf);
1373       hbck.connect();
1374       hbck.setDisplayFullReport(); // i.e. -details
1375       hbck.setTimeLag(0);
1376       hbck.setFixSplitParents(true);
1377       hbck.onlineHbck();
1378       assertTrue(hbck.shouldRerun());
1379 
1380       Get get = new Get(hri.getRegionName());
1381       Result result = meta.get(get);
1382       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1383         HConstants.SPLITA_QUALIFIER).isEmpty());
1384       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1385         HConstants.SPLITB_QUALIFIER).isEmpty());
1386       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1387 
1388       // fix other issues
1389       doFsck(conf, true);
1390 
1391       // check that all are fixed
1392       assertNoErrors(doFsck(conf, false));
1393       assertEquals(ROWKEYS.length, countRows());
1394     } finally {
1395       deleteTable(table);
1396       IOUtils.closeQuietly(meta);
1397     }
1398   }
1399 
1400   /**
1401    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1402    * valid cases where the daughters are there.
1403    */
1404   @Test
1405   public void testValidLingeringSplitParent() throws Exception {
1406     TableName table =
1407         TableName.valueOf("testLingeringSplitParent");
1408     HTable meta = null;
1409     try {
1410       setupTable(table);
1411       assertEquals(ROWKEYS.length, countRows());
1412 
1413       // make sure data in regions, if in hlog only there is no data loss
1414       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1415       HRegionLocation location = tbl.getRegionLocation("B");
1416 
1417       meta = new HTable(conf, TableName.META_TABLE_NAME);
1418       HRegionInfo hri = location.getRegionInfo();
1419 
1420       // do a regular split
1421       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1422       byte[] regionName = location.getRegionInfo().getRegionName();
1423       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1424       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1425           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1426 
1427       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1428       // for some time until children references are deleted. HBCK erroneously sees this as
1429       // overlapping regions
1430       HBaseFsck hbck = doFsck(
1431         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1432       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1433 
1434       // assert that the split hbase:meta entry is still there.
1435       Get get = new Get(hri.getRegionName());
1436       Result result = meta.get(get);
1437       assertNotNull(result);
1438       assertNotNull(HRegionInfo.getHRegionInfo(result));
1439 
1440       assertEquals(ROWKEYS.length, countRows());
1441 
1442       // assert that we still have the split regions
1443       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1444       assertNoErrors(doFsck(conf, false));
1445     } finally {
1446       deleteTable(table);
1447       IOUtils.closeQuietly(meta);
1448     }
1449   }
1450 
1451   /**
1452    * Split crashed after write to hbase:meta finished for the parent region, but
1453    * failed to write daughters (pre HBASE-7721 codebase)
1454    */
1455   @Test(timeout=75000)
1456   public void testSplitDaughtersNotInMeta() throws Exception {
1457     TableName table =
1458         TableName.valueOf("testSplitdaughtersNotInMeta");
1459     HTable meta = null;
1460     try {
1461       setupTable(table);
1462       assertEquals(ROWKEYS.length, countRows());
1463 
1464       // make sure data in regions, if in hlog only there is no data loss
1465       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1466       HRegionLocation location = tbl.getRegionLocation("B");
1467 
1468       meta = new HTable(conf, TableName.META_TABLE_NAME);
1469       HRegionInfo hri = location.getRegionInfo();
1470 
1471       // do a regular split
1472       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1473       byte[] regionName = location.getRegionInfo().getRegionName();
1474       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1475       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1476           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1477 
1478       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1479 
1480       // Delete daughter regions from meta, but not hdfs, unassign it.
1481       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1482       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1483       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1484 
1485       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1486       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1487       meta.flushCommits();
1488 
1489       HBaseFsck hbck = doFsck(conf, false);
1490       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1491           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1492 
1493       // now fix it. The fix should not revert the region split, but add daughters to META
1494       hbck = doFsck(
1495         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1496       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1497           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1498 
1499       // assert that the split hbase:meta entry is still there.
1500       Get get = new Get(hri.getRegionName());
1501       Result result = meta.get(get);
1502       assertNotNull(result);
1503       assertNotNull(HRegionInfo.getHRegionInfo(result));
1504 
1505       assertEquals(ROWKEYS.length, countRows());
1506 
1507       // assert that we still have the split regions
1508       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1509       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1510     } finally {
1511       deleteTable(table);
1512       IOUtils.closeQuietly(meta);
1513     }
1514   }
1515 
1516   /**
1517    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1518    * meta and data missing in the fs.
1519    */
1520   @Test(timeout=120000)
1521   public void testMissingFirstRegion() throws Exception {
1522     TableName table =
1523         TableName.valueOf("testMissingFirstRegion");
1524     try {
1525       setupTable(table);
1526       assertEquals(ROWKEYS.length, countRows());
1527 
1528       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1529       TEST_UTIL.getHBaseAdmin().disableTable(table);
1530       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1531           true, true);
1532       TEST_UTIL.getHBaseAdmin().enableTable(table);
1533 
1534       HBaseFsck hbck = doFsck(conf, false);
1535       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1536       // fix hole
1537       doFsck(conf, true);
1538       // check that hole fixed
1539       assertNoErrors(doFsck(conf, false));
1540     } finally {
1541       deleteTable(table);
1542     }
1543   }
1544 
1545   /**
1546    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1547    * meta and data missing in the fs.
1548    */
1549   @Test(timeout=120000)
1550   public void testRegionDeployedNotInHdfs() throws Exception {
1551     TableName table =
1552         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1553     try {
1554       setupTable(table);
1555       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1556 
1557       // Mess it up by deleting region dir
1558       deleteRegion(conf, tbl.getTableDescriptor(),
1559         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1560         false, true);
1561 
1562       HBaseFsck hbck = doFsck(conf, false);
1563       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1564       // fix hole
1565       doFsck(conf, true);
1566       // check that hole fixed
1567       assertNoErrors(doFsck(conf, false));
1568     } finally {
1569       deleteTable(table);
1570     }
1571   }
1572 
1573   /**
1574    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1575    * the fs.
1576    */
1577   @Test(timeout=120000)
1578   public void testMissingLastRegion() throws Exception {
1579     TableName table =
1580         TableName.valueOf("testMissingLastRegion");
1581     try {
1582       setupTable(table);
1583       assertEquals(ROWKEYS.length, countRows());
1584 
1585       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1586       TEST_UTIL.getHBaseAdmin().disableTable(table);
1587       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1588           true, true);
1589       TEST_UTIL.getHBaseAdmin().enableTable(table);
1590 
1591       HBaseFsck hbck = doFsck(conf, false);
1592       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1593       // fix hole
1594       doFsck(conf, true);
1595       // check that hole fixed
1596       assertNoErrors(doFsck(conf, false));
1597     } finally {
1598       deleteTable(table);
1599     }
1600   }
1601 
1602   /**
1603    * Test -noHdfsChecking option can detect and fix assignments issue.
1604    */
1605   @Test
1606   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1607     TableName table =
1608         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1609     try {
1610       setupTable(table);
1611       assertEquals(ROWKEYS.length, countRows());
1612 
1613       // Mess it up by closing a region
1614       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1615         Bytes.toBytes("B"), true, false, false, false);
1616 
1617       // verify there is no other errors
1618       HBaseFsck hbck = doFsck(conf, false);
1619       assertErrors(hbck, new ERROR_CODE[] {
1620         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1621 
1622       // verify that noHdfsChecking report the same errors
1623       HBaseFsck fsck = new HBaseFsck(conf);
1624       fsck.connect();
1625       fsck.setDisplayFullReport(); // i.e. -details
1626       fsck.setTimeLag(0);
1627       fsck.setCheckHdfs(false);
1628       fsck.onlineHbck();
1629       assertErrors(fsck, new ERROR_CODE[] {
1630         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1631 
1632       // verify that fixAssignments works fine with noHdfsChecking
1633       fsck = new HBaseFsck(conf);
1634       fsck.connect();
1635       fsck.setDisplayFullReport(); // i.e. -details
1636       fsck.setTimeLag(0);
1637       fsck.setCheckHdfs(false);
1638       fsck.setFixAssignments(true);
1639       fsck.onlineHbck();
1640       assertTrue(fsck.shouldRerun());
1641       fsck.onlineHbck();
1642       assertNoErrors(fsck);
1643 
1644       assertEquals(ROWKEYS.length, countRows());
1645     } finally {
1646       deleteTable(table);
1647     }
1648   }
1649 
1650   /**
1651    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1652    * However, it can not fix it without checking Hdfs because we need to get
1653    * the region info from Hdfs in this case, then to patch the meta.
1654    */
1655   @Test
1656   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1657     TableName table =
1658         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1659     try {
1660       setupTable(table);
1661       assertEquals(ROWKEYS.length, countRows());
1662 
1663       // Mess it up by deleting a region from the metadata
1664       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1665         Bytes.toBytes("B"), false, true, false, false);
1666 
1667       // verify there is no other errors
1668       HBaseFsck hbck = doFsck(conf, false);
1669       assertErrors(hbck, new ERROR_CODE[] {
1670         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1671 
1672       // verify that noHdfsChecking report the same errors
1673       HBaseFsck fsck = new HBaseFsck(conf);
1674       fsck.connect();
1675       fsck.setDisplayFullReport(); // i.e. -details
1676       fsck.setTimeLag(0);
1677       fsck.setCheckHdfs(false);
1678       fsck.onlineHbck();
1679       assertErrors(fsck, new ERROR_CODE[] {
1680         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1681 
1682       // verify that fixMeta doesn't work with noHdfsChecking
1683       fsck = new HBaseFsck(conf);
1684       fsck.connect();
1685       fsck.setDisplayFullReport(); // i.e. -details
1686       fsck.setTimeLag(0);
1687       fsck.setCheckHdfs(false);
1688       fsck.setFixAssignments(true);
1689       fsck.setFixMeta(true);
1690       fsck.onlineHbck();
1691       assertFalse(fsck.shouldRerun());
1692       assertErrors(fsck, new ERROR_CODE[] {
1693         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1694 
1695       // fix the cluster so other tests won't be impacted
1696       fsck = doFsck(conf, true);
1697       assertTrue(fsck.shouldRerun());
1698       fsck = doFsck(conf, true);
1699       assertNoErrors(fsck);
1700     } finally {
1701       deleteTable(table);
1702     }
1703   }
1704 
1705   /**
1706    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1707    * and -noHdfsChecking can't detect orphan Hdfs region.
1708    */
1709   @Test
1710   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1711     TableName table =
1712         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1713     try {
1714       setupTable(table);
1715       assertEquals(ROWKEYS.length, countRows());
1716 
1717       // Mess it up by creating an overlap in the metadata
1718       TEST_UTIL.getHBaseAdmin().disableTable(table);
1719       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1720         Bytes.toBytes("B"), true, true, false, true);
1721       TEST_UTIL.getHBaseAdmin().enableTable(table);
1722 
1723       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1724         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1725       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1726       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1727         .waitForAssignment(hriOverlap);
1728       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1729       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1730 
1731       HBaseFsck hbck = doFsck(conf, false);
1732       assertErrors(hbck, new ERROR_CODE[] {
1733         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1734         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1735 
1736       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1737       HBaseFsck fsck = new HBaseFsck(conf);
1738       fsck.connect();
1739       fsck.setDisplayFullReport(); // i.e. -details
1740       fsck.setTimeLag(0);
1741       fsck.setCheckHdfs(false);
1742       fsck.onlineHbck();
1743       assertErrors(fsck, new ERROR_CODE[] {
1744         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1745 
1746       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1747       fsck = new HBaseFsck(conf);
1748       fsck.connect();
1749       fsck.setDisplayFullReport(); // i.e. -details
1750       fsck.setTimeLag(0);
1751       fsck.setCheckHdfs(false);
1752       fsck.setFixHdfsHoles(true);
1753       fsck.setFixHdfsOverlaps(true);
1754       fsck.setFixHdfsOrphans(true);
1755       fsck.onlineHbck();
1756       assertFalse(fsck.shouldRerun());
1757       assertErrors(fsck, new ERROR_CODE[] {
1758         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1759     } finally {
1760       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1761         TEST_UTIL.getHBaseAdmin().enableTable(table);
1762       }
1763       deleteTable(table);
1764     }
1765   }
1766 
1767   /**
1768    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1769    * legitimate hfile and return it.
1770    * @param fs
1771    * @param table
1772    * @return Path of a flushed hfile.
1773    * @throws IOException
1774    */
1775   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1776     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1777     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1778     Path famDir = new Path(regionDir, FAM_STR);
1779 
1780     // keep doing this until we get a legit hfile
1781     while (true) {
1782       FileStatus[] hfFss = fs.listStatus(famDir);
1783       if (hfFss.length == 0) {
1784         continue;
1785       }
1786       for (FileStatus hfs : hfFss) {
1787         if (!hfs.isDir()) {
1788           return hfs.getPath();
1789         }
1790       }
1791     }
1792   }
1793 
1794   /**
1795    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1796    */
1797   @Test(timeout=180000)
1798   public void testQuarantineCorruptHFile() throws Exception {
1799     TableName table = TableName.valueOf(name.getMethodName());
1800     try {
1801       setupTable(table);
1802       assertEquals(ROWKEYS.length, countRows());
1803       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1804 
1805       FileSystem fs = FileSystem.get(conf);
1806       Path hfile = getFlushedHFile(fs, table);
1807 
1808       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1809       TEST_UTIL.getHBaseAdmin().disableTable(table);
1810 
1811       // create new corrupt file called deadbeef (valid hfile name)
1812       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1813       TestHFile.truncateFile(fs, hfile, corrupt);
1814       LOG.info("Created corrupted file " + corrupt);
1815       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1816 
1817       // we cannot enable here because enable never finished due to the corrupt region.
1818       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1819       assertEquals(res.getRetCode(), 0);
1820       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1821       assertEquals(hfcc.getHFilesChecked(), 5);
1822       assertEquals(hfcc.getCorrupted().size(), 1);
1823       assertEquals(hfcc.getFailures().size(), 0);
1824       assertEquals(hfcc.getQuarantined().size(), 1);
1825       assertEquals(hfcc.getMissing().size(), 0);
1826 
1827       // Its been fixed, verify that we can enable.
1828       TEST_UTIL.getHBaseAdmin().enableTable(table);
1829     } finally {
1830       deleteTable(table);
1831     }
1832   }
1833 
1834   /**
1835   * Test that use this should have a timeout, because this method could potentially wait forever.
1836   */
1837   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1838                                 int corrupt, int fail, int quar, int missing) throws Exception {
1839     try {
1840       setupTable(table);
1841       assertEquals(ROWKEYS.length, countRows());
1842       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1843 
1844       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1845       TEST_UTIL.getHBaseAdmin().disableTable(table);
1846 
1847       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1848           table.getNameAsString()};
1849       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1850       HBaseFsck res = hbck.exec(exec, args);
1851 
1852       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1853       assertEquals(hfcc.getHFilesChecked(), check);
1854       assertEquals(hfcc.getCorrupted().size(), corrupt);
1855       assertEquals(hfcc.getFailures().size(), fail);
1856       assertEquals(hfcc.getQuarantined().size(), quar);
1857       assertEquals(hfcc.getMissing().size(), missing);
1858 
1859       // its been fixed, verify that we can enable
1860       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1861       admin.enableTableAsync(table);
1862       while (!admin.isTableEnabled(table)) {
1863         try {
1864           Thread.sleep(250);
1865         } catch (InterruptedException e) {
1866           e.printStackTrace();
1867           fail("Interrupted when trying to enable table " + table);
1868         }
1869       }
1870     } finally {
1871       deleteTable(table);
1872     }
1873   }
1874 
1875   /**
1876    * This creates a table and simulates the race situation where a concurrent compaction or split
1877    * has removed an hfile after the corruption checker learned about it.
1878    */
1879   @Test(timeout=180000)
1880   public void testQuarantineMissingHFile() throws Exception {
1881     TableName table = TableName.valueOf(name.getMethodName());
1882     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1883     // inject a fault in the hfcc created.
1884     final FileSystem fs = FileSystem.get(conf);
1885     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1886       @Override
1887       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1888         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1889           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1890           @Override
1891           protected void checkHFile(Path p) throws IOException {
1892             if (attemptedFirstHFile.compareAndSet(false, true)) {
1893               assertTrue(fs.delete(p, true)); // make sure delete happened.
1894             }
1895             super.checkHFile(p);
1896           }
1897         };
1898       }
1899     };
1900     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1901   }
1902 
1903   /**
1904    * This creates a table and simulates the race situation where a concurrent compaction or split
1905    * has removed an colfam dir before the corruption checker got to it.
1906    */
1907   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1908   // files in a column family on initial creation -- as suggested by Matteo.
1909   @Ignore @Test(timeout=180000)
1910   public void testQuarantineMissingFamdir() throws Exception {
1911     TableName table = TableName.valueOf(name.getMethodName());
1912     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1913     // inject a fault in the hfcc created.
1914     final FileSystem fs = FileSystem.get(conf);
1915     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1916       @Override
1917       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1918         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1919           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1920           @Override
1921           protected void checkColFamDir(Path p) throws IOException {
1922             if (attemptedFirstHFile.compareAndSet(false, true)) {
1923               assertTrue(fs.delete(p, true)); // make sure delete happened.
1924             }
1925             super.checkColFamDir(p);
1926           }
1927         };
1928       }
1929     };
1930     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1931   }
1932 
1933   /**
1934    * This creates a table and simulates the race situation where a concurrent compaction or split
1935    * has removed a region dir before the corruption checker got to it.
1936    */
1937   @Test(timeout=180000)
1938   public void testQuarantineMissingRegionDir() throws Exception {
1939     TableName table = TableName.valueOf(name.getMethodName());
1940     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1941     // inject a fault in the hfcc created.
1942     final FileSystem fs = FileSystem.get(conf);
1943     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1944       @Override
1945       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1946         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1947           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1948           @Override
1949           protected void checkRegionDir(Path p) throws IOException {
1950             if (attemptedFirstHFile.compareAndSet(false, true)) {
1951               assertTrue(fs.delete(p, true)); // make sure delete happened.
1952             }
1953             super.checkRegionDir(p);
1954           }
1955         };
1956       }
1957     };
1958     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1959   }
1960 
1961   /**
1962    * Test fixing lingering reference file.
1963    */
1964   @Test
1965   public void testLingeringReferenceFile() throws Exception {
1966     TableName table =
1967         TableName.valueOf("testLingeringReferenceFile");
1968     try {
1969       setupTable(table);
1970       assertEquals(ROWKEYS.length, countRows());
1971 
1972       // Mess it up by creating a fake reference file
1973       FileSystem fs = FileSystem.get(conf);
1974       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1975       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1976       Path famDir = new Path(regionDir, FAM_STR);
1977       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
1978       fs.create(fakeReferenceFile);
1979 
1980       HBaseFsck hbck = doFsck(conf, false);
1981       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
1982       // fix reference file
1983       doFsck(conf, true);
1984       // check that reference file fixed
1985       assertNoErrors(doFsck(conf, false));
1986     } finally {
1987       deleteTable(table);
1988     }
1989   }
1990 
1991   /**
1992    * Test mission REGIONINFO_QUALIFIER in hbase:meta
1993    */
1994   @Test
1995   public void testMissingRegionInfoQualifier() throws Exception {
1996     TableName table =
1997         TableName.valueOf("testMissingRegionInfoQualifier");
1998     try {
1999       setupTable(table);
2000 
2001       // Mess it up by removing the RegionInfo for one region.
2002       final List<Delete> deletes = new LinkedList<Delete>();
2003       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
2004       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
2005 
2006         @Override
2007         public boolean processRow(Result rowResult) throws IOException {
2008           HRegionInfo hri = MetaScanner.getHRegionInfo(rowResult);
2009           if (hri != null && !hri.getTable().isSystemTable()) {
2010             Delete delete = new Delete(rowResult.getRow());
2011             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2012             deletes.add(delete);
2013           }
2014           return true;
2015         }
2016 
2017         @Override
2018         public void close() throws IOException {
2019         }
2020       });
2021       meta.delete(deletes);
2022 
2023       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2024       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2025         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2026       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2027         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2028       meta.close();
2029 
2030       HBaseFsck hbck = doFsck(conf, false);
2031       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2032 
2033       // fix reference file
2034       hbck = doFsck(conf, true);
2035 
2036       // check that reference file fixed
2037       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2038     } finally {
2039       deleteTable(table);
2040     }
2041   }
2042 
2043 
2044   /**
2045    * Test pluggable error reporter. It can be plugged in
2046    * from system property or configuration.
2047    */
2048   @Test
2049   public void testErrorReporter() throws Exception {
2050     try {
2051       MockErrorReporter.calledCount = 0;
2052       doFsck(conf, false);
2053       assertEquals(MockErrorReporter.calledCount, 0);
2054 
2055       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2056       doFsck(conf, false);
2057       assertTrue(MockErrorReporter.calledCount > 20);
2058     } finally {
2059       conf.set("hbasefsck.errorreporter",
2060         PrintingErrorReporter.class.getName());
2061       MockErrorReporter.calledCount = 0;
2062     }
2063   }
2064 
2065   static class MockErrorReporter implements ErrorReporter {
2066     static int calledCount = 0;
2067 
2068     @Override
2069     public void clear() {
2070       calledCount++;
2071     }
2072 
2073     @Override
2074     public void report(String message) {
2075       calledCount++;
2076     }
2077 
2078     @Override
2079     public void reportError(String message) {
2080       calledCount++;
2081     }
2082 
2083     @Override
2084     public void reportError(ERROR_CODE errorCode, String message) {
2085       calledCount++;
2086     }
2087 
2088     @Override
2089     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2090       calledCount++;
2091     }
2092 
2093     @Override
2094     public void reportError(ERROR_CODE errorCode,
2095         String message, TableInfo table, HbckInfo info) {
2096       calledCount++;
2097     }
2098 
2099     @Override
2100     public void reportError(ERROR_CODE errorCode, String message,
2101         TableInfo table, HbckInfo info1, HbckInfo info2) {
2102       calledCount++;
2103     }
2104 
2105     @Override
2106     public int summarize() {
2107       return ++calledCount;
2108     }
2109 
2110     @Override
2111     public void detail(String details) {
2112       calledCount++;
2113     }
2114 
2115     @Override
2116     public ArrayList<ERROR_CODE> getErrorList() {
2117       calledCount++;
2118       return new ArrayList<ERROR_CODE>();
2119     }
2120 
2121     @Override
2122     public void progress() {
2123       calledCount++;
2124     }
2125 
2126     @Override
2127     public void print(String message) {
2128       calledCount++;
2129     }
2130 
2131     @Override
2132     public void resetErrors() {
2133       calledCount++;
2134     }
2135 
2136     @Override
2137     public boolean tableHasErrors(TableInfo table) {
2138       calledCount++;
2139       return false;
2140     }
2141   }
2142 
2143   @Test(timeout=180000)
2144   public void testCheckTableLocks() throws Exception {
2145     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2146     EnvironmentEdgeManager.injectEdge(edge);
2147     // check no errors
2148     HBaseFsck hbck = doFsck(conf, false);
2149     assertNoErrors(hbck);
2150 
2151     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2152 
2153     // obtain one lock
2154     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2155     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2156         "testCheckTableLocks");
2157     writeLock.acquire();
2158     hbck = doFsck(conf, false);
2159     assertNoErrors(hbck); // should not have expired, no problems
2160 
2161     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2162         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2163 
2164     hbck = doFsck(conf, false);
2165     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2166 
2167     final CountDownLatch latch = new CountDownLatch(1);
2168     new Thread() {
2169       @Override
2170       public void run() {
2171         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2172             "testCheckTableLocks");
2173         try {
2174           latch.countDown();
2175           readLock.acquire();
2176         } catch (IOException ex) {
2177           fail();
2178         } catch (IllegalStateException ex) {
2179           return; // expected, since this will be reaped under us.
2180         }
2181         fail("should not have come here");
2182       };
2183     }.start();
2184 
2185     latch.await(); // wait until thread starts
2186     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2187 
2188     hbck = doFsck(conf, false);
2189     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2190 
2191     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2192         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2193 
2194     hbck = doFsck(conf, false);
2195     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2196 
2197     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2198                                                                  // which is not injectable through EnvironmentEdge
2199     Threads.sleep(10);
2200     hbck = doFsck(conf, true); // now fix both cases
2201 
2202     hbck = doFsck(conf, false);
2203     assertNoErrors(hbck);
2204 
2205     // ensure that locks are deleted
2206     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2207         "should acquire without blocking");
2208     writeLock.acquire(); // this should not block.
2209     writeLock.release(); // release for clean state
2210   }
2211 
2212   /**
2213    * Test orphaned table ZNode (for table states)
2214    */
2215   @Test
2216   public void testOrphanedTableZNode() throws Exception {
2217     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2218 
2219     try {
2220       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getZKTable().
2221       setEnablingTable(table);
2222 
2223       try {
2224         setupTable(table);
2225         Assert.fail(
2226           "Create table should fail when its ZNode has already existed with ENABLING state.");
2227       } catch(TableExistsException t) {
2228         //Expected exception
2229       }
2230       // The setup table was interrupted in some state that needs to some cleanup.
2231       try {
2232         deleteTable(table);
2233       } catch (IOException e) {
2234         // Because create table failed, it is expected that the cleanup table would
2235         // throw some exception.  Ignore and continue.
2236       }
2237 
2238       HBaseFsck hbck = doFsck(conf, false);
2239       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2240 
2241       // fix the orphaned ZK entry
2242       hbck = doFsck(conf, true);
2243 
2244       // check that orpahned ZK table entry is gone.
2245       hbck = doFsck(conf, false);
2246       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2247       // Now create table should succeed.
2248       setupTable(table);
2249     } finally {
2250       // This code could be called that either a table was created successfully or set up
2251       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2252       try {
2253         deleteTable(table);
2254       } catch (IOException e) {
2255         // The cleanup table would throw some exception if create table failed in some state.
2256         // Ignore this exception
2257       }
2258     }
2259   }
2260 
2261   @Test
2262   public void testMetaOffline() throws Exception {
2263     // check no errors
2264     HBaseFsck hbck = doFsck(conf, false);
2265     assertNoErrors(hbck);
2266     deleteMetaRegion(conf, true, false, false);
2267     hbck = doFsck(conf, false);
2268     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2269     // inconsistency and whether we will be fixing it or not.
2270     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2271     hbck = doFsck(conf, true);
2272     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2273     hbck = doFsck(conf, false);
2274     assertNoErrors(hbck);
2275   }
2276 
2277   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2278       boolean regionInfoOnly) throws IOException, InterruptedException {
2279     HConnection connection = HConnectionManager.getConnection(conf);
2280     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2281         HConstants.EMPTY_START_ROW);
2282     ServerName hsa = metaLocation.getServerName();
2283     HRegionInfo hri = metaLocation.getRegionInfo();
2284     if (unassign) {
2285       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2286       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2287     }
2288 
2289     if (regionInfoOnly) {
2290       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2291       Path rootDir = FSUtils.getRootDir(conf);
2292       FileSystem fs = rootDir.getFileSystem(conf);
2293       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2294           hri.getEncodedName());
2295       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2296       fs.delete(hriPath, true);
2297     }
2298 
2299     if (hdfs) {
2300       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2301       Path rootDir = FSUtils.getRootDir(conf);
2302       FileSystem fs = rootDir.getFileSystem(conf);
2303       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2304           hri.getEncodedName());
2305       HBaseFsck.debugLsr(conf, p);
2306       boolean success = fs.delete(p, true);
2307       LOG.info("Deleted " + p + " sucessfully? " + success);
2308       HBaseFsck.debugLsr(conf, p);
2309     }
2310   }
2311 
2312   @Test
2313   public void testTableWithNoRegions() throws Exception {
2314     // We might end up with empty regions in a table
2315     // see also testNoHdfsTable()
2316     TableName table =
2317         TableName.valueOf(name.getMethodName());
2318     try {
2319       // create table with one region
2320       HTableDescriptor desc = new HTableDescriptor(table);
2321       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2322       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2323       TEST_UTIL.getHBaseAdmin().createTable(desc);
2324       tbl = new HTable(TEST_UTIL.getConfiguration(), table, executorService);
2325 
2326       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2327       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false,
2328           false, true);
2329 
2330       HBaseFsck hbck = doFsck(conf, false);
2331       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2332 
2333       doFsck(conf, true);
2334 
2335       // fix hole
2336       doFsck(conf, true);
2337 
2338       // check that hole fixed
2339       assertNoErrors(doFsck(conf, false));
2340     } finally {
2341       deleteTable(table);
2342     }
2343 
2344   }
2345 
2346   @Test
2347   public void testHbckAfterRegionMerge() throws Exception {
2348     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2349     HTable meta = null;
2350     try {
2351       // disable CatalogJanitor
2352       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2353       setupTable(table);
2354       assertEquals(ROWKEYS.length, countRows());
2355 
2356       // make sure data in regions, if in hlog only there is no data loss
2357       TEST_UTIL.getHBaseAdmin().flush(table.getName());
2358       HRegionInfo region1 = tbl.getRegionLocation("A").getRegionInfo();
2359       HRegionInfo region2 = tbl.getRegionLocation("B").getRegionInfo();
2360 
2361       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2362 
2363       assertNotEquals(region1, region2);
2364 
2365       // do a region merge
2366       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
2367       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2368           region2.getEncodedNameAsBytes(), false);
2369 
2370       // wait until region merged
2371       long timeout = System.currentTimeMillis() + 30 * 1000;
2372       while (true) {
2373         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2374           break;
2375         } else if (System.currentTimeMillis() > timeout) {
2376           fail("Time out waiting on region " + region1.getEncodedName()
2377               + " and " + region2.getEncodedName() + " be merged");
2378         }
2379         Thread.sleep(10);
2380       }
2381 
2382       assertEquals(ROWKEYS.length, countRows());
2383 
2384       HBaseFsck hbck = doFsck(conf, false);
2385       assertNoErrors(hbck); // no errors
2386 
2387     } finally {
2388       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2389       deleteTable(table);
2390       IOUtils.closeQuietly(meta);
2391     }
2392   }
2393 
2394   @Test
2395   public void testRegionBoundariesCheck() throws Exception {
2396     HBaseFsck hbck = doFsck(conf, false);
2397     assertNoErrors(hbck); // no errors
2398     try {
2399       hbck.checkRegionBoundaries();
2400     } catch (IllegalArgumentException e) {
2401       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2402         fail("Table directory path is not valid." + e.getMessage());
2403       }
2404     }
2405   }
2406 
2407   @org.junit.Rule
2408   public TestName name = new TestName();
2409 
2410   @Test
2411   public void testReadOnlyProperty() throws Exception {
2412     HBaseFsck hbck = doFsck(conf, false);
2413     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2414       hbck.shouldIgnorePreCheckPermission());
2415 
2416     hbck = doFsck(conf, true);
2417     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2418       hbck.shouldIgnorePreCheckPermission());
2419 
2420     hbck = doFsck(conf, true);
2421     hbck.setIgnorePreCheckPermission(true);
2422     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2423       hbck.shouldIgnorePreCheckPermission());
2424   }
2425 
2426   @Before
2427   public void setUp() {
2428     EnvironmentEdgeManager.reset();
2429   }
2430 
2431   @Test (timeout=180000)
2432   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2433     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2434     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2435     try {
2436       HTableDescriptor desc = new HTableDescriptor(table);
2437       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2438       TEST_UTIL.getHBaseAdmin().createTable(desc);
2439       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2440       for (int i = 0; i < 5; i++) {
2441         Put p1 = new Put(("r" + i).getBytes());
2442         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2443         tbl.put(p1);
2444       }
2445       TEST_UTIL.getHBaseAdmin().flush(desc.getTableName().toString());
2446       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2447       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2448       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2449       cluster.getServerWith(regions.get(0).getRegionName());
2450       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2451       st.prepare();
2452       st.stepsBeforePONR(regionServer, regionServer, false);
2453       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2454       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2455       for (RegionState state : regionsInTransition.values()) {
2456         am.regionOffline(state.getRegion());
2457       }
2458       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2459       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2460       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2461       am.assign(regionsMap);
2462       am.waitForAssignment(regions.get(0).getRegionInfo());
2463       HBaseFsck hbck = doFsck(conf, false);
2464       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2465           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2466       // holes are separate from overlap groups
2467       assertEquals(0, hbck.getOverlapGroups(table).size());
2468 
2469       // fix hole
2470       assertErrors(
2471         doFsck(
2472           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2473         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2474           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2475 
2476       // check that hole fixed
2477       assertNoErrors(doFsck(conf, false));
2478       assertEquals(5, countRows());
2479     } finally {
2480       if (tbl != null) {
2481         tbl.close();
2482         tbl = null;
2483       }
2484       deleteTable(table);
2485     }
2486   }
2487 }