View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.concurrent.Callable;
40  import java.util.concurrent.CountDownLatch;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.Executors;
43  import java.util.concurrent.Future;
44  import java.util.concurrent.ScheduledThreadPoolExecutor;
45  import java.util.concurrent.SynchronousQueue;
46  import java.util.concurrent.ThreadPoolExecutor;
47  import java.util.concurrent.TimeUnit;
48  import java.util.concurrent.atomic.AtomicBoolean;
49  
50  import org.apache.commons.io.IOUtils;
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  import org.apache.hadoop.conf.Configuration;
54  import org.apache.hadoop.fs.FileStatus;
55  import org.apache.hadoop.fs.FileSystem;
56  import org.apache.hadoop.fs.Path;
57  import org.apache.hadoop.hbase.ClusterStatus;
58  import org.apache.hadoop.hbase.HBaseTestingUtility;
59  import org.apache.hadoop.hbase.HColumnDescriptor;
60  import org.apache.hadoop.hbase.HConstants;
61  import org.apache.hadoop.hbase.HRegionInfo;
62  import org.apache.hadoop.hbase.HRegionLocation;
63  import org.apache.hadoop.hbase.HTableDescriptor;
64  import org.apache.hadoop.hbase.testclassification.LargeTests;
65  import org.apache.hadoop.hbase.MiniHBaseCluster;
66  import org.apache.hadoop.hbase.ServerName;
67  import org.apache.hadoop.hbase.TableName;
68  import org.apache.hadoop.hbase.catalog.MetaEditor;
69  import org.apache.hadoop.hbase.client.Delete;
70  import org.apache.hadoop.hbase.client.Durability;
71  import org.apache.hadoop.hbase.client.Get;
72  import org.apache.hadoop.hbase.client.HBaseAdmin;
73  import org.apache.hadoop.hbase.client.HConnection;
74  import org.apache.hadoop.hbase.client.HConnectionManager;
75  import org.apache.hadoop.hbase.client.HTable;
76  import org.apache.hadoop.hbase.client.MetaScanner;
77  import org.apache.hadoop.hbase.client.Put;
78  import org.apache.hadoop.hbase.client.Result;
79  import org.apache.hadoop.hbase.client.ResultScanner;
80  import org.apache.hadoop.hbase.client.Scan;
81  import org.apache.hadoop.hbase.io.hfile.TestHFile;
82  import org.apache.hadoop.hbase.master.AssignmentManager;
83  import org.apache.hadoop.hbase.master.HMaster;
84  import org.apache.hadoop.hbase.master.RegionState;
85  import org.apache.hadoop.hbase.master.RegionStates;
86  import org.apache.hadoop.hbase.master.TableLockManager;
87  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
88  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
89  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
90  import org.apache.hadoop.hbase.regionserver.HRegion;
91  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
92  import org.apache.hadoop.hbase.regionserver.HRegionServer;
93  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
94  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
95  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
96  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
97  import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
98  import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
99  import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
100 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
101 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
102 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
103 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
104 import org.apache.zookeeper.KeeperException;
105 import org.junit.AfterClass;
106 import org.junit.Assert;
107 import org.junit.BeforeClass;
108 import org.junit.Ignore;
109 import org.junit.Test;
110 import org.junit.experimental.categories.Category;
111 import org.junit.rules.TestName;
112 
113 import com.google.common.collect.Multimap;
114 
115 /**
116  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
117  */
118 @Category(LargeTests.class)
119 public class TestHBaseFsck {
120   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
121   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
122   private final static Configuration conf = TEST_UTIL.getConfiguration();
123   private final static String FAM_STR = "fam";
124   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
125   private final static int REGION_ONLINE_TIMEOUT = 800;
126   private static RegionStates regionStates;
127   private static ExecutorService executorService;
128 
129   // for the instance, reset every test run
130   private HTable tbl;
131   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
132     Bytes.toBytes("B"), Bytes.toBytes("C") };
133   // one row per region.
134   private final static byte[][] ROWKEYS= new byte[][] {
135     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
136     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
137 
138   @SuppressWarnings("deprecation")
139   @BeforeClass
140   public static void setUpBeforeClass() throws Exception {
141     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
142     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
143     TEST_UTIL.getConfiguration().setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
144     TEST_UTIL.startMiniCluster(3);
145     TEST_UTIL.setHDFSClientRetry(0);
146 
147     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
148         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
149 
150     AssignmentManager assignmentManager =
151       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
152     regionStates = assignmentManager.getRegionStates();
153     TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
154   }
155 
156   @AfterClass
157   public static void tearDownAfterClass() throws Exception {
158     TEST_UTIL.shutdownMiniCluster();
159   }
160 
161   @Test
162   public void testHBaseFsck() throws Exception {
163     assertNoErrors(doFsck(conf, false));
164     String table = "tableBadMetaAssign";
165     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
166 
167     // We created 1 table, should be fine
168     assertNoErrors(doFsck(conf, false));
169 
170     // Now let's mess it up and change the assignment in hbase:meta to
171     // point to a different region server
172     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
173     Scan scan = new Scan();
174     scan.setStartRow(Bytes.toBytes(table+",,"));
175     ResultScanner scanner = meta.getScanner(scan);
176     HRegionInfo hri = null;
177 
178     Result res = scanner.next();
179     ServerName currServer =
180       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
181           HConstants.SERVER_QUALIFIER));
182     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
183         HConstants.STARTCODE_QUALIFIER));
184 
185     for (JVMClusterUtil.RegionServerThread rs :
186         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
187 
188       ServerName sn = rs.getRegionServer().getServerName();
189 
190       // When we find a diff RS, change the assignment and break
191       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
192           startCode != sn.getStartcode()) {
193         Put put = new Put(res.getRow());
194         put.setDurability(Durability.SKIP_WAL);
195         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
196           Bytes.toBytes(sn.getHostAndPort()));
197         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
198           Bytes.toBytes(sn.getStartcode()));
199         meta.put(put);
200         hri = HRegionInfo.getHRegionInfo(res);
201         break;
202       }
203     }
204 
205     // Try to fix the data
206     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
207         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
208 
209     TEST_UTIL.getHBaseCluster().getMaster()
210       .getAssignmentManager().waitForAssignment(hri);
211 
212     // Should be fixed now
213     assertNoErrors(doFsck(conf, false));
214 
215     // comment needed - what is the purpose of this line
216     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
217     ResultScanner s = t.getScanner(new Scan());
218     s.close();
219     t.close();
220 
221     scanner.close();
222     meta.close();
223   }
224 
225   @Test(timeout=180000)
226   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
227     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
228     HBaseAdmin admin = null;
229     try {
230       admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
231       admin.closeRegion(cluster.getServerHoldingMeta(),
232           HRegionInfo.FIRST_META_REGIONINFO);
233     } finally {
234       if (admin != null) {
235         admin.close();
236       }
237     }
238     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
239     MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
240     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
241     HBaseFsck hbck = doFsck(conf, true);
242     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
243         ERROR_CODE.NULL_META_REGION });
244     assertNoErrors(doFsck(conf, false));
245   }
246 
247   /**
248    * Create a new region in META.
249    */
250   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
251       htd, byte[] startKey, byte[] endKey)
252       throws IOException {
253     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
254     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
255     MetaEditor.addRegionToMeta(meta, hri);
256     meta.close();
257     return hri;
258   }
259 
260   /**
261    * Debugging method to dump the contents of meta.
262    */
263   private void dumpMeta(TableName tableName) throws IOException {
264     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
265     for (byte[] row : metaRows) {
266       LOG.info(Bytes.toString(row));
267     }
268   }
269 
270   /**
271    * This method is used to undeploy a region -- close it and attempt to
272    * remove its state from the Master.
273    */
274   private void undeployRegion(HBaseAdmin admin, ServerName sn,
275       HRegionInfo hri) throws IOException, InterruptedException {
276     try {
277       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
278       if (!hri.isMetaTable()) {
279         admin.offline(hri.getRegionName());
280       }
281     } catch (IOException ioe) {
282       LOG.warn("Got exception when attempting to offline region "
283           + Bytes.toString(hri.getRegionName()), ioe);
284     }
285   }
286   /**
287    * Delete a region from assignments, meta, or completely from hdfs.
288    * @param unassign if true unassign region if assigned
289    * @param metaRow  if true remove region's row from META
290    * @param hdfs if true remove region's dir in HDFS
291    */
292   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
293       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
294       boolean hdfs) throws IOException, InterruptedException {
295     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
296   }
297 
298   /**
299    * Delete a region from assignments, meta, or completely from hdfs.
300    * @param unassign if true unassign region if assigned
301    * @param metaRow  if true remove region's row from META
302    * @param hdfs if true remove region's dir in HDFS
303    * @param regionInfoOnly if true remove a region dir's .regioninfo file
304    */
305   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
306       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
307       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
308     LOG.info("** Before delete:");
309     dumpMeta(htd.getTableName());
310 
311     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
312     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
313       HRegionInfo hri = e.getKey();
314       ServerName hsa = e.getValue();
315       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
316           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
317 
318         LOG.info("RegionName: " +hri.getRegionNameAsString());
319         byte[] deleteRow = hri.getRegionName();
320 
321         if (unassign) {
322           LOG.info("Undeploying region " + hri + " from server " + hsa);
323           undeployRegion(new HBaseAdmin(conf), hsa, hri);
324         }
325 
326         if (regionInfoOnly) {
327           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
328           Path rootDir = FSUtils.getRootDir(conf);
329           FileSystem fs = rootDir.getFileSystem(conf);
330           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
331               hri.getEncodedName());
332           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
333           fs.delete(hriPath, true);
334         }
335 
336         if (hdfs) {
337           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
338           Path rootDir = FSUtils.getRootDir(conf);
339           FileSystem fs = rootDir.getFileSystem(conf);
340           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
341               hri.getEncodedName());
342           HBaseFsck.debugLsr(conf, p);
343           boolean success = fs.delete(p, true);
344           LOG.info("Deleted " + p + " sucessfully? " + success);
345           HBaseFsck.debugLsr(conf, p);
346         }
347 
348         if (metaRow) {
349           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
350           Delete delete = new Delete(deleteRow);
351           meta.delete(delete);
352         }
353       }
354       LOG.info(hri.toString() + hsa.toString());
355     }
356 
357     TEST_UTIL.getMetaTableRows(htd.getTableName());
358     LOG.info("*** After delete:");
359     dumpMeta(htd.getTableName());
360   }
361 
362   /**
363    * Setup a clean table before we start mucking with it.
364    *
365    * @throws IOException
366    * @throws InterruptedException
367    * @throws KeeperException
368    */
369   HTable setupTable(TableName tablename) throws Exception {
370     HTableDescriptor desc = new HTableDescriptor(tablename);
371     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
372     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
373     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
374     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
375 
376     List<Put> puts = new ArrayList<Put>();
377     for (byte[] row : ROWKEYS) {
378       Put p = new Put(row);
379       p.add(FAM, Bytes.toBytes("val"), row);
380       puts.add(p);
381     }
382     tbl.put(puts);
383     tbl.flushCommits();
384     return tbl;
385   }
386 
387   /**
388    * Counts the number of row to verify data loss or non-dataloss.
389    */
390   int countRows() throws IOException {
391      Scan s = new Scan();
392      ResultScanner rs = tbl.getScanner(s);
393      int i = 0;
394      while(rs.next() !=null) {
395        i++;
396      }
397      return i;
398   }
399 
400   /**
401    * delete table in preparation for next test
402    *
403    * @param tablename
404    * @throws IOException
405    */
406   void deleteTable(TableName tablename) throws IOException {
407     HBaseAdmin admin = new HBaseAdmin(conf);
408     admin.getConnection().clearRegionCache();
409     if (admin.isTableEnabled(tablename)) {
410       admin.disableTableAsync(tablename);
411     }
412     long totalWait = 0;
413     long maxWait = 30*1000;
414     long sleepTime = 250;
415     while (!admin.isTableDisabled(tablename)) {
416       try {
417         Thread.sleep(sleepTime);
418         totalWait += sleepTime;
419         if (totalWait >= maxWait) {
420           fail("Waited too long for table to be disabled + " + tablename);
421         }
422       } catch (InterruptedException e) {
423         e.printStackTrace();
424         fail("Interrupted when trying to disable table " + tablename);
425       }
426     }
427     admin.deleteTable(tablename);
428   }
429 
430   /**
431    * This creates a clean table and confirms that the table is clean.
432    */
433   @Test
434   public void testHBaseFsckClean() throws Exception {
435     assertNoErrors(doFsck(conf, false));
436     TableName table = TableName.valueOf("tableClean");
437     try {
438       HBaseFsck hbck = doFsck(conf, false);
439       assertNoErrors(hbck);
440 
441       setupTable(table);
442       assertEquals(ROWKEYS.length, countRows());
443 
444       // We created 1 table, should be fine
445       hbck = doFsck(conf, false);
446       assertNoErrors(hbck);
447       assertEquals(0, hbck.getOverlapGroups(table).size());
448       assertEquals(ROWKEYS.length, countRows());
449     } finally {
450       deleteTable(table);
451     }
452   }
453 
454   /**
455    * Test thread pooling in the case where there are more regions than threads
456    */
457   @Test
458   public void testHbckThreadpooling() throws Exception {
459     TableName table =
460         TableName.valueOf("tableDupeStartKey");
461     try {
462       // Create table with 4 regions
463       setupTable(table);
464 
465       // limit number of threads to 1.
466       Configuration newconf = new Configuration(conf);
467       newconf.setInt("hbasefsck.numthreads", 1);
468       assertNoErrors(doFsck(newconf, false));
469 
470       // We should pass without triggering a RejectedExecutionException
471     } finally {
472       deleteTable(table);
473     }
474   }
475 
476   @Test
477   public void testHbckFixOrphanTable() throws Exception {
478     TableName table = TableName.valueOf("tableInfo");
479     FileSystem fs = null;
480     Path tableinfo = null;
481     try {
482       setupTable(table);
483       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
484 
485       Path hbaseTableDir = FSUtils.getTableDir(
486           FSUtils.getRootDir(conf), table);
487       fs = hbaseTableDir.getFileSystem(conf);
488       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
489       tableinfo = status.getPath();
490       fs.rename(tableinfo, new Path("/.tableinfo"));
491 
492       //to report error if .tableinfo is missing.
493       HBaseFsck hbck = doFsck(conf, false);
494       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
495 
496       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
497       hbck = doFsck(conf, true);
498       assertNoErrors(hbck);
499       status = null;
500       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
501       assertNotNull(status);
502 
503       HTableDescriptor htd = admin.getTableDescriptor(table);
504       htd.setValue("NOT_DEFAULT", "true");
505       admin.disableTable(table);
506       admin.modifyTable(table, htd);
507       admin.enableTable(table);
508       fs.delete(status.getPath(), true);
509 
510       // fix OrphanTable with cache
511       htd = admin.getTableDescriptor(table); // warms up cached htd on master
512       hbck = doFsck(conf, true);
513       assertNoErrors(hbck);
514       status = null;
515       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
516       assertNotNull(status);
517       htd = admin.getTableDescriptor(table);
518       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
519     } finally {
520       fs.rename(new Path("/.tableinfo"), tableinfo);
521       deleteTable(table);
522     }
523   }
524 
525   /**
526    * This test makes sure that parallel instances of Hbck is disabled.
527    *
528    * @throws Exception
529    */
530   @Test
531   public void testParallelHbck() throws Exception {
532     final ExecutorService service;
533     final Future<HBaseFsck> hbck1,hbck2;
534 
535     class RunHbck implements Callable<HBaseFsck>{
536       boolean fail = true;
537       @Override
538       public HBaseFsck call(){
539         try{
540           return doFsck(conf, false);
541         } catch(Exception e){
542           if (e.getMessage().contains("Duplicate hbck")) {
543             fail = false;
544           } else {
545             LOG.fatal("hbck failed.", e);
546           }
547         }
548         // If we reach here, then an exception was caught
549         if (fail) fail();
550         return null;
551       }
552     }
553     service = Executors.newFixedThreadPool(2);
554     hbck1 = service.submit(new RunHbck());
555     hbck2 = service.submit(new RunHbck());
556     service.shutdown();
557     //wait for 15 seconds, for both hbck calls finish
558     service.awaitTermination(15, TimeUnit.SECONDS);
559     HBaseFsck h1 = hbck1.get();
560     HBaseFsck h2 = hbck2.get();
561     // Make sure only one of the calls was successful
562     assert(h1 == null || h2 == null);
563     if (h1 != null) {
564       assert(h1.getRetCode() >= 0);
565     }
566     if (h2 != null) {
567       assert(h2.getRetCode() >= 0);
568     }
569   }
570 
571   /**
572    * This create and fixes a bad table with regions that have a duplicate
573    * start key
574    */
575   @Test
576   public void testDupeStartKey() throws Exception {
577     TableName table =
578         TableName.valueOf("tableDupeStartKey");
579     try {
580       setupTable(table);
581       assertNoErrors(doFsck(conf, false));
582       assertEquals(ROWKEYS.length, countRows());
583 
584       // Now let's mess it up, by adding a region with a duplicate startkey
585       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
586           Bytes.toBytes("A"), Bytes.toBytes("A2"));
587       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
588       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
589           .waitForAssignment(hriDupe);
590       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
591       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
592 
593       HBaseFsck hbck = doFsck(conf, false);
594       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
595             ERROR_CODE.DUPE_STARTKEYS});
596       assertEquals(2, hbck.getOverlapGroups(table).size());
597       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
598 
599       // fix the degenerate region.
600       doFsck(conf,true);
601 
602       // check that the degenerate region is gone and no data loss
603       HBaseFsck hbck2 = doFsck(conf,false);
604       assertNoErrors(hbck2);
605       assertEquals(0, hbck2.getOverlapGroups(table).size());
606       assertEquals(ROWKEYS.length, countRows());
607     } finally {
608       deleteTable(table);
609     }
610   }
611 
612   /**
613    * Get region info from local cluster.
614    */
615   Map<ServerName, List<String>> getDeployedHRIs(
616       final HBaseAdmin admin) throws IOException {
617     ClusterStatus status = admin.getClusterStatus();
618     Collection<ServerName> regionServers = status.getServers();
619     Map<ServerName, List<String>> mm =
620         new HashMap<ServerName, List<String>>();
621     HConnection connection = admin.getConnection();
622     for (ServerName hsi : regionServers) {
623       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
624 
625       // list all online regions from this region server
626       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
627       List<String> regionNames = new ArrayList<String>();
628       for (HRegionInfo hri : regions) {
629         regionNames.add(hri.getRegionNameAsString());
630       }
631       mm.put(hsi, regionNames);
632     }
633     return mm;
634   }
635 
636   /**
637    * Returns the HSI a region info is on.
638    */
639   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
640     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
641       if (e.getValue().contains(hri.getRegionNameAsString())) {
642         return e.getKey();
643       }
644     }
645     return null;
646   }
647 
648   /**
649    * This create and fixes a bad table with regions that have a duplicate
650    * start key
651    */
652   @Test
653   public void testDupeRegion() throws Exception {
654     TableName table =
655         TableName.valueOf("tableDupeRegion");
656     try {
657       setupTable(table);
658       assertNoErrors(doFsck(conf, false));
659       assertEquals(ROWKEYS.length, countRows());
660 
661       // Now let's mess it up, by adding a region with a duplicate startkey
662       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
663           Bytes.toBytes("A"), Bytes.toBytes("B"));
664 
665       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
666       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
667           .waitForAssignment(hriDupe);
668       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
669       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
670 
671       // Yikes! The assignment manager can't tell between diff between two
672       // different regions with the same start/endkeys since it doesn't
673       // differentiate on ts/regionId!  We actually need to recheck
674       // deployments!
675       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
676       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
677         Thread.sleep(250);
678       }
679 
680       LOG.debug("Finished assignment of dupe region");
681 
682       // TODO why is dupe region different from dupe start keys?
683       HBaseFsck hbck = doFsck(conf, false);
684       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
685             ERROR_CODE.DUPE_STARTKEYS});
686       assertEquals(2, hbck.getOverlapGroups(table).size());
687       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
688 
689       // fix the degenerate region.
690       doFsck(conf,true);
691 
692       // check that the degenerate region is gone and no data loss
693       HBaseFsck hbck2 = doFsck(conf,false);
694       assertNoErrors(hbck2);
695       assertEquals(0, hbck2.getOverlapGroups(table).size());
696       assertEquals(ROWKEYS.length, countRows());
697     } finally {
698       deleteTable(table);
699     }
700   }
701 
702   /**
703    * This creates and fixes a bad table with regions that has startkey == endkey
704    */
705   @Test
706   public void testDegenerateRegions() throws Exception {
707     TableName table =
708         TableName.valueOf("tableDegenerateRegions");
709     try {
710       setupTable(table);
711       assertNoErrors(doFsck(conf,false));
712       assertEquals(ROWKEYS.length, countRows());
713 
714       // Now let's mess it up, by adding a region with a duplicate startkey
715       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
716           Bytes.toBytes("B"), Bytes.toBytes("B"));
717       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
718       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
719           .waitForAssignment(hriDupe);
720       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
721       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
722 
723       HBaseFsck hbck = doFsck(conf,false);
724       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
725           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
726       assertEquals(2, hbck.getOverlapGroups(table).size());
727       assertEquals(ROWKEYS.length, countRows());
728 
729       // fix the degenerate region.
730       doFsck(conf,true);
731 
732       // check that the degenerate region is gone and no data loss
733       HBaseFsck hbck2 = doFsck(conf,false);
734       assertNoErrors(hbck2);
735       assertEquals(0, hbck2.getOverlapGroups(table).size());
736       assertEquals(ROWKEYS.length, countRows());
737     } finally {
738       deleteTable(table);
739     }
740   }
741 
742   /**
743    * This creates and fixes a bad table where a region is completely contained
744    * by another region.
745    */
746   @Test
747   public void testContainedRegionOverlap() throws Exception {
748     TableName table =
749         TableName.valueOf("tableContainedRegionOverlap");
750     try {
751       setupTable(table);
752       assertEquals(ROWKEYS.length, countRows());
753 
754       // Mess it up by creating an overlap in the metadata
755       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
756           Bytes.toBytes("A2"), Bytes.toBytes("B"));
757       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
758       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
759           .waitForAssignment(hriOverlap);
760       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
761       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
762 
763       HBaseFsck hbck = doFsck(conf, false);
764       assertErrors(hbck, new ERROR_CODE[] {
765           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
766       assertEquals(2, hbck.getOverlapGroups(table).size());
767       assertEquals(ROWKEYS.length, countRows());
768 
769       // fix the problem.
770       doFsck(conf, true);
771 
772       // verify that overlaps are fixed
773       HBaseFsck hbck2 = doFsck(conf,false);
774       assertNoErrors(hbck2);
775       assertEquals(0, hbck2.getOverlapGroups(table).size());
776       assertEquals(ROWKEYS.length, countRows());
777     } finally {
778        deleteTable(table);
779     }
780   }
781 
782   /**
783    * This creates and fixes a bad table where an overlap group of
784    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
785    * region. Mess around the meta data so that closeRegion/offlineRegion
786    * throws exceptions.
787    */
788   @Test
789   public void testSidelineOverlapRegion() throws Exception {
790     TableName table =
791         TableName.valueOf("testSidelineOverlapRegion");
792     try {
793       setupTable(table);
794       assertEquals(ROWKEYS.length, countRows());
795 
796       // Mess it up by creating an overlap
797       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
798       HMaster master = cluster.getMaster();
799       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
800         Bytes.toBytes("A"), Bytes.toBytes("AB"));
801       master.assignRegion(hriOverlap1);
802       master.getAssignmentManager().waitForAssignment(hriOverlap1);
803       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
804         Bytes.toBytes("AB"), Bytes.toBytes("B"));
805       master.assignRegion(hriOverlap2);
806       master.getAssignmentManager().waitForAssignment(hriOverlap2);
807 
808       HBaseFsck hbck = doFsck(conf, false);
809       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
810         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
811       assertEquals(3, hbck.getOverlapGroups(table).size());
812       assertEquals(ROWKEYS.length, countRows());
813 
814       // mess around the overlapped regions, to trigger NotServingRegionException
815       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
816       ServerName serverName = null;
817       byte[] regionName = null;
818       for (HbckInfo hbi: overlapGroups.values()) {
819         if ("A".equals(Bytes.toString(hbi.getStartKey()))
820             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
821           regionName = hbi.getRegionName();
822 
823           // get an RS not serving the region to force bad assignment info in to META.
824           int k = cluster.getServerWith(regionName);
825           for (int i = 0; i < 3; i++) {
826             if (i != k) {
827               HRegionServer rs = cluster.getRegionServer(i);
828               serverName = rs.getServerName();
829               break;
830             }
831           }
832 
833           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
834           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
835             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
836           admin.offline(regionName);
837           break;
838         }
839       }
840 
841       assertNotNull(regionName);
842       assertNotNull(serverName);
843       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
844       Put put = new Put(regionName);
845       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
846         Bytes.toBytes(serverName.getHostAndPort()));
847       meta.put(put);
848 
849       // fix the problem.
850       HBaseFsck fsck = new HBaseFsck(conf);
851       fsck.connect();
852       fsck.setDisplayFullReport(); // i.e. -details
853       fsck.setTimeLag(0);
854       fsck.setFixAssignments(true);
855       fsck.setFixMeta(true);
856       fsck.setFixHdfsHoles(true);
857       fsck.setFixHdfsOverlaps(true);
858       fsck.setFixHdfsOrphans(true);
859       fsck.setFixVersionFile(true);
860       fsck.setSidelineBigOverlaps(true);
861       fsck.setMaxMerge(2);
862       fsck.onlineHbck();
863 
864       // verify that overlaps are fixed, and there are less rows
865       // since one region is sidelined.
866       HBaseFsck hbck2 = doFsck(conf,false);
867       assertNoErrors(hbck2);
868       assertEquals(0, hbck2.getOverlapGroups(table).size());
869       assertTrue(ROWKEYS.length > countRows());
870     } finally {
871        deleteTable(table);
872     }
873   }
874 
875   /**
876    * This creates and fixes a bad table where a region is completely contained
877    * by another region, and there is a hole (sort of like a bad split)
878    */
879   @Test
880   public void testOverlapAndOrphan() throws Exception {
881     TableName table =
882         TableName.valueOf("tableOverlapAndOrphan");
883     try {
884       setupTable(table);
885       assertEquals(ROWKEYS.length, countRows());
886 
887       // Mess it up by creating an overlap in the metadata
888       TEST_UTIL.getHBaseAdmin().disableTable(table);
889       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
890           Bytes.toBytes("B"), true, true, false, true);
891       TEST_UTIL.getHBaseAdmin().enableTable(table);
892 
893       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
894           Bytes.toBytes("A2"), Bytes.toBytes("B"));
895       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
896       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
897           .waitForAssignment(hriOverlap);
898       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
899       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
900 
901       HBaseFsck hbck = doFsck(conf, false);
902       assertErrors(hbck, new ERROR_CODE[] {
903           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
904           ERROR_CODE.HOLE_IN_REGION_CHAIN});
905 
906       // fix the problem.
907       doFsck(conf, true);
908 
909       // verify that overlaps are fixed
910       HBaseFsck hbck2 = doFsck(conf,false);
911       assertNoErrors(hbck2);
912       assertEquals(0, hbck2.getOverlapGroups(table).size());
913       assertEquals(ROWKEYS.length, countRows());
914     } finally {
915        deleteTable(table);
916     }
917   }
918 
919   /**
920    * This creates and fixes a bad table where a region overlaps two regions --
921    * a start key contained in another region and its end key is contained in
922    * yet another region.
923    */
924   @Test
925   public void testCoveredStartKey() throws Exception {
926     TableName table =
927         TableName.valueOf("tableCoveredStartKey");
928     try {
929       setupTable(table);
930       assertEquals(ROWKEYS.length, countRows());
931 
932       // Mess it up by creating an overlap in the metadata
933       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
934           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
935       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
936       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
937           .waitForAssignment(hriOverlap);
938       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
939       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
940 
941       HBaseFsck hbck = doFsck(conf, false);
942       assertErrors(hbck, new ERROR_CODE[] {
943           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
944           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
945       assertEquals(3, hbck.getOverlapGroups(table).size());
946       assertEquals(ROWKEYS.length, countRows());
947 
948       // fix the problem.
949       doFsck(conf, true);
950 
951       // verify that overlaps are fixed
952       HBaseFsck hbck2 = doFsck(conf, false);
953       assertErrors(hbck2, new ERROR_CODE[0]);
954       assertEquals(0, hbck2.getOverlapGroups(table).size());
955       assertEquals(ROWKEYS.length, countRows());
956     } finally {
957       deleteTable(table);
958     }
959   }
960 
961   /**
962    * This creates and fixes a bad table with a missing region -- hole in meta
963    * and data missing in the fs.
964    */
965   @Test
966   public void testRegionHole() throws Exception {
967     TableName table =
968         TableName.valueOf("tableRegionHole");
969     try {
970       setupTable(table);
971       assertEquals(ROWKEYS.length, countRows());
972 
973       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
974       TEST_UTIL.getHBaseAdmin().disableTable(table);
975       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
976           Bytes.toBytes("C"), true, true, true);
977       TEST_UTIL.getHBaseAdmin().enableTable(table);
978 
979       HBaseFsck hbck = doFsck(conf, false);
980       assertErrors(hbck, new ERROR_CODE[] {
981           ERROR_CODE.HOLE_IN_REGION_CHAIN});
982       // holes are separate from overlap groups
983       assertEquals(0, hbck.getOverlapGroups(table).size());
984 
985       // fix hole
986       doFsck(conf, true);
987 
988       // check that hole fixed
989       assertNoErrors(doFsck(conf,false));
990       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
991     } finally {
992       deleteTable(table);
993     }
994   }
995 
996   /**
997    * This creates and fixes a bad table with a missing region -- hole in meta
998    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
999    */
1000   @Test
1001   public void testHDFSRegioninfoMissing() throws Exception {
1002     TableName table =
1003         TableName.valueOf("tableHDFSRegioininfoMissing");
1004     try {
1005       setupTable(table);
1006       assertEquals(ROWKEYS.length, countRows());
1007 
1008       // Mess it up by leaving a hole in the meta data
1009       TEST_UTIL.getHBaseAdmin().disableTable(table);
1010       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1011           Bytes.toBytes("C"), true, true, false, true);
1012       TEST_UTIL.getHBaseAdmin().enableTable(table);
1013 
1014       HBaseFsck hbck = doFsck(conf, false);
1015       assertErrors(hbck, new ERROR_CODE[] {
1016           ERROR_CODE.ORPHAN_HDFS_REGION,
1017           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1018           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1019       // holes are separate from overlap groups
1020       assertEquals(0, hbck.getOverlapGroups(table).size());
1021 
1022       // fix hole
1023       doFsck(conf, true);
1024 
1025       // check that hole fixed
1026       assertNoErrors(doFsck(conf, false));
1027       assertEquals(ROWKEYS.length, countRows());
1028     } finally {
1029       deleteTable(table);
1030     }
1031   }
1032 
1033   /**
1034    * This creates and fixes a bad table with a region that is missing meta and
1035    * not assigned to a region server.
1036    */
1037   @Test
1038   public void testNotInMetaOrDeployedHole() throws Exception {
1039     TableName table =
1040         TableName.valueOf("tableNotInMetaOrDeployedHole");
1041     try {
1042       setupTable(table);
1043       assertEquals(ROWKEYS.length, countRows());
1044 
1045       // Mess it up by leaving a hole in the meta data
1046       TEST_UTIL.getHBaseAdmin().disableTable(table);
1047       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1048           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1049       TEST_UTIL.getHBaseAdmin().enableTable(table);
1050 
1051       HBaseFsck hbck = doFsck(conf, false);
1052       assertErrors(hbck, new ERROR_CODE[] {
1053           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1054       // holes are separate from overlap groups
1055       assertEquals(0, hbck.getOverlapGroups(table).size());
1056 
1057       // fix hole
1058       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1059           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1060 
1061       // check that hole fixed
1062       assertNoErrors(doFsck(conf,false));
1063       assertEquals(ROWKEYS.length, countRows());
1064     } finally {
1065       deleteTable(table);
1066     }
1067   }
1068 
1069   /**
1070    * This creates fixes a bad table with a hole in meta.
1071    */
1072   @Test
1073   public void testNotInMetaHole() throws Exception {
1074     TableName table =
1075         TableName.valueOf("tableNotInMetaHole");
1076     try {
1077       setupTable(table);
1078       assertEquals(ROWKEYS.length, countRows());
1079 
1080       // Mess it up by leaving a hole in the meta data
1081       TEST_UTIL.getHBaseAdmin().disableTable(table);
1082       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1083           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1084       TEST_UTIL.getHBaseAdmin().enableTable(table);
1085 
1086       HBaseFsck hbck = doFsck(conf, false);
1087       assertErrors(hbck, new ERROR_CODE[] {
1088           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1089       // holes are separate from overlap groups
1090       assertEquals(0, hbck.getOverlapGroups(table).size());
1091 
1092       // fix hole
1093       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1094           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1095 
1096       // check that hole fixed
1097       assertNoErrors(doFsck(conf,false));
1098       assertEquals(ROWKEYS.length, countRows());
1099     } finally {
1100       deleteTable(table);
1101     }
1102   }
1103 
1104   /**
1105    * This creates and fixes a bad table with a region that is in meta but has
1106    * no deployment or data hdfs
1107    */
1108   @Test
1109   public void testNotInHdfs() throws Exception {
1110     TableName table =
1111         TableName.valueOf("tableNotInHdfs");
1112     try {
1113       setupTable(table);
1114       assertEquals(ROWKEYS.length, countRows());
1115 
1116       // make sure data in regions, if in hlog only there is no data loss
1117       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1118 
1119       // Mess it up by leaving a hole in the hdfs data
1120       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1121           Bytes.toBytes("C"), false, false, true); // don't rm meta
1122 
1123       HBaseFsck hbck = doFsck(conf, false);
1124       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1125       // holes are separate from overlap groups
1126       assertEquals(0, hbck.getOverlapGroups(table).size());
1127 
1128       // fix hole
1129       doFsck(conf, true);
1130 
1131       // check that hole fixed
1132       assertNoErrors(doFsck(conf,false));
1133       assertEquals(ROWKEYS.length - 2, countRows());
1134     } finally {
1135       deleteTable(table);
1136     }
1137   }
1138 
1139   /**
1140    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1141    * remove the table.
1142    */
1143   @Test
1144   public void testNoHdfsTable() throws Exception {
1145     TableName table = TableName.valueOf("NoHdfsTable");
1146     setupTable(table);
1147     assertEquals(ROWKEYS.length, countRows());
1148 
1149     // make sure data in regions, if in hlog only there is no data loss
1150     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1151 
1152     // Mess it up by deleting hdfs dirs
1153     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1154         Bytes.toBytes("A"), false, false, true); // don't rm meta
1155     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1156         Bytes.toBytes("B"), false, false, true); // don't rm meta
1157     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1158         Bytes.toBytes("C"), false, false, true); // don't rm meta
1159     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1160         Bytes.toBytes(""), false, false, true); // don't rm meta
1161 
1162     // also remove the table directory in hdfs
1163     deleteTableDir(table);
1164 
1165     HBaseFsck hbck = doFsck(conf, false);
1166     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1167         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1168         ERROR_CODE.NOT_IN_HDFS,});
1169     // holes are separate from overlap groups
1170     assertEquals(0, hbck.getOverlapGroups(table).size());
1171 
1172     // fix hole
1173     doFsck(conf, true); // detect dangling regions and remove those
1174 
1175     // check that hole fixed
1176     assertNoErrors(doFsck(conf,false));
1177     assertFalse("Table "+ table + " should have been deleted",
1178         TEST_UTIL.getHBaseAdmin().tableExists(table));
1179   }
1180 
1181   public void deleteTableDir(TableName table) throws IOException {
1182     Path rootDir = FSUtils.getRootDir(conf);
1183     FileSystem fs = rootDir.getFileSystem(conf);
1184     Path p = FSUtils.getTableDir(rootDir, table);
1185     HBaseFsck.debugLsr(conf, p);
1186     boolean success = fs.delete(p, true);
1187     LOG.info("Deleted " + p + " sucessfully? " + success);
1188   }
1189 
1190   /**
1191    * when the hbase.version file missing, It is fix the fault.
1192    */
1193   @Test
1194   public void testNoVersionFile() throws Exception {
1195     // delete the hbase.version file
1196     Path rootDir = FSUtils.getRootDir(conf);
1197     FileSystem fs = rootDir.getFileSystem(conf);
1198     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1199     fs.delete(versionFile, true);
1200 
1201     // test
1202     HBaseFsck hbck = doFsck(conf, false);
1203     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1204     // fix hbase.version missing
1205     doFsck(conf, true);
1206 
1207     // no version file fixed
1208     assertNoErrors(doFsck(conf, false));
1209   }
1210 
1211   /**
1212    * The region is not deployed when the table is disabled.
1213    */
1214   @Test
1215   public void testRegionShouldNotBeDeployed() throws Exception {
1216     TableName table =
1217         TableName.valueOf("tableRegionShouldNotBeDeployed");
1218     try {
1219       LOG.info("Starting testRegionShouldNotBeDeployed.");
1220       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1221       assertTrue(cluster.waitForActiveAndReadyMaster());
1222 
1223 
1224       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1225           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1226       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1227       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1228 
1229       // Write the .tableinfo
1230       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1231       fstd.createTableDescriptor(htdDisabled);
1232       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1233           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1234 
1235       // Let's just assign everything to first RS
1236       HRegionServer hrs = cluster.getRegionServer(0);
1237 
1238       // Create region files.
1239       TEST_UTIL.getHBaseAdmin().disableTable(table);
1240       TEST_UTIL.getHBaseAdmin().enableTable(table);
1241 
1242       // Disable the table and close its regions
1243       TEST_UTIL.getHBaseAdmin().disableTable(table);
1244       HRegionInfo region = disabledRegions.remove(0);
1245       byte[] regionName = region.getRegionName();
1246 
1247       // The region should not be assigned currently
1248       assertTrue(cluster.getServerWith(regionName) == -1);
1249 
1250       // Directly open a region on a region server.
1251       // If going through AM/ZK, the region won't be open.
1252       // Even it is opened, AM will close it which causes
1253       // flakiness of this test.
1254       HRegion r = HRegion.openHRegion(
1255         region, htdDisabled, hrs.getWAL(region), conf);
1256       hrs.addToOnlineRegions(r);
1257 
1258       HBaseFsck hbck = doFsck(conf, false);
1259       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1260 
1261       // fix this fault
1262       doFsck(conf, true);
1263 
1264       // check result
1265       assertNoErrors(doFsck(conf, false));
1266     } finally {
1267       TEST_UTIL.getHBaseAdmin().enableTable(table);
1268       deleteTable(table);
1269     }
1270   }
1271 
1272   /**
1273    * This creates two tables and mess both of them and fix them one by one
1274    */
1275   @Test
1276   public void testFixByTable() throws Exception {
1277     TableName table1 =
1278         TableName.valueOf("testFixByTable1");
1279     TableName table2 =
1280         TableName.valueOf("testFixByTable2");
1281     try {
1282       setupTable(table1);
1283       // make sure data in regions, if in hlog only there is no data loss
1284       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1285       // Mess them up by leaving a hole in the hdfs data
1286       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1287         Bytes.toBytes("C"), false, false, true); // don't rm meta
1288 
1289       setupTable(table2);
1290       // make sure data in regions, if in hlog only there is no data loss
1291       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1292       // Mess them up by leaving a hole in the hdfs data
1293       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1294         Bytes.toBytes("C"), false, false, true); // don't rm meta
1295 
1296       HBaseFsck hbck = doFsck(conf, false);
1297       assertErrors(hbck, new ERROR_CODE[] {
1298         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1299 
1300       // fix hole in table 1
1301       doFsck(conf, true, table1);
1302       // check that hole in table 1 fixed
1303       assertNoErrors(doFsck(conf, false, table1));
1304       // check that hole in table 2 still there
1305       assertErrors(doFsck(conf, false, table2),
1306         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1307 
1308       // fix hole in table 2
1309       doFsck(conf, true, table2);
1310       // check that hole in both tables fixed
1311       assertNoErrors(doFsck(conf, false));
1312       assertEquals(ROWKEYS.length - 2, countRows());
1313     } finally {
1314       deleteTable(table1);
1315       deleteTable(table2);
1316     }
1317   }
1318   /**
1319    * A split parent in meta, in hdfs, and not deployed
1320    */
1321   @Test
1322   public void testLingeringSplitParent() throws Exception {
1323     TableName table =
1324         TableName.valueOf("testLingeringSplitParent");
1325     HTable meta = null;
1326     try {
1327       setupTable(table);
1328       assertEquals(ROWKEYS.length, countRows());
1329 
1330       // make sure data in regions, if in hlog only there is no data loss
1331       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1332       HRegionLocation location = tbl.getRegionLocation("B");
1333 
1334       // Delete one region from meta, but not hdfs, unassign it.
1335       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1336         Bytes.toBytes("C"), true, true, false);
1337 
1338       // Create a new meta entry to fake it as a split parent.
1339       meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
1340       HRegionInfo hri = location.getRegionInfo();
1341 
1342       HRegionInfo a = new HRegionInfo(tbl.getName(),
1343         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1344       HRegionInfo b = new HRegionInfo(tbl.getName(),
1345         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1346 
1347       hri.setOffline(true);
1348       hri.setSplit(true);
1349 
1350       MetaEditor.addRegionToMeta(meta, hri, a, b);
1351       meta.flushCommits();
1352       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1353 
1354       HBaseFsck hbck = doFsck(conf, false);
1355       assertErrors(hbck, new ERROR_CODE[] {
1356         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1357 
1358       // regular repair cannot fix lingering split parent
1359       hbck = doFsck(conf, true);
1360       assertErrors(hbck, new ERROR_CODE[] {
1361         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1362       assertFalse(hbck.shouldRerun());
1363       hbck = doFsck(conf, false);
1364       assertErrors(hbck, new ERROR_CODE[] {
1365         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1366 
1367       // fix lingering split parent
1368       hbck = new HBaseFsck(conf);
1369       hbck.connect();
1370       hbck.setDisplayFullReport(); // i.e. -details
1371       hbck.setTimeLag(0);
1372       hbck.setFixSplitParents(true);
1373       hbck.onlineHbck();
1374       assertTrue(hbck.shouldRerun());
1375 
1376       Get get = new Get(hri.getRegionName());
1377       Result result = meta.get(get);
1378       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1379         HConstants.SPLITA_QUALIFIER).isEmpty());
1380       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1381         HConstants.SPLITB_QUALIFIER).isEmpty());
1382       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1383 
1384       // fix other issues
1385       doFsck(conf, true);
1386 
1387       // check that all are fixed
1388       assertNoErrors(doFsck(conf, false));
1389       assertEquals(ROWKEYS.length, countRows());
1390     } finally {
1391       deleteTable(table);
1392       IOUtils.closeQuietly(meta);
1393     }
1394   }
1395 
1396   /**
1397    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1398    * valid cases where the daughters are there.
1399    */
1400   @Test
1401   public void testValidLingeringSplitParent() throws Exception {
1402     TableName table =
1403         TableName.valueOf("testLingeringSplitParent");
1404     HTable meta = null;
1405     try {
1406       setupTable(table);
1407       assertEquals(ROWKEYS.length, countRows());
1408 
1409       // make sure data in regions, if in hlog only there is no data loss
1410       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1411       HRegionLocation location = tbl.getRegionLocation("B");
1412 
1413       meta = new HTable(conf, TableName.META_TABLE_NAME);
1414       HRegionInfo hri = location.getRegionInfo();
1415 
1416       // do a regular split
1417       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1418       byte[] regionName = location.getRegionInfo().getRegionName();
1419       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1420       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1421           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1422 
1423       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1424       // for some time until children references are deleted. HBCK erroneously sees this as
1425       // overlapping regions
1426       HBaseFsck hbck = doFsck(conf, true, true, false, false, false, true, true, true, false, false, null);
1427       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1428 
1429       // assert that the split hbase:meta entry is still there.
1430       Get get = new Get(hri.getRegionName());
1431       Result result = meta.get(get);
1432       assertNotNull(result);
1433       assertNotNull(HRegionInfo.getHRegionInfo(result));
1434 
1435       assertEquals(ROWKEYS.length, countRows());
1436 
1437       // assert that we still have the split regions
1438       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1439       assertNoErrors(doFsck(conf, false));
1440     } finally {
1441       deleteTable(table);
1442       IOUtils.closeQuietly(meta);
1443     }
1444   }
1445 
1446   /**
1447    * Split crashed after write to hbase:meta finished for the parent region, but
1448    * failed to write daughters (pre HBASE-7721 codebase)
1449    */
1450   @Test(timeout=75000)
1451   public void testSplitDaughtersNotInMeta() throws Exception {
1452     TableName table =
1453         TableName.valueOf("testSplitdaughtersNotInMeta");
1454     HTable meta = null;
1455     try {
1456       setupTable(table);
1457       assertEquals(ROWKEYS.length, countRows());
1458 
1459       // make sure data in regions, if in hlog only there is no data loss
1460       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1461       HRegionLocation location = tbl.getRegionLocation("B");
1462 
1463       meta = new HTable(conf, TableName.META_TABLE_NAME);
1464       HRegionInfo hri = location.getRegionInfo();
1465 
1466       // do a regular split
1467       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1468       byte[] regionName = location.getRegionInfo().getRegionName();
1469       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1470       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1471           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1472 
1473       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1474 
1475       // Delete daughter regions from meta, but not hdfs, unassign it.
1476       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1477       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1478       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1479 
1480       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1481       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1482       meta.flushCommits();
1483 
1484       HBaseFsck hbck = doFsck(conf, false);
1485       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1486           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1487 
1488       // now fix it. The fix should not revert the region split, but add daughters to META
1489       hbck = doFsck(conf, true, true, false, false, false, false, false, false, false, false, null);
1490       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1491           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1492 
1493       // assert that the split hbase:meta entry is still there.
1494       Get get = new Get(hri.getRegionName());
1495       Result result = meta.get(get);
1496       assertNotNull(result);
1497       assertNotNull(HRegionInfo.getHRegionInfo(result));
1498 
1499       assertEquals(ROWKEYS.length, countRows());
1500 
1501       // assert that we still have the split regions
1502       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1503       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1504     } finally {
1505       deleteTable(table);
1506       IOUtils.closeQuietly(meta);
1507     }
1508   }
1509 
1510   /**
1511    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1512    * meta and data missing in the fs.
1513    */
1514   @Test(timeout=120000)
1515   public void testMissingFirstRegion() throws Exception {
1516     TableName table =
1517         TableName.valueOf("testMissingFirstRegion");
1518     try {
1519       setupTable(table);
1520       assertEquals(ROWKEYS.length, countRows());
1521 
1522       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1523       TEST_UTIL.getHBaseAdmin().disableTable(table);
1524       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1525           true, true);
1526       TEST_UTIL.getHBaseAdmin().enableTable(table);
1527 
1528       HBaseFsck hbck = doFsck(conf, false);
1529       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1530       // fix hole
1531       doFsck(conf, true);
1532       // check that hole fixed
1533       assertNoErrors(doFsck(conf, false));
1534     } finally {
1535       deleteTable(table);
1536     }
1537   }
1538 
1539   /**
1540    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1541    * meta and data missing in the fs.
1542    */
1543   @Test(timeout=120000)
1544   public void testRegionDeployedNotInHdfs() throws Exception {
1545     TableName table =
1546         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1547     try {
1548       setupTable(table);
1549       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1550 
1551       // Mess it up by deleting region dir
1552       deleteRegion(conf, tbl.getTableDescriptor(),
1553         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1554         false, true);
1555 
1556       HBaseFsck hbck = doFsck(conf, false);
1557       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1558       // fix hole
1559       doFsck(conf, true);
1560       // check that hole fixed
1561       assertNoErrors(doFsck(conf, false));
1562     } finally {
1563       deleteTable(table);
1564     }
1565   }
1566 
1567   /**
1568    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1569    * the fs.
1570    */
1571   @Test(timeout=120000)
1572   public void testMissingLastRegion() throws Exception {
1573     TableName table =
1574         TableName.valueOf("testMissingLastRegion");
1575     try {
1576       setupTable(table);
1577       assertEquals(ROWKEYS.length, countRows());
1578 
1579       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1580       TEST_UTIL.getHBaseAdmin().disableTable(table);
1581       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1582           true, true);
1583       TEST_UTIL.getHBaseAdmin().enableTable(table);
1584 
1585       HBaseFsck hbck = doFsck(conf, false);
1586       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1587       // fix hole
1588       doFsck(conf, true);
1589       // check that hole fixed
1590       assertNoErrors(doFsck(conf, false));
1591     } finally {
1592       deleteTable(table);
1593     }
1594   }
1595 
1596   /**
1597    * Test -noHdfsChecking option can detect and fix assignments issue.
1598    */
1599   @Test
1600   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1601     TableName table =
1602         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1603     try {
1604       setupTable(table);
1605       assertEquals(ROWKEYS.length, countRows());
1606 
1607       // Mess it up by closing a region
1608       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1609         Bytes.toBytes("B"), true, false, false, false);
1610 
1611       // verify there is no other errors
1612       HBaseFsck hbck = doFsck(conf, false);
1613       assertErrors(hbck, new ERROR_CODE[] {
1614         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1615 
1616       // verify that noHdfsChecking report the same errors
1617       HBaseFsck fsck = new HBaseFsck(conf);
1618       fsck.connect();
1619       fsck.setDisplayFullReport(); // i.e. -details
1620       fsck.setTimeLag(0);
1621       fsck.setCheckHdfs(false);
1622       fsck.onlineHbck();
1623       assertErrors(fsck, new ERROR_CODE[] {
1624         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1625 
1626       // verify that fixAssignments works fine with noHdfsChecking
1627       fsck = new HBaseFsck(conf);
1628       fsck.connect();
1629       fsck.setDisplayFullReport(); // i.e. -details
1630       fsck.setTimeLag(0);
1631       fsck.setCheckHdfs(false);
1632       fsck.setFixAssignments(true);
1633       fsck.onlineHbck();
1634       assertTrue(fsck.shouldRerun());
1635       fsck.onlineHbck();
1636       assertNoErrors(fsck);
1637 
1638       assertEquals(ROWKEYS.length, countRows());
1639     } finally {
1640       deleteTable(table);
1641     }
1642   }
1643 
1644   /**
1645    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1646    * However, it can not fix it without checking Hdfs because we need to get
1647    * the region info from Hdfs in this case, then to patch the meta.
1648    */
1649   @Test
1650   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1651     TableName table =
1652         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1653     try {
1654       setupTable(table);
1655       assertEquals(ROWKEYS.length, countRows());
1656 
1657       // Mess it up by deleting a region from the metadata
1658       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1659         Bytes.toBytes("B"), false, true, false, false);
1660 
1661       // verify there is no other errors
1662       HBaseFsck hbck = doFsck(conf, false);
1663       assertErrors(hbck, new ERROR_CODE[] {
1664         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1665 
1666       // verify that noHdfsChecking report the same errors
1667       HBaseFsck fsck = new HBaseFsck(conf);
1668       fsck.connect();
1669       fsck.setDisplayFullReport(); // i.e. -details
1670       fsck.setTimeLag(0);
1671       fsck.setCheckHdfs(false);
1672       fsck.onlineHbck();
1673       assertErrors(fsck, new ERROR_CODE[] {
1674         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1675 
1676       // verify that fixMeta doesn't work with noHdfsChecking
1677       fsck = new HBaseFsck(conf);
1678       fsck.connect();
1679       fsck.setDisplayFullReport(); // i.e. -details
1680       fsck.setTimeLag(0);
1681       fsck.setCheckHdfs(false);
1682       fsck.setFixAssignments(true);
1683       fsck.setFixMeta(true);
1684       fsck.onlineHbck();
1685       assertFalse(fsck.shouldRerun());
1686       assertErrors(fsck, new ERROR_CODE[] {
1687         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1688 
1689       // fix the cluster so other tests won't be impacted
1690       fsck = doFsck(conf, true);
1691       assertTrue(fsck.shouldRerun());
1692       fsck = doFsck(conf, true);
1693       assertNoErrors(fsck);
1694     } finally {
1695       deleteTable(table);
1696     }
1697   }
1698 
1699   /**
1700    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1701    * and -noHdfsChecking can't detect orphan Hdfs region.
1702    */
1703   @Test
1704   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1705     TableName table =
1706         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1707     try {
1708       setupTable(table);
1709       assertEquals(ROWKEYS.length, countRows());
1710 
1711       // Mess it up by creating an overlap in the metadata
1712       TEST_UTIL.getHBaseAdmin().disableTable(table);
1713       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1714         Bytes.toBytes("B"), true, true, false, true);
1715       TEST_UTIL.getHBaseAdmin().enableTable(table);
1716 
1717       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1718         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1719       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1720       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1721         .waitForAssignment(hriOverlap);
1722       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1723       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1724 
1725       HBaseFsck hbck = doFsck(conf, false);
1726       assertErrors(hbck, new ERROR_CODE[] {
1727         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1728         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1729 
1730       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1731       HBaseFsck fsck = new HBaseFsck(conf);
1732       fsck.connect();
1733       fsck.setDisplayFullReport(); // i.e. -details
1734       fsck.setTimeLag(0);
1735       fsck.setCheckHdfs(false);
1736       fsck.onlineHbck();
1737       assertErrors(fsck, new ERROR_CODE[] {
1738         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1739 
1740       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1741       fsck = new HBaseFsck(conf);
1742       fsck.connect();
1743       fsck.setDisplayFullReport(); // i.e. -details
1744       fsck.setTimeLag(0);
1745       fsck.setCheckHdfs(false);
1746       fsck.setFixHdfsHoles(true);
1747       fsck.setFixHdfsOverlaps(true);
1748       fsck.setFixHdfsOrphans(true);
1749       fsck.onlineHbck();
1750       assertFalse(fsck.shouldRerun());
1751       assertErrors(fsck, new ERROR_CODE[] {
1752         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1753     } finally {
1754       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1755         TEST_UTIL.getHBaseAdmin().enableTable(table);
1756       }
1757       deleteTable(table);
1758     }
1759   }
1760 
1761   /**
1762    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1763    * legitimate hfile and return it.
1764    * @param fs
1765    * @param table
1766    * @return Path of a flushed hfile.
1767    * @throws IOException
1768    */
1769   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1770     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1771     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1772     Path famDir = new Path(regionDir, FAM_STR);
1773 
1774     // keep doing this until we get a legit hfile
1775     while (true) {
1776       FileStatus[] hfFss = fs.listStatus(famDir);
1777       if (hfFss.length == 0) {
1778         continue;
1779       }
1780       for (FileStatus hfs : hfFss) {
1781         if (!hfs.isDir()) {
1782           return hfs.getPath();
1783         }
1784       }
1785     }
1786   }
1787 
1788   /**
1789    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1790    */
1791   @Test(timeout=180000)
1792   public void testQuarantineCorruptHFile() throws Exception {
1793     TableName table = TableName.valueOf(name.getMethodName());
1794     try {
1795       setupTable(table);
1796       assertEquals(ROWKEYS.length, countRows());
1797       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1798 
1799       FileSystem fs = FileSystem.get(conf);
1800       Path hfile = getFlushedHFile(fs, table);
1801 
1802       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1803       TEST_UTIL.getHBaseAdmin().disableTable(table);
1804 
1805       // create new corrupt file called deadbeef (valid hfile name)
1806       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1807       TestHFile.truncateFile(fs, hfile, corrupt);
1808       LOG.info("Created corrupted file " + corrupt);
1809       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1810 
1811       // we cannot enable here because enable never finished due to the corrupt region.
1812       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1813       assertEquals(res.getRetCode(), 0);
1814       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1815       assertEquals(hfcc.getHFilesChecked(), 5);
1816       assertEquals(hfcc.getCorrupted().size(), 1);
1817       assertEquals(hfcc.getFailures().size(), 0);
1818       assertEquals(hfcc.getQuarantined().size(), 1);
1819       assertEquals(hfcc.getMissing().size(), 0);
1820 
1821       // Its been fixed, verify that we can enable.
1822       TEST_UTIL.getHBaseAdmin().enableTable(table);
1823     } finally {
1824       deleteTable(table);
1825     }
1826   }
1827 
1828   /**
1829   * Test that use this should have a timeout, because this method could potentially wait forever.
1830   */
1831   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1832                                 int corrupt, int fail, int quar, int missing) throws Exception {
1833     try {
1834       setupTable(table);
1835       assertEquals(ROWKEYS.length, countRows());
1836       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1837 
1838       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1839       TEST_UTIL.getHBaseAdmin().disableTable(table);
1840 
1841       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1842           table.getNameAsString()};
1843       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1844       HBaseFsck res = hbck.exec(exec, args);
1845 
1846       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1847       assertEquals(hfcc.getHFilesChecked(), check);
1848       assertEquals(hfcc.getCorrupted().size(), corrupt);
1849       assertEquals(hfcc.getFailures().size(), fail);
1850       assertEquals(hfcc.getQuarantined().size(), quar);
1851       assertEquals(hfcc.getMissing().size(), missing);
1852 
1853       // its been fixed, verify that we can enable
1854       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1855       admin.enableTableAsync(table);
1856       while (!admin.isTableEnabled(table)) {
1857         try {
1858           Thread.sleep(250);
1859         } catch (InterruptedException e) {
1860           e.printStackTrace();
1861           fail("Interrupted when trying to enable table " + table);
1862         }
1863       }
1864     } finally {
1865       deleteTable(table);
1866     }
1867   }
1868 
1869   /**
1870    * This creates a table and simulates the race situation where a concurrent compaction or split
1871    * has removed an hfile after the corruption checker learned about it.
1872    */
1873   @Test(timeout=180000)
1874   public void testQuarantineMissingHFile() throws Exception {
1875     TableName table = TableName.valueOf(name.getMethodName());
1876     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1877     // inject a fault in the hfcc created.
1878     final FileSystem fs = FileSystem.get(conf);
1879     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1880       @Override
1881       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1882         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1883           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1884           @Override
1885           protected void checkHFile(Path p) throws IOException {
1886             if (attemptedFirstHFile.compareAndSet(false, true)) {
1887               assertTrue(fs.delete(p, true)); // make sure delete happened.
1888             }
1889             super.checkHFile(p);
1890           }
1891         };
1892       }
1893     };
1894     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1895   }
1896 
1897   /**
1898    * This creates a table and simulates the race situation where a concurrent compaction or split
1899    * has removed an colfam dir before the corruption checker got to it.
1900    */
1901   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1902   // files in a column family on initial creation -- as suggested by Matteo.
1903   @Ignore @Test(timeout=180000)
1904   public void testQuarantineMissingFamdir() throws Exception {
1905     TableName table = TableName.valueOf(name.getMethodName());
1906     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1907     // inject a fault in the hfcc created.
1908     final FileSystem fs = FileSystem.get(conf);
1909     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1910       @Override
1911       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1912         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1913           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1914           @Override
1915           protected void checkColFamDir(Path p) throws IOException {
1916             if (attemptedFirstHFile.compareAndSet(false, true)) {
1917               assertTrue(fs.delete(p, true)); // make sure delete happened.
1918             }
1919             super.checkColFamDir(p);
1920           }
1921         };
1922       }
1923     };
1924     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1925   }
1926 
1927   /**
1928    * This creates a table and simulates the race situation where a concurrent compaction or split
1929    * has removed a region dir before the corruption checker got to it.
1930    */
1931   @Test(timeout=180000)
1932   public void testQuarantineMissingRegionDir() throws Exception {
1933     TableName table = TableName.valueOf(name.getMethodName());
1934     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1935     // inject a fault in the hfcc created.
1936     final FileSystem fs = FileSystem.get(conf);
1937     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1938       @Override
1939       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1940         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1941           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1942           @Override
1943           protected void checkRegionDir(Path p) throws IOException {
1944             if (attemptedFirstHFile.compareAndSet(false, true)) {
1945               assertTrue(fs.delete(p, true)); // make sure delete happened.
1946             }
1947             super.checkRegionDir(p);
1948           }
1949         };
1950       }
1951     };
1952     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1953   }
1954 
1955   /**
1956    * Test fixing lingering reference file.
1957    */
1958   @Test
1959   public void testLingeringReferenceFile() throws Exception {
1960     TableName table =
1961         TableName.valueOf("testLingeringReferenceFile");
1962     try {
1963       setupTable(table);
1964       assertEquals(ROWKEYS.length, countRows());
1965 
1966       // Mess it up by creating a fake reference file
1967       FileSystem fs = FileSystem.get(conf);
1968       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1969       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1970       Path famDir = new Path(regionDir, FAM_STR);
1971       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
1972       fs.create(fakeReferenceFile);
1973 
1974       HBaseFsck hbck = doFsck(conf, false);
1975       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
1976       // fix reference file
1977       doFsck(conf, true);
1978       // check that reference file fixed
1979       assertNoErrors(doFsck(conf, false));
1980     } finally {
1981       deleteTable(table);
1982     }
1983   }
1984 
1985   /**
1986    * Test mission REGIONINFO_QUALIFIER in hbase:meta
1987    */
1988   @Test
1989   public void testMissingRegionInfoQualifier() throws Exception {
1990     TableName table =
1991         TableName.valueOf("testMissingRegionInfoQualifier");
1992     try {
1993       setupTable(table);
1994 
1995       // Mess it up by removing the RegionInfo for one region.
1996       final List<Delete> deletes = new LinkedList<Delete>();
1997       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
1998       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
1999 
2000         @Override
2001         public boolean processRow(Result rowResult) throws IOException {
2002           HRegionInfo hri = MetaScanner.getHRegionInfo(rowResult);
2003           if (hri != null && !hri.getTable().isSystemTable()) {
2004             Delete delete = new Delete(rowResult.getRow());
2005             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2006             deletes.add(delete);
2007           }
2008           return true;
2009         }
2010 
2011         @Override
2012         public void close() throws IOException {
2013         }
2014       });
2015       meta.delete(deletes);
2016 
2017       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2018       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2019         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2020       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2021         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2022       meta.close();
2023 
2024       HBaseFsck hbck = doFsck(conf, false);
2025       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2026 
2027       // fix reference file
2028       hbck = doFsck(conf, true);
2029 
2030       // check that reference file fixed
2031       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2032     } finally {
2033       deleteTable(table);
2034     }
2035   }
2036 
2037 
2038   /**
2039    * Test pluggable error reporter. It can be plugged in
2040    * from system property or configuration.
2041    */
2042   @Test
2043   public void testErrorReporter() throws Exception {
2044     try {
2045       MockErrorReporter.calledCount = 0;
2046       doFsck(conf, false);
2047       assertEquals(MockErrorReporter.calledCount, 0);
2048 
2049       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2050       doFsck(conf, false);
2051       assertTrue(MockErrorReporter.calledCount > 20);
2052     } finally {
2053       conf.set("hbasefsck.errorreporter",
2054         PrintingErrorReporter.class.getName());
2055       MockErrorReporter.calledCount = 0;
2056     }
2057   }
2058 
2059   static class MockErrorReporter implements ErrorReporter {
2060     static int calledCount = 0;
2061 
2062     @Override
2063     public void clear() {
2064       calledCount++;
2065     }
2066 
2067     @Override
2068     public void report(String message) {
2069       calledCount++;
2070     }
2071 
2072     @Override
2073     public void reportError(String message) {
2074       calledCount++;
2075     }
2076 
2077     @Override
2078     public void reportError(ERROR_CODE errorCode, String message) {
2079       calledCount++;
2080     }
2081 
2082     @Override
2083     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2084       calledCount++;
2085     }
2086 
2087     @Override
2088     public void reportError(ERROR_CODE errorCode,
2089         String message, TableInfo table, HbckInfo info) {
2090       calledCount++;
2091     }
2092 
2093     @Override
2094     public void reportError(ERROR_CODE errorCode, String message,
2095         TableInfo table, HbckInfo info1, HbckInfo info2) {
2096       calledCount++;
2097     }
2098 
2099     @Override
2100     public int summarize() {
2101       return ++calledCount;
2102     }
2103 
2104     @Override
2105     public void detail(String details) {
2106       calledCount++;
2107     }
2108 
2109     @Override
2110     public ArrayList<ERROR_CODE> getErrorList() {
2111       calledCount++;
2112       return new ArrayList<ERROR_CODE>();
2113     }
2114 
2115     @Override
2116     public void progress() {
2117       calledCount++;
2118     }
2119 
2120     @Override
2121     public void print(String message) {
2122       calledCount++;
2123     }
2124 
2125     @Override
2126     public void resetErrors() {
2127       calledCount++;
2128     }
2129 
2130     @Override
2131     public boolean tableHasErrors(TableInfo table) {
2132       calledCount++;
2133       return false;
2134     }
2135   }
2136 
2137   @Test(timeout=60000)
2138   public void testCheckTableLocks() throws Exception {
2139     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2140     EnvironmentEdgeManager.injectEdge(edge);
2141     // check no errors
2142     HBaseFsck hbck = doFsck(conf, false);
2143     assertNoErrors(hbck);
2144 
2145     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2146 
2147     // obtain one lock
2148     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2149     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2150         "testCheckTableLocks");
2151     writeLock.acquire();
2152     hbck = doFsck(conf, false);
2153     assertNoErrors(hbck); // should not have expired, no problems
2154 
2155     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2156         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2157 
2158     hbck = doFsck(conf, false);
2159     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2160 
2161     final CountDownLatch latch = new CountDownLatch(1);
2162     new Thread() {
2163       @Override
2164       public void run() {
2165         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2166             "testCheckTableLocks");
2167         try {
2168           latch.countDown();
2169           readLock.acquire();
2170         } catch (IOException ex) {
2171           fail();
2172         } catch (IllegalStateException ex) {
2173           return; // expected, since this will be reaped under us.
2174         }
2175         fail("should not have come here");
2176       };
2177     }.start();
2178 
2179     latch.await(); // wait until thread starts
2180     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2181 
2182     hbck = doFsck(conf, false);
2183     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2184 
2185     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2186         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2187 
2188     hbck = doFsck(conf, false);
2189     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2190 
2191     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2192                                                                  // which is not injectable through EnvironmentEdge
2193     Threads.sleep(10);
2194     hbck = doFsck(conf, true); // now fix both cases
2195 
2196     hbck = doFsck(conf, false);
2197     assertNoErrors(hbck);
2198 
2199     // ensure that locks are deleted
2200     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2201         "should acquire without blocking");
2202     writeLock.acquire(); // this should not block.
2203     writeLock.release(); // release for clean state
2204   }
2205 
2206   @Test
2207   public void testMetaOffline() throws Exception {
2208     // check no errors
2209     HBaseFsck hbck = doFsck(conf, false);
2210     assertNoErrors(hbck);
2211     deleteMetaRegion(conf, true, false, false);
2212     hbck = doFsck(conf, false);
2213     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2214     // inconsistency and whether we will be fixing it or not.
2215     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2216     hbck = doFsck(conf, true);
2217     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2218     hbck = doFsck(conf, false);
2219     assertNoErrors(hbck);
2220   }
2221 
2222   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2223       boolean regionInfoOnly) throws IOException, InterruptedException {
2224     HConnection connection = HConnectionManager.getConnection(conf);
2225     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2226         HConstants.EMPTY_START_ROW);
2227     ServerName hsa = metaLocation.getServerName();
2228     HRegionInfo hri = metaLocation.getRegionInfo();
2229     if (unassign) {
2230       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2231       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2232     }
2233 
2234     if (regionInfoOnly) {
2235       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2236       Path rootDir = FSUtils.getRootDir(conf);
2237       FileSystem fs = rootDir.getFileSystem(conf);
2238       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2239           hri.getEncodedName());
2240       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2241       fs.delete(hriPath, true);
2242     }
2243 
2244     if (hdfs) {
2245       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2246       Path rootDir = FSUtils.getRootDir(conf);
2247       FileSystem fs = rootDir.getFileSystem(conf);
2248       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2249           hri.getEncodedName());
2250       HBaseFsck.debugLsr(conf, p);
2251       boolean success = fs.delete(p, true);
2252       LOG.info("Deleted " + p + " sucessfully? " + success);
2253       HBaseFsck.debugLsr(conf, p);
2254     }
2255   }
2256 
2257   @Test
2258   public void testTableWithNoRegions() throws Exception {
2259     // We might end up with empty regions in a table
2260     // see also testNoHdfsTable()
2261     TableName table =
2262         TableName.valueOf(name.getMethodName());
2263     try {
2264       // create table with one region
2265       HTableDescriptor desc = new HTableDescriptor(table);
2266       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2267       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2268       TEST_UTIL.getHBaseAdmin().createTable(desc);
2269       tbl = new HTable(TEST_UTIL.getConfiguration(), table, executorService);
2270 
2271       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2272       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false,
2273           false, true);
2274 
2275       HBaseFsck hbck = doFsck(conf, false);
2276       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2277 
2278       doFsck(conf, true);
2279 
2280       // fix hole
2281       doFsck(conf, true);
2282 
2283       // check that hole fixed
2284       assertNoErrors(doFsck(conf, false));
2285     } finally {
2286       deleteTable(table);
2287     }
2288 
2289   }
2290 
2291   @Test
2292   public void testHbckAfterRegionMerge() throws Exception {
2293     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2294     HTable meta = null;
2295     try {
2296       // disable CatalogJanitor
2297       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2298       setupTable(table);
2299       assertEquals(ROWKEYS.length, countRows());
2300 
2301       // make sure data in regions, if in hlog only there is no data loss
2302       TEST_UTIL.getHBaseAdmin().flush(table.getName());
2303       HRegionInfo region1 = tbl.getRegionLocation("A").getRegionInfo();
2304       HRegionInfo region2 = tbl.getRegionLocation("B").getRegionInfo();
2305 
2306       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2307 
2308       assertNotEquals(region1, region2);
2309 
2310       // do a region merge
2311       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
2312       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2313           region2.getEncodedNameAsBytes(), false);
2314 
2315       // wait until region merged
2316       long timeout = System.currentTimeMillis() + 30 * 1000;
2317       while (true) {
2318         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2319           break;
2320         } else if (System.currentTimeMillis() > timeout) {
2321           fail("Time out waiting on region " + region1.getEncodedName()
2322               + " and " + region2.getEncodedName() + " be merged");
2323         }
2324         Thread.sleep(10);
2325       }
2326 
2327       assertEquals(ROWKEYS.length, countRows());
2328 
2329       HBaseFsck hbck = doFsck(conf, false);
2330       assertNoErrors(hbck); // no errors
2331 
2332     } finally {
2333       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2334       deleteTable(table);
2335       IOUtils.closeQuietly(meta);
2336     }
2337   }
2338 
2339   @Test
2340   public void testRegionBoundariesCheck() throws Exception {
2341     HBaseFsck hbck = doFsck(conf, false);
2342     assertNoErrors(hbck); // no errors
2343     try {
2344       hbck.checkRegionBoundaries();
2345     } catch (IllegalArgumentException e) {
2346       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2347         fail("Table directory path is not valid." + e.getMessage());
2348       }
2349     }
2350   }
2351 
2352   @org.junit.Rule
2353   public TestName name = new TestName();
2354 
2355   @Test
2356   public void testReadOnlyProperty() throws Exception {
2357     HBaseFsck hbck = doFsck(conf, false);
2358     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2359       hbck.shouldIgnorePreCheckPermission());
2360 
2361     hbck = doFsck(conf, true);
2362     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2363       hbck.shouldIgnorePreCheckPermission());
2364 
2365     hbck = doFsck(conf, true);
2366     hbck.setIgnorePreCheckPermission(true);
2367     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2368       hbck.shouldIgnorePreCheckPermission());
2369   }
2370 
2371   @Test (timeout=180000)
2372   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2373     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2374     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2375     try {
2376       HTableDescriptor desc = new HTableDescriptor(table);
2377       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2378       TEST_UTIL.getHBaseAdmin().createTable(desc);
2379       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2380       for (int i = 0; i < 5; i++) {
2381         Put p1 = new Put(("r" + i).getBytes());
2382         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2383         tbl.put(p1);
2384       }
2385       TEST_UTIL.getHBaseAdmin().flush(desc.getTableName().toString());
2386       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2387       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2388       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2389       cluster.getServerWith(regions.get(0).getRegionName());
2390       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2391       st.prepare();
2392       st.stepsBeforePONR(regionServer, regionServer, false);
2393       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2394       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2395       for (RegionState state : regionsInTransition.values()) {
2396         am.regionOffline(state.getRegion());
2397       }
2398       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2399       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2400       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2401       am.assign(regionsMap);
2402       am.waitForAssignment(regions.get(0).getRegionInfo());
2403       HBaseFsck hbck = doFsck(conf, false);
2404       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2405           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2406       // holes are separate from overlap groups
2407       assertEquals(0, hbck.getOverlapGroups(table).size());
2408 
2409       // fix hole
2410       assertErrors(
2411         doFsck(conf, false, true, false, false, false, false, false, false, false, false, null),
2412         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2413           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2414 
2415       // check that hole fixed
2416       assertNoErrors(doFsck(conf, false));
2417       assertEquals(5, countRows());
2418     } finally {
2419       if (tbl != null) {
2420         tbl.close();
2421         tbl = null;
2422       }
2423       deleteTable(table);
2424     }
2425   }
2426 }