View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.concurrent.Callable;
39  import java.util.concurrent.CountDownLatch;
40  import java.util.concurrent.ExecutorService;
41  import java.util.concurrent.Executors;
42  import java.util.concurrent.Future;
43  import java.util.concurrent.ScheduledThreadPoolExecutor;
44  import java.util.concurrent.SynchronousQueue;
45  import java.util.concurrent.ThreadPoolExecutor;
46  import java.util.concurrent.TimeUnit;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  
49  import org.apache.commons.io.IOUtils;
50  import org.apache.commons.logging.Log;
51  import org.apache.commons.logging.LogFactory;
52  import org.apache.hadoop.conf.Configuration;
53  import org.apache.hadoop.fs.FileStatus;
54  import org.apache.hadoop.fs.FileSystem;
55  import org.apache.hadoop.fs.Path;
56  import org.apache.hadoop.hbase.ClusterStatus;
57  import org.apache.hadoop.hbase.HBaseTestingUtility;
58  import org.apache.hadoop.hbase.HColumnDescriptor;
59  import org.apache.hadoop.hbase.HConstants;
60  import org.apache.hadoop.hbase.HRegionInfo;
61  import org.apache.hadoop.hbase.HRegionLocation;
62  import org.apache.hadoop.hbase.HTableDescriptor;
63  import org.apache.hadoop.hbase.TableExistsException;
64  import org.apache.hadoop.hbase.testclassification.LargeTests;
65  import org.apache.hadoop.hbase.MiniHBaseCluster;
66  import org.apache.hadoop.hbase.ServerName;
67  import org.apache.hadoop.hbase.TableName;
68  import org.apache.hadoop.hbase.MetaTableAccessor;
69  import org.apache.hadoop.hbase.client.Admin;
70  import org.apache.hadoop.hbase.client.ClusterConnection;
71  import org.apache.hadoop.hbase.client.Connection;
72  import org.apache.hadoop.hbase.client.ConnectionFactory;
73  import org.apache.hadoop.hbase.client.Delete;
74  import org.apache.hadoop.hbase.client.Durability;
75  import org.apache.hadoop.hbase.client.Get;
76  import org.apache.hadoop.hbase.client.HBaseAdmin;
77  import org.apache.hadoop.hbase.client.HConnection;
78  import org.apache.hadoop.hbase.client.HTable;
79  import org.apache.hadoop.hbase.client.MetaScanner;
80  import org.apache.hadoop.hbase.client.Put;
81  import org.apache.hadoop.hbase.client.Result;
82  import org.apache.hadoop.hbase.client.ResultScanner;
83  import org.apache.hadoop.hbase.client.Scan;
84  import org.apache.hadoop.hbase.client.Table;
85  import org.apache.hadoop.hbase.io.hfile.TestHFile;
86  import org.apache.hadoop.hbase.master.AssignmentManager;
87  import org.apache.hadoop.hbase.master.HMaster;
88  import org.apache.hadoop.hbase.master.RegionState;
89  import org.apache.hadoop.hbase.master.RegionStates;
90  import org.apache.hadoop.hbase.master.TableLockManager;
91  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
92  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
93  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
94  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
95  import org.apache.hadoop.hbase.regionserver.HRegion;
96  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
97  import org.apache.hadoop.hbase.regionserver.HRegionServer;
98  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
99  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
100 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
101 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
102 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
103 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
104 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
105 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
106 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
107 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
108 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
109 import org.apache.zookeeper.KeeperException;
110 import org.junit.AfterClass;
111 import org.junit.Assert;
112 import org.junit.BeforeClass;
113 import org.junit.Ignore;
114 import org.junit.Test;
115 import org.junit.experimental.categories.Category;
116 import org.junit.rules.TestName;
117 
118 import com.google.common.collect.Multimap;
119 
120 /**
121  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
122  */
123 @Category(LargeTests.class)
124 public class TestHBaseFsck {
125   static final int POOL_SIZE = 7;
126 
127   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
128   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
129   private final static Configuration conf = TEST_UTIL.getConfiguration();
130   private final static String FAM_STR = "fam";
131   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
132   private final static int REGION_ONLINE_TIMEOUT = 800;
133   private static RegionStates regionStates;
134   private static ExecutorService tableExecutorService;
135   private static ScheduledThreadPoolExecutor hbfsckExecutorService;
136   private static ClusterConnection connection;
137   private static Admin admin;
138 
139   // for the instance, reset every test run
140   private HTable tbl;
141   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
142     Bytes.toBytes("B"), Bytes.toBytes("C") };
143   // one row per region.
144   private final static byte[][] ROWKEYS= new byte[][] {
145     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
146     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
147 
148   @BeforeClass
149   public static void setUpBeforeClass() throws Exception {
150     conf.setInt("hbase.regionserver.handler.count", 2);
151     conf.setInt("hbase.regionserver.metahandler.count", 2);
152 
153     conf.setInt("hbase.htable.threads.max", POOL_SIZE);
154     conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
155     conf.setInt("hbase.hconnection.threads.core", POOL_SIZE);
156     conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
157     TEST_UTIL.startMiniCluster(3);
158 
159     tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
160         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
161 
162     hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
163 
164     AssignmentManager assignmentManager =
165       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
166     regionStates = assignmentManager.getRegionStates();
167 
168     connection = (ClusterConnection) TEST_UTIL.getConnection();
169 
170     admin = connection.getAdmin();
171     admin.setBalancerRunning(false, true);
172   }
173 
174   @AfterClass
175   public static void tearDownAfterClass() throws Exception {
176     tableExecutorService.shutdown();
177     hbfsckExecutorService.shutdown();
178     admin.close();
179     TEST_UTIL.shutdownMiniCluster();
180   }
181 
182   @Test (timeout=180000)
183   public void testHBaseFsck() throws Exception {
184     assertNoErrors(doFsck(conf, false));
185     TableName table = TableName.valueOf("tableBadMetaAssign");
186     TEST_UTIL.createTable(table, FAM);
187 
188     // We created 1 table, should be fine
189     assertNoErrors(doFsck(conf, false));
190 
191     // Now let's mess it up and change the assignment in hbase:meta to
192     // point to a different region server
193     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
194     Scan scan = new Scan();
195     scan.setStartRow(Bytes.toBytes(table+",,"));
196     ResultScanner scanner = meta.getScanner(scan);
197     HRegionInfo hri = null;
198 
199     Result res = scanner.next();
200     ServerName currServer =
201       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
202           HConstants.SERVER_QUALIFIER));
203     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
204         HConstants.STARTCODE_QUALIFIER));
205 
206     for (JVMClusterUtil.RegionServerThread rs :
207         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
208 
209       ServerName sn = rs.getRegionServer().getServerName();
210 
211       // When we find a diff RS, change the assignment and break
212       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
213           startCode != sn.getStartcode()) {
214         Put put = new Put(res.getRow());
215         put.setDurability(Durability.SKIP_WAL);
216         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
217           Bytes.toBytes(sn.getHostAndPort()));
218         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
219           Bytes.toBytes(sn.getStartcode()));
220         meta.put(put);
221         hri = MetaTableAccessor.getHRegionInfo(res);
222         break;
223       }
224     }
225 
226     // Try to fix the data
227     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
228         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
229 
230     TEST_UTIL.getHBaseCluster().getMaster()
231       .getAssignmentManager().waitForAssignment(hri);
232 
233     // Should be fixed now
234     assertNoErrors(doFsck(conf, false));
235 
236     // comment needed - what is the purpose of this line
237     Table t = connection.getTable(table, tableExecutorService);
238     ResultScanner s = t.getScanner(new Scan());
239     s.close();
240     t.close();
241 
242     scanner.close();
243     meta.close();
244   }
245 
246   @Test(timeout=180000)
247   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
248     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
249     admin.closeRegion(cluster.getServerHoldingMeta(), HRegionInfo.FIRST_META_REGIONINFO);
250     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
251     new MetaTableLocator().deleteMetaLocation(cluster.getMaster().getZooKeeper());
252     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
253     HBaseFsck hbck = doFsck(conf, true);
254     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
255         ERROR_CODE.NULL_META_REGION });
256     assertNoErrors(doFsck(conf, false));
257   }
258 
259   /**
260    * Create a new region in META.
261    */
262   private HRegionInfo createRegion(final HTableDescriptor
263       htd, byte[] startKey, byte[] endKey)
264       throws IOException {
265     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
266     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
267     MetaTableAccessor.addRegionToMeta(meta, hri);
268     meta.close();
269     return hri;
270   }
271 
272   /**
273    * Debugging method to dump the contents of meta.
274    */
275   private void dumpMeta(TableName tableName) throws IOException {
276     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
277     for (byte[] row : metaRows) {
278       LOG.info(Bytes.toString(row));
279     }
280   }
281 
282   /**
283    * This method is used to undeploy a region -- close it and attempt to
284    * remove its state from the Master.
285    */
286   private void undeployRegion(Connection conn, ServerName sn,
287       HRegionInfo hri) throws IOException, InterruptedException {
288     try {
289       HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) conn, sn, hri);
290       if (!hri.isMetaTable()) {
291         admin.offline(hri.getRegionName());
292       }
293     } catch (IOException ioe) {
294       LOG.warn("Got exception when attempting to offline region "
295           + Bytes.toString(hri.getRegionName()), ioe);
296     }
297   }
298   /**
299    * Delete a region from assignments, meta, or completely from hdfs.
300    * @param unassign if true unassign region if assigned
301    * @param metaRow  if true remove region's row from META
302    * @param hdfs if true remove region's dir in HDFS
303    */
304   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
305       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
306       boolean hdfs) throws IOException, InterruptedException {
307     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
308   }
309 
310   /**
311    * Delete a region from assignments, meta, or completely from hdfs.
312    * @param unassign if true unassign region if assigned
313    * @param metaRow  if true remove region's row from META
314    * @param hdfs if true remove region's dir in HDFS
315    * @param regionInfoOnly if true remove a region dir's .regioninfo file
316    */
317   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
318       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
319       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
320     LOG.info("** Before delete:");
321     dumpMeta(htd.getTableName());
322 
323     List<HRegionLocation> locations = tbl.getAllRegionLocations();
324     for (HRegionLocation location : locations) {
325       HRegionInfo hri = location.getRegionInfo();
326       ServerName hsa = location.getServerName();
327       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
328           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
329 
330         LOG.info("RegionName: " +hri.getRegionNameAsString());
331         byte[] deleteRow = hri.getRegionName();
332 
333         if (unassign) {
334           LOG.info("Undeploying region " + hri + " from server " + hsa);
335           undeployRegion(connection, hsa, hri);
336         }
337 
338         if (regionInfoOnly) {
339           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
340           Path rootDir = FSUtils.getRootDir(conf);
341           FileSystem fs = rootDir.getFileSystem(conf);
342           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
343               hri.getEncodedName());
344           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
345           fs.delete(hriPath, true);
346         }
347 
348         if (hdfs) {
349           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
350           Path rootDir = FSUtils.getRootDir(conf);
351           FileSystem fs = rootDir.getFileSystem(conf);
352           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
353               hri.getEncodedName());
354           HBaseFsck.debugLsr(conf, p);
355           boolean success = fs.delete(p, true);
356           LOG.info("Deleted " + p + " sucessfully? " + success);
357           HBaseFsck.debugLsr(conf, p);
358         }
359 
360         if (metaRow) {
361           try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
362             Delete delete = new Delete(deleteRow);
363             meta.delete(delete);
364           }
365         }
366       }
367       LOG.info(hri.toString() + hsa.toString());
368     }
369 
370     TEST_UTIL.getMetaTableRows(htd.getTableName());
371     LOG.info("*** After delete:");
372     dumpMeta(htd.getTableName());
373   }
374 
375   /**
376    * Setup a clean table before we start mucking with it.
377    *
378    * It will set tbl which needs to be closed after test
379    *
380    * @throws IOException
381    * @throws InterruptedException
382    * @throws KeeperException
383    */
384   void setupTable(TableName tablename) throws Exception {
385     setupTableWithRegionReplica(tablename, 1);
386   }
387 
388   /**
389    * Setup a clean table with a certain region_replica count
390    *
391    * It will set tbl which needs to be closed after test
392    *
393    * @param tableName
394    * @param replicaCount
395    * @throws Exception
396    */
397   void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
398     HTableDescriptor desc = new HTableDescriptor(tablename);
399     desc.setRegionReplication(replicaCount);
400     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
401     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
402     admin.createTable(desc, SPLITS);
403     tbl = (HTable) connection.getTable(tablename, tableExecutorService);
404     List<Put> puts = new ArrayList<Put>();
405     for (byte[] row : ROWKEYS) {
406       Put p = new Put(row);
407       p.add(FAM, Bytes.toBytes("val"), row);
408       puts.add(p);
409     }
410     tbl.put(puts);
411     tbl.flushCommits();
412   }
413 
414   /**
415    * Counts the number of row to verify data loss or non-dataloss.
416    */
417   int countRows() throws IOException {
418      Scan s = new Scan();
419      ResultScanner rs = tbl.getScanner(s);
420      int i = 0;
421      while(rs.next() !=null) {
422        i++;
423      }
424      return i;
425   }
426 
427   /**
428    * delete table in preparation for next test
429    *
430    * @param tablename
431    * @throws IOException
432    */
433   void cleanupTable(TableName tablename) throws IOException {
434     if (tbl != null) {
435       tbl.close();
436       tbl = null;
437     }
438 
439     ((ClusterConnection) connection).clearRegionCache();
440     TEST_UTIL.deleteTable(tablename);
441 
442   }
443 
444   /**
445    * This creates a clean table and confirms that the table is clean.
446    */
447   @Test (timeout=180000)
448   public void testHBaseFsckClean() throws Exception {
449     assertNoErrors(doFsck(conf, false));
450     TableName table = TableName.valueOf("tableClean");
451     try {
452       HBaseFsck hbck = doFsck(conf, false);
453       assertNoErrors(hbck);
454 
455       setupTable(table);
456       assertEquals(ROWKEYS.length, countRows());
457 
458       // We created 1 table, should be fine
459       hbck = doFsck(conf, false);
460       assertNoErrors(hbck);
461       assertEquals(0, hbck.getOverlapGroups(table).size());
462       assertEquals(ROWKEYS.length, countRows());
463     } finally {
464       cleanupTable(table);
465     }
466   }
467 
468   /**
469    * Test thread pooling in the case where there are more regions than threads
470    */
471   @Test (timeout=180000)
472   public void testHbckThreadpooling() throws Exception {
473     TableName table =
474         TableName.valueOf("tableDupeStartKey");
475     try {
476       // Create table with 4 regions
477       setupTable(table);
478 
479       // limit number of threads to 1.
480       Configuration newconf = new Configuration(conf);
481       newconf.setInt("hbasefsck.numthreads", 1);
482       assertNoErrors(doFsck(newconf, false));
483 
484       // We should pass without triggering a RejectedExecutionException
485     } finally {
486       cleanupTable(table);
487     }
488   }
489 
490   @Test (timeout=180000)
491   public void testHbckFixOrphanTable() throws Exception {
492     TableName table = TableName.valueOf("tableInfo");
493     FileSystem fs = null;
494     Path tableinfo = null;
495     try {
496       setupTable(table);
497 
498       Path hbaseTableDir = FSUtils.getTableDir(
499           FSUtils.getRootDir(conf), table);
500       fs = hbaseTableDir.getFileSystem(conf);
501       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
502       tableinfo = status.getPath();
503       fs.rename(tableinfo, new Path("/.tableinfo"));
504 
505       //to report error if .tableinfo is missing.
506       HBaseFsck hbck = doFsck(conf, false);
507       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
508 
509       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
510       hbck = doFsck(conf, true);
511       assertNoErrors(hbck);
512       status = null;
513       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
514       assertNotNull(status);
515 
516       HTableDescriptor htd = admin.getTableDescriptor(table);
517       htd.setValue("NOT_DEFAULT", "true");
518       admin.disableTable(table);
519       admin.modifyTable(table, htd);
520       admin.enableTable(table);
521       fs.delete(status.getPath(), true);
522 
523       // fix OrphanTable with cache
524       htd = admin.getTableDescriptor(table); // warms up cached htd on master
525       hbck = doFsck(conf, true);
526       assertNoErrors(hbck);
527       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
528       assertNotNull(status);
529       htd = admin.getTableDescriptor(table);
530       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
531     } finally {
532       fs.rename(new Path("/.tableinfo"), tableinfo);
533       cleanupTable(table);
534     }
535   }
536 
537   /**
538    * This test makes sure that parallel instances of Hbck is disabled.
539    *
540    * @throws Exception
541    */
542   @Test (timeout=180000)
543   public void testParallelHbck() throws Exception {
544     final ExecutorService service;
545     final Future<HBaseFsck> hbck1,hbck2;
546 
547     class RunHbck implements Callable<HBaseFsck>{
548       boolean fail = true;
549       @Override
550       public HBaseFsck call(){
551         try{
552           return doFsck(conf, false);
553         } catch(Exception e){
554           if (e.getMessage().contains("Duplicate hbck")) {
555             fail = false;
556           }
557         }
558         // If we reach here, then an exception was caught
559         if (fail) fail();
560         return null;
561       }
562     }
563     service = Executors.newFixedThreadPool(2);
564     hbck1 = service.submit(new RunHbck());
565     hbck2 = service.submit(new RunHbck());
566     service.shutdown();
567     //wait for 15 seconds, for both hbck calls finish
568     service.awaitTermination(15, TimeUnit.SECONDS);
569     HBaseFsck h1 = hbck1.get();
570     HBaseFsck h2 = hbck2.get();
571     // Make sure only one of the calls was successful
572     assert(h1 == null || h2 == null);
573     if (h1 != null) {
574       assert(h1.getRetCode() >= 0);
575     }
576     if (h2 != null) {
577       assert(h2.getRetCode() >= 0);
578     }
579   }
580 
581   /**
582    * This create and fixes a bad table with regions that have a duplicate
583    * start key
584    */
585   @Test (timeout=180000)
586   public void testDupeStartKey() throws Exception {
587     TableName table =
588         TableName.valueOf("tableDupeStartKey");
589     try {
590       setupTable(table);
591       assertNoErrors(doFsck(conf, false));
592       assertEquals(ROWKEYS.length, countRows());
593 
594       // Now let's mess it up, by adding a region with a duplicate startkey
595       HRegionInfo hriDupe =
596           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("A2"));
597       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
598       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
599           .waitForAssignment(hriDupe);
600       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
601       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
602 
603       HBaseFsck hbck = doFsck(conf, false);
604       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
605             ERROR_CODE.DUPE_STARTKEYS});
606       assertEquals(2, hbck.getOverlapGroups(table).size());
607       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
608 
609       // fix the degenerate region.
610       doFsck(conf,true);
611 
612       // check that the degenerate region is gone and no data loss
613       HBaseFsck hbck2 = doFsck(conf,false);
614       assertNoErrors(hbck2);
615       assertEquals(0, hbck2.getOverlapGroups(table).size());
616       assertEquals(ROWKEYS.length, countRows());
617     } finally {
618       cleanupTable(table);
619     }
620   }
621 
622   /*
623    * This creates a table with region_replica > 1 and verifies hbck runs
624    * successfully
625    */
626   @Test (timeout=180000)
627   public void testHbckWithRegionReplica() throws Exception {
628     TableName table =
629         TableName.valueOf("tableWithReplica");
630     try {
631       setupTableWithRegionReplica(table, 2);
632       assertNoErrors(doFsck(conf, false));
633       assertEquals(ROWKEYS.length, countRows());
634     } finally {
635       cleanupTable(table);
636     }
637   }
638 
639   /**
640    * Get region info from local cluster.
641    */
642   Map<ServerName, List<String>> getDeployedHRIs(final HBaseAdmin admin) throws IOException {
643     ClusterStatus status = admin.getClusterStatus();
644     Collection<ServerName> regionServers = status.getServers();
645     Map<ServerName, List<String>> mm =
646         new HashMap<ServerName, List<String>>();
647     for (ServerName hsi : regionServers) {
648       AdminProtos.AdminService.BlockingInterface server = ((HConnection) connection).getAdmin(hsi);
649 
650       // list all online regions from this region server
651       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
652       List<String> regionNames = new ArrayList<String>();
653       for (HRegionInfo hri : regions) {
654         regionNames.add(hri.getRegionNameAsString());
655       }
656       mm.put(hsi, regionNames);
657     }
658     return mm;
659   }
660 
661   /**
662    * Returns the HSI a region info is on.
663    */
664   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
665     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
666       if (e.getValue().contains(hri.getRegionNameAsString())) {
667         return e.getKey();
668       }
669     }
670     return null;
671   }
672 
673   /**
674    * This create and fixes a bad table with regions that have a duplicate
675    * start key
676    */
677   @Test (timeout=180000)
678   public void testDupeRegion() throws Exception {
679     TableName table =
680         TableName.valueOf("tableDupeRegion");
681     try {
682       setupTable(table);
683       assertNoErrors(doFsck(conf, false));
684       assertEquals(ROWKEYS.length, countRows());
685 
686       // Now let's mess it up, by adding a region with a duplicate startkey
687       HRegionInfo hriDupe =
688           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"));
689 
690       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
691       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
692           .waitForAssignment(hriDupe);
693       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
694       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
695 
696       // Yikes! The assignment manager can't tell between diff between two
697       // different regions with the same start/endkeys since it doesn't
698       // differentiate on ts/regionId!  We actually need to recheck
699       // deployments!
700       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriDupe) == null) {
701         Thread.sleep(250);
702       }
703 
704       LOG.debug("Finished assignment of dupe region");
705 
706       // TODO why is dupe region different from dupe start keys?
707       HBaseFsck hbck = doFsck(conf, false);
708       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
709             ERROR_CODE.DUPE_STARTKEYS});
710       assertEquals(2, hbck.getOverlapGroups(table).size());
711       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
712 
713       // fix the degenerate region.
714       doFsck(conf,true);
715 
716       // check that the degenerate region is gone and no data loss
717       HBaseFsck hbck2 = doFsck(conf,false);
718       assertNoErrors(hbck2);
719       assertEquals(0, hbck2.getOverlapGroups(table).size());
720       assertEquals(ROWKEYS.length, countRows());
721     } finally {
722       cleanupTable(table);
723     }
724   }
725 
726   /**
727    * This creates and fixes a bad table with regions that has startkey == endkey
728    */
729   @Test (timeout=180000)
730   public void testDegenerateRegions() throws Exception {
731     TableName table = TableName.valueOf("tableDegenerateRegions");
732     try {
733       setupTable(table);
734       assertNoErrors(doFsck(conf,false));
735       assertEquals(ROWKEYS.length, countRows());
736 
737       // Now let's mess it up, by adding a region with a duplicate startkey
738       HRegionInfo hriDupe =
739           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
740       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
741       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
742           .waitForAssignment(hriDupe);
743       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
744       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
745 
746       HBaseFsck hbck = doFsck(conf,false);
747       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION, ERROR_CODE.DUPE_STARTKEYS,
748           ERROR_CODE.DUPE_STARTKEYS });
749       assertEquals(2, hbck.getOverlapGroups(table).size());
750       assertEquals(ROWKEYS.length, countRows());
751 
752       // fix the degenerate region.
753       doFsck(conf,true);
754 
755       // check that the degenerate region is gone and no data loss
756       HBaseFsck hbck2 = doFsck(conf,false);
757       assertNoErrors(hbck2);
758       assertEquals(0, hbck2.getOverlapGroups(table).size());
759       assertEquals(ROWKEYS.length, countRows());
760     } finally {
761       cleanupTable(table);
762     }
763   }
764 
765   /**
766    * This creates and fixes a bad table where a region is completely contained
767    * by another region.
768    */
769   @Test (timeout=180000)
770   public void testContainedRegionOverlap() throws Exception {
771     TableName table =
772         TableName.valueOf("tableContainedRegionOverlap");
773     try {
774       setupTable(table);
775       assertEquals(ROWKEYS.length, countRows());
776 
777       // Mess it up by creating an overlap in the metadata
778       HRegionInfo hriOverlap =
779           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
780       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
781       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
782           .waitForAssignment(hriOverlap);
783       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
784       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
785 
786       HBaseFsck hbck = doFsck(conf, false);
787       assertErrors(hbck, new ERROR_CODE[] {
788           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
789       assertEquals(2, hbck.getOverlapGroups(table).size());
790       assertEquals(ROWKEYS.length, countRows());
791 
792       // fix the problem.
793       doFsck(conf, true);
794 
795       // verify that overlaps are fixed
796       HBaseFsck hbck2 = doFsck(conf,false);
797       assertNoErrors(hbck2);
798       assertEquals(0, hbck2.getOverlapGroups(table).size());
799       assertEquals(ROWKEYS.length, countRows());
800     } finally {
801       cleanupTable(table);
802     }
803   }
804 
805   /**
806    * This creates and fixes a bad table where an overlap group of
807    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
808    * region. Mess around the meta data so that closeRegion/offlineRegion
809    * throws exceptions.
810    */
811   @Test (timeout=180000)
812   public void testSidelineOverlapRegion() throws Exception {
813     TableName table =
814         TableName.valueOf("testSidelineOverlapRegion");
815     try {
816       setupTable(table);
817       assertEquals(ROWKEYS.length, countRows());
818 
819       // Mess it up by creating an overlap
820       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
821       HMaster master = cluster.getMaster();
822       HRegionInfo hriOverlap1 =
823           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("AB"));
824       master.assignRegion(hriOverlap1);
825       master.getAssignmentManager().waitForAssignment(hriOverlap1);
826       HRegionInfo hriOverlap2 =
827           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("AB"), Bytes.toBytes("B"));
828       master.assignRegion(hriOverlap2);
829       master.getAssignmentManager().waitForAssignment(hriOverlap2);
830 
831       HBaseFsck hbck = doFsck(conf, false);
832       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
833         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
834       assertEquals(3, hbck.getOverlapGroups(table).size());
835       assertEquals(ROWKEYS.length, countRows());
836 
837       // mess around the overlapped regions, to trigger NotServingRegionException
838       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
839       ServerName serverName = null;
840       byte[] regionName = null;
841       for (HbckInfo hbi: overlapGroups.values()) {
842         if ("A".equals(Bytes.toString(hbi.getStartKey()))
843             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
844           regionName = hbi.getRegionName();
845 
846           // get an RS not serving the region to force bad assignment info in to META.
847           int k = cluster.getServerWith(regionName);
848           for (int i = 0; i < 3; i++) {
849             if (i != k) {
850               HRegionServer rs = cluster.getRegionServer(i);
851               serverName = rs.getServerName();
852               break;
853             }
854           }
855 
856           HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) connection,
857               cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
858           admin.offline(regionName);
859           break;
860         }
861       }
862 
863       assertNotNull(regionName);
864       assertNotNull(serverName);
865       try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
866         Put put = new Put(regionName);
867         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
868             Bytes.toBytes(serverName.getHostAndPort()));
869         meta.put(put);
870       }
871 
872       // fix the problem.
873       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
874       fsck.connect();
875       fsck.setDisplayFullReport(); // i.e. -details
876       fsck.setTimeLag(0);
877       fsck.setFixAssignments(true);
878       fsck.setFixMeta(true);
879       fsck.setFixHdfsHoles(true);
880       fsck.setFixHdfsOverlaps(true);
881       fsck.setFixHdfsOrphans(true);
882       fsck.setFixVersionFile(true);
883       fsck.setSidelineBigOverlaps(true);
884       fsck.setMaxMerge(2);
885       fsck.onlineHbck();
886       fsck.close();
887 
888       // verify that overlaps are fixed, and there are less rows
889       // since one region is sidelined.
890       HBaseFsck hbck2 = doFsck(conf,false);
891       assertNoErrors(hbck2);
892       assertEquals(0, hbck2.getOverlapGroups(table).size());
893       assertTrue(ROWKEYS.length > countRows());
894     } finally {
895       cleanupTable(table);
896     }
897   }
898 
899   /**
900    * This creates and fixes a bad table where a region is completely contained
901    * by another region, and there is a hole (sort of like a bad split)
902    */
903   @Test (timeout=180000)
904   public void testOverlapAndOrphan() throws Exception {
905     TableName table =
906         TableName.valueOf("tableOverlapAndOrphan");
907     try {
908       setupTable(table);
909       assertEquals(ROWKEYS.length, countRows());
910 
911       // Mess it up by creating an overlap in the metadata
912       admin.disableTable(table);
913       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
914           Bytes.toBytes("B"), true, true, false, true);
915       admin.enableTable(table);
916 
917       HRegionInfo hriOverlap =
918           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
919       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
920       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
921           .waitForAssignment(hriOverlap);
922       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
923       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
924 
925       HBaseFsck hbck = doFsck(conf, false);
926       assertErrors(hbck, new ERROR_CODE[] {
927           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
928           ERROR_CODE.HOLE_IN_REGION_CHAIN});
929 
930       // fix the problem.
931       doFsck(conf, true);
932 
933       // verify that overlaps are fixed
934       HBaseFsck hbck2 = doFsck(conf,false);
935       assertNoErrors(hbck2);
936       assertEquals(0, hbck2.getOverlapGroups(table).size());
937       assertEquals(ROWKEYS.length, countRows());
938     } finally {
939       cleanupTable(table);
940     }
941   }
942 
943   /**
944    * This creates and fixes a bad table where a region overlaps two regions --
945    * a start key contained in another region and its end key is contained in
946    * yet another region.
947    */
948   @Test (timeout=180000)
949   public void testCoveredStartKey() throws Exception {
950     TableName table =
951         TableName.valueOf("tableCoveredStartKey");
952     try {
953       setupTable(table);
954       assertEquals(ROWKEYS.length, countRows());
955 
956       // Mess it up by creating an overlap in the metadata
957       HRegionInfo hriOverlap =
958           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
959       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
960       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
961           .waitForAssignment(hriOverlap);
962       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
963       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
964 
965       HBaseFsck hbck = doFsck(conf, false);
966       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
967           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
968       assertEquals(3, hbck.getOverlapGroups(table).size());
969       assertEquals(ROWKEYS.length, countRows());
970 
971       // fix the problem.
972       doFsck(conf, true);
973 
974       // verify that overlaps are fixed
975       HBaseFsck hbck2 = doFsck(conf, false);
976       assertErrors(hbck2, new ERROR_CODE[0]);
977       assertEquals(0, hbck2.getOverlapGroups(table).size());
978       assertEquals(ROWKEYS.length, countRows());
979     } finally {
980       cleanupTable(table);
981     }
982   }
983 
984   /**
985    * This creates and fixes a bad table with a missing region -- hole in meta
986    * and data missing in the fs.
987    */
988   @Test (timeout=180000)
989   public void testRegionHole() throws Exception {
990     TableName table =
991         TableName.valueOf("tableRegionHole");
992     try {
993       setupTable(table);
994       assertEquals(ROWKEYS.length, countRows());
995 
996       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
997       admin.disableTable(table);
998       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
999           Bytes.toBytes("C"), true, true, true);
1000       admin.enableTable(table);
1001 
1002       HBaseFsck hbck = doFsck(conf, false);
1003       assertErrors(hbck, new ERROR_CODE[] {
1004           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1005       // holes are separate from overlap groups
1006       assertEquals(0, hbck.getOverlapGroups(table).size());
1007 
1008       // fix hole
1009       doFsck(conf, true);
1010 
1011       // check that hole fixed
1012       assertNoErrors(doFsck(conf,false));
1013       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1014     } finally {
1015       cleanupTable(table);
1016     }
1017   }
1018 
1019   /**
1020    * This creates and fixes a bad table with a missing region -- hole in meta
1021    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1022    */
1023   @Test (timeout=180000)
1024   public void testHDFSRegioninfoMissing() throws Exception {
1025     TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
1026     try {
1027       setupTable(table);
1028       assertEquals(ROWKEYS.length, countRows());
1029 
1030       // Mess it up by leaving a hole in the meta data
1031       admin.disableTable(table);
1032       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1033           Bytes.toBytes("C"), true, true, false, true);
1034       admin.enableTable(table);
1035 
1036       HBaseFsck hbck = doFsck(conf, false);
1037       assertErrors(hbck, new ERROR_CODE[] {
1038           ERROR_CODE.ORPHAN_HDFS_REGION,
1039           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1040           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1041       // holes are separate from overlap groups
1042       assertEquals(0, hbck.getOverlapGroups(table).size());
1043 
1044       // fix hole
1045       doFsck(conf, true);
1046 
1047       // check that hole fixed
1048       assertNoErrors(doFsck(conf, false));
1049       assertEquals(ROWKEYS.length, countRows());
1050     } finally {
1051       cleanupTable(table);
1052     }
1053   }
1054 
1055   /**
1056    * This creates and fixes a bad table with a region that is missing meta and
1057    * not assigned to a region server.
1058    */
1059   @Test (timeout=180000)
1060   public void testNotInMetaOrDeployedHole() throws Exception {
1061     TableName table =
1062         TableName.valueOf("tableNotInMetaOrDeployedHole");
1063     try {
1064       setupTable(table);
1065       assertEquals(ROWKEYS.length, countRows());
1066 
1067       // Mess it up by leaving a hole in the meta data
1068       admin.disableTable(table);
1069       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1070           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1071       admin.enableTable(table);
1072 
1073       HBaseFsck hbck = doFsck(conf, false);
1074       assertErrors(hbck, new ERROR_CODE[] {
1075           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1076       // holes are separate from overlap groups
1077       assertEquals(0, hbck.getOverlapGroups(table).size());
1078 
1079       // fix hole
1080       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1081           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1082 
1083       // check that hole fixed
1084       assertNoErrors(doFsck(conf,false));
1085       assertEquals(ROWKEYS.length, countRows());
1086     } finally {
1087       cleanupTable(table);
1088     }
1089   }
1090 
1091   /**
1092    * This creates fixes a bad table with a hole in meta.
1093    */
1094   @Test (timeout=180000)
1095   public void testNotInMetaHole() throws Exception {
1096     TableName table =
1097         TableName.valueOf("tableNotInMetaHole");
1098     try {
1099       setupTable(table);
1100       assertEquals(ROWKEYS.length, countRows());
1101 
1102       // Mess it up by leaving a hole in the meta data
1103       admin.disableTable(table);
1104       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1105           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1106       admin.enableTable(table);
1107 
1108       HBaseFsck hbck = doFsck(conf, false);
1109       assertErrors(hbck, new ERROR_CODE[] {
1110           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1111       // holes are separate from overlap groups
1112       assertEquals(0, hbck.getOverlapGroups(table).size());
1113 
1114       // fix hole
1115       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1116           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1117 
1118       // check that hole fixed
1119       assertNoErrors(doFsck(conf,false));
1120       assertEquals(ROWKEYS.length, countRows());
1121     } finally {
1122       cleanupTable(table);
1123     }
1124   }
1125 
1126   /**
1127    * This creates and fixes a bad table with a region that is in meta but has
1128    * no deployment or data hdfs
1129    */
1130   @Test (timeout=180000)
1131   public void testNotInHdfs() throws Exception {
1132     TableName table =
1133         TableName.valueOf("tableNotInHdfs");
1134     try {
1135       setupTable(table);
1136       assertEquals(ROWKEYS.length, countRows());
1137 
1138       // make sure data in regions, if in wal only there is no data loss
1139       admin.flush(table);
1140 
1141       // Mess it up by leaving a hole in the hdfs data
1142       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1143           Bytes.toBytes("C"), false, false, true); // don't rm meta
1144 
1145       HBaseFsck hbck = doFsck(conf, false);
1146       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1147       // holes are separate from overlap groups
1148       assertEquals(0, hbck.getOverlapGroups(table).size());
1149 
1150       // fix hole
1151       doFsck(conf, true);
1152 
1153       // check that hole fixed
1154       assertNoErrors(doFsck(conf,false));
1155       assertEquals(ROWKEYS.length - 2, countRows());
1156     } finally {
1157       cleanupTable(table);
1158     }
1159   }
1160 
1161   /**
1162    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1163    * remove the table.
1164    */
1165   @Test (timeout=180000)
1166   public void testNoHdfsTable() throws Exception {
1167     TableName table = TableName.valueOf("NoHdfsTable");
1168     setupTable(table);
1169     assertEquals(ROWKEYS.length, countRows());
1170 
1171     // make sure data in regions, if in wal only there is no data loss
1172     admin.flush(table);
1173 
1174     // Mess it up by deleting hdfs dirs
1175     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1176         Bytes.toBytes("A"), false, false, true); // don't rm meta
1177     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1178         Bytes.toBytes("B"), false, false, true); // don't rm meta
1179     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1180         Bytes.toBytes("C"), false, false, true); // don't rm meta
1181     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1182         Bytes.toBytes(""), false, false, true); // don't rm meta
1183 
1184     // also remove the table directory in hdfs
1185     deleteTableDir(table);
1186 
1187     HBaseFsck hbck = doFsck(conf, false);
1188     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1189         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1190         ERROR_CODE.NOT_IN_HDFS,});
1191     // holes are separate from overlap groups
1192     assertEquals(0, hbck.getOverlapGroups(table).size());
1193 
1194     // fix hole
1195     doFsck(conf, true); // detect dangling regions and remove those
1196 
1197     // check that hole fixed
1198     assertNoErrors(doFsck(conf,false));
1199     assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
1200   }
1201 
1202   public void deleteTableDir(TableName table) throws IOException {
1203     Path rootDir = FSUtils.getRootDir(conf);
1204     FileSystem fs = rootDir.getFileSystem(conf);
1205     Path p = FSUtils.getTableDir(rootDir, table);
1206     HBaseFsck.debugLsr(conf, p);
1207     boolean success = fs.delete(p, true);
1208     LOG.info("Deleted " + p + " sucessfully? " + success);
1209   }
1210 
1211   /**
1212    * when the hbase.version file missing, It is fix the fault.
1213    */
1214   @Test (timeout=180000)
1215   public void testNoVersionFile() throws Exception {
1216     // delete the hbase.version file
1217     Path rootDir = FSUtils.getRootDir(conf);
1218     FileSystem fs = rootDir.getFileSystem(conf);
1219     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1220     fs.delete(versionFile, true);
1221 
1222     // test
1223     HBaseFsck hbck = doFsck(conf, false);
1224     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1225     // fix hbase.version missing
1226     doFsck(conf, true);
1227 
1228     // no version file fixed
1229     assertNoErrors(doFsck(conf, false));
1230   }
1231 
1232   /**
1233    * The region is not deployed when the table is disabled.
1234    */
1235   @Test (timeout=180000)
1236   public void testRegionShouldNotBeDeployed() throws Exception {
1237     TableName table =
1238         TableName.valueOf("tableRegionShouldNotBeDeployed");
1239     try {
1240       LOG.info("Starting testRegionShouldNotBeDeployed.");
1241       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1242       assertTrue(cluster.waitForActiveAndReadyMaster());
1243 
1244 
1245       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1246           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1247       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1248       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1249 
1250       // Write the .tableinfo
1251       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1252       fstd.createTableDescriptor(htdDisabled);
1253       List<HRegionInfo> disabledRegions =
1254           TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
1255 
1256       // Let's just assign everything to first RS
1257       HRegionServer hrs = cluster.getRegionServer(0);
1258 
1259       // Create region files.
1260       admin.disableTable(table);
1261       admin.enableTable(table);
1262 
1263       // Disable the table and close its regions
1264       admin.disableTable(table);
1265       HRegionInfo region = disabledRegions.remove(0);
1266       byte[] regionName = region.getRegionName();
1267 
1268       // The region should not be assigned currently
1269       assertTrue(cluster.getServerWith(regionName) == -1);
1270 
1271       // Directly open a region on a region server.
1272       // If going through AM/ZK, the region won't be open.
1273       // Even it is opened, AM will close it which causes
1274       // flakiness of this test.
1275       HRegion r = HRegion.openHRegion(
1276         region, htdDisabled, hrs.getWAL(region), conf);
1277       hrs.addToOnlineRegions(r);
1278 
1279       HBaseFsck hbck = doFsck(conf, false);
1280       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1281 
1282       // fix this fault
1283       doFsck(conf, true);
1284 
1285       // check result
1286       assertNoErrors(doFsck(conf, false));
1287     } finally {
1288       admin.enableTable(table);
1289       cleanupTable(table);
1290     }
1291   }
1292 
1293   /**
1294    * This creates two tables and mess both of them and fix them one by one
1295    */
1296   @Test (timeout=180000)
1297   public void testFixByTable() throws Exception {
1298     TableName table1 =
1299         TableName.valueOf("testFixByTable1");
1300     TableName table2 =
1301         TableName.valueOf("testFixByTable2");
1302     try {
1303       setupTable(table1);
1304       // make sure data in regions, if in wal only there is no data loss
1305       admin.flush(table1);
1306       // Mess them up by leaving a hole in the hdfs data
1307       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1308         Bytes.toBytes("C"), false, false, true); // don't rm meta
1309 
1310       setupTable(table2);
1311       // make sure data in regions, if in wal only there is no data loss
1312       admin.flush(table2);
1313       // Mess them up by leaving a hole in the hdfs data
1314       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1315         Bytes.toBytes("C"), false, false, true); // don't rm meta
1316 
1317       HBaseFsck hbck = doFsck(conf, false);
1318       assertErrors(hbck, new ERROR_CODE[] {
1319         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1320 
1321       // fix hole in table 1
1322       doFsck(conf, true, table1);
1323       // check that hole in table 1 fixed
1324       assertNoErrors(doFsck(conf, false, table1));
1325       // check that hole in table 2 still there
1326       assertErrors(doFsck(conf, false, table2),
1327         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1328 
1329       // fix hole in table 2
1330       doFsck(conf, true, table2);
1331       // check that hole in both tables fixed
1332       assertNoErrors(doFsck(conf, false));
1333       assertEquals(ROWKEYS.length - 2, countRows());
1334     } finally {
1335       cleanupTable(table1);
1336       cleanupTable(table2);
1337     }
1338   }
1339   /**
1340    * A split parent in meta, in hdfs, and not deployed
1341    */
1342   @Test (timeout=180000)
1343   public void testLingeringSplitParent() throws Exception {
1344     TableName table =
1345         TableName.valueOf("testLingeringSplitParent");
1346     Table meta = null;
1347     try {
1348       setupTable(table);
1349       assertEquals(ROWKEYS.length, countRows());
1350 
1351       // make sure data in regions, if in wal only there is no data loss
1352       admin.flush(table);
1353       HRegionLocation location = tbl.getRegionLocation("B");
1354 
1355       // Delete one region from meta, but not hdfs, unassign it.
1356       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1357         Bytes.toBytes("C"), true, true, false);
1358 
1359       // Create a new meta entry to fake it as a split parent.
1360       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1361       HRegionInfo hri = location.getRegionInfo();
1362 
1363       HRegionInfo a = new HRegionInfo(tbl.getName(),
1364         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1365       HRegionInfo b = new HRegionInfo(tbl.getName(),
1366         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1367 
1368       hri.setOffline(true);
1369       hri.setSplit(true);
1370 
1371       MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
1372       meta.close();
1373       admin.flush(TableName.META_TABLE_NAME);
1374 
1375       HBaseFsck hbck = doFsck(conf, false);
1376       assertErrors(hbck, new ERROR_CODE[] {
1377         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1378 
1379       // regular repair cannot fix lingering split parent
1380       hbck = doFsck(conf, true);
1381       assertErrors(hbck, new ERROR_CODE[] {
1382         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1383       assertFalse(hbck.shouldRerun());
1384       hbck = doFsck(conf, false);
1385       assertErrors(hbck, new ERROR_CODE[] {
1386         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1387 
1388       // fix lingering split parent
1389       hbck = new HBaseFsck(conf, hbfsckExecutorService);
1390       hbck.connect();
1391       hbck.setDisplayFullReport(); // i.e. -details
1392       hbck.setTimeLag(0);
1393       hbck.setFixSplitParents(true);
1394       hbck.onlineHbck();
1395       assertTrue(hbck.shouldRerun());
1396       hbck.close();
1397 
1398       Get get = new Get(hri.getRegionName());
1399       Result result = meta.get(get);
1400       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1401         HConstants.SPLITA_QUALIFIER).isEmpty());
1402       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1403         HConstants.SPLITB_QUALIFIER).isEmpty());
1404       admin.flush(TableName.META_TABLE_NAME);
1405 
1406       // fix other issues
1407       doFsck(conf, true);
1408 
1409       // check that all are fixed
1410       assertNoErrors(doFsck(conf, false));
1411       assertEquals(ROWKEYS.length, countRows());
1412     } finally {
1413       cleanupTable(table);
1414       IOUtils.closeQuietly(meta);
1415     }
1416   }
1417 
1418   /**
1419    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1420    * valid cases where the daughters are there.
1421    */
1422   @Test (timeout=180000)
1423   public void testValidLingeringSplitParent() throws Exception {
1424     TableName table =
1425         TableName.valueOf("testLingeringSplitParent");
1426     Table meta = null;
1427     try {
1428       setupTable(table);
1429       assertEquals(ROWKEYS.length, countRows());
1430 
1431       // make sure data in regions, if in wal only there is no data loss
1432       admin.flush(table);
1433       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1434 
1435       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1436       HRegionInfo hri = location.getRegionInfo();
1437 
1438       // do a regular split
1439       byte[] regionName = location.getRegionInfo().getRegionName();
1440       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1441       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1442 
1443       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1444       // for some time until children references are deleted. HBCK erroneously sees this as
1445       // overlapping regions
1446       HBaseFsck hbck = doFsck(
1447         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1448       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1449 
1450       // assert that the split hbase:meta entry is still there.
1451       Get get = new Get(hri.getRegionName());
1452       Result result = meta.get(get);
1453       assertNotNull(result);
1454       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1455 
1456       assertEquals(ROWKEYS.length, countRows());
1457 
1458       // assert that we still have the split regions
1459       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1460       assertNoErrors(doFsck(conf, false));
1461     } finally {
1462       cleanupTable(table);
1463       IOUtils.closeQuietly(meta);
1464     }
1465   }
1466 
1467   /**
1468    * Split crashed after write to hbase:meta finished for the parent region, but
1469    * failed to write daughters (pre HBASE-7721 codebase)
1470    */
1471   @Test(timeout=75000)
1472   public void testSplitDaughtersNotInMeta() throws Exception {
1473     TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
1474     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1475     try {
1476       setupTable(table);
1477       assertEquals(ROWKEYS.length, countRows());
1478 
1479       // make sure data in regions, if in wal only there is no data loss
1480       admin.flush(table);
1481       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1482 
1483       HRegionInfo hri = location.getRegionInfo();
1484 
1485       // do a regular split
1486       byte[] regionName = location.getRegionInfo().getRegionName();
1487       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1488       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1489 
1490       PairOfSameType<HRegionInfo> daughters =
1491           MetaTableAccessor.getDaughterRegions(meta.get(new Get(regionName)));
1492 
1493       // Delete daughter regions from meta, but not hdfs, unassign it.
1494       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1495       undeployRegion(connection, hris.get(daughters.getFirst()), daughters.getFirst());
1496       undeployRegion(connection, hris.get(daughters.getSecond()), daughters.getSecond());
1497 
1498       List<Delete> deletes = new ArrayList<>();
1499       deletes.add(new Delete(daughters.getFirst().getRegionName()));
1500       deletes.add(new Delete(daughters.getSecond().getRegionName()));
1501       meta.delete(deletes);
1502 
1503       // Remove daughters from regionStates
1504       RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
1505           getAssignmentManager().getRegionStates();
1506       regionStates.deleteRegion(daughters.getFirst());
1507       regionStates.deleteRegion(daughters.getSecond());
1508 
1509       HBaseFsck hbck = doFsck(conf, false);
1510       assertErrors(hbck,
1511           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1512               ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
1513 
1514       // now fix it. The fix should not revert the region split, but add daughters to META
1515       hbck = doFsck(
1516         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1517       assertErrors(hbck,
1518           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1519               ERROR_CODE.HOLE_IN_REGION_CHAIN });
1520 
1521       // assert that the split hbase:meta entry is still there.
1522       Get get = new Get(hri.getRegionName());
1523       Result result = meta.get(get);
1524       assertNotNull(result);
1525       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1526 
1527       assertEquals(ROWKEYS.length, countRows());
1528 
1529       // assert that we still have the split regions
1530       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1531       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1532     } finally {
1533       meta.close();
1534       cleanupTable(table);
1535     }
1536   }
1537 
1538   /**
1539    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1540    * meta and data missing in the fs.
1541    */
1542   @Test(timeout=120000)
1543   public void testMissingFirstRegion() throws Exception {
1544     TableName table = TableName.valueOf("testMissingFirstRegion");
1545     try {
1546       setupTable(table);
1547       assertEquals(ROWKEYS.length, countRows());
1548 
1549       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1550       admin.disableTable(table);
1551       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1552           true, true);
1553       admin.enableTable(table);
1554 
1555       HBaseFsck hbck = doFsck(conf, false);
1556       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1557       // fix hole
1558       doFsck(conf, true);
1559       // check that hole fixed
1560       assertNoErrors(doFsck(conf, false));
1561     } finally {
1562       cleanupTable(table);
1563     }
1564   }
1565 
1566   /**
1567    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1568    * meta and data missing in the fs.
1569    */
1570   @Test(timeout=120000)
1571   public void testRegionDeployedNotInHdfs() throws Exception {
1572     TableName table =
1573         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1574     try {
1575       setupTable(table);
1576       admin.flush(table);
1577 
1578       // Mess it up by deleting region dir
1579       deleteRegion(conf, tbl.getTableDescriptor(),
1580         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1581         false, true);
1582 
1583       HBaseFsck hbck = doFsck(conf, false);
1584       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1585       // fix hole
1586       doFsck(conf, true);
1587       // check that hole fixed
1588       assertNoErrors(doFsck(conf, false));
1589     } finally {
1590       cleanupTable(table);
1591     }
1592   }
1593 
1594   /**
1595    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1596    * the fs.
1597    */
1598   @Test(timeout=120000)
1599   public void testMissingLastRegion() throws Exception {
1600     TableName table =
1601         TableName.valueOf("testMissingLastRegion");
1602     try {
1603       setupTable(table);
1604       assertEquals(ROWKEYS.length, countRows());
1605 
1606       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1607       admin.disableTable(table);
1608       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1609           true, true);
1610       admin.enableTable(table);
1611 
1612       HBaseFsck hbck = doFsck(conf, false);
1613       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1614       // fix hole
1615       doFsck(conf, true);
1616       // check that hole fixed
1617       assertNoErrors(doFsck(conf, false));
1618     } finally {
1619       cleanupTable(table);
1620     }
1621   }
1622 
1623   /**
1624    * Test -noHdfsChecking option can detect and fix assignments issue.
1625    */
1626   @Test (timeout=180000)
1627   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1628     TableName table =
1629         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1630     try {
1631       setupTable(table);
1632       assertEquals(ROWKEYS.length, countRows());
1633 
1634       // Mess it up by closing a region
1635       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1636         Bytes.toBytes("B"), true, false, false, false);
1637 
1638       // verify there is no other errors
1639       HBaseFsck hbck = doFsck(conf, false);
1640       assertErrors(hbck, new ERROR_CODE[] {
1641         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1642 
1643       // verify that noHdfsChecking report the same errors
1644       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1645       fsck.connect();
1646       fsck.setDisplayFullReport(); // i.e. -details
1647       fsck.setTimeLag(0);
1648       fsck.setCheckHdfs(false);
1649       fsck.onlineHbck();
1650       assertErrors(fsck, new ERROR_CODE[] {
1651         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1652       fsck.close();
1653 
1654       // verify that fixAssignments works fine with noHdfsChecking
1655       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1656       fsck.connect();
1657       fsck.setDisplayFullReport(); // i.e. -details
1658       fsck.setTimeLag(0);
1659       fsck.setCheckHdfs(false);
1660       fsck.setFixAssignments(true);
1661       fsck.onlineHbck();
1662       assertTrue(fsck.shouldRerun());
1663       fsck.onlineHbck();
1664       assertNoErrors(fsck);
1665 
1666       assertEquals(ROWKEYS.length, countRows());
1667 
1668       fsck.close();
1669     } finally {
1670       cleanupTable(table);
1671     }
1672   }
1673 
1674   /**
1675    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1676    * However, it can not fix it without checking Hdfs because we need to get
1677    * the region info from Hdfs in this case, then to patch the meta.
1678    */
1679   @Test (timeout=180000)
1680   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1681     TableName table =
1682         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1683     try {
1684       setupTable(table);
1685       assertEquals(ROWKEYS.length, countRows());
1686 
1687       // Mess it up by deleting a region from the metadata
1688       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1689         Bytes.toBytes("B"), false, true, false, false);
1690 
1691       // verify there is no other errors
1692       HBaseFsck hbck = doFsck(conf, false);
1693       assertErrors(hbck,
1694           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1695 
1696       // verify that noHdfsChecking report the same errors
1697       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1698       fsck.connect();
1699       fsck.setDisplayFullReport(); // i.e. -details
1700       fsck.setTimeLag(0);
1701       fsck.setCheckHdfs(false);
1702       fsck.onlineHbck();
1703       assertErrors(fsck,
1704           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1705       fsck.close();
1706 
1707       // verify that fixMeta doesn't work with noHdfsChecking
1708       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1709       fsck.connect();
1710       fsck.setDisplayFullReport(); // i.e. -details
1711       fsck.setTimeLag(0);
1712       fsck.setCheckHdfs(false);
1713       fsck.setFixAssignments(true);
1714       fsck.setFixMeta(true);
1715       fsck.onlineHbck();
1716       assertFalse(fsck.shouldRerun());
1717       assertErrors(fsck,
1718           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1719       fsck.close();
1720 
1721       // fix the cluster so other tests won't be impacted
1722       fsck = doFsck(conf, true);
1723       assertTrue(fsck.shouldRerun());
1724       fsck = doFsck(conf, true);
1725       assertNoErrors(fsck);
1726     } finally {
1727       cleanupTable(table);
1728     }
1729   }
1730 
1731   /**
1732    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1733    * and -noHdfsChecking can't detect orphan Hdfs region.
1734    */
1735   @Test (timeout=180000)
1736   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1737     TableName table =
1738         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1739     try {
1740       setupTable(table);
1741       assertEquals(ROWKEYS.length, countRows());
1742 
1743       // Mess it up by creating an overlap in the metadata
1744       admin.disableTable(table);
1745       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1746         Bytes.toBytes("B"), true, true, false, true);
1747       admin.enableTable(table);
1748 
1749       HRegionInfo hriOverlap =
1750           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1751       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1752       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1753         .waitForAssignment(hriOverlap);
1754       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1755       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1756 
1757       HBaseFsck hbck = doFsck(conf, false);
1758       assertErrors(hbck, new ERROR_CODE[] {
1759         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1760         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1761 
1762       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1763       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1764       fsck.connect();
1765       fsck.setDisplayFullReport(); // i.e. -details
1766       fsck.setTimeLag(0);
1767       fsck.setCheckHdfs(false);
1768       fsck.onlineHbck();
1769       assertErrors(fsck, new ERROR_CODE[] {
1770         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1771       fsck.close();
1772 
1773       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1774       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1775       fsck.connect();
1776       fsck.setDisplayFullReport(); // i.e. -details
1777       fsck.setTimeLag(0);
1778       fsck.setCheckHdfs(false);
1779       fsck.setFixHdfsHoles(true);
1780       fsck.setFixHdfsOverlaps(true);
1781       fsck.setFixHdfsOrphans(true);
1782       fsck.onlineHbck();
1783       assertFalse(fsck.shouldRerun());
1784       assertErrors(fsck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN});
1785       fsck.close();
1786     } finally {
1787       if (admin.isTableDisabled(table)) {
1788         admin.enableTable(table);
1789       }
1790       cleanupTable(table);
1791     }
1792   }
1793 
1794   /**
1795    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1796    * legitimate hfile and return it.
1797    * @param fs
1798    * @param table
1799    * @return Path of a flushed hfile.
1800    * @throws IOException
1801    */
1802   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1803     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1804     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1805     Path famDir = new Path(regionDir, FAM_STR);
1806 
1807     // keep doing this until we get a legit hfile
1808     while (true) {
1809       FileStatus[] hfFss = fs.listStatus(famDir);
1810       if (hfFss.length == 0) {
1811         continue;
1812       }
1813       for (FileStatus hfs : hfFss) {
1814         if (!hfs.isDirectory()) {
1815           return hfs.getPath();
1816         }
1817       }
1818     }
1819   }
1820 
1821   /**
1822    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1823    */
1824   @Test(timeout=180000)
1825   public void testQuarantineCorruptHFile() throws Exception {
1826     TableName table = TableName.valueOf(name.getMethodName());
1827     try {
1828       setupTable(table);
1829       assertEquals(ROWKEYS.length, countRows());
1830       admin.flush(table); // flush is async.
1831 
1832       FileSystem fs = FileSystem.get(conf);
1833       Path hfile = getFlushedHFile(fs, table);
1834 
1835       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1836       admin.disableTable(table);
1837 
1838       // create new corrupt file called deadbeef (valid hfile name)
1839       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1840       TestHFile.truncateFile(fs, hfile, corrupt);
1841       LOG.info("Created corrupted file " + corrupt);
1842       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1843 
1844       // we cannot enable here because enable never finished due to the corrupt region.
1845       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1846       assertEquals(res.getRetCode(), 0);
1847       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1848       assertEquals(hfcc.getHFilesChecked(), 5);
1849       assertEquals(hfcc.getCorrupted().size(), 1);
1850       assertEquals(hfcc.getFailures().size(), 0);
1851       assertEquals(hfcc.getQuarantined().size(), 1);
1852       assertEquals(hfcc.getMissing().size(), 0);
1853 
1854       // Its been fixed, verify that we can enable.
1855       admin.enableTable(table);
1856     } finally {
1857       cleanupTable(table);
1858     }
1859   }
1860 
1861   /**
1862    * Test that use this should have a timeout, because this method could potentially wait forever.
1863   */
1864   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1865                                 int corrupt, int fail, int quar, int missing) throws Exception {
1866     try {
1867       setupTable(table);
1868       assertEquals(ROWKEYS.length, countRows());
1869       admin.flush(table); // flush is async.
1870 
1871       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1872       admin.disableTable(table);
1873 
1874       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1875           table.getNameAsString()};
1876       HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
1877 
1878       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1879       assertEquals(hfcc.getHFilesChecked(), check);
1880       assertEquals(hfcc.getCorrupted().size(), corrupt);
1881       assertEquals(hfcc.getFailures().size(), fail);
1882       assertEquals(hfcc.getQuarantined().size(), quar);
1883       assertEquals(hfcc.getMissing().size(), missing);
1884 
1885       // its been fixed, verify that we can enable
1886       admin.enableTableAsync(table);
1887       while (!admin.isTableEnabled(table)) {
1888         try {
1889           Thread.sleep(250);
1890         } catch (InterruptedException e) {
1891           e.printStackTrace();
1892           fail("Interrupted when trying to enable table " + table);
1893         }
1894       }
1895     } finally {
1896       cleanupTable(table);
1897     }
1898   }
1899 
1900   /**
1901    * This creates a table and simulates the race situation where a concurrent compaction or split
1902    * has removed an hfile after the corruption checker learned about it.
1903    */
1904   @Test(timeout=180000)
1905   public void testQuarantineMissingHFile() throws Exception {
1906     TableName table = TableName.valueOf(name.getMethodName());
1907 
1908     // inject a fault in the hfcc created.
1909     final FileSystem fs = FileSystem.get(conf);
1910     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
1911       @Override
1912       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1913         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1914           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1915           @Override
1916           protected void checkHFile(Path p) throws IOException {
1917             if (attemptedFirstHFile.compareAndSet(false, true)) {
1918               assertTrue(fs.delete(p, true)); // make sure delete happened.
1919             }
1920             super.checkHFile(p);
1921           }
1922         };
1923       }
1924     };
1925     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1926     hbck.close();
1927   }
1928 
1929   /**
1930    * This creates a table and simulates the race situation where a concurrent compaction or split
1931    * has removed an colfam dir before the corruption checker got to it.
1932    */
1933   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1934   // files in a column family on initial creation -- as suggested by Matteo.
1935   @Ignore @Test(timeout=180000)
1936   public void testQuarantineMissingFamdir() throws Exception {
1937     TableName table = TableName.valueOf(name.getMethodName());
1938     // inject a fault in the hfcc created.
1939     final FileSystem fs = FileSystem.get(conf);
1940     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
1941       @Override
1942       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1943         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1944           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1945           @Override
1946           protected void checkColFamDir(Path p) throws IOException {
1947             if (attemptedFirstHFile.compareAndSet(false, true)) {
1948               assertTrue(fs.delete(p, true)); // make sure delete happened.
1949             }
1950             super.checkColFamDir(p);
1951           }
1952         };
1953       }
1954     };
1955     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1956     hbck.close();
1957   }
1958 
1959   /**
1960    * This creates a table and simulates the race situation where a concurrent compaction or split
1961    * has removed a region dir before the corruption checker got to it.
1962    */
1963   @Test(timeout=180000)
1964   public void testQuarantineMissingRegionDir() throws Exception {
1965     TableName table = TableName.valueOf(name.getMethodName());
1966     // inject a fault in the hfcc created.
1967     final FileSystem fs = FileSystem.get(conf);
1968     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
1969       @Override
1970       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
1971       throws IOException {
1972         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1973           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1974           @Override
1975           protected void checkRegionDir(Path p) throws IOException {
1976             if (attemptedFirstHFile.compareAndSet(false, true)) {
1977               assertTrue(fs.delete(p, true)); // make sure delete happened.
1978             }
1979             super.checkRegionDir(p);
1980           }
1981         };
1982       }
1983     };
1984     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1985     hbck.close();
1986   }
1987 
1988   /**
1989    * Test fixing lingering reference file.
1990    */
1991   @Test (timeout=180000)
1992   public void testLingeringReferenceFile() throws Exception {
1993     TableName table =
1994         TableName.valueOf("testLingeringReferenceFile");
1995     try {
1996       setupTable(table);
1997       assertEquals(ROWKEYS.length, countRows());
1998 
1999       // Mess it up by creating a fake reference file
2000       FileSystem fs = FileSystem.get(conf);
2001       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2002       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2003       Path famDir = new Path(regionDir, FAM_STR);
2004       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2005       fs.create(fakeReferenceFile);
2006 
2007       HBaseFsck hbck = doFsck(conf, false);
2008       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2009       // fix reference file
2010       doFsck(conf, true);
2011       // check that reference file fixed
2012       assertNoErrors(doFsck(conf, false));
2013     } finally {
2014       cleanupTable(table);
2015     }
2016   }
2017 
2018   /**
2019    * Test mission REGIONINFO_QUALIFIER in hbase:meta
2020    */
2021   @Test (timeout=180000)
2022   public void testMissingRegionInfoQualifier() throws Exception {
2023     Connection connection = ConnectionFactory.createConnection(conf);
2024     TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
2025     try {
2026       setupTable(table);
2027 
2028       // Mess it up by removing the RegionInfo for one region.
2029       final List<Delete> deletes = new LinkedList<Delete>();
2030       Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
2031       MetaScanner.metaScan(connection, new MetaScanner.MetaScannerVisitor() {
2032 
2033         @Override
2034         public boolean processRow(Result rowResult) throws IOException {
2035           HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
2036           if (hri != null && !hri.getTable().isSystemTable()) {
2037             Delete delete = new Delete(rowResult.getRow());
2038             delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2039             deletes.add(delete);
2040           }
2041           return true;
2042         }
2043 
2044         @Override
2045         public void close() throws IOException {
2046         }
2047       });
2048       meta.delete(deletes);
2049 
2050       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2051       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2052         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2053       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2054         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2055       meta.close();
2056 
2057       HBaseFsck hbck = doFsck(conf, false);
2058       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2059 
2060       // fix reference file
2061       hbck = doFsck(conf, true);
2062 
2063       // check that reference file fixed
2064       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2065     } finally {
2066       cleanupTable(table);
2067     }
2068     connection.close();
2069   }
2070 
2071   /**
2072    * Test pluggable error reporter. It can be plugged in
2073    * from system property or configuration.
2074    */
2075   @Test (timeout=180000)
2076   public void testErrorReporter() throws Exception {
2077     try {
2078       MockErrorReporter.calledCount = 0;
2079       doFsck(conf, false);
2080       assertEquals(MockErrorReporter.calledCount, 0);
2081 
2082       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2083       doFsck(conf, false);
2084       assertTrue(MockErrorReporter.calledCount > 20);
2085     } finally {
2086       conf.set("hbasefsck.errorreporter",
2087         PrintingErrorReporter.class.getName());
2088       MockErrorReporter.calledCount = 0;
2089     }
2090   }
2091 
2092   static class MockErrorReporter implements ErrorReporter {
2093     static int calledCount = 0;
2094 
2095     @Override
2096     public void clear() {
2097       calledCount++;
2098     }
2099 
2100     @Override
2101     public void report(String message) {
2102       calledCount++;
2103     }
2104 
2105     @Override
2106     public void reportError(String message) {
2107       calledCount++;
2108     }
2109 
2110     @Override
2111     public void reportError(ERROR_CODE errorCode, String message) {
2112       calledCount++;
2113     }
2114 
2115     @Override
2116     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2117       calledCount++;
2118     }
2119 
2120     @Override
2121     public void reportError(ERROR_CODE errorCode,
2122         String message, TableInfo table, HbckInfo info) {
2123       calledCount++;
2124     }
2125 
2126     @Override
2127     public void reportError(ERROR_CODE errorCode, String message,
2128         TableInfo table, HbckInfo info1, HbckInfo info2) {
2129       calledCount++;
2130     }
2131 
2132     @Override
2133     public int summarize() {
2134       return ++calledCount;
2135     }
2136 
2137     @Override
2138     public void detail(String details) {
2139       calledCount++;
2140     }
2141 
2142     @Override
2143     public ArrayList<ERROR_CODE> getErrorList() {
2144       calledCount++;
2145       return new ArrayList<ERROR_CODE>();
2146     }
2147 
2148     @Override
2149     public void progress() {
2150       calledCount++;
2151     }
2152 
2153     @Override
2154     public void print(String message) {
2155       calledCount++;
2156     }
2157 
2158     @Override
2159     public void resetErrors() {
2160       calledCount++;
2161     }
2162 
2163     @Override
2164     public boolean tableHasErrors(TableInfo table) {
2165       calledCount++;
2166       return false;
2167     }
2168   }
2169 
2170   @Test(timeout=180000)
2171   public void testCheckTableLocks() throws Exception {
2172     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2173     EnvironmentEdgeManager.injectEdge(edge);
2174     // check no errors
2175     HBaseFsck hbck = doFsck(conf, false);
2176     assertNoErrors(hbck);
2177 
2178     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2179 
2180     // obtain one lock
2181     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2182     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2183         "testCheckTableLocks");
2184     writeLock.acquire();
2185     hbck = doFsck(conf, false);
2186     assertNoErrors(hbck); // should not have expired, no problems
2187 
2188     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2189         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2190 
2191     hbck = doFsck(conf, false);
2192     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2193 
2194     final CountDownLatch latch = new CountDownLatch(1);
2195     new Thread() {
2196       @Override
2197       public void run() {
2198         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2199             "testCheckTableLocks");
2200         try {
2201           latch.countDown();
2202           readLock.acquire();
2203         } catch (IOException ex) {
2204           fail();
2205         } catch (IllegalStateException ex) {
2206           return; // expected, since this will be reaped under us.
2207         }
2208         fail("should not have come here");
2209       };
2210     }.start();
2211 
2212     latch.await(); // wait until thread starts
2213     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2214 
2215     hbck = doFsck(conf, false);
2216     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2217 
2218     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2219         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2220 
2221     hbck = doFsck(conf, false);
2222     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2223 
2224     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2225                                                                  // which is not injectable through EnvironmentEdge
2226     Threads.sleep(10);
2227     hbck = doFsck(conf, true); // now fix both cases
2228 
2229     hbck = doFsck(conf, false);
2230     assertNoErrors(hbck);
2231 
2232     // ensure that locks are deleted
2233     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2234         "should acquire without blocking");
2235     writeLock.acquire(); // this should not block.
2236     writeLock.release(); // release for clean state
2237   }
2238 
2239   /**
2240    * Test orphaned table ZNode (for table states)
2241    */
2242   @Test
2243   public void testOrphanedTableZNode() throws Exception {
2244     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2245 
2246     try {
2247       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getTableStateManager()
2248       .setTableState(table, ZooKeeperProtos.Table.State.ENABLING);
2249 
2250       try {
2251         setupTable(table);
2252         Assert.fail(
2253           "Create table should fail when its ZNode has already existed with ENABLING state.");
2254       } catch(TableExistsException t) {
2255         //Expected exception
2256       }
2257       // The setup table was interrupted in some state that needs to some cleanup.
2258       try {
2259         cleanupTable(table);
2260       } catch (IOException e) {
2261         // Because create table failed, it is expected that the cleanup table would
2262         // throw some exception.  Ignore and continue.
2263       }
2264 
2265       HBaseFsck hbck = doFsck(conf, false);
2266       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2267 
2268       // fix the orphaned ZK entry
2269       hbck = doFsck(conf, true);
2270 
2271       // check that orpahned ZK table entry is gone.
2272       hbck = doFsck(conf, false);
2273       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2274       // Now create table should succeed.
2275       setupTable(table);
2276     } finally {
2277       // This code could be called that either a table was created successfully or set up
2278       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2279       try {
2280         cleanupTable(table);
2281       } catch (IOException e) {
2282         // The cleanup table would throw some exception if create table failed in some state.
2283         // Ignore this exception
2284       }
2285     }
2286   }
2287 
2288   @Test (timeout=180000)
2289   public void testMetaOffline() throws Exception {
2290     // check no errors
2291     HBaseFsck hbck = doFsck(conf, false);
2292     assertNoErrors(hbck);
2293     deleteMetaRegion(conf, true, false, false);
2294     hbck = doFsck(conf, false);
2295     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2296     // inconsistency and whether we will be fixing it or not.
2297     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2298     hbck = doFsck(conf, true);
2299     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2300     hbck = doFsck(conf, false);
2301     assertNoErrors(hbck);
2302   }
2303 
2304   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2305       boolean regionInfoOnly) throws IOException, InterruptedException {
2306     HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
2307         .getRegionLocation(HConstants.EMPTY_START_ROW);
2308     ServerName hsa = metaLocation.getServerName();
2309     HRegionInfo hri = metaLocation.getRegionInfo();
2310     if (unassign) {
2311       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2312       try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
2313         undeployRegion(unmanagedConnection, hsa, hri);
2314       }
2315     }
2316 
2317     if (regionInfoOnly) {
2318       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2319       Path rootDir = FSUtils.getRootDir(conf);
2320       FileSystem fs = rootDir.getFileSystem(conf);
2321       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2322           hri.getEncodedName());
2323       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2324       fs.delete(hriPath, true);
2325     }
2326 
2327     if (hdfs) {
2328       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2329       Path rootDir = FSUtils.getRootDir(conf);
2330       FileSystem fs = rootDir.getFileSystem(conf);
2331       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2332           hri.getEncodedName());
2333       HBaseFsck.debugLsr(conf, p);
2334       boolean success = fs.delete(p, true);
2335       LOG.info("Deleted " + p + " sucessfully? " + success);
2336       HBaseFsck.debugLsr(conf, p);
2337     }
2338   }
2339 
2340   @Test (timeout=180000)
2341   public void testTableWithNoRegions() throws Exception {
2342     // We might end up with empty regions in a table
2343     // see also testNoHdfsTable()
2344     TableName table =
2345         TableName.valueOf(name.getMethodName());
2346     try {
2347       // create table with one region
2348       HTableDescriptor desc = new HTableDescriptor(table);
2349       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2350       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2351       admin.createTable(desc);
2352       tbl = (HTable) connection.getTable(table, tableExecutorService);
2353 
2354       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2355       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
2356           HConstants.EMPTY_END_ROW, false, false, true);
2357 
2358       HBaseFsck hbck = doFsck(conf, false);
2359       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2360 
2361       doFsck(conf, true);
2362 
2363       // fix hole
2364       doFsck(conf, true);
2365 
2366       // check that hole fixed
2367       assertNoErrors(doFsck(conf, false));
2368     } finally {
2369       cleanupTable(table);
2370     }
2371 
2372   }
2373 
2374   @Test (timeout=180000)
2375   public void testHbckAfterRegionMerge() throws Exception {
2376     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2377     Table meta = null;
2378     try {
2379       // disable CatalogJanitor
2380       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2381       setupTable(table);
2382       assertEquals(ROWKEYS.length, countRows());
2383 
2384       // make sure data in regions, if in wal only there is no data loss
2385       admin.flush(table);
2386       HRegionInfo region1 = tbl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
2387       HRegionInfo region2 = tbl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
2388 
2389       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2390 
2391       assertNotEquals(region1, region2);
2392 
2393       // do a region merge
2394       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2395           region2.getEncodedNameAsBytes(), false);
2396 
2397       // wait until region merged
2398       long timeout = System.currentTimeMillis() + 30 * 1000;
2399       while (true) {
2400         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2401           break;
2402         } else if (System.currentTimeMillis() > timeout) {
2403           fail("Time out waiting on region " + region1.getEncodedName()
2404               + " and " + region2.getEncodedName() + " be merged");
2405         }
2406         Thread.sleep(10);
2407       }
2408 
2409       assertEquals(ROWKEYS.length, countRows());
2410 
2411       HBaseFsck hbck = doFsck(conf, false);
2412       assertNoErrors(hbck); // no errors
2413 
2414     } finally {
2415       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2416       cleanupTable(table);
2417       IOUtils.closeQuietly(meta);
2418     }
2419   }
2420 
2421   @Test (timeout = 180000)
2422   public void testRegionBoundariesCheck() throws Exception {
2423     HBaseFsck hbck = doFsck(conf, false);
2424     assertNoErrors(hbck); // no errors
2425     try {
2426       hbck.checkRegionBoundaries();
2427     } catch (IllegalArgumentException e) {
2428       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2429         fail("Table directory path is not valid." + e.getMessage());
2430       }
2431     }
2432   }
2433 
2434   @org.junit.Rule
2435   public TestName name = new TestName();
2436 
2437   @Test (timeout=180000)
2438   public void testReadOnlyProperty() throws Exception {
2439     HBaseFsck hbck = doFsck(conf, false);
2440     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2441       hbck.shouldIgnorePreCheckPermission());
2442 
2443     hbck = doFsck(conf, true);
2444     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2445       hbck.shouldIgnorePreCheckPermission());
2446 
2447     hbck = doFsck(conf, true);
2448     hbck.setIgnorePreCheckPermission(true);
2449     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2450       hbck.shouldIgnorePreCheckPermission());
2451   }
2452 
2453   @Test (timeout=180000)
2454   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2455     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2456     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2457     try {
2458       HTableDescriptor desc = new HTableDescriptor(table);
2459       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2460       admin.createTable(desc);
2461       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2462       for (int i = 0; i < 5; i++) {
2463         Put p1 = new Put(("r" + i).getBytes());
2464         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2465         tbl.put(p1);
2466       }
2467       admin.flush(desc.getTableName());
2468       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2469       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2470       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2471       cluster.getServerWith(regions.get(0).getRegionName());
2472       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2473       st.prepare();
2474       st.stepsBeforePONR(regionServer, regionServer, false);
2475       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2476       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2477       for (RegionState state : regionsInTransition.values()) {
2478         am.regionOffline(state.getRegion());
2479       }
2480       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2481       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2482       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2483       am.assign(regionsMap);
2484       am.waitForAssignment(regions.get(0).getRegionInfo());
2485       HBaseFsck hbck = doFsck(conf, false);
2486       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2487           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2488       // holes are separate from overlap groups
2489       assertEquals(0, hbck.getOverlapGroups(table).size());
2490 
2491       // fix hole
2492       assertErrors(
2493         doFsck(
2494           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2495         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2496           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2497 
2498       // check that hole fixed
2499       assertNoErrors(doFsck(conf, false));
2500       assertEquals(5, countRows());
2501     } finally {
2502       if (tbl != null) {
2503         tbl.close();
2504         tbl = null;
2505       }
2506       cleanupTable(table);
2507     }
2508   }
2509 
2510 }