View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.concurrent.Callable;
39  import java.util.concurrent.CountDownLatch;
40  import java.util.concurrent.ExecutorService;
41  import java.util.concurrent.Executors;
42  import java.util.concurrent.Future;
43  import java.util.concurrent.ScheduledThreadPoolExecutor;
44  import java.util.concurrent.SynchronousQueue;
45  import java.util.concurrent.ThreadPoolExecutor;
46  import java.util.concurrent.TimeUnit;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  
49  import org.apache.commons.io.IOUtils;
50  import org.apache.commons.logging.Log;
51  import org.apache.commons.logging.LogFactory;
52  import org.apache.hadoop.conf.Configuration;
53  import org.apache.hadoop.fs.FileStatus;
54  import org.apache.hadoop.fs.FileSystem;
55  import org.apache.hadoop.fs.Path;
56  import org.apache.hadoop.hbase.ClusterStatus;
57  import org.apache.hadoop.hbase.HBaseTestingUtility;
58  import org.apache.hadoop.hbase.HColumnDescriptor;
59  import org.apache.hadoop.hbase.HConstants;
60  import org.apache.hadoop.hbase.HRegionInfo;
61  import org.apache.hadoop.hbase.HRegionLocation;
62  import org.apache.hadoop.hbase.HTableDescriptor;
63  import org.apache.hadoop.hbase.TableExistsException;
64  import org.apache.hadoop.hbase.testclassification.LargeTests;
65  import org.apache.hadoop.hbase.MiniHBaseCluster;
66  import org.apache.hadoop.hbase.ServerName;
67  import org.apache.hadoop.hbase.TableName;
68  import org.apache.hadoop.hbase.MetaTableAccessor;
69  import org.apache.hadoop.hbase.client.Admin;
70  import org.apache.hadoop.hbase.client.ClusterConnection;
71  import org.apache.hadoop.hbase.client.Connection;
72  import org.apache.hadoop.hbase.client.ConnectionFactory;
73  import org.apache.hadoop.hbase.client.Delete;
74  import org.apache.hadoop.hbase.client.Durability;
75  import org.apache.hadoop.hbase.client.Get;
76  import org.apache.hadoop.hbase.client.HBaseAdmin;
77  import org.apache.hadoop.hbase.client.HConnection;
78  import org.apache.hadoop.hbase.client.HTable;
79  import org.apache.hadoop.hbase.client.MetaScanner;
80  import org.apache.hadoop.hbase.client.Put;
81  import org.apache.hadoop.hbase.client.Result;
82  import org.apache.hadoop.hbase.client.ResultScanner;
83  import org.apache.hadoop.hbase.client.Scan;
84  import org.apache.hadoop.hbase.client.Table;
85  import org.apache.hadoop.hbase.io.hfile.TestHFile;
86  import org.apache.hadoop.hbase.master.AssignmentManager;
87  import org.apache.hadoop.hbase.master.HMaster;
88  import org.apache.hadoop.hbase.master.RegionState;
89  import org.apache.hadoop.hbase.master.RegionStates;
90  import org.apache.hadoop.hbase.master.TableLockManager;
91  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
92  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
93  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
94  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
95  import org.apache.hadoop.hbase.regionserver.HRegion;
96  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
97  import org.apache.hadoop.hbase.regionserver.HRegionServer;
98  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
99  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
100 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
101 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
102 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
103 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
104 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
105 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
106 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
107 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
108 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
109 import org.apache.zookeeper.KeeperException;
110 import org.junit.AfterClass;
111 import org.junit.Assert;
112 import org.junit.Before;
113 import org.junit.BeforeClass;
114 import org.junit.Ignore;
115 import org.junit.Test;
116 import org.junit.experimental.categories.Category;
117 import org.junit.rules.TestName;
118 
119 import com.google.common.collect.Multimap;
120 
121 /**
122  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
123  */
124 @Category(LargeTests.class)
125 public class TestHBaseFsck {
126   static final int POOL_SIZE = 7;
127 
128   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
129   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
130   private final static Configuration conf = TEST_UTIL.getConfiguration();
131   private final static String FAM_STR = "fam";
132   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
133   private final static int REGION_ONLINE_TIMEOUT = 800;
134   private static RegionStates regionStates;
135   private static ExecutorService tableExecutorService;
136   private static ScheduledThreadPoolExecutor hbfsckExecutorService;
137   private static ClusterConnection connection;
138   private static Admin admin;
139 
140   // for the instance, reset every test run
141   private HTable tbl;
142   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
143     Bytes.toBytes("B"), Bytes.toBytes("C") };
144   // one row per region.
145   private final static byte[][] ROWKEYS= new byte[][] {
146     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
147     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
148 
149   @BeforeClass
150   public static void setUpBeforeClass() throws Exception {
151     conf.setInt("hbase.regionserver.handler.count", 2);
152     conf.setInt("hbase.regionserver.metahandler.count", 2);
153 
154     conf.setInt("hbase.htable.threads.max", POOL_SIZE);
155     conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
156     conf.setInt("hbase.hconnection.threads.core", POOL_SIZE);
157     conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
158     TEST_UTIL.startMiniCluster(3);
159 
160     tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
161         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
162 
163     hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
164 
165     AssignmentManager assignmentManager =
166       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
167     regionStates = assignmentManager.getRegionStates();
168 
169     connection = (ClusterConnection) TEST_UTIL.getConnection();
170 
171     admin = connection.getAdmin();
172     admin.setBalancerRunning(false, true);
173   }
174 
175   @AfterClass
176   public static void tearDownAfterClass() throws Exception {
177     tableExecutorService.shutdown();
178     hbfsckExecutorService.shutdown();
179     admin.close();
180     TEST_UTIL.shutdownMiniCluster();
181   }
182 
183   @Before
184   public void setUp() {
185     EnvironmentEdgeManager.reset();
186   }
187 
188   @Test (timeout=180000)
189   public void testHBaseFsck() throws Exception {
190     assertNoErrors(doFsck(conf, false));
191     TableName table = TableName.valueOf("tableBadMetaAssign");
192     TEST_UTIL.createTable(table, FAM);
193 
194     // We created 1 table, should be fine
195     assertNoErrors(doFsck(conf, false));
196 
197     // Now let's mess it up and change the assignment in hbase:meta to
198     // point to a different region server
199     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
200     Scan scan = new Scan();
201     scan.setStartRow(Bytes.toBytes(table+",,"));
202     ResultScanner scanner = meta.getScanner(scan);
203     HRegionInfo hri = null;
204 
205     Result res = scanner.next();
206     ServerName currServer =
207       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
208           HConstants.SERVER_QUALIFIER));
209     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
210         HConstants.STARTCODE_QUALIFIER));
211 
212     for (JVMClusterUtil.RegionServerThread rs :
213         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
214 
215       ServerName sn = rs.getRegionServer().getServerName();
216 
217       // When we find a diff RS, change the assignment and break
218       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
219           startCode != sn.getStartcode()) {
220         Put put = new Put(res.getRow());
221         put.setDurability(Durability.SKIP_WAL);
222         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
223           Bytes.toBytes(sn.getHostAndPort()));
224         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
225           Bytes.toBytes(sn.getStartcode()));
226         meta.put(put);
227         hri = MetaTableAccessor.getHRegionInfo(res);
228         break;
229       }
230     }
231 
232     // Try to fix the data
233     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
234         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
235 
236     TEST_UTIL.getHBaseCluster().getMaster()
237       .getAssignmentManager().waitForAssignment(hri);
238 
239     // Should be fixed now
240     assertNoErrors(doFsck(conf, false));
241 
242     // comment needed - what is the purpose of this line
243     Table t = connection.getTable(table, tableExecutorService);
244     ResultScanner s = t.getScanner(new Scan());
245     s.close();
246     t.close();
247 
248     scanner.close();
249     meta.close();
250   }
251 
252   @Test(timeout=180000)
253   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
254     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
255     admin.closeRegion(cluster.getServerHoldingMeta(), HRegionInfo.FIRST_META_REGIONINFO);
256     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
257     new MetaTableLocator().deleteMetaLocation(cluster.getMaster().getZooKeeper());
258     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
259     HBaseFsck hbck = doFsck(conf, true);
260     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
261         ERROR_CODE.NULL_META_REGION });
262     assertNoErrors(doFsck(conf, false));
263   }
264 
265   /**
266    * Create a new region in META.
267    */
268   private HRegionInfo createRegion(final HTableDescriptor
269       htd, byte[] startKey, byte[] endKey)
270       throws IOException {
271     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
272     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
273     MetaTableAccessor.addRegionToMeta(meta, hri);
274     meta.close();
275     return hri;
276   }
277 
278   /**
279    * Debugging method to dump the contents of meta.
280    */
281   private void dumpMeta(TableName tableName) throws IOException {
282     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
283     for (byte[] row : metaRows) {
284       LOG.info(Bytes.toString(row));
285     }
286   }
287 
288   /**
289    * This method is used to undeploy a region -- close it and attempt to
290    * remove its state from the Master.
291    */
292   private void undeployRegion(Connection conn, ServerName sn,
293       HRegionInfo hri) throws IOException, InterruptedException {
294     try {
295       HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) conn, sn, hri);
296       if (!hri.isMetaTable()) {
297         admin.offline(hri.getRegionName());
298       }
299     } catch (IOException ioe) {
300       LOG.warn("Got exception when attempting to offline region "
301           + Bytes.toString(hri.getRegionName()), ioe);
302     }
303   }
304   /**
305    * Delete a region from assignments, meta, or completely from hdfs.
306    * @param unassign if true unassign region if assigned
307    * @param metaRow  if true remove region's row from META
308    * @param hdfs if true remove region's dir in HDFS
309    */
310   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
311       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
312       boolean hdfs) throws IOException, InterruptedException {
313     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
314   }
315 
316   /**
317    * Delete a region from assignments, meta, or completely from hdfs.
318    * @param unassign if true unassign region if assigned
319    * @param metaRow  if true remove region's row from META
320    * @param hdfs if true remove region's dir in HDFS
321    * @param regionInfoOnly if true remove a region dir's .regioninfo file
322    */
323   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
324       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
325       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
326     LOG.info("** Before delete:");
327     dumpMeta(htd.getTableName());
328 
329     List<HRegionLocation> locations = tbl.getAllRegionLocations();
330     for (HRegionLocation location : locations) {
331       HRegionInfo hri = location.getRegionInfo();
332       ServerName hsa = location.getServerName();
333       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
334           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
335 
336         LOG.info("RegionName: " +hri.getRegionNameAsString());
337         byte[] deleteRow = hri.getRegionName();
338 
339         if (unassign) {
340           LOG.info("Undeploying region " + hri + " from server " + hsa);
341           undeployRegion(connection, hsa, hri);
342         }
343 
344         if (regionInfoOnly) {
345           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
346           Path rootDir = FSUtils.getRootDir(conf);
347           FileSystem fs = rootDir.getFileSystem(conf);
348           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
349               hri.getEncodedName());
350           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
351           fs.delete(hriPath, true);
352         }
353 
354         if (hdfs) {
355           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
356           Path rootDir = FSUtils.getRootDir(conf);
357           FileSystem fs = rootDir.getFileSystem(conf);
358           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
359               hri.getEncodedName());
360           HBaseFsck.debugLsr(conf, p);
361           boolean success = fs.delete(p, true);
362           LOG.info("Deleted " + p + " sucessfully? " + success);
363           HBaseFsck.debugLsr(conf, p);
364         }
365 
366         if (metaRow) {
367           try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
368             Delete delete = new Delete(deleteRow);
369             meta.delete(delete);
370           }
371         }
372       }
373       LOG.info(hri.toString() + hsa.toString());
374     }
375 
376     TEST_UTIL.getMetaTableRows(htd.getTableName());
377     LOG.info("*** After delete:");
378     dumpMeta(htd.getTableName());
379   }
380 
381   /**
382    * Setup a clean table before we start mucking with it.
383    *
384    * It will set tbl which needs to be closed after test
385    *
386    * @throws IOException
387    * @throws InterruptedException
388    * @throws KeeperException
389    */
390   void setupTable(TableName tablename) throws Exception {
391     setupTableWithRegionReplica(tablename, 1);
392   }
393 
394   /**
395    * Setup a clean table with a certain region_replica count
396    *
397    * It will set tbl which needs to be closed after test
398    *
399    * @param tableName
400    * @param replicaCount
401    * @throws Exception
402    */
403   void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
404     HTableDescriptor desc = new HTableDescriptor(tablename);
405     desc.setRegionReplication(replicaCount);
406     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
407     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
408     admin.createTable(desc, SPLITS);
409     tbl = (HTable) connection.getTable(tablename, tableExecutorService);
410     List<Put> puts = new ArrayList<Put>();
411     for (byte[] row : ROWKEYS) {
412       Put p = new Put(row);
413       p.add(FAM, Bytes.toBytes("val"), row);
414       puts.add(p);
415     }
416     tbl.put(puts);
417     tbl.flushCommits();
418   }
419 
420   /**
421    * Counts the number of row to verify data loss or non-dataloss.
422    */
423   int countRows() throws IOException {
424      Scan s = new Scan();
425      ResultScanner rs = tbl.getScanner(s);
426      int i = 0;
427      while(rs.next() !=null) {
428        i++;
429      }
430      return i;
431   }
432 
433   /**
434    * delete table in preparation for next test
435    *
436    * @param tablename
437    * @throws IOException
438    */
439   void cleanupTable(TableName tablename) throws IOException {
440     if (tbl != null) {
441       tbl.close();
442       tbl = null;
443     }
444 
445     ((ClusterConnection) connection).clearRegionCache();
446     TEST_UTIL.deleteTable(tablename);
447 
448   }
449 
450   /**
451    * This creates a clean table and confirms that the table is clean.
452    */
453   @Test (timeout=180000)
454   public void testHBaseFsckClean() throws Exception {
455     assertNoErrors(doFsck(conf, false));
456     TableName table = TableName.valueOf("tableClean");
457     try {
458       HBaseFsck hbck = doFsck(conf, false);
459       assertNoErrors(hbck);
460 
461       setupTable(table);
462       assertEquals(ROWKEYS.length, countRows());
463 
464       // We created 1 table, should be fine
465       hbck = doFsck(conf, false);
466       assertNoErrors(hbck);
467       assertEquals(0, hbck.getOverlapGroups(table).size());
468       assertEquals(ROWKEYS.length, countRows());
469     } finally {
470       cleanupTable(table);
471     }
472   }
473 
474   /**
475    * Test thread pooling in the case where there are more regions than threads
476    */
477   @Test (timeout=180000)
478   public void testHbckThreadpooling() throws Exception {
479     TableName table =
480         TableName.valueOf("tableDupeStartKey");
481     try {
482       // Create table with 4 regions
483       setupTable(table);
484 
485       // limit number of threads to 1.
486       Configuration newconf = new Configuration(conf);
487       newconf.setInt("hbasefsck.numthreads", 1);
488       assertNoErrors(doFsck(newconf, false));
489 
490       // We should pass without triggering a RejectedExecutionException
491     } finally {
492       cleanupTable(table);
493     }
494   }
495 
496   @Test (timeout=180000)
497   public void testHbckFixOrphanTable() throws Exception {
498     TableName table = TableName.valueOf("tableInfo");
499     FileSystem fs = null;
500     Path tableinfo = null;
501     try {
502       setupTable(table);
503 
504       Path hbaseTableDir = FSUtils.getTableDir(
505           FSUtils.getRootDir(conf), table);
506       fs = hbaseTableDir.getFileSystem(conf);
507       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
508       tableinfo = status.getPath();
509       fs.rename(tableinfo, new Path("/.tableinfo"));
510 
511       //to report error if .tableinfo is missing.
512       HBaseFsck hbck = doFsck(conf, false);
513       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
514 
515       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
516       hbck = doFsck(conf, true);
517       assertNoErrors(hbck);
518       status = null;
519       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
520       assertNotNull(status);
521 
522       HTableDescriptor htd = admin.getTableDescriptor(table);
523       htd.setValue("NOT_DEFAULT", "true");
524       admin.disableTable(table);
525       admin.modifyTable(table, htd);
526       admin.enableTable(table);
527       fs.delete(status.getPath(), true);
528 
529       // fix OrphanTable with cache
530       htd = admin.getTableDescriptor(table); // warms up cached htd on master
531       hbck = doFsck(conf, true);
532       assertNoErrors(hbck);
533       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
534       assertNotNull(status);
535       htd = admin.getTableDescriptor(table);
536       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
537     } finally {
538       fs.rename(new Path("/.tableinfo"), tableinfo);
539       cleanupTable(table);
540     }
541   }
542 
543   /**
544    * This test makes sure that parallel instances of Hbck is disabled.
545    *
546    * @throws Exception
547    */
548   @Test (timeout=180000)
549   public void testParallelHbck() throws Exception {
550     final ExecutorService service;
551     final Future<HBaseFsck> hbck1,hbck2;
552 
553     class RunHbck implements Callable<HBaseFsck>{
554       boolean fail = true;
555       @Override
556       public HBaseFsck call(){
557         try{
558           return doFsck(conf, false);
559         } catch(Exception e){
560           if (e.getMessage().contains("Duplicate hbck")) {
561             fail = false;
562           }
563         }
564         // If we reach here, then an exception was caught
565         if (fail) fail();
566         return null;
567       }
568     }
569     service = Executors.newFixedThreadPool(2);
570     hbck1 = service.submit(new RunHbck());
571     hbck2 = service.submit(new RunHbck());
572     service.shutdown();
573     //wait for 15 seconds, for both hbck calls finish
574     service.awaitTermination(15, TimeUnit.SECONDS);
575     HBaseFsck h1 = hbck1.get();
576     HBaseFsck h2 = hbck2.get();
577     // Make sure only one of the calls was successful
578     assert(h1 == null || h2 == null);
579     if (h1 != null) {
580       assert(h1.getRetCode() >= 0);
581     }
582     if (h2 != null) {
583       assert(h2.getRetCode() >= 0);
584     }
585   }
586 
587   /**
588    * This create and fixes a bad table with regions that have a duplicate
589    * start key
590    */
591   @Test (timeout=180000)
592   public void testDupeStartKey() throws Exception {
593     TableName table =
594         TableName.valueOf("tableDupeStartKey");
595     try {
596       setupTable(table);
597       assertNoErrors(doFsck(conf, false));
598       assertEquals(ROWKEYS.length, countRows());
599 
600       // Now let's mess it up, by adding a region with a duplicate startkey
601       HRegionInfo hriDupe =
602           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("A2"));
603       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
604       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
605           .waitForAssignment(hriDupe);
606       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
607       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
608 
609       HBaseFsck hbck = doFsck(conf, false);
610       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
611             ERROR_CODE.DUPE_STARTKEYS});
612       assertEquals(2, hbck.getOverlapGroups(table).size());
613       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
614 
615       // fix the degenerate region.
616       doFsck(conf,true);
617 
618       // check that the degenerate region is gone and no data loss
619       HBaseFsck hbck2 = doFsck(conf,false);
620       assertNoErrors(hbck2);
621       assertEquals(0, hbck2.getOverlapGroups(table).size());
622       assertEquals(ROWKEYS.length, countRows());
623     } finally {
624       cleanupTable(table);
625     }
626   }
627 
628   /*
629    * This creates a table with region_replica > 1 and verifies hbck runs
630    * successfully
631    */
632   @Test (timeout=180000)
633   public void testHbckWithRegionReplica() throws Exception {
634     TableName table =
635         TableName.valueOf("tableWithReplica");
636     try {
637       setupTableWithRegionReplica(table, 2);
638       assertNoErrors(doFsck(conf, false));
639       assertEquals(ROWKEYS.length, countRows());
640     } finally {
641       cleanupTable(table);
642     }
643   }
644 
645   /**
646    * Get region info from local cluster.
647    */
648   Map<ServerName, List<String>> getDeployedHRIs(final HBaseAdmin admin) throws IOException {
649     ClusterStatus status = admin.getClusterStatus();
650     Collection<ServerName> regionServers = status.getServers();
651     Map<ServerName, List<String>> mm =
652         new HashMap<ServerName, List<String>>();
653     for (ServerName hsi : regionServers) {
654       AdminProtos.AdminService.BlockingInterface server = ((HConnection) connection).getAdmin(hsi);
655 
656       // list all online regions from this region server
657       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
658       List<String> regionNames = new ArrayList<String>();
659       for (HRegionInfo hri : regions) {
660         regionNames.add(hri.getRegionNameAsString());
661       }
662       mm.put(hsi, regionNames);
663     }
664     return mm;
665   }
666 
667   /**
668    * Returns the HSI a region info is on.
669    */
670   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
671     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
672       if (e.getValue().contains(hri.getRegionNameAsString())) {
673         return e.getKey();
674       }
675     }
676     return null;
677   }
678 
679   /**
680    * This create and fixes a bad table with regions that have a duplicate
681    * start key
682    */
683   @Test (timeout=180000)
684   public void testDupeRegion() throws Exception {
685     TableName table =
686         TableName.valueOf("tableDupeRegion");
687     try {
688       setupTable(table);
689       assertNoErrors(doFsck(conf, false));
690       assertEquals(ROWKEYS.length, countRows());
691 
692       // Now let's mess it up, by adding a region with a duplicate startkey
693       HRegionInfo hriDupe =
694           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"));
695 
696       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
697       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
698           .waitForAssignment(hriDupe);
699       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
700       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
701 
702       // Yikes! The assignment manager can't tell between diff between two
703       // different regions with the same start/endkeys since it doesn't
704       // differentiate on ts/regionId!  We actually need to recheck
705       // deployments!
706       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriDupe) == null) {
707         Thread.sleep(250);
708       }
709 
710       LOG.debug("Finished assignment of dupe region");
711 
712       // TODO why is dupe region different from dupe start keys?
713       HBaseFsck hbck = doFsck(conf, false);
714       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
715             ERROR_CODE.DUPE_STARTKEYS});
716       assertEquals(2, hbck.getOverlapGroups(table).size());
717       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
718 
719       // fix the degenerate region.
720       doFsck(conf,true);
721 
722       // check that the degenerate region is gone and no data loss
723       HBaseFsck hbck2 = doFsck(conf,false);
724       assertNoErrors(hbck2);
725       assertEquals(0, hbck2.getOverlapGroups(table).size());
726       assertEquals(ROWKEYS.length, countRows());
727     } finally {
728       cleanupTable(table);
729     }
730   }
731 
732   /**
733    * This creates and fixes a bad table with regions that has startkey == endkey
734    */
735   @Test (timeout=180000)
736   public void testDegenerateRegions() throws Exception {
737     TableName table = TableName.valueOf("tableDegenerateRegions");
738     try {
739       setupTable(table);
740       assertNoErrors(doFsck(conf,false));
741       assertEquals(ROWKEYS.length, countRows());
742 
743       // Now let's mess it up, by adding a region with a duplicate startkey
744       HRegionInfo hriDupe =
745           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
746       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
747       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
748           .waitForAssignment(hriDupe);
749       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
750       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
751 
752       HBaseFsck hbck = doFsck(conf,false);
753       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION, ERROR_CODE.DUPE_STARTKEYS,
754           ERROR_CODE.DUPE_STARTKEYS });
755       assertEquals(2, hbck.getOverlapGroups(table).size());
756       assertEquals(ROWKEYS.length, countRows());
757 
758       // fix the degenerate region.
759       doFsck(conf,true);
760 
761       // check that the degenerate region is gone and no data loss
762       HBaseFsck hbck2 = doFsck(conf,false);
763       assertNoErrors(hbck2);
764       assertEquals(0, hbck2.getOverlapGroups(table).size());
765       assertEquals(ROWKEYS.length, countRows());
766     } finally {
767       cleanupTable(table);
768     }
769   }
770 
771   /**
772    * This creates and fixes a bad table where a region is completely contained
773    * by another region.
774    */
775   @Test (timeout=180000)
776   public void testContainedRegionOverlap() throws Exception {
777     TableName table =
778         TableName.valueOf("tableContainedRegionOverlap");
779     try {
780       setupTable(table);
781       assertEquals(ROWKEYS.length, countRows());
782 
783       // Mess it up by creating an overlap in the metadata
784       HRegionInfo hriOverlap =
785           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
786       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
787       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
788           .waitForAssignment(hriOverlap);
789       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
790       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
791 
792       HBaseFsck hbck = doFsck(conf, false);
793       assertErrors(hbck, new ERROR_CODE[] {
794           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
795       assertEquals(2, hbck.getOverlapGroups(table).size());
796       assertEquals(ROWKEYS.length, countRows());
797 
798       // fix the problem.
799       doFsck(conf, true);
800 
801       // verify that overlaps are fixed
802       HBaseFsck hbck2 = doFsck(conf,false);
803       assertNoErrors(hbck2);
804       assertEquals(0, hbck2.getOverlapGroups(table).size());
805       assertEquals(ROWKEYS.length, countRows());
806     } finally {
807       cleanupTable(table);
808     }
809   }
810 
811   /**
812    * This creates and fixes a bad table where an overlap group of
813    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
814    * region. Mess around the meta data so that closeRegion/offlineRegion
815    * throws exceptions.
816    */
817   @Test (timeout=180000)
818   public void testSidelineOverlapRegion() throws Exception {
819     TableName table =
820         TableName.valueOf("testSidelineOverlapRegion");
821     try {
822       setupTable(table);
823       assertEquals(ROWKEYS.length, countRows());
824 
825       // Mess it up by creating an overlap
826       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
827       HMaster master = cluster.getMaster();
828       HRegionInfo hriOverlap1 =
829           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("AB"));
830       master.assignRegion(hriOverlap1);
831       master.getAssignmentManager().waitForAssignment(hriOverlap1);
832       HRegionInfo hriOverlap2 =
833           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("AB"), Bytes.toBytes("B"));
834       master.assignRegion(hriOverlap2);
835       master.getAssignmentManager().waitForAssignment(hriOverlap2);
836 
837       HBaseFsck hbck = doFsck(conf, false);
838       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
839         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
840       assertEquals(3, hbck.getOverlapGroups(table).size());
841       assertEquals(ROWKEYS.length, countRows());
842 
843       // mess around the overlapped regions, to trigger NotServingRegionException
844       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
845       ServerName serverName = null;
846       byte[] regionName = null;
847       for (HbckInfo hbi: overlapGroups.values()) {
848         if ("A".equals(Bytes.toString(hbi.getStartKey()))
849             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
850           regionName = hbi.getRegionName();
851 
852           // get an RS not serving the region to force bad assignment info in to META.
853           int k = cluster.getServerWith(regionName);
854           for (int i = 0; i < 3; i++) {
855             if (i != k) {
856               HRegionServer rs = cluster.getRegionServer(i);
857               serverName = rs.getServerName();
858               break;
859             }
860           }
861 
862           HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) connection,
863               cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
864           admin.offline(regionName);
865           break;
866         }
867       }
868 
869       assertNotNull(regionName);
870       assertNotNull(serverName);
871       try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
872         Put put = new Put(regionName);
873         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
874             Bytes.toBytes(serverName.getHostAndPort()));
875         meta.put(put);
876       }
877 
878       // fix the problem.
879       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
880       fsck.connect();
881       fsck.setDisplayFullReport(); // i.e. -details
882       fsck.setTimeLag(0);
883       fsck.setFixAssignments(true);
884       fsck.setFixMeta(true);
885       fsck.setFixHdfsHoles(true);
886       fsck.setFixHdfsOverlaps(true);
887       fsck.setFixHdfsOrphans(true);
888       fsck.setFixVersionFile(true);
889       fsck.setSidelineBigOverlaps(true);
890       fsck.setMaxMerge(2);
891       fsck.onlineHbck();
892       fsck.close();
893 
894       // verify that overlaps are fixed, and there are less rows
895       // since one region is sidelined.
896       HBaseFsck hbck2 = doFsck(conf,false);
897       assertNoErrors(hbck2);
898       assertEquals(0, hbck2.getOverlapGroups(table).size());
899       assertTrue(ROWKEYS.length > countRows());
900     } finally {
901       cleanupTable(table);
902     }
903   }
904 
905   /**
906    * This creates and fixes a bad table where a region is completely contained
907    * by another region, and there is a hole (sort of like a bad split)
908    */
909   @Test (timeout=180000)
910   public void testOverlapAndOrphan() throws Exception {
911     TableName table =
912         TableName.valueOf("tableOverlapAndOrphan");
913     try {
914       setupTable(table);
915       assertEquals(ROWKEYS.length, countRows());
916 
917       // Mess it up by creating an overlap in the metadata
918       admin.disableTable(table);
919       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
920           Bytes.toBytes("B"), true, true, false, true);
921       admin.enableTable(table);
922 
923       HRegionInfo hriOverlap =
924           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
925       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
926       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
927           .waitForAssignment(hriOverlap);
928       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
929       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
930 
931       HBaseFsck hbck = doFsck(conf, false);
932       assertErrors(hbck, new ERROR_CODE[] {
933           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
934           ERROR_CODE.HOLE_IN_REGION_CHAIN});
935 
936       // fix the problem.
937       doFsck(conf, true);
938 
939       // verify that overlaps are fixed
940       HBaseFsck hbck2 = doFsck(conf,false);
941       assertNoErrors(hbck2);
942       assertEquals(0, hbck2.getOverlapGroups(table).size());
943       assertEquals(ROWKEYS.length, countRows());
944     } finally {
945       cleanupTable(table);
946     }
947   }
948 
949   /**
950    * This creates and fixes a bad table where a region overlaps two regions --
951    * a start key contained in another region and its end key is contained in
952    * yet another region.
953    */
954   @Test (timeout=180000)
955   public void testCoveredStartKey() throws Exception {
956     TableName table =
957         TableName.valueOf("tableCoveredStartKey");
958     try {
959       setupTable(table);
960       assertEquals(ROWKEYS.length, countRows());
961 
962       // Mess it up by creating an overlap in the metadata
963       HRegionInfo hriOverlap =
964           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
965       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
966       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
967           .waitForAssignment(hriOverlap);
968       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
969       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
970 
971       HBaseFsck hbck = doFsck(conf, false);
972       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
973           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
974       assertEquals(3, hbck.getOverlapGroups(table).size());
975       assertEquals(ROWKEYS.length, countRows());
976 
977       // fix the problem.
978       doFsck(conf, true);
979 
980       // verify that overlaps are fixed
981       HBaseFsck hbck2 = doFsck(conf, false);
982       assertErrors(hbck2, new ERROR_CODE[0]);
983       assertEquals(0, hbck2.getOverlapGroups(table).size());
984       assertEquals(ROWKEYS.length, countRows());
985     } finally {
986       cleanupTable(table);
987     }
988   }
989 
990   /**
991    * This creates and fixes a bad table with a missing region -- hole in meta
992    * and data missing in the fs.
993    */
994   @Test (timeout=180000)
995   public void testRegionHole() throws Exception {
996     TableName table =
997         TableName.valueOf("tableRegionHole");
998     try {
999       setupTable(table);
1000       assertEquals(ROWKEYS.length, countRows());
1001 
1002       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1003       admin.disableTable(table);
1004       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1005           Bytes.toBytes("C"), true, true, true);
1006       admin.enableTable(table);
1007 
1008       HBaseFsck hbck = doFsck(conf, false);
1009       assertErrors(hbck, new ERROR_CODE[] {
1010           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1011       // holes are separate from overlap groups
1012       assertEquals(0, hbck.getOverlapGroups(table).size());
1013 
1014       // fix hole
1015       doFsck(conf, true);
1016 
1017       // check that hole fixed
1018       assertNoErrors(doFsck(conf,false));
1019       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1020     } finally {
1021       cleanupTable(table);
1022     }
1023   }
1024 
1025   /**
1026    * This creates and fixes a bad table with a missing region -- hole in meta
1027    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1028    */
1029   @Test (timeout=180000)
1030   public void testHDFSRegioninfoMissing() throws Exception {
1031     TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
1032     try {
1033       setupTable(table);
1034       assertEquals(ROWKEYS.length, countRows());
1035 
1036       // Mess it up by leaving a hole in the meta data
1037       admin.disableTable(table);
1038       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1039           Bytes.toBytes("C"), true, true, false, true);
1040       admin.enableTable(table);
1041 
1042       HBaseFsck hbck = doFsck(conf, false);
1043       assertErrors(hbck, new ERROR_CODE[] {
1044           ERROR_CODE.ORPHAN_HDFS_REGION,
1045           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1046           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1047       // holes are separate from overlap groups
1048       assertEquals(0, hbck.getOverlapGroups(table).size());
1049 
1050       // fix hole
1051       doFsck(conf, true);
1052 
1053       // check that hole fixed
1054       assertNoErrors(doFsck(conf, false));
1055       assertEquals(ROWKEYS.length, countRows());
1056     } finally {
1057       cleanupTable(table);
1058     }
1059   }
1060 
1061   /**
1062    * This creates and fixes a bad table with a region that is missing meta and
1063    * not assigned to a region server.
1064    */
1065   @Test (timeout=180000)
1066   public void testNotInMetaOrDeployedHole() throws Exception {
1067     TableName table =
1068         TableName.valueOf("tableNotInMetaOrDeployedHole");
1069     try {
1070       setupTable(table);
1071       assertEquals(ROWKEYS.length, countRows());
1072 
1073       // Mess it up by leaving a hole in the meta data
1074       admin.disableTable(table);
1075       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1076           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1077       admin.enableTable(table);
1078 
1079       HBaseFsck hbck = doFsck(conf, false);
1080       assertErrors(hbck, new ERROR_CODE[] {
1081           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1082       // holes are separate from overlap groups
1083       assertEquals(0, hbck.getOverlapGroups(table).size());
1084 
1085       // fix hole
1086       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1087           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1088 
1089       // check that hole fixed
1090       assertNoErrors(doFsck(conf,false));
1091       assertEquals(ROWKEYS.length, countRows());
1092     } finally {
1093       cleanupTable(table);
1094     }
1095   }
1096 
1097   /**
1098    * This creates fixes a bad table with a hole in meta.
1099    */
1100   @Test (timeout=180000)
1101   public void testNotInMetaHole() throws Exception {
1102     TableName table =
1103         TableName.valueOf("tableNotInMetaHole");
1104     try {
1105       setupTable(table);
1106       assertEquals(ROWKEYS.length, countRows());
1107 
1108       // Mess it up by leaving a hole in the meta data
1109       admin.disableTable(table);
1110       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1111           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1112       admin.enableTable(table);
1113 
1114       HBaseFsck hbck = doFsck(conf, false);
1115       assertErrors(hbck, new ERROR_CODE[] {
1116           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1117       // holes are separate from overlap groups
1118       assertEquals(0, hbck.getOverlapGroups(table).size());
1119 
1120       // fix hole
1121       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1122           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1123 
1124       // check that hole fixed
1125       assertNoErrors(doFsck(conf,false));
1126       assertEquals(ROWKEYS.length, countRows());
1127     } finally {
1128       cleanupTable(table);
1129     }
1130   }
1131 
1132   /**
1133    * This creates and fixes a bad table with a region that is in meta but has
1134    * no deployment or data hdfs
1135    */
1136   @Test (timeout=180000)
1137   public void testNotInHdfs() throws Exception {
1138     TableName table =
1139         TableName.valueOf("tableNotInHdfs");
1140     try {
1141       setupTable(table);
1142       assertEquals(ROWKEYS.length, countRows());
1143 
1144       // make sure data in regions, if in wal only there is no data loss
1145       admin.flush(table);
1146 
1147       // Mess it up by leaving a hole in the hdfs data
1148       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1149           Bytes.toBytes("C"), false, false, true); // don't rm meta
1150 
1151       HBaseFsck hbck = doFsck(conf, false);
1152       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1153       // holes are separate from overlap groups
1154       assertEquals(0, hbck.getOverlapGroups(table).size());
1155 
1156       // fix hole
1157       doFsck(conf, true);
1158 
1159       // check that hole fixed
1160       assertNoErrors(doFsck(conf,false));
1161       assertEquals(ROWKEYS.length - 2, countRows());
1162     } finally {
1163       cleanupTable(table);
1164     }
1165   }
1166 
1167   /**
1168    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1169    * remove the table.
1170    */
1171   @Test (timeout=180000)
1172   public void testNoHdfsTable() throws Exception {
1173     TableName table = TableName.valueOf("NoHdfsTable");
1174     setupTable(table);
1175     assertEquals(ROWKEYS.length, countRows());
1176 
1177     // make sure data in regions, if in wal only there is no data loss
1178     admin.flush(table);
1179 
1180     // Mess it up by deleting hdfs dirs
1181     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1182         Bytes.toBytes("A"), false, false, true); // don't rm meta
1183     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1184         Bytes.toBytes("B"), false, false, true); // don't rm meta
1185     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1186         Bytes.toBytes("C"), false, false, true); // don't rm meta
1187     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1188         Bytes.toBytes(""), false, false, true); // don't rm meta
1189 
1190     // also remove the table directory in hdfs
1191     deleteTableDir(table);
1192 
1193     HBaseFsck hbck = doFsck(conf, false);
1194     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1195         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1196         ERROR_CODE.NOT_IN_HDFS,});
1197     // holes are separate from overlap groups
1198     assertEquals(0, hbck.getOverlapGroups(table).size());
1199 
1200     // fix hole
1201     doFsck(conf, true); // detect dangling regions and remove those
1202 
1203     // check that hole fixed
1204     assertNoErrors(doFsck(conf,false));
1205     assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
1206   }
1207 
1208   public void deleteTableDir(TableName table) throws IOException {
1209     Path rootDir = FSUtils.getRootDir(conf);
1210     FileSystem fs = rootDir.getFileSystem(conf);
1211     Path p = FSUtils.getTableDir(rootDir, table);
1212     HBaseFsck.debugLsr(conf, p);
1213     boolean success = fs.delete(p, true);
1214     LOG.info("Deleted " + p + " sucessfully? " + success);
1215   }
1216 
1217   /**
1218    * when the hbase.version file missing, It is fix the fault.
1219    */
1220   @Test (timeout=180000)
1221   public void testNoVersionFile() throws Exception {
1222     // delete the hbase.version file
1223     Path rootDir = FSUtils.getRootDir(conf);
1224     FileSystem fs = rootDir.getFileSystem(conf);
1225     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1226     fs.delete(versionFile, true);
1227 
1228     // test
1229     HBaseFsck hbck = doFsck(conf, false);
1230     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1231     // fix hbase.version missing
1232     doFsck(conf, true);
1233 
1234     // no version file fixed
1235     assertNoErrors(doFsck(conf, false));
1236   }
1237 
1238   /**
1239    * The region is not deployed when the table is disabled.
1240    */
1241   @Test (timeout=180000)
1242   public void testRegionShouldNotBeDeployed() throws Exception {
1243     TableName table =
1244         TableName.valueOf("tableRegionShouldNotBeDeployed");
1245     try {
1246       LOG.info("Starting testRegionShouldNotBeDeployed.");
1247       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1248       assertTrue(cluster.waitForActiveAndReadyMaster());
1249 
1250 
1251       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1252           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1253       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1254       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1255 
1256       // Write the .tableinfo
1257       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1258       fstd.createTableDescriptor(htdDisabled);
1259       List<HRegionInfo> disabledRegions =
1260           TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
1261 
1262       // Let's just assign everything to first RS
1263       HRegionServer hrs = cluster.getRegionServer(0);
1264 
1265       // Create region files.
1266       admin.disableTable(table);
1267       admin.enableTable(table);
1268 
1269       // Disable the table and close its regions
1270       admin.disableTable(table);
1271       HRegionInfo region = disabledRegions.remove(0);
1272       byte[] regionName = region.getRegionName();
1273 
1274       // The region should not be assigned currently
1275       assertTrue(cluster.getServerWith(regionName) == -1);
1276 
1277       // Directly open a region on a region server.
1278       // If going through AM/ZK, the region won't be open.
1279       // Even it is opened, AM will close it which causes
1280       // flakiness of this test.
1281       HRegion r = HRegion.openHRegion(
1282         region, htdDisabled, hrs.getWAL(region), conf);
1283       hrs.addToOnlineRegions(r);
1284 
1285       HBaseFsck hbck = doFsck(conf, false);
1286       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1287 
1288       // fix this fault
1289       doFsck(conf, true);
1290 
1291       // check result
1292       assertNoErrors(doFsck(conf, false));
1293     } finally {
1294       admin.enableTable(table);
1295       cleanupTable(table);
1296     }
1297   }
1298 
1299   /**
1300    * This creates two tables and mess both of them and fix them one by one
1301    */
1302   @Test (timeout=180000)
1303   public void testFixByTable() throws Exception {
1304     TableName table1 =
1305         TableName.valueOf("testFixByTable1");
1306     TableName table2 =
1307         TableName.valueOf("testFixByTable2");
1308     try {
1309       setupTable(table1);
1310       // make sure data in regions, if in wal only there is no data loss
1311       admin.flush(table1);
1312       // Mess them up by leaving a hole in the hdfs data
1313       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1314         Bytes.toBytes("C"), false, false, true); // don't rm meta
1315 
1316       setupTable(table2);
1317       // make sure data in regions, if in wal only there is no data loss
1318       admin.flush(table2);
1319       // Mess them up by leaving a hole in the hdfs data
1320       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1321         Bytes.toBytes("C"), false, false, true); // don't rm meta
1322 
1323       HBaseFsck hbck = doFsck(conf, false);
1324       assertErrors(hbck, new ERROR_CODE[] {
1325         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1326 
1327       // fix hole in table 1
1328       doFsck(conf, true, table1);
1329       // check that hole in table 1 fixed
1330       assertNoErrors(doFsck(conf, false, table1));
1331       // check that hole in table 2 still there
1332       assertErrors(doFsck(conf, false, table2),
1333         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1334 
1335       // fix hole in table 2
1336       doFsck(conf, true, table2);
1337       // check that hole in both tables fixed
1338       assertNoErrors(doFsck(conf, false));
1339       assertEquals(ROWKEYS.length - 2, countRows());
1340     } finally {
1341       cleanupTable(table1);
1342       cleanupTable(table2);
1343     }
1344   }
1345   /**
1346    * A split parent in meta, in hdfs, and not deployed
1347    */
1348   @Test (timeout=180000)
1349   public void testLingeringSplitParent() throws Exception {
1350     TableName table =
1351         TableName.valueOf("testLingeringSplitParent");
1352     Table meta = null;
1353     try {
1354       setupTable(table);
1355       assertEquals(ROWKEYS.length, countRows());
1356 
1357       // make sure data in regions, if in wal only there is no data loss
1358       admin.flush(table);
1359       HRegionLocation location = tbl.getRegionLocation("B");
1360 
1361       // Delete one region from meta, but not hdfs, unassign it.
1362       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1363         Bytes.toBytes("C"), true, true, false);
1364 
1365       // Create a new meta entry to fake it as a split parent.
1366       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1367       HRegionInfo hri = location.getRegionInfo();
1368 
1369       HRegionInfo a = new HRegionInfo(tbl.getName(),
1370         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1371       HRegionInfo b = new HRegionInfo(tbl.getName(),
1372         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1373 
1374       hri.setOffline(true);
1375       hri.setSplit(true);
1376 
1377       MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
1378       meta.close();
1379       admin.flush(TableName.META_TABLE_NAME);
1380 
1381       HBaseFsck hbck = doFsck(conf, false);
1382       assertErrors(hbck, new ERROR_CODE[] {
1383         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1384 
1385       // regular repair cannot fix lingering split parent
1386       hbck = doFsck(conf, true);
1387       assertErrors(hbck, new ERROR_CODE[] {
1388         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1389       assertFalse(hbck.shouldRerun());
1390       hbck = doFsck(conf, false);
1391       assertErrors(hbck, new ERROR_CODE[] {
1392         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1393 
1394       // fix lingering split parent
1395       hbck = new HBaseFsck(conf, hbfsckExecutorService);
1396       hbck.connect();
1397       hbck.setDisplayFullReport(); // i.e. -details
1398       hbck.setTimeLag(0);
1399       hbck.setFixSplitParents(true);
1400       hbck.onlineHbck();
1401       assertTrue(hbck.shouldRerun());
1402       hbck.close();
1403 
1404       Get get = new Get(hri.getRegionName());
1405       Result result = meta.get(get);
1406       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1407         HConstants.SPLITA_QUALIFIER).isEmpty());
1408       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1409         HConstants.SPLITB_QUALIFIER).isEmpty());
1410       admin.flush(TableName.META_TABLE_NAME);
1411 
1412       // fix other issues
1413       doFsck(conf, true);
1414 
1415       // check that all are fixed
1416       assertNoErrors(doFsck(conf, false));
1417       assertEquals(ROWKEYS.length, countRows());
1418     } finally {
1419       cleanupTable(table);
1420       IOUtils.closeQuietly(meta);
1421     }
1422   }
1423 
1424   /**
1425    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1426    * valid cases where the daughters are there.
1427    */
1428   @Test (timeout=180000)
1429   public void testValidLingeringSplitParent() throws Exception {
1430     TableName table =
1431         TableName.valueOf("testLingeringSplitParent");
1432     Table meta = null;
1433     try {
1434       setupTable(table);
1435       assertEquals(ROWKEYS.length, countRows());
1436 
1437       // make sure data in regions, if in wal only there is no data loss
1438       admin.flush(table);
1439       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1440 
1441       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1442       HRegionInfo hri = location.getRegionInfo();
1443 
1444       // do a regular split
1445       byte[] regionName = location.getRegionInfo().getRegionName();
1446       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1447       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1448 
1449       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1450       // for some time until children references are deleted. HBCK erroneously sees this as
1451       // overlapping regions
1452       HBaseFsck hbck = doFsck(
1453         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1454       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1455 
1456       // assert that the split hbase:meta entry is still there.
1457       Get get = new Get(hri.getRegionName());
1458       Result result = meta.get(get);
1459       assertNotNull(result);
1460       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1461 
1462       assertEquals(ROWKEYS.length, countRows());
1463 
1464       // assert that we still have the split regions
1465       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1466       assertNoErrors(doFsck(conf, false));
1467     } finally {
1468       cleanupTable(table);
1469       IOUtils.closeQuietly(meta);
1470     }
1471   }
1472 
1473   /**
1474    * Split crashed after write to hbase:meta finished for the parent region, but
1475    * failed to write daughters (pre HBASE-7721 codebase)
1476    */
1477   @Test(timeout=75000)
1478   public void testSplitDaughtersNotInMeta() throws Exception {
1479     TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
1480     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1481     try {
1482       setupTable(table);
1483       assertEquals(ROWKEYS.length, countRows());
1484 
1485       // make sure data in regions, if in wal only there is no data loss
1486       admin.flush(table);
1487       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1488 
1489       HRegionInfo hri = location.getRegionInfo();
1490 
1491       // do a regular split
1492       byte[] regionName = location.getRegionInfo().getRegionName();
1493       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1494       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1495 
1496       PairOfSameType<HRegionInfo> daughters =
1497           MetaTableAccessor.getDaughterRegions(meta.get(new Get(regionName)));
1498 
1499       // Delete daughter regions from meta, but not hdfs, unassign it.
1500       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1501       undeployRegion(connection, hris.get(daughters.getFirst()), daughters.getFirst());
1502       undeployRegion(connection, hris.get(daughters.getSecond()), daughters.getSecond());
1503 
1504       List<Delete> deletes = new ArrayList<>();
1505       deletes.add(new Delete(daughters.getFirst().getRegionName()));
1506       deletes.add(new Delete(daughters.getSecond().getRegionName()));
1507       meta.delete(deletes);
1508 
1509       // Remove daughters from regionStates
1510       RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
1511           getAssignmentManager().getRegionStates();
1512       regionStates.deleteRegion(daughters.getFirst());
1513       regionStates.deleteRegion(daughters.getSecond());
1514 
1515       HBaseFsck hbck = doFsck(conf, false);
1516       assertErrors(hbck,
1517           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1518               ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
1519 
1520       // now fix it. The fix should not revert the region split, but add daughters to META
1521       hbck = doFsck(
1522         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1523       assertErrors(hbck,
1524           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1525               ERROR_CODE.HOLE_IN_REGION_CHAIN });
1526 
1527       // assert that the split hbase:meta entry is still there.
1528       Get get = new Get(hri.getRegionName());
1529       Result result = meta.get(get);
1530       assertNotNull(result);
1531       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1532 
1533       assertEquals(ROWKEYS.length, countRows());
1534 
1535       // assert that we still have the split regions
1536       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1537       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1538     } finally {
1539       meta.close();
1540       cleanupTable(table);
1541     }
1542   }
1543 
1544   /**
1545    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1546    * meta and data missing in the fs.
1547    */
1548   @Test(timeout=120000)
1549   public void testMissingFirstRegion() throws Exception {
1550     TableName table = TableName.valueOf("testMissingFirstRegion");
1551     try {
1552       setupTable(table);
1553       assertEquals(ROWKEYS.length, countRows());
1554 
1555       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1556       admin.disableTable(table);
1557       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1558           true, true);
1559       admin.enableTable(table);
1560 
1561       HBaseFsck hbck = doFsck(conf, false);
1562       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1563       // fix hole
1564       doFsck(conf, true);
1565       // check that hole fixed
1566       assertNoErrors(doFsck(conf, false));
1567     } finally {
1568       cleanupTable(table);
1569     }
1570   }
1571 
1572   /**
1573    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1574    * meta and data missing in the fs.
1575    */
1576   @Test(timeout=120000)
1577   public void testRegionDeployedNotInHdfs() throws Exception {
1578     TableName table =
1579         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1580     try {
1581       setupTable(table);
1582       admin.flush(table);
1583 
1584       // Mess it up by deleting region dir
1585       deleteRegion(conf, tbl.getTableDescriptor(),
1586         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1587         false, true);
1588 
1589       HBaseFsck hbck = doFsck(conf, false);
1590       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1591       // fix hole
1592       doFsck(conf, true);
1593       // check that hole fixed
1594       assertNoErrors(doFsck(conf, false));
1595     } finally {
1596       cleanupTable(table);
1597     }
1598   }
1599 
1600   /**
1601    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1602    * the fs.
1603    */
1604   @Test(timeout=120000)
1605   public void testMissingLastRegion() throws Exception {
1606     TableName table =
1607         TableName.valueOf("testMissingLastRegion");
1608     try {
1609       setupTable(table);
1610       assertEquals(ROWKEYS.length, countRows());
1611 
1612       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1613       admin.disableTable(table);
1614       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1615           true, true);
1616       admin.enableTable(table);
1617 
1618       HBaseFsck hbck = doFsck(conf, false);
1619       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1620       // fix hole
1621       doFsck(conf, true);
1622       // check that hole fixed
1623       assertNoErrors(doFsck(conf, false));
1624     } finally {
1625       cleanupTable(table);
1626     }
1627   }
1628 
1629   /**
1630    * Test -noHdfsChecking option can detect and fix assignments issue.
1631    */
1632   @Test (timeout=180000)
1633   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1634     TableName table =
1635         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1636     try {
1637       setupTable(table);
1638       assertEquals(ROWKEYS.length, countRows());
1639 
1640       // Mess it up by closing a region
1641       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1642         Bytes.toBytes("B"), true, false, false, false);
1643 
1644       // verify there is no other errors
1645       HBaseFsck hbck = doFsck(conf, false);
1646       assertErrors(hbck, new ERROR_CODE[] {
1647         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1648 
1649       // verify that noHdfsChecking report the same errors
1650       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1651       fsck.connect();
1652       fsck.setDisplayFullReport(); // i.e. -details
1653       fsck.setTimeLag(0);
1654       fsck.setCheckHdfs(false);
1655       fsck.onlineHbck();
1656       assertErrors(fsck, new ERROR_CODE[] {
1657         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1658       fsck.close();
1659 
1660       // verify that fixAssignments works fine with noHdfsChecking
1661       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1662       fsck.connect();
1663       fsck.setDisplayFullReport(); // i.e. -details
1664       fsck.setTimeLag(0);
1665       fsck.setCheckHdfs(false);
1666       fsck.setFixAssignments(true);
1667       fsck.onlineHbck();
1668       assertTrue(fsck.shouldRerun());
1669       fsck.onlineHbck();
1670       assertNoErrors(fsck);
1671 
1672       assertEquals(ROWKEYS.length, countRows());
1673 
1674       fsck.close();
1675     } finally {
1676       cleanupTable(table);
1677     }
1678   }
1679 
1680   /**
1681    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1682    * However, it can not fix it without checking Hdfs because we need to get
1683    * the region info from Hdfs in this case, then to patch the meta.
1684    */
1685   @Test (timeout=180000)
1686   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1687     TableName table =
1688         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1689     try {
1690       setupTable(table);
1691       assertEquals(ROWKEYS.length, countRows());
1692 
1693       // Mess it up by deleting a region from the metadata
1694       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1695         Bytes.toBytes("B"), false, true, false, false);
1696 
1697       // verify there is no other errors
1698       HBaseFsck hbck = doFsck(conf, false);
1699       assertErrors(hbck,
1700           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1701 
1702       // verify that noHdfsChecking report the same errors
1703       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1704       fsck.connect();
1705       fsck.setDisplayFullReport(); // i.e. -details
1706       fsck.setTimeLag(0);
1707       fsck.setCheckHdfs(false);
1708       fsck.onlineHbck();
1709       assertErrors(fsck,
1710           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1711       fsck.close();
1712 
1713       // verify that fixMeta doesn't work with noHdfsChecking
1714       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1715       fsck.connect();
1716       fsck.setDisplayFullReport(); // i.e. -details
1717       fsck.setTimeLag(0);
1718       fsck.setCheckHdfs(false);
1719       fsck.setFixAssignments(true);
1720       fsck.setFixMeta(true);
1721       fsck.onlineHbck();
1722       assertFalse(fsck.shouldRerun());
1723       assertErrors(fsck,
1724           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1725       fsck.close();
1726 
1727       // fix the cluster so other tests won't be impacted
1728       fsck = doFsck(conf, true);
1729       assertTrue(fsck.shouldRerun());
1730       fsck = doFsck(conf, true);
1731       assertNoErrors(fsck);
1732     } finally {
1733       cleanupTable(table);
1734     }
1735   }
1736 
1737   /**
1738    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1739    * and -noHdfsChecking can't detect orphan Hdfs region.
1740    */
1741   @Test (timeout=180000)
1742   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1743     TableName table =
1744         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1745     try {
1746       setupTable(table);
1747       assertEquals(ROWKEYS.length, countRows());
1748 
1749       // Mess it up by creating an overlap in the metadata
1750       admin.disableTable(table);
1751       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1752         Bytes.toBytes("B"), true, true, false, true);
1753       admin.enableTable(table);
1754 
1755       HRegionInfo hriOverlap =
1756           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1757       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1758       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1759         .waitForAssignment(hriOverlap);
1760       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1761       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1762 
1763       HBaseFsck hbck = doFsck(conf, false);
1764       assertErrors(hbck, new ERROR_CODE[] {
1765         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1766         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1767 
1768       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1769       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1770       fsck.connect();
1771       fsck.setDisplayFullReport(); // i.e. -details
1772       fsck.setTimeLag(0);
1773       fsck.setCheckHdfs(false);
1774       fsck.onlineHbck();
1775       assertErrors(fsck, new ERROR_CODE[] {
1776         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1777       fsck.close();
1778 
1779       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1780       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1781       fsck.connect();
1782       fsck.setDisplayFullReport(); // i.e. -details
1783       fsck.setTimeLag(0);
1784       fsck.setCheckHdfs(false);
1785       fsck.setFixHdfsHoles(true);
1786       fsck.setFixHdfsOverlaps(true);
1787       fsck.setFixHdfsOrphans(true);
1788       fsck.onlineHbck();
1789       assertFalse(fsck.shouldRerun());
1790       assertErrors(fsck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN});
1791       fsck.close();
1792     } finally {
1793       if (admin.isTableDisabled(table)) {
1794         admin.enableTable(table);
1795       }
1796       cleanupTable(table);
1797     }
1798   }
1799 
1800   /**
1801    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1802    * legitimate hfile and return it.
1803    * @param fs
1804    * @param table
1805    * @return Path of a flushed hfile.
1806    * @throws IOException
1807    */
1808   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1809     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1810     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1811     Path famDir = new Path(regionDir, FAM_STR);
1812 
1813     // keep doing this until we get a legit hfile
1814     while (true) {
1815       FileStatus[] hfFss = fs.listStatus(famDir);
1816       if (hfFss.length == 0) {
1817         continue;
1818       }
1819       for (FileStatus hfs : hfFss) {
1820         if (!hfs.isDirectory()) {
1821           return hfs.getPath();
1822         }
1823       }
1824     }
1825   }
1826 
1827   /**
1828    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1829    */
1830   @Test(timeout=180000)
1831   public void testQuarantineCorruptHFile() throws Exception {
1832     TableName table = TableName.valueOf(name.getMethodName());
1833     try {
1834       setupTable(table);
1835       assertEquals(ROWKEYS.length, countRows());
1836       admin.flush(table); // flush is async.
1837 
1838       FileSystem fs = FileSystem.get(conf);
1839       Path hfile = getFlushedHFile(fs, table);
1840 
1841       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1842       admin.disableTable(table);
1843 
1844       // create new corrupt file called deadbeef (valid hfile name)
1845       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1846       TestHFile.truncateFile(fs, hfile, corrupt);
1847       LOG.info("Created corrupted file " + corrupt);
1848       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1849 
1850       // we cannot enable here because enable never finished due to the corrupt region.
1851       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1852       assertEquals(res.getRetCode(), 0);
1853       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1854       assertEquals(hfcc.getHFilesChecked(), 5);
1855       assertEquals(hfcc.getCorrupted().size(), 1);
1856       assertEquals(hfcc.getFailures().size(), 0);
1857       assertEquals(hfcc.getQuarantined().size(), 1);
1858       assertEquals(hfcc.getMissing().size(), 0);
1859 
1860       // Its been fixed, verify that we can enable.
1861       admin.enableTable(table);
1862     } finally {
1863       cleanupTable(table);
1864     }
1865   }
1866 
1867   /**
1868    * Test that use this should have a timeout, because this method could potentially wait forever.
1869   */
1870   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1871                                 int corrupt, int fail, int quar, int missing) throws Exception {
1872     try {
1873       setupTable(table);
1874       assertEquals(ROWKEYS.length, countRows());
1875       admin.flush(table); // flush is async.
1876 
1877       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1878       admin.disableTable(table);
1879 
1880       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1881           table.getNameAsString()};
1882       HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
1883 
1884       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1885       assertEquals(hfcc.getHFilesChecked(), check);
1886       assertEquals(hfcc.getCorrupted().size(), corrupt);
1887       assertEquals(hfcc.getFailures().size(), fail);
1888       assertEquals(hfcc.getQuarantined().size(), quar);
1889       assertEquals(hfcc.getMissing().size(), missing);
1890 
1891       // its been fixed, verify that we can enable
1892       admin.enableTableAsync(table);
1893       while (!admin.isTableEnabled(table)) {
1894         try {
1895           Thread.sleep(250);
1896         } catch (InterruptedException e) {
1897           e.printStackTrace();
1898           fail("Interrupted when trying to enable table " + table);
1899         }
1900       }
1901     } finally {
1902       cleanupTable(table);
1903     }
1904   }
1905 
1906   /**
1907    * This creates a table and simulates the race situation where a concurrent compaction or split
1908    * has removed an hfile after the corruption checker learned about it.
1909    */
1910   @Test(timeout=180000)
1911   public void testQuarantineMissingHFile() throws Exception {
1912     TableName table = TableName.valueOf(name.getMethodName());
1913 
1914     // inject a fault in the hfcc created.
1915     final FileSystem fs = FileSystem.get(conf);
1916     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
1917       @Override
1918       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1919         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1920           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1921           @Override
1922           protected void checkHFile(Path p) throws IOException {
1923             if (attemptedFirstHFile.compareAndSet(false, true)) {
1924               assertTrue(fs.delete(p, true)); // make sure delete happened.
1925             }
1926             super.checkHFile(p);
1927           }
1928         };
1929       }
1930     };
1931     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1932     hbck.close();
1933   }
1934 
1935   /**
1936    * This creates a table and simulates the race situation where a concurrent compaction or split
1937    * has removed an colfam dir before the corruption checker got to it.
1938    */
1939   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1940   // files in a column family on initial creation -- as suggested by Matteo.
1941   @Ignore @Test(timeout=180000)
1942   public void testQuarantineMissingFamdir() throws Exception {
1943     TableName table = TableName.valueOf(name.getMethodName());
1944     // inject a fault in the hfcc created.
1945     final FileSystem fs = FileSystem.get(conf);
1946     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
1947       @Override
1948       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1949         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1950           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1951           @Override
1952           protected void checkColFamDir(Path p) throws IOException {
1953             if (attemptedFirstHFile.compareAndSet(false, true)) {
1954               assertTrue(fs.delete(p, true)); // make sure delete happened.
1955             }
1956             super.checkColFamDir(p);
1957           }
1958         };
1959       }
1960     };
1961     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1962     hbck.close();
1963   }
1964 
1965   /**
1966    * This creates a table and simulates the race situation where a concurrent compaction or split
1967    * has removed a region dir before the corruption checker got to it.
1968    */
1969   @Test(timeout=180000)
1970   public void testQuarantineMissingRegionDir() throws Exception {
1971     TableName table = TableName.valueOf(name.getMethodName());
1972     // inject a fault in the hfcc created.
1973     final FileSystem fs = FileSystem.get(conf);
1974     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
1975       @Override
1976       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
1977       throws IOException {
1978         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1979           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1980           @Override
1981           protected void checkRegionDir(Path p) throws IOException {
1982             if (attemptedFirstHFile.compareAndSet(false, true)) {
1983               assertTrue(fs.delete(p, true)); // make sure delete happened.
1984             }
1985             super.checkRegionDir(p);
1986           }
1987         };
1988       }
1989     };
1990     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1991     hbck.close();
1992   }
1993 
1994   /**
1995    * Test fixing lingering reference file.
1996    */
1997   @Test (timeout=180000)
1998   public void testLingeringReferenceFile() throws Exception {
1999     TableName table =
2000         TableName.valueOf("testLingeringReferenceFile");
2001     try {
2002       setupTable(table);
2003       assertEquals(ROWKEYS.length, countRows());
2004 
2005       // Mess it up by creating a fake reference file
2006       FileSystem fs = FileSystem.get(conf);
2007       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2008       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2009       Path famDir = new Path(regionDir, FAM_STR);
2010       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2011       fs.create(fakeReferenceFile);
2012 
2013       HBaseFsck hbck = doFsck(conf, false);
2014       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2015       // fix reference file
2016       doFsck(conf, true);
2017       // check that reference file fixed
2018       assertNoErrors(doFsck(conf, false));
2019     } finally {
2020       cleanupTable(table);
2021     }
2022   }
2023 
2024   /**
2025    * Test mission REGIONINFO_QUALIFIER in hbase:meta
2026    */
2027   @Test (timeout=180000)
2028   public void testMissingRegionInfoQualifier() throws Exception {
2029     Connection connection = ConnectionFactory.createConnection(conf);
2030     TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
2031     try {
2032       setupTable(table);
2033 
2034       // Mess it up by removing the RegionInfo for one region.
2035       final List<Delete> deletes = new LinkedList<Delete>();
2036       Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
2037       MetaScanner.metaScan(connection, new MetaScanner.MetaScannerVisitor() {
2038 
2039         @Override
2040         public boolean processRow(Result rowResult) throws IOException {
2041           HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
2042           if (hri != null && !hri.getTable().isSystemTable()) {
2043             Delete delete = new Delete(rowResult.getRow());
2044             delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2045             deletes.add(delete);
2046           }
2047           return true;
2048         }
2049 
2050         @Override
2051         public void close() throws IOException {
2052         }
2053       });
2054       meta.delete(deletes);
2055 
2056       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2057       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2058         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2059       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2060         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2061       meta.close();
2062 
2063       HBaseFsck hbck = doFsck(conf, false);
2064       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2065 
2066       // fix reference file
2067       hbck = doFsck(conf, true);
2068 
2069       // check that reference file fixed
2070       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2071     } finally {
2072       cleanupTable(table);
2073     }
2074     connection.close();
2075   }
2076 
2077   /**
2078    * Test pluggable error reporter. It can be plugged in
2079    * from system property or configuration.
2080    */
2081   @Test (timeout=180000)
2082   public void testErrorReporter() throws Exception {
2083     try {
2084       MockErrorReporter.calledCount = 0;
2085       doFsck(conf, false);
2086       assertEquals(MockErrorReporter.calledCount, 0);
2087 
2088       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2089       doFsck(conf, false);
2090       assertTrue(MockErrorReporter.calledCount > 20);
2091     } finally {
2092       conf.set("hbasefsck.errorreporter",
2093         PrintingErrorReporter.class.getName());
2094       MockErrorReporter.calledCount = 0;
2095     }
2096   }
2097 
2098   static class MockErrorReporter implements ErrorReporter {
2099     static int calledCount = 0;
2100 
2101     @Override
2102     public void clear() {
2103       calledCount++;
2104     }
2105 
2106     @Override
2107     public void report(String message) {
2108       calledCount++;
2109     }
2110 
2111     @Override
2112     public void reportError(String message) {
2113       calledCount++;
2114     }
2115 
2116     @Override
2117     public void reportError(ERROR_CODE errorCode, String message) {
2118       calledCount++;
2119     }
2120 
2121     @Override
2122     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2123       calledCount++;
2124     }
2125 
2126     @Override
2127     public void reportError(ERROR_CODE errorCode,
2128         String message, TableInfo table, HbckInfo info) {
2129       calledCount++;
2130     }
2131 
2132     @Override
2133     public void reportError(ERROR_CODE errorCode, String message,
2134         TableInfo table, HbckInfo info1, HbckInfo info2) {
2135       calledCount++;
2136     }
2137 
2138     @Override
2139     public int summarize() {
2140       return ++calledCount;
2141     }
2142 
2143     @Override
2144     public void detail(String details) {
2145       calledCount++;
2146     }
2147 
2148     @Override
2149     public ArrayList<ERROR_CODE> getErrorList() {
2150       calledCount++;
2151       return new ArrayList<ERROR_CODE>();
2152     }
2153 
2154     @Override
2155     public void progress() {
2156       calledCount++;
2157     }
2158 
2159     @Override
2160     public void print(String message) {
2161       calledCount++;
2162     }
2163 
2164     @Override
2165     public void resetErrors() {
2166       calledCount++;
2167     }
2168 
2169     @Override
2170     public boolean tableHasErrors(TableInfo table) {
2171       calledCount++;
2172       return false;
2173     }
2174   }
2175 
2176   @Test(timeout=180000)
2177   public void testCheckTableLocks() throws Exception {
2178     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2179     EnvironmentEdgeManager.injectEdge(edge);
2180     // check no errors
2181     HBaseFsck hbck = doFsck(conf, false);
2182     assertNoErrors(hbck);
2183 
2184     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2185 
2186     // obtain one lock
2187     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2188     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2189         "testCheckTableLocks");
2190     writeLock.acquire();
2191     hbck = doFsck(conf, false);
2192     assertNoErrors(hbck); // should not have expired, no problems
2193 
2194     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2195         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2196 
2197     hbck = doFsck(conf, false);
2198     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2199 
2200     final CountDownLatch latch = new CountDownLatch(1);
2201     new Thread() {
2202       @Override
2203       public void run() {
2204         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2205             "testCheckTableLocks");
2206         try {
2207           latch.countDown();
2208           readLock.acquire();
2209         } catch (IOException ex) {
2210           fail();
2211         } catch (IllegalStateException ex) {
2212           return; // expected, since this will be reaped under us.
2213         }
2214         fail("should not have come here");
2215       };
2216     }.start();
2217 
2218     latch.await(); // wait until thread starts
2219     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2220 
2221     hbck = doFsck(conf, false);
2222     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2223 
2224     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2225         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2226 
2227     hbck = doFsck(conf, false);
2228     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2229 
2230     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2231                                                                  // which is not injectable through EnvironmentEdge
2232     Threads.sleep(10);
2233     hbck = doFsck(conf, true); // now fix both cases
2234 
2235     hbck = doFsck(conf, false);
2236     assertNoErrors(hbck);
2237 
2238     // ensure that locks are deleted
2239     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2240         "should acquire without blocking");
2241     writeLock.acquire(); // this should not block.
2242     writeLock.release(); // release for clean state
2243   }
2244 
2245   /**
2246    * Test orphaned table ZNode (for table states)
2247    */
2248   @Test
2249   public void testOrphanedTableZNode() throws Exception {
2250     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2251 
2252     try {
2253       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getTableStateManager()
2254       .setTableState(table, ZooKeeperProtos.Table.State.ENABLING);
2255 
2256       try {
2257         setupTable(table);
2258         Assert.fail(
2259           "Create table should fail when its ZNode has already existed with ENABLING state.");
2260       } catch(TableExistsException t) {
2261         //Expected exception
2262       }
2263       // The setup table was interrupted in some state that needs to some cleanup.
2264       try {
2265         cleanupTable(table);
2266       } catch (IOException e) {
2267         // Because create table failed, it is expected that the cleanup table would
2268         // throw some exception.  Ignore and continue.
2269       }
2270 
2271       HBaseFsck hbck = doFsck(conf, false);
2272       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2273 
2274       // fix the orphaned ZK entry
2275       hbck = doFsck(conf, true);
2276 
2277       // check that orpahned ZK table entry is gone.
2278       hbck = doFsck(conf, false);
2279       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2280       // Now create table should succeed.
2281       setupTable(table);
2282     } finally {
2283       // This code could be called that either a table was created successfully or set up
2284       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2285       try {
2286         cleanupTable(table);
2287       } catch (IOException e) {
2288         // The cleanup table would throw some exception if create table failed in some state.
2289         // Ignore this exception
2290       }
2291     }
2292   }
2293 
2294   @Test (timeout=180000)
2295   public void testMetaOffline() throws Exception {
2296     // check no errors
2297     HBaseFsck hbck = doFsck(conf, false);
2298     assertNoErrors(hbck);
2299     deleteMetaRegion(conf, true, false, false);
2300     hbck = doFsck(conf, false);
2301     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2302     // inconsistency and whether we will be fixing it or not.
2303     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2304     hbck = doFsck(conf, true);
2305     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2306     hbck = doFsck(conf, false);
2307     assertNoErrors(hbck);
2308   }
2309 
2310   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2311       boolean regionInfoOnly) throws IOException, InterruptedException {
2312     HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
2313         .getRegionLocation(HConstants.EMPTY_START_ROW);
2314     ServerName hsa = metaLocation.getServerName();
2315     HRegionInfo hri = metaLocation.getRegionInfo();
2316     if (unassign) {
2317       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2318       try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
2319         undeployRegion(unmanagedConnection, hsa, hri);
2320       }
2321     }
2322 
2323     if (regionInfoOnly) {
2324       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2325       Path rootDir = FSUtils.getRootDir(conf);
2326       FileSystem fs = rootDir.getFileSystem(conf);
2327       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2328           hri.getEncodedName());
2329       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2330       fs.delete(hriPath, true);
2331     }
2332 
2333     if (hdfs) {
2334       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2335       Path rootDir = FSUtils.getRootDir(conf);
2336       FileSystem fs = rootDir.getFileSystem(conf);
2337       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2338           hri.getEncodedName());
2339       HBaseFsck.debugLsr(conf, p);
2340       boolean success = fs.delete(p, true);
2341       LOG.info("Deleted " + p + " sucessfully? " + success);
2342       HBaseFsck.debugLsr(conf, p);
2343     }
2344   }
2345 
2346   @Test (timeout=180000)
2347   public void testTableWithNoRegions() throws Exception {
2348     // We might end up with empty regions in a table
2349     // see also testNoHdfsTable()
2350     TableName table =
2351         TableName.valueOf(name.getMethodName());
2352     try {
2353       // create table with one region
2354       HTableDescriptor desc = new HTableDescriptor(table);
2355       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2356       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2357       admin.createTable(desc);
2358       tbl = (HTable) connection.getTable(table, tableExecutorService);
2359 
2360       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2361       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
2362           HConstants.EMPTY_END_ROW, false, false, true);
2363 
2364       HBaseFsck hbck = doFsck(conf, false);
2365       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2366 
2367       doFsck(conf, true);
2368 
2369       // fix hole
2370       doFsck(conf, true);
2371 
2372       // check that hole fixed
2373       assertNoErrors(doFsck(conf, false));
2374     } finally {
2375       cleanupTable(table);
2376     }
2377 
2378   }
2379 
2380   @Test (timeout=180000)
2381   public void testHbckAfterRegionMerge() throws Exception {
2382     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2383     Table meta = null;
2384     try {
2385       // disable CatalogJanitor
2386       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2387       setupTable(table);
2388       assertEquals(ROWKEYS.length, countRows());
2389 
2390       // make sure data in regions, if in wal only there is no data loss
2391       admin.flush(table);
2392       HRegionInfo region1 = tbl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
2393       HRegionInfo region2 = tbl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
2394 
2395       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2396 
2397       assertNotEquals(region1, region2);
2398 
2399       // do a region merge
2400       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2401           region2.getEncodedNameAsBytes(), false);
2402 
2403       // wait until region merged
2404       long timeout = System.currentTimeMillis() + 30 * 1000;
2405       while (true) {
2406         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2407           break;
2408         } else if (System.currentTimeMillis() > timeout) {
2409           fail("Time out waiting on region " + region1.getEncodedName()
2410               + " and " + region2.getEncodedName() + " be merged");
2411         }
2412         Thread.sleep(10);
2413       }
2414 
2415       assertEquals(ROWKEYS.length, countRows());
2416 
2417       HBaseFsck hbck = doFsck(conf, false);
2418       assertNoErrors(hbck); // no errors
2419 
2420     } finally {
2421       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2422       cleanupTable(table);
2423       IOUtils.closeQuietly(meta);
2424     }
2425   }
2426 
2427   @Test (timeout = 180000)
2428   public void testRegionBoundariesCheck() throws Exception {
2429     HBaseFsck hbck = doFsck(conf, false);
2430     assertNoErrors(hbck); // no errors
2431     try {
2432       hbck.checkRegionBoundaries();
2433     } catch (IllegalArgumentException e) {
2434       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2435         fail("Table directory path is not valid." + e.getMessage());
2436       }
2437     }
2438   }
2439 
2440   @org.junit.Rule
2441   public TestName name = new TestName();
2442 
2443   @Test (timeout=180000)
2444   public void testReadOnlyProperty() throws Exception {
2445     HBaseFsck hbck = doFsck(conf, false);
2446     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2447       hbck.shouldIgnorePreCheckPermission());
2448 
2449     hbck = doFsck(conf, true);
2450     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2451       hbck.shouldIgnorePreCheckPermission());
2452 
2453     hbck = doFsck(conf, true);
2454     hbck.setIgnorePreCheckPermission(true);
2455     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2456       hbck.shouldIgnorePreCheckPermission());
2457   }
2458 
2459   @Test (timeout=180000)
2460   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2461     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2462     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2463     try {
2464       HTableDescriptor desc = new HTableDescriptor(table);
2465       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2466       admin.createTable(desc);
2467       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2468       for (int i = 0; i < 5; i++) {
2469         Put p1 = new Put(("r" + i).getBytes());
2470         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2471         tbl.put(p1);
2472       }
2473       admin.flush(desc.getTableName());
2474       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2475       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2476       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2477       cluster.getServerWith(regions.get(0).getRegionName());
2478       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2479       st.prepare();
2480       st.stepsBeforePONR(regionServer, regionServer, false);
2481       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2482       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2483       for (RegionState state : regionsInTransition.values()) {
2484         am.regionOffline(state.getRegion());
2485       }
2486       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2487       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2488       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2489       am.assign(regionsMap);
2490       am.waitForAssignment(regions.get(0).getRegionInfo());
2491       HBaseFsck hbck = doFsck(conf, false);
2492       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2493           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2494       // holes are separate from overlap groups
2495       assertEquals(0, hbck.getOverlapGroups(table).size());
2496 
2497       // fix hole
2498       assertErrors(
2499         doFsck(
2500           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2501         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2502           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2503 
2504       // check that hole fixed
2505       assertNoErrors(doFsck(conf, false));
2506       assertEquals(5, countRows());
2507     } finally {
2508       if (tbl != null) {
2509         tbl.close();
2510         tbl = null;
2511       }
2512       cleanupTable(table);
2513     }
2514   }
2515 
2516 }