View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotNull;
27  import static org.junit.Assert.assertTrue;
28  import static org.junit.Assert.fail;
29  
30  import java.io.IOException;
31  import java.util.ArrayList;
32  import java.util.Collection;
33  import java.util.HashMap;
34  import java.util.LinkedList;
35  import java.util.List;
36  import java.util.Map;
37  import java.util.Map.Entry;
38  import java.util.concurrent.CountDownLatch;
39  import java.util.concurrent.ExecutorService;
40  import java.util.concurrent.ScheduledThreadPoolExecutor;
41  import java.util.concurrent.SynchronousQueue;
42  import java.util.concurrent.ThreadPoolExecutor;
43  import java.util.concurrent.TimeUnit;
44  
45  import org.apache.commons.io.IOUtils;
46  import org.apache.commons.logging.Log;
47  import org.apache.commons.logging.LogFactory;
48  import org.apache.hadoop.conf.Configuration;
49  import org.apache.hadoop.fs.FileStatus;
50  import org.apache.hadoop.fs.FileSystem;
51  import org.apache.hadoop.fs.Path;
52  import org.apache.hadoop.hbase.ClusterStatus;
53  import org.apache.hadoop.hbase.TableName;
54  import org.apache.hadoop.hbase.HBaseTestingUtility;
55  import org.apache.hadoop.hbase.HColumnDescriptor;
56  import org.apache.hadoop.hbase.HConstants;
57  import org.apache.hadoop.hbase.HRegionInfo;
58  import org.apache.hadoop.hbase.HRegionLocation;
59  import org.apache.hadoop.hbase.HTableDescriptor;
60  import org.apache.hadoop.hbase.LargeTests;
61  import org.apache.hadoop.hbase.MiniHBaseCluster;
62  import org.apache.hadoop.hbase.ServerName;
63  import org.apache.hadoop.hbase.catalog.MetaEditor;
64  import org.apache.hadoop.hbase.client.Delete;
65  import org.apache.hadoop.hbase.client.Durability;
66  import org.apache.hadoop.hbase.client.Get;
67  import org.apache.hadoop.hbase.client.HBaseAdmin;
68  import org.apache.hadoop.hbase.client.HConnection;
69  import org.apache.hadoop.hbase.client.HConnectionManager;
70  import org.apache.hadoop.hbase.client.HTable;
71  import org.apache.hadoop.hbase.client.MetaScanner;
72  import org.apache.hadoop.hbase.client.Put;
73  import org.apache.hadoop.hbase.client.Result;
74  import org.apache.hadoop.hbase.client.ResultScanner;
75  import org.apache.hadoop.hbase.client.Scan;
76  import org.apache.hadoop.hbase.io.hfile.TestHFile;
77  import org.apache.hadoop.hbase.master.AssignmentManager;
78  import org.apache.hadoop.hbase.master.HMaster;
79  import org.apache.hadoop.hbase.master.RegionStates;
80  import org.apache.hadoop.hbase.master.TableLockManager;
81  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
82  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
83  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
84  import org.apache.hadoop.hbase.regionserver.HRegion;
85  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
86  import org.apache.hadoop.hbase.regionserver.HRegionServer;
87  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
88  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
89  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
90  import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
91  import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
92  import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
93  import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
94  import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
95  import org.apache.zookeeper.KeeperException;
96  import org.junit.AfterClass;
97  import org.junit.BeforeClass;
98  import org.junit.Ignore;
99  import org.junit.Test;
100 import org.junit.experimental.categories.Category;
101 import org.junit.rules.TestName;
102 
103 import com.google.common.collect.Multimap;
104 
105 /**
106  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
107  */
108 @Category(LargeTests.class)
109 public class TestHBaseFsck {
110   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
111   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
112   private final static Configuration conf = TEST_UTIL.getConfiguration();
113   private final static String FAM_STR = "fam";
114   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
115   private final static int REGION_ONLINE_TIMEOUT = 800;
116   private static RegionStates regionStates;
117   private static ExecutorService executorService;
118 
119   // for the instance, reset every test run
120   private HTable tbl;
121   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
122     Bytes.toBytes("B"), Bytes.toBytes("C") };
123   // one row per region.
124   private final static byte[][] ROWKEYS= new byte[][] {
125     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
126     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
127 
128   @BeforeClass
129   public static void setUpBeforeClass() throws Exception {
130     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
131     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
132     TEST_UTIL.startMiniCluster(3);
133 
134     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
135         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
136 
137     AssignmentManager assignmentManager =
138       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
139     regionStates = assignmentManager.getRegionStates();
140   }
141 
142   @AfterClass
143   public static void tearDownAfterClass() throws Exception {
144     TEST_UTIL.shutdownMiniCluster();
145   }
146 
147   @Test
148   public void testHBaseFsck() throws Exception {
149     assertNoErrors(doFsck(conf, false));
150     String table = "tableBadMetaAssign";
151     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
152 
153     // We created 1 table, should be fine
154     assertNoErrors(doFsck(conf, false));
155 
156     // Now let's mess it up and change the assignment in .META. to
157     // point to a different region server
158     HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName(),
159         executorService);
160     Scan scan = new Scan();
161     scan.setStartRow(Bytes.toBytes(table+",,"));
162     ResultScanner scanner = meta.getScanner(scan);
163     HRegionInfo hri = null;
164 
165     Result res = scanner.next();
166     ServerName currServer =
167       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
168           HConstants.SERVER_QUALIFIER));
169     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
170         HConstants.STARTCODE_QUALIFIER));
171 
172     for (JVMClusterUtil.RegionServerThread rs :
173         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
174 
175       ServerName sn = rs.getRegionServer().getServerName();
176 
177       // When we find a diff RS, change the assignment and break
178       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
179           startCode != sn.getStartcode()) {
180         Put put = new Put(res.getRow());
181         put.setDurability(Durability.SKIP_WAL);
182         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
183           Bytes.toBytes(sn.getHostAndPort()));
184         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
185           Bytes.toBytes(sn.getStartcode()));
186         meta.put(put);
187         hri = HRegionInfo.getHRegionInfo(res);
188         break;
189       }
190     }
191 
192     // Try to fix the data
193     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
194         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
195 
196     TEST_UTIL.getHBaseCluster().getMaster()
197       .getAssignmentManager().waitForAssignment(hri);
198 
199     // Should be fixed now
200     assertNoErrors(doFsck(conf, false));
201 
202     // comment needed - what is the purpose of this line
203     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
204     ResultScanner s = t.getScanner(new Scan());
205     s.close();
206     t.close();
207 
208     scanner.close();
209     meta.close();
210   }
211 
212   /**
213    * Create a new region in META.
214    */
215   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
216       htd, byte[] startKey, byte[] endKey)
217       throws IOException {
218     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
219     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
220     MetaEditor.addRegionToMeta(meta, hri);
221     meta.close();
222     return hri;
223   }
224 
225   /**
226    * Debugging method to dump the contents of meta.
227    */
228   private void dumpMeta(TableName tableName) throws IOException {
229     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
230     for (byte[] row : metaRows) {
231       LOG.info(Bytes.toString(row));
232     }
233   }
234 
235   /**
236    * This method is used to undeploy a region -- close it and attempt to
237    * remove its state from the Master.
238    */
239   private void undeployRegion(HBaseAdmin admin, ServerName sn,
240       HRegionInfo hri) throws IOException, InterruptedException {
241     try {
242       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
243       if (!hri.isMetaTable()) {
244         admin.offline(hri.getRegionName());
245       }
246     } catch (IOException ioe) {
247       LOG.warn("Got exception when attempting to offline region "
248           + Bytes.toString(hri.getRegionName()), ioe);
249     }
250   }
251   /**
252    * Delete a region from assignments, meta, or completely from hdfs.
253    * @param unassign if true unassign region if assigned
254    * @param metaRow  if true remove region's row from META
255    * @param hdfs if true remove region's dir in HDFS
256    */
257   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
258       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
259       boolean hdfs) throws IOException, InterruptedException {
260     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
261   }
262 
263   /**
264    * Delete a region from assignments, meta, or completely from hdfs.
265    * @param unassign if true unassign region if assigned
266    * @param metaRow  if true remove region's row from META
267    * @param hdfs if true remove region's dir in HDFS
268    * @param regionInfoOnly if true remove a region dir's .regioninfo file
269    */
270   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
271       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
272       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
273     LOG.info("** Before delete:");
274     dumpMeta(htd.getTableName());
275 
276     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
277     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
278       HRegionInfo hri = e.getKey();
279       ServerName hsa = e.getValue();
280       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
281           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
282 
283         LOG.info("RegionName: " +hri.getRegionNameAsString());
284         byte[] deleteRow = hri.getRegionName();
285 
286         if (unassign) {
287           LOG.info("Undeploying region " + hri + " from server " + hsa);
288           undeployRegion(new HBaseAdmin(conf), hsa, hri);
289         }
290 
291         if (regionInfoOnly) {
292           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
293           Path rootDir = FSUtils.getRootDir(conf);
294           FileSystem fs = rootDir.getFileSystem(conf);
295           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
296               hri.getEncodedName());
297           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
298           fs.delete(hriPath, true);
299         }
300 
301         if (hdfs) {
302           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
303           Path rootDir = FSUtils.getRootDir(conf);
304           FileSystem fs = rootDir.getFileSystem(conf);
305           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
306               hri.getEncodedName());
307           HBaseFsck.debugLsr(conf, p);
308           boolean success = fs.delete(p, true);
309           LOG.info("Deleted " + p + " sucessfully? " + success);
310           HBaseFsck.debugLsr(conf, p);
311         }
312 
313         if (metaRow) {
314           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
315           Delete delete = new Delete(deleteRow);
316           meta.delete(delete);
317         }
318       }
319       LOG.info(hri.toString() + hsa.toString());
320     }
321 
322     TEST_UTIL.getMetaTableRows(htd.getTableName());
323     LOG.info("*** After delete:");
324     dumpMeta(htd.getTableName());
325   }
326 
327   /**
328    * Setup a clean table before we start mucking with it.
329    *
330    * @throws IOException
331    * @throws InterruptedException
332    * @throws KeeperException
333    */
334   HTable setupTable(TableName tablename) throws Exception {
335     HTableDescriptor desc = new HTableDescriptor(tablename);
336     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
337     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
338     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
339     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
340 
341     List<Put> puts = new ArrayList<Put>();
342     for (byte[] row : ROWKEYS) {
343       Put p = new Put(row);
344       p.add(FAM, Bytes.toBytes("val"), row);
345       puts.add(p);
346     }
347     tbl.put(puts);
348     tbl.flushCommits();
349     return tbl;
350   }
351 
352   /**
353    * Counts the number of row to verify data loss or non-dataloss.
354    */
355   int countRows() throws IOException {
356      Scan s = new Scan();
357      ResultScanner rs = tbl.getScanner(s);
358      int i = 0;
359      while(rs.next() !=null) {
360        i++;
361      }
362      return i;
363   }
364 
365   /**
366    * delete table in preparation for next test
367    *
368    * @param tablename
369    * @throws IOException
370    */
371   void deleteTable(TableName tablename) throws IOException {
372     HBaseAdmin admin = new HBaseAdmin(conf);
373     admin.getConnection().clearRegionCache();
374     admin.disableTableAsync(tablename);
375     long totalWait = 0;
376     long maxWait = 30*1000;
377     long sleepTime = 250;
378     while (!admin.isTableDisabled(tablename)) {
379       try {
380         Thread.sleep(sleepTime);
381         totalWait += sleepTime;
382         if (totalWait >= maxWait) {
383           fail("Waited too long for table to be disabled + " + tablename);
384         }
385       } catch (InterruptedException e) {
386         e.printStackTrace();
387         fail("Interrupted when trying to disable table " + tablename);
388       }
389     }
390     admin.deleteTable(tablename);
391   }
392 
393   /**
394    * This creates a clean table and confirms that the table is clean.
395    */
396   @Test
397   public void testHBaseFsckClean() throws Exception {
398     assertNoErrors(doFsck(conf, false));
399     TableName table = TableName.valueOf("tableClean");
400     try {
401       HBaseFsck hbck = doFsck(conf, false);
402       assertNoErrors(hbck);
403 
404       setupTable(table);
405       assertEquals(ROWKEYS.length, countRows());
406 
407       // We created 1 table, should be fine
408       hbck = doFsck(conf, false);
409       assertNoErrors(hbck);
410       assertEquals(0, hbck.getOverlapGroups(table).size());
411       assertEquals(ROWKEYS.length, countRows());
412     } finally {
413       deleteTable(table);
414     }
415   }
416 
417   /**
418    * Test thread pooling in the case where there are more regions than threads
419    */
420   @Test
421   public void testHbckThreadpooling() throws Exception {
422     TableName table =
423         TableName.valueOf("tableDupeStartKey");
424     try {
425       // Create table with 4 regions
426       setupTable(table);
427 
428       // limit number of threads to 1.
429       Configuration newconf = new Configuration(conf);
430       newconf.setInt("hbasefsck.numthreads", 1);
431       assertNoErrors(doFsck(newconf, false));
432 
433       // We should pass without triggering a RejectedExecutionException
434     } finally {
435       deleteTable(table);
436     }
437   }
438 
439   @Test
440   public void testHbckFixOrphanTable() throws Exception {
441     TableName table = TableName.valueOf("tableInfo");
442     FileSystem fs = null;
443     Path tableinfo = null;
444     try {
445       setupTable(table);
446       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
447 
448       Path hbaseTableDir = FSUtils.getTableDir(
449           FSUtils.getRootDir(conf), table);
450       fs = hbaseTableDir.getFileSystem(conf);
451       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
452       tableinfo = status.getPath();
453       fs.rename(tableinfo, new Path("/.tableinfo"));
454 
455       //to report error if .tableinfo is missing.
456       HBaseFsck hbck = doFsck(conf, false);
457       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
458 
459       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
460       hbck = doFsck(conf, true);
461       assertNoErrors(hbck);
462       status = null;
463       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
464       assertNotNull(status);
465 
466       HTableDescriptor htd = admin.getTableDescriptor(table);
467       htd.setValue("NOT_DEFAULT", "true");
468       admin.disableTable(table);
469       admin.modifyTable(table, htd);
470       admin.enableTable(table);
471       fs.delete(status.getPath(), true);
472 
473       // fix OrphanTable with cache
474       htd = admin.getTableDescriptor(table); // warms up cached htd on master
475       hbck = doFsck(conf, true);
476       assertNoErrors(hbck);
477       status = null;
478       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
479       assertNotNull(status);
480       htd = admin.getTableDescriptor(table);
481       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
482     } finally {
483       fs.rename(new Path("/.tableinfo"), tableinfo);
484       deleteTable(table);
485     }
486   }
487 
488   /**
489    * This create and fixes a bad table with regions that have a duplicate
490    * start key
491    */
492   @Test
493   public void testDupeStartKey() throws Exception {
494     TableName table =
495         TableName.valueOf("tableDupeStartKey");
496     try {
497       setupTable(table);
498       assertNoErrors(doFsck(conf, false));
499       assertEquals(ROWKEYS.length, countRows());
500 
501       // Now let's mess it up, by adding a region with a duplicate startkey
502       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
503           Bytes.toBytes("A"), Bytes.toBytes("A2"));
504       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
505       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
506           .waitForAssignment(hriDupe);
507       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
508       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
509 
510       HBaseFsck hbck = doFsck(conf, false);
511       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
512             ERROR_CODE.DUPE_STARTKEYS});
513       assertEquals(2, hbck.getOverlapGroups(table).size());
514       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
515 
516       // fix the degenerate region.
517       doFsck(conf,true);
518 
519       // check that the degenerate region is gone and no data loss
520       HBaseFsck hbck2 = doFsck(conf,false);
521       assertNoErrors(hbck2);
522       assertEquals(0, hbck2.getOverlapGroups(table).size());
523       assertEquals(ROWKEYS.length, countRows());
524     } finally {
525       deleteTable(table);
526     }
527   }
528 
529   /**
530    * Get region info from local cluster.
531    */
532   Map<ServerName, List<String>> getDeployedHRIs(
533       final HBaseAdmin admin) throws IOException {
534     ClusterStatus status = admin.getClusterStatus();
535     Collection<ServerName> regionServers = status.getServers();
536     Map<ServerName, List<String>> mm =
537         new HashMap<ServerName, List<String>>();
538     HConnection connection = admin.getConnection();
539     for (ServerName hsi : regionServers) {
540       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
541 
542       // list all online regions from this region server
543       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
544       List<String> regionNames = new ArrayList<String>();
545       for (HRegionInfo hri : regions) {
546         regionNames.add(hri.getRegionNameAsString());
547       }
548       mm.put(hsi, regionNames);
549     }
550     return mm;
551   }
552 
553   /**
554    * Returns the HSI a region info is on.
555    */
556   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
557     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
558       if (e.getValue().contains(hri.getRegionNameAsString())) {
559         return e.getKey();
560       }
561     }
562     return null;
563   }
564 
565   /**
566    * This create and fixes a bad table with regions that have a duplicate
567    * start key
568    */
569   @Test
570   public void testDupeRegion() throws Exception {
571     TableName table =
572         TableName.valueOf("tableDupeRegion");
573     try {
574       setupTable(table);
575       assertNoErrors(doFsck(conf, false));
576       assertEquals(ROWKEYS.length, countRows());
577 
578       // Now let's mess it up, by adding a region with a duplicate startkey
579       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
580           Bytes.toBytes("A"), Bytes.toBytes("B"));
581 
582       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
583       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
584           .waitForAssignment(hriDupe);
585       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
586       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
587 
588       // Yikes! The assignment manager can't tell between diff between two
589       // different regions with the same start/endkeys since it doesn't
590       // differentiate on ts/regionId!  We actually need to recheck
591       // deployments!
592       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
593       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
594         Thread.sleep(250);
595       }
596 
597       LOG.debug("Finished assignment of dupe region");
598 
599       // TODO why is dupe region different from dupe start keys?
600       HBaseFsck hbck = doFsck(conf, false);
601       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
602             ERROR_CODE.DUPE_STARTKEYS});
603       assertEquals(2, hbck.getOverlapGroups(table).size());
604       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
605 
606       // fix the degenerate region.
607       doFsck(conf,true);
608 
609       // check that the degenerate region is gone and no data loss
610       HBaseFsck hbck2 = doFsck(conf,false);
611       assertNoErrors(hbck2);
612       assertEquals(0, hbck2.getOverlapGroups(table).size());
613       assertEquals(ROWKEYS.length, countRows());
614     } finally {
615       deleteTable(table);
616     }
617   }
618 
619   /**
620    * This creates and fixes a bad table with regions that has startkey == endkey
621    */
622   @Test
623   public void testDegenerateRegions() throws Exception {
624     TableName table =
625         TableName.valueOf("tableDegenerateRegions");
626     try {
627       setupTable(table);
628       assertNoErrors(doFsck(conf,false));
629       assertEquals(ROWKEYS.length, countRows());
630 
631       // Now let's mess it up, by adding a region with a duplicate startkey
632       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
633           Bytes.toBytes("B"), Bytes.toBytes("B"));
634       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
635       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
636           .waitForAssignment(hriDupe);
637       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
638       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
639 
640       HBaseFsck hbck = doFsck(conf,false);
641       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
642           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
643       assertEquals(2, hbck.getOverlapGroups(table).size());
644       assertEquals(ROWKEYS.length, countRows());
645 
646       // fix the degenerate region.
647       doFsck(conf,true);
648 
649       // check that the degenerate region is gone and no data loss
650       HBaseFsck hbck2 = doFsck(conf,false);
651       assertNoErrors(hbck2);
652       assertEquals(0, hbck2.getOverlapGroups(table).size());
653       assertEquals(ROWKEYS.length, countRows());
654     } finally {
655       deleteTable(table);
656     }
657   }
658 
659   /**
660    * This creates and fixes a bad table where a region is completely contained
661    * by another region.
662    */
663   @Test
664   public void testContainedRegionOverlap() throws Exception {
665     TableName table =
666         TableName.valueOf("tableContainedRegionOverlap");
667     try {
668       setupTable(table);
669       assertEquals(ROWKEYS.length, countRows());
670 
671       // Mess it up by creating an overlap in the metadata
672       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
673           Bytes.toBytes("A2"), Bytes.toBytes("B"));
674       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
675       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
676           .waitForAssignment(hriOverlap);
677       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
678       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
679 
680       HBaseFsck hbck = doFsck(conf, false);
681       assertErrors(hbck, new ERROR_CODE[] {
682           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
683       assertEquals(2, hbck.getOverlapGroups(table).size());
684       assertEquals(ROWKEYS.length, countRows());
685 
686       // fix the problem.
687       doFsck(conf, true);
688 
689       // verify that overlaps are fixed
690       HBaseFsck hbck2 = doFsck(conf,false);
691       assertNoErrors(hbck2);
692       assertEquals(0, hbck2.getOverlapGroups(table).size());
693       assertEquals(ROWKEYS.length, countRows());
694     } finally {
695        deleteTable(table);
696     }
697   }
698 
699   /**
700    * This creates and fixes a bad table where an overlap group of
701    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
702    * region. Mess around the meta data so that closeRegion/offlineRegion
703    * throws exceptions.
704    */
705   @Test
706   public void testSidelineOverlapRegion() throws Exception {
707     TableName table =
708         TableName.valueOf("testSidelineOverlapRegion");
709     try {
710       setupTable(table);
711       assertEquals(ROWKEYS.length, countRows());
712 
713       // Mess it up by creating an overlap
714       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
715       HMaster master = cluster.getMaster();
716       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
717         Bytes.toBytes("A"), Bytes.toBytes("AB"));
718       master.assignRegion(hriOverlap1);
719       master.getAssignmentManager().waitForAssignment(hriOverlap1);
720       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
721         Bytes.toBytes("AB"), Bytes.toBytes("B"));
722       master.assignRegion(hriOverlap2);
723       master.getAssignmentManager().waitForAssignment(hriOverlap2);
724 
725       HBaseFsck hbck = doFsck(conf, false);
726       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
727         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
728       assertEquals(3, hbck.getOverlapGroups(table).size());
729       assertEquals(ROWKEYS.length, countRows());
730 
731       // mess around the overlapped regions, to trigger NotServingRegionException
732       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
733       ServerName serverName = null;
734       byte[] regionName = null;
735       for (HbckInfo hbi: overlapGroups.values()) {
736         if ("A".equals(Bytes.toString(hbi.getStartKey()))
737             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
738           regionName = hbi.getRegionName();
739 
740           // get an RS not serving the region to force bad assignment info in to META.
741           int k = cluster.getServerWith(regionName);
742           for (int i = 0; i < 3; i++) {
743             if (i != k) {
744               HRegionServer rs = cluster.getRegionServer(i);
745               serverName = rs.getServerName();
746               break;
747             }
748           }
749 
750           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
751           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
752             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
753           admin.offline(regionName);
754           break;
755         }
756       }
757 
758       assertNotNull(regionName);
759       assertNotNull(serverName);
760       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
761       Put put = new Put(regionName);
762       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
763         Bytes.toBytes(serverName.getHostAndPort()));
764       meta.put(put);
765 
766       // fix the problem.
767       HBaseFsck fsck = new HBaseFsck(conf);
768       fsck.connect();
769       fsck.setDisplayFullReport(); // i.e. -details
770       fsck.setTimeLag(0);
771       fsck.setFixAssignments(true);
772       fsck.setFixMeta(true);
773       fsck.setFixHdfsHoles(true);
774       fsck.setFixHdfsOverlaps(true);
775       fsck.setFixHdfsOrphans(true);
776       fsck.setFixVersionFile(true);
777       fsck.setSidelineBigOverlaps(true);
778       fsck.setMaxMerge(2);
779       fsck.onlineHbck();
780 
781       // verify that overlaps are fixed, and there are less rows
782       // since one region is sidelined.
783       HBaseFsck hbck2 = doFsck(conf,false);
784       assertNoErrors(hbck2);
785       assertEquals(0, hbck2.getOverlapGroups(table).size());
786       assertTrue(ROWKEYS.length > countRows());
787     } finally {
788        deleteTable(table);
789     }
790   }
791 
792   /**
793    * This creates and fixes a bad table where a region is completely contained
794    * by another region, and there is a hole (sort of like a bad split)
795    */
796   @Test
797   public void testOverlapAndOrphan() throws Exception {
798     TableName table =
799         TableName.valueOf("tableOverlapAndOrphan");
800     try {
801       setupTable(table);
802       assertEquals(ROWKEYS.length, countRows());
803 
804       // Mess it up by creating an overlap in the metadata
805       TEST_UTIL.getHBaseAdmin().disableTable(table);
806       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
807           Bytes.toBytes("B"), true, true, false, true);
808       TEST_UTIL.getHBaseAdmin().enableTable(table);
809 
810       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
811           Bytes.toBytes("A2"), Bytes.toBytes("B"));
812       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
813       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
814           .waitForAssignment(hriOverlap);
815       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
816       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
817 
818       HBaseFsck hbck = doFsck(conf, false);
819       assertErrors(hbck, new ERROR_CODE[] {
820           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
821           ERROR_CODE.HOLE_IN_REGION_CHAIN});
822 
823       // fix the problem.
824       doFsck(conf, true);
825 
826       // verify that overlaps are fixed
827       HBaseFsck hbck2 = doFsck(conf,false);
828       assertNoErrors(hbck2);
829       assertEquals(0, hbck2.getOverlapGroups(table).size());
830       assertEquals(ROWKEYS.length, countRows());
831     } finally {
832        deleteTable(table);
833     }
834   }
835 
836   /**
837    * This creates and fixes a bad table where a region overlaps two regions --
838    * a start key contained in another region and its end key is contained in
839    * yet another region.
840    */
841   @Test
842   public void testCoveredStartKey() throws Exception {
843     TableName table =
844         TableName.valueOf("tableCoveredStartKey");
845     try {
846       setupTable(table);
847       assertEquals(ROWKEYS.length, countRows());
848 
849       // Mess it up by creating an overlap in the metadata
850       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
851           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
852       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
853       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
854           .waitForAssignment(hriOverlap);
855       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
856       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
857 
858       HBaseFsck hbck = doFsck(conf, false);
859       assertErrors(hbck, new ERROR_CODE[] {
860           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
861           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
862       assertEquals(3, hbck.getOverlapGroups(table).size());
863       assertEquals(ROWKEYS.length, countRows());
864 
865       // fix the problem.
866       doFsck(conf, true);
867 
868       // verify that overlaps are fixed
869       HBaseFsck hbck2 = doFsck(conf, false);
870       assertErrors(hbck2, new ERROR_CODE[0]);
871       assertEquals(0, hbck2.getOverlapGroups(table).size());
872       assertEquals(ROWKEYS.length, countRows());
873     } finally {
874       deleteTable(table);
875     }
876   }
877 
878   /**
879    * This creates and fixes a bad table with a missing region -- hole in meta
880    * and data missing in the fs.
881    */
882   @Test
883   public void testRegionHole() throws Exception {
884     TableName table =
885         TableName.valueOf("tableRegionHole");
886     try {
887       setupTable(table);
888       assertEquals(ROWKEYS.length, countRows());
889 
890       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
891       TEST_UTIL.getHBaseAdmin().disableTable(table);
892       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
893           Bytes.toBytes("C"), true, true, true);
894       TEST_UTIL.getHBaseAdmin().enableTable(table);
895 
896       HBaseFsck hbck = doFsck(conf, false);
897       assertErrors(hbck, new ERROR_CODE[] {
898           ERROR_CODE.HOLE_IN_REGION_CHAIN});
899       // holes are separate from overlap groups
900       assertEquals(0, hbck.getOverlapGroups(table).size());
901 
902       // fix hole
903       doFsck(conf, true);
904 
905       // check that hole fixed
906       assertNoErrors(doFsck(conf,false));
907       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
908     } finally {
909       deleteTable(table);
910     }
911   }
912 
913   /**
914    * This creates and fixes a bad table with a missing region -- hole in meta
915    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
916    */
917   @Test
918   public void testHDFSRegioninfoMissing() throws Exception {
919     TableName table =
920         TableName.valueOf("tableHDFSRegioininfoMissing");
921     try {
922       setupTable(table);
923       assertEquals(ROWKEYS.length, countRows());
924 
925       // Mess it up by leaving a hole in the meta data
926       TEST_UTIL.getHBaseAdmin().disableTable(table);
927       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
928           Bytes.toBytes("C"), true, true, false, true);
929       TEST_UTIL.getHBaseAdmin().enableTable(table);
930 
931       HBaseFsck hbck = doFsck(conf, false);
932       assertErrors(hbck, new ERROR_CODE[] {
933           ERROR_CODE.ORPHAN_HDFS_REGION,
934           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
935           ERROR_CODE.HOLE_IN_REGION_CHAIN});
936       // holes are separate from overlap groups
937       assertEquals(0, hbck.getOverlapGroups(table).size());
938 
939       // fix hole
940       doFsck(conf, true);
941 
942       // check that hole fixed
943       assertNoErrors(doFsck(conf, false));
944       assertEquals(ROWKEYS.length, countRows());
945     } finally {
946       deleteTable(table);
947     }
948   }
949 
950   /**
951    * This creates and fixes a bad table with a region that is missing meta and
952    * not assigned to a region server.
953    */
954   @Test
955   public void testNotInMetaOrDeployedHole() throws Exception {
956     TableName table =
957         TableName.valueOf("tableNotInMetaOrDeployedHole");
958     try {
959       setupTable(table);
960       assertEquals(ROWKEYS.length, countRows());
961 
962       // Mess it up by leaving a hole in the meta data
963       TEST_UTIL.getHBaseAdmin().disableTable(table);
964       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
965           Bytes.toBytes("C"), true, true, false); // don't rm from fs
966       TEST_UTIL.getHBaseAdmin().enableTable(table);
967 
968       HBaseFsck hbck = doFsck(conf, false);
969       assertErrors(hbck, new ERROR_CODE[] {
970           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
971       // holes are separate from overlap groups
972       assertEquals(0, hbck.getOverlapGroups(table).size());
973 
974       // fix hole
975       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
976           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
977 
978       // check that hole fixed
979       assertNoErrors(doFsck(conf,false));
980       assertEquals(ROWKEYS.length, countRows());
981     } finally {
982       deleteTable(table);
983     }
984   }
985 
986   /**
987    * This creates fixes a bad table with a hole in meta.
988    */
989   @Test
990   public void testNotInMetaHole() throws Exception {
991     TableName table =
992         TableName.valueOf("tableNotInMetaHole");
993     try {
994       setupTable(table);
995       assertEquals(ROWKEYS.length, countRows());
996 
997       // Mess it up by leaving a hole in the meta data
998       TEST_UTIL.getHBaseAdmin().disableTable(table);
999       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1000           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1001       TEST_UTIL.getHBaseAdmin().enableTable(table);
1002 
1003       HBaseFsck hbck = doFsck(conf, false);
1004       assertErrors(hbck, new ERROR_CODE[] {
1005           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1006       // holes are separate from overlap groups
1007       assertEquals(0, hbck.getOverlapGroups(table).size());
1008 
1009       // fix hole
1010       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1011           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1012 
1013       // check that hole fixed
1014       assertNoErrors(doFsck(conf,false));
1015       assertEquals(ROWKEYS.length, countRows());
1016     } finally {
1017       deleteTable(table);
1018     }
1019   }
1020 
1021   /**
1022    * This creates and fixes a bad table with a region that is in meta but has
1023    * no deployment or data hdfs
1024    */
1025   @Test
1026   public void testNotInHdfs() throws Exception {
1027     TableName table =
1028         TableName.valueOf("tableNotInHdfs");
1029     try {
1030       setupTable(table);
1031       assertEquals(ROWKEYS.length, countRows());
1032 
1033       // make sure data in regions, if in hlog only there is no data loss
1034       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1035 
1036       // Mess it up by leaving a hole in the hdfs data
1037       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1038           Bytes.toBytes("C"), false, false, true); // don't rm meta
1039 
1040       HBaseFsck hbck = doFsck(conf, false);
1041       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1042       // holes are separate from overlap groups
1043       assertEquals(0, hbck.getOverlapGroups(table).size());
1044 
1045       // fix hole
1046       doFsck(conf, true);
1047 
1048       // check that hole fixed
1049       assertNoErrors(doFsck(conf,false));
1050       assertEquals(ROWKEYS.length - 2, countRows());
1051     } finally {
1052       deleteTable(table);
1053     }
1054   }
1055 
1056   /**
1057    * This creates entries in META with no hdfs data.  This should cleanly
1058    * remove the table.
1059    */
1060   @Test
1061   public void testNoHdfsTable() throws Exception {
1062     TableName table = TableName.valueOf("NoHdfsTable");
1063     setupTable(table);
1064     assertEquals(ROWKEYS.length, countRows());
1065 
1066     // make sure data in regions, if in hlog only there is no data loss
1067     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1068 
1069     // Mess it up by leaving a giant hole in meta
1070     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1071         Bytes.toBytes("A"), false, false, true); // don't rm meta
1072     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1073         Bytes.toBytes("B"), false, false, true); // don't rm meta
1074     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1075         Bytes.toBytes("C"), false, false, true); // don't rm meta
1076     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1077         Bytes.toBytes(""), false, false, true); // don't rm meta
1078 
1079     HBaseFsck hbck = doFsck(conf, false);
1080     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1081         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1082         ERROR_CODE.NOT_IN_HDFS,});
1083     // holes are separate from overlap groups
1084     assertEquals(0, hbck.getOverlapGroups(table).size());
1085 
1086     // fix hole
1087     doFsck(conf, true); // in 0.92+, meta entries auto create regiondirs
1088 
1089     // check that hole fixed
1090     assertNoErrors(doFsck(conf,false));
1091     assertFalse("Table "+ table + " should have been deleted",
1092         TEST_UTIL.getHBaseAdmin().tableExists(table));
1093   }
1094 
1095   /**
1096    * when the hbase.version file missing, It is fix the fault.
1097    */
1098   @Test
1099   public void testNoVersionFile() throws Exception {
1100     // delete the hbase.version file
1101     Path rootDir = FSUtils.getRootDir(conf);
1102     FileSystem fs = rootDir.getFileSystem(conf);
1103     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1104     fs.delete(versionFile, true);
1105 
1106     // test
1107     HBaseFsck hbck = doFsck(conf, false);
1108     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1109     // fix hbase.version missing
1110     doFsck(conf, true);
1111 
1112     // no version file fixed
1113     assertNoErrors(doFsck(conf, false));
1114   }
1115 
1116   /**
1117    * The region is not deployed when the table is disabled.
1118    */
1119   @Test
1120   public void testRegionShouldNotBeDeployed() throws Exception {
1121     TableName table =
1122         TableName.valueOf("tableRegionShouldNotBeDeployed");
1123     try {
1124       LOG.info("Starting testRegionShouldNotBeDeployed.");
1125       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1126       assertTrue(cluster.waitForActiveAndReadyMaster());
1127 
1128 
1129       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1130           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1131       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1132       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1133 
1134       // Write the .tableinfo
1135       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1136       fstd.createTableDescriptor(htdDisabled);
1137       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1138           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1139 
1140       // Let's just assign everything to first RS
1141       HRegionServer hrs = cluster.getRegionServer(0);
1142 
1143       // Create region files.
1144       TEST_UTIL.getHBaseAdmin().disableTable(table);
1145       TEST_UTIL.getHBaseAdmin().enableTable(table);
1146 
1147       // Disable the table and close its regions
1148       TEST_UTIL.getHBaseAdmin().disableTable(table);
1149       HRegionInfo region = disabledRegions.remove(0);
1150       byte[] regionName = region.getRegionName();
1151 
1152       // The region should not be assigned currently
1153       assertTrue(cluster.getServerWith(regionName) == -1);
1154 
1155       // Directly open a region on a region server.
1156       // If going through AM/ZK, the region won't be open.
1157       // Even it is opened, AM will close it which causes
1158       // flakiness of this test.
1159       HRegion r = HRegion.openHRegion(
1160         region, htdDisabled, hrs.getWAL(region), conf);
1161       hrs.addToOnlineRegions(r);
1162 
1163       HBaseFsck hbck = doFsck(conf, false);
1164       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1165 
1166       // fix this fault
1167       doFsck(conf, true);
1168 
1169       // check result
1170       assertNoErrors(doFsck(conf, false));
1171     } finally {
1172       TEST_UTIL.getHBaseAdmin().enableTable(table);
1173       deleteTable(table);
1174     }
1175   }
1176 
1177   /**
1178    * This creates two tables and mess both of them and fix them one by one
1179    */
1180   @Test
1181   public void testFixByTable() throws Exception {
1182     TableName table1 =
1183         TableName.valueOf("testFixByTable1");
1184     TableName table2 =
1185         TableName.valueOf("testFixByTable2");
1186     try {
1187       setupTable(table1);
1188       // make sure data in regions, if in hlog only there is no data loss
1189       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1190       // Mess them up by leaving a hole in the hdfs data
1191       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1192         Bytes.toBytes("C"), false, false, true); // don't rm meta
1193 
1194       setupTable(table2);
1195       // make sure data in regions, if in hlog only there is no data loss
1196       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1197       // Mess them up by leaving a hole in the hdfs data
1198       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1199         Bytes.toBytes("C"), false, false, true); // don't rm meta
1200 
1201       HBaseFsck hbck = doFsck(conf, false);
1202       assertErrors(hbck, new ERROR_CODE[] {
1203         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1204 
1205       // fix hole in table 1
1206       doFsck(conf, true, table1);
1207       // check that hole in table 1 fixed
1208       assertNoErrors(doFsck(conf, false, table1));
1209       // check that hole in table 2 still there
1210       assertErrors(doFsck(conf, false, table2),
1211         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1212 
1213       // fix hole in table 2
1214       doFsck(conf, true, table2);
1215       // check that hole in both tables fixed
1216       assertNoErrors(doFsck(conf, false));
1217       assertEquals(ROWKEYS.length - 2, countRows());
1218     } finally {
1219       deleteTable(table1);
1220       deleteTable(table2);
1221     }
1222   }
1223   /**
1224    * A split parent in meta, in hdfs, and not deployed
1225    */
1226   @Test
1227   public void testLingeringSplitParent() throws Exception {
1228     TableName table =
1229         TableName.valueOf("testLingeringSplitParent");
1230     HTable meta = null;
1231     try {
1232       setupTable(table);
1233       assertEquals(ROWKEYS.length, countRows());
1234 
1235       // make sure data in regions, if in hlog only there is no data loss
1236       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1237       HRegionLocation location = tbl.getRegionLocation("B");
1238 
1239       // Delete one region from meta, but not hdfs, unassign it.
1240       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1241         Bytes.toBytes("C"), true, true, false);
1242 
1243       // Create a new meta entry to fake it as a split parent.
1244       meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName(),
1245           executorService);
1246       HRegionInfo hri = location.getRegionInfo();
1247 
1248       HRegionInfo a = new HRegionInfo(tbl.getName(),
1249         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1250       HRegionInfo b = new HRegionInfo(tbl.getName(),
1251         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1252 
1253       hri.setOffline(true);
1254       hri.setSplit(true);
1255 
1256       MetaEditor.addRegionToMeta(meta, hri, a, b);
1257       meta.flushCommits();
1258       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1259 
1260       HBaseFsck hbck = doFsck(conf, false);
1261       assertErrors(hbck, new ERROR_CODE[] {
1262         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1263 
1264       // regular repair cannot fix lingering split parent
1265       hbck = doFsck(conf, true);
1266       assertErrors(hbck, new ERROR_CODE[] {
1267         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1268       assertFalse(hbck.shouldRerun());
1269       hbck = doFsck(conf, false);
1270       assertErrors(hbck, new ERROR_CODE[] {
1271         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1272 
1273       // fix lingering split parent
1274       hbck = new HBaseFsck(conf);
1275       hbck.connect();
1276       hbck.setDisplayFullReport(); // i.e. -details
1277       hbck.setTimeLag(0);
1278       hbck.setFixSplitParents(true);
1279       hbck.onlineHbck();
1280       assertTrue(hbck.shouldRerun());
1281 
1282       Get get = new Get(hri.getRegionName());
1283       Result result = meta.get(get);
1284       assertTrue(result.getColumn(HConstants.CATALOG_FAMILY,
1285         HConstants.SPLITA_QUALIFIER).isEmpty());
1286       assertTrue(result.getColumn(HConstants.CATALOG_FAMILY,
1287         HConstants.SPLITB_QUALIFIER).isEmpty());
1288       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1289 
1290       // fix other issues
1291       doFsck(conf, true);
1292 
1293       // check that all are fixed
1294       assertNoErrors(doFsck(conf, false));
1295       assertEquals(ROWKEYS.length, countRows());
1296     } finally {
1297       deleteTable(table);
1298       IOUtils.closeQuietly(meta);
1299     }
1300   }
1301 
1302   /**
1303    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1304    * valid cases where the daughters are there.
1305    */
1306   @Test
1307   public void testValidLingeringSplitParent() throws Exception {
1308     TableName table =
1309         TableName.valueOf("testLingeringSplitParent");
1310     HTable meta = null;
1311     try {
1312       setupTable(table);
1313       assertEquals(ROWKEYS.length, countRows());
1314 
1315       // make sure data in regions, if in hlog only there is no data loss
1316       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1317       HRegionLocation location = tbl.getRegionLocation("B");
1318 
1319       meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName());
1320       HRegionInfo hri = location.getRegionInfo();
1321 
1322       // do a regular split
1323       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1324       byte[] regionName = location.getRegionInfo().getRegionName();
1325       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1326       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1327           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1328 
1329       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1330       // for some time until children references are deleted. HBCK erroneously sees this as
1331       // overlapping regions
1332       HBaseFsck hbck = doFsck(conf, true, true, false, false, false, true, true, true, false, false, null);
1333       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1334 
1335       // assert that the split META entry is still there.
1336       Get get = new Get(hri.getRegionName());
1337       Result result = meta.get(get);
1338       assertNotNull(result);
1339       assertNotNull(HRegionInfo.getHRegionInfo(result));
1340 
1341       assertEquals(ROWKEYS.length, countRows());
1342 
1343       // assert that we still have the split regions
1344       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1345       assertNoErrors(doFsck(conf, false));
1346     } finally {
1347       deleteTable(table);
1348       IOUtils.closeQuietly(meta);
1349     }
1350   }
1351 
1352   /**
1353    * Split crashed after write to META finished for the parent region, but
1354    * failed to write daughters (pre HBASE-7721 codebase)
1355    */
1356   @Test
1357   public void testSplitDaughtersNotInMeta() throws Exception {
1358     TableName table =
1359         TableName.valueOf("testSplitdaughtersNotInMeta");
1360     HTable meta = null;
1361     try {
1362       setupTable(table);
1363       assertEquals(ROWKEYS.length, countRows());
1364 
1365       // make sure data in regions, if in hlog only there is no data loss
1366       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1367       HRegionLocation location = tbl.getRegionLocation("B");
1368 
1369       meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName());
1370       HRegionInfo hri = location.getRegionInfo();
1371 
1372       // do a regular split
1373       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1374       byte[] regionName = location.getRegionInfo().getRegionName();
1375       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1376       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1377           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1378 
1379       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1380 
1381       // Delete daughter regions from meta, but not hdfs, unassign it.
1382       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1383       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1384       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1385 
1386       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1387       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1388       meta.flushCommits();
1389 
1390       HBaseFsck hbck = doFsck(conf, false);
1391       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1392           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1393 
1394       // now fix it. The fix should not revert the region split, but add daughters to META
1395       hbck = doFsck(conf, true, true, false, false, false, false, false, false, false, false, null);
1396       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1397           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1398 
1399       // assert that the split META entry is still there.
1400       Get get = new Get(hri.getRegionName());
1401       Result result = meta.get(get);
1402       assertNotNull(result);
1403       assertNotNull(HRegionInfo.getHRegionInfo(result));
1404 
1405       assertEquals(ROWKEYS.length, countRows());
1406 
1407       // assert that we still have the split regions
1408       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1409       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1410     } finally {
1411       deleteTable(table);
1412       IOUtils.closeQuietly(meta);
1413     }
1414   }
1415 
1416   /**
1417    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1418    * meta and data missing in the fs.
1419    */
1420   @Test(timeout=120000)
1421   public void testMissingFirstRegion() throws Exception {
1422     TableName table =
1423         TableName.valueOf("testMissingFirstRegion");
1424     try {
1425       setupTable(table);
1426       assertEquals(ROWKEYS.length, countRows());
1427 
1428       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1429       TEST_UTIL.getHBaseAdmin().disableTable(table);
1430       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1431           true, true);
1432       TEST_UTIL.getHBaseAdmin().enableTable(table);
1433 
1434       HBaseFsck hbck = doFsck(conf, false);
1435       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1436       // fix hole
1437       doFsck(conf, true);
1438       // check that hole fixed
1439       assertNoErrors(doFsck(conf, false));
1440     } finally {
1441       deleteTable(table);
1442     }
1443   }
1444 
1445   /**
1446    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1447    * the fs.
1448    */
1449   @Test(timeout=120000)
1450   public void testMissingLastRegion() throws Exception {
1451     TableName table =
1452         TableName.valueOf("testMissingLastRegion");
1453     try {
1454       setupTable(table);
1455       assertEquals(ROWKEYS.length, countRows());
1456 
1457       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1458       TEST_UTIL.getHBaseAdmin().disableTable(table);
1459       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1460           true, true);
1461       TEST_UTIL.getHBaseAdmin().enableTable(table);
1462 
1463       HBaseFsck hbck = doFsck(conf, false);
1464       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1465       // fix hole
1466       doFsck(conf, true);
1467       // check that hole fixed
1468       assertNoErrors(doFsck(conf, false));
1469     } finally {
1470       deleteTable(table);
1471     }
1472   }
1473 
1474   /**
1475    * Test -noHdfsChecking option can detect and fix assignments issue.
1476    */
1477   @Test
1478   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1479     TableName table =
1480         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1481     try {
1482       setupTable(table);
1483       assertEquals(ROWKEYS.length, countRows());
1484 
1485       // Mess it up by closing a region
1486       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1487         Bytes.toBytes("B"), true, false, false, false);
1488 
1489       // verify there is no other errors
1490       HBaseFsck hbck = doFsck(conf, false);
1491       assertErrors(hbck, new ERROR_CODE[] {
1492         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1493 
1494       // verify that noHdfsChecking report the same errors
1495       HBaseFsck fsck = new HBaseFsck(conf);
1496       fsck.connect();
1497       fsck.setDisplayFullReport(); // i.e. -details
1498       fsck.setTimeLag(0);
1499       fsck.setCheckHdfs(false);
1500       fsck.onlineHbck();
1501       assertErrors(fsck, new ERROR_CODE[] {
1502         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1503 
1504       // verify that fixAssignments works fine with noHdfsChecking
1505       fsck = new HBaseFsck(conf);
1506       fsck.connect();
1507       fsck.setDisplayFullReport(); // i.e. -details
1508       fsck.setTimeLag(0);
1509       fsck.setCheckHdfs(false);
1510       fsck.setFixAssignments(true);
1511       fsck.onlineHbck();
1512       assertTrue(fsck.shouldRerun());
1513       fsck.onlineHbck();
1514       assertNoErrors(fsck);
1515 
1516       assertEquals(ROWKEYS.length, countRows());
1517     } finally {
1518       deleteTable(table);
1519     }
1520   }
1521 
1522   /**
1523    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1524    * However, it can not fix it without checking Hdfs because we need to get
1525    * the region info from Hdfs in this case, then to patch the meta.
1526    */
1527   @Test
1528   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1529     TableName table =
1530         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1531     try {
1532       setupTable(table);
1533       assertEquals(ROWKEYS.length, countRows());
1534 
1535       // Mess it up by deleting a region from the metadata
1536       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1537         Bytes.toBytes("B"), false, true, false, false);
1538 
1539       // verify there is no other errors
1540       HBaseFsck hbck = doFsck(conf, false);
1541       assertErrors(hbck, new ERROR_CODE[] {
1542         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1543 
1544       // verify that noHdfsChecking report the same errors
1545       HBaseFsck fsck = new HBaseFsck(conf);
1546       fsck.connect();
1547       fsck.setDisplayFullReport(); // i.e. -details
1548       fsck.setTimeLag(0);
1549       fsck.setCheckHdfs(false);
1550       fsck.onlineHbck();
1551       assertErrors(fsck, new ERROR_CODE[] {
1552         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1553 
1554       // verify that fixMeta doesn't work with noHdfsChecking
1555       fsck = new HBaseFsck(conf);
1556       fsck.connect();
1557       fsck.setDisplayFullReport(); // i.e. -details
1558       fsck.setTimeLag(0);
1559       fsck.setCheckHdfs(false);
1560       fsck.setFixAssignments(true);
1561       fsck.setFixMeta(true);
1562       fsck.onlineHbck();
1563       assertFalse(fsck.shouldRerun());
1564       assertErrors(fsck, new ERROR_CODE[] {
1565         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1566     } finally {
1567       deleteTable(table);
1568     }
1569   }
1570 
1571   /**
1572    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1573    * and -noHdfsChecking can't detect orphan Hdfs region.
1574    */
1575   @Test
1576   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1577     TableName table =
1578         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1579     try {
1580       setupTable(table);
1581       assertEquals(ROWKEYS.length, countRows());
1582 
1583       // Mess it up by creating an overlap in the metadata
1584       TEST_UTIL.getHBaseAdmin().disableTable(table);
1585       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1586         Bytes.toBytes("B"), true, true, false, true);
1587       TEST_UTIL.getHBaseAdmin().enableTable(table);
1588 
1589       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1590         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1591       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1592       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1593         .waitForAssignment(hriOverlap);
1594       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1595       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1596 
1597       HBaseFsck hbck = doFsck(conf, false);
1598       assertErrors(hbck, new ERROR_CODE[] {
1599         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1600         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1601 
1602       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1603       HBaseFsck fsck = new HBaseFsck(conf);
1604       fsck.connect();
1605       fsck.setDisplayFullReport(); // i.e. -details
1606       fsck.setTimeLag(0);
1607       fsck.setCheckHdfs(false);
1608       fsck.onlineHbck();
1609       assertErrors(fsck, new ERROR_CODE[] {
1610         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1611 
1612       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1613       fsck = new HBaseFsck(conf);
1614       fsck.connect();
1615       fsck.setDisplayFullReport(); // i.e. -details
1616       fsck.setTimeLag(0);
1617       fsck.setCheckHdfs(false);
1618       fsck.setFixHdfsHoles(true);
1619       fsck.setFixHdfsOverlaps(true);
1620       fsck.setFixHdfsOrphans(true);
1621       fsck.onlineHbck();
1622       assertFalse(fsck.shouldRerun());
1623       assertErrors(fsck, new ERROR_CODE[] {
1624         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1625     } finally {
1626       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1627         TEST_UTIL.getHBaseAdmin().enableTable(table);
1628       }
1629       deleteTable(table);
1630     }
1631   }
1632 
1633   /**
1634    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1635    * legitimate hfile and return it.
1636    * @param fs
1637    * @param table
1638    * @return Path of a flushed hfile.
1639    * @throws IOException
1640    */
1641   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1642     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1643     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1644     Path famDir = new Path(regionDir, FAM_STR);
1645 
1646     // keep doing this until we get a legit hfile
1647     while (true) {
1648       FileStatus[] hfFss = fs.listStatus(famDir);
1649       if (hfFss.length == 0) {
1650         continue;
1651       }
1652       for (FileStatus hfs : hfFss) {
1653         if (!hfs.isDir()) {
1654           return hfs.getPath();
1655         }
1656       }
1657     }
1658   }
1659 
1660   /**
1661    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1662    */
1663   @Test(timeout=120000)
1664   public void testQuarantineCorruptHFile() throws Exception {
1665     TableName table = TableName.valueOf(name.getMethodName());
1666     try {
1667       setupTable(table);
1668       assertEquals(ROWKEYS.length, countRows());
1669       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1670 
1671       FileSystem fs = FileSystem.get(conf);
1672       Path hfile = getFlushedHFile(fs, table);
1673 
1674       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1675       TEST_UTIL.getHBaseAdmin().disableTable(table);
1676 
1677       // create new corrupt file called deadbeef (valid hfile name)
1678       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1679       TestHFile.truncateFile(fs, hfile, corrupt);
1680       LOG.info("Created corrupted file " + corrupt);
1681       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1682 
1683       // we cannot enable here because enable never finished due to the corrupt region.
1684       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1685       assertEquals(res.getRetCode(), 0);
1686       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1687       assertEquals(hfcc.getHFilesChecked(), 5);
1688       assertEquals(hfcc.getCorrupted().size(), 1);
1689       assertEquals(hfcc.getFailures().size(), 0);
1690       assertEquals(hfcc.getQuarantined().size(), 1);
1691       assertEquals(hfcc.getMissing().size(), 0);
1692 
1693       // Its been fixed, verify that we can enable.
1694       TEST_UTIL.getHBaseAdmin().enableTable(table);
1695     } finally {
1696       deleteTable(table);
1697     }
1698   }
1699 
1700   /**
1701   * Test that use this should have a timeout, because this method could potentially wait forever.
1702   */
1703   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1704                                 int corrupt, int fail, int quar, int missing) throws Exception {
1705     try {
1706       setupTable(table);
1707       assertEquals(ROWKEYS.length, countRows());
1708       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1709 
1710       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1711       TEST_UTIL.getHBaseAdmin().disableTable(table);
1712 
1713       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1714           table.getNameAsString()};
1715       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1716       HBaseFsck res = hbck.exec(exec, args);
1717 
1718       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1719       assertEquals(hfcc.getHFilesChecked(), check);
1720       assertEquals(hfcc.getCorrupted().size(), corrupt);
1721       assertEquals(hfcc.getFailures().size(), fail);
1722       assertEquals(hfcc.getQuarantined().size(), quar);
1723       assertEquals(hfcc.getMissing().size(), missing);
1724 
1725       // its been fixed, verify that we can enable
1726       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1727       admin.enableTableAsync(table);
1728       while (!admin.isTableEnabled(table)) {
1729         try {
1730           Thread.sleep(250);
1731         } catch (InterruptedException e) {
1732           e.printStackTrace();
1733           fail("Interrupted when trying to enable table " + table);
1734         }
1735       }
1736     } finally {
1737       deleteTable(table);
1738     }
1739   }
1740 
1741   /**
1742    * This creates a table and simulates the race situation where a concurrent compaction or split
1743    * has removed an hfile after the corruption checker learned about it.
1744    */
1745   @Test(timeout=120000)
1746   public void testQuarantineMissingHFile() throws Exception {
1747     TableName table = TableName.valueOf(name.getMethodName());
1748     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1749     // inject a fault in the hfcc created.
1750     final FileSystem fs = FileSystem.get(conf);
1751     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1752       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1753         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1754           boolean attemptedFirstHFile = false;
1755           protected void checkHFile(Path p) throws IOException {
1756             if (!attemptedFirstHFile) {
1757               attemptedFirstHFile = true;
1758               assertTrue(fs.delete(p, true)); // make sure delete happened.
1759             }
1760             super.checkHFile(p);
1761           }
1762         };
1763       }
1764     };
1765     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1766   }
1767 
1768   /**
1769    * This creates a table and simulates the race situation where a concurrent compaction or split
1770    * has removed an colfam dir before the corruption checker got to it.
1771    */
1772   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1773   // files in a column family on initial creation -- as suggested by Matteo.
1774   @Ignore @Test(timeout=120000)
1775   public void testQuarantineMissingFamdir() throws Exception {
1776     TableName table = TableName.valueOf(name.getMethodName());
1777     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1778     // inject a fault in the hfcc created.
1779     final FileSystem fs = FileSystem.get(conf);
1780     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1781       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1782         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1783           boolean attemptedFirstFamDir = false;
1784           protected void checkColFamDir(Path p) throws IOException {
1785             if (!attemptedFirstFamDir) {
1786               attemptedFirstFamDir = true;
1787               assertTrue(fs.delete(p, true)); // make sure delete happened.
1788             }
1789             super.checkColFamDir(p);
1790           }
1791         };
1792       }
1793     };
1794     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1795   }
1796 
1797   /**
1798    * This creates a table and simulates the race situation where a concurrent compaction or split
1799    * has removed a region dir before the corruption checker got to it.
1800    */
1801   @Test(timeout=120000)
1802   public void testQuarantineMissingRegionDir() throws Exception {
1803     TableName table = TableName.valueOf(name.getMethodName());
1804     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1805     // inject a fault in the hfcc created.
1806     final FileSystem fs = FileSystem.get(conf);
1807     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1808       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1809         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1810           boolean attemptedFirstRegionDir = false;
1811           protected void checkRegionDir(Path p) throws IOException {
1812             if (!attemptedFirstRegionDir) {
1813               attemptedFirstRegionDir = true;
1814               assertTrue(fs.delete(p, true)); // make sure delete happened.
1815             }
1816             super.checkRegionDir(p);
1817           }
1818         };
1819       }
1820     };
1821     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1822   }
1823 
1824   /**
1825    * Test fixing lingering reference file.
1826    */
1827   @Test
1828   public void testLingeringReferenceFile() throws Exception {
1829     TableName table =
1830         TableName.valueOf("testLingeringReferenceFile");
1831     try {
1832       setupTable(table);
1833       assertEquals(ROWKEYS.length, countRows());
1834 
1835       // Mess it up by creating a fake reference file
1836       FileSystem fs = FileSystem.get(conf);
1837       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1838       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1839       Path famDir = new Path(regionDir, FAM_STR);
1840       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
1841       fs.create(fakeReferenceFile);
1842 
1843       HBaseFsck hbck = doFsck(conf, false);
1844       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
1845       // fix reference file
1846       doFsck(conf, true);
1847       // check that reference file fixed
1848       assertNoErrors(doFsck(conf, false));
1849     } finally {
1850       deleteTable(table);
1851     }
1852   }
1853 
1854   /**
1855    * Test mission REGIONINFO_QUALIFIER in .META.
1856    */
1857   @Test
1858   public void testMissingRegionInfoQualifier() throws Exception {
1859     TableName table =
1860         TableName.valueOf("testMissingRegionInfoQualifier");
1861     try {
1862       setupTable(table);
1863 
1864       // Mess it up by removing the RegionInfo for one region.
1865       final List<Delete> deletes = new LinkedList<Delete>();
1866       HTable meta = new HTable(conf, HTableDescriptor.META_TABLEDESC.getTableName());
1867       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
1868 
1869         @Override
1870         public boolean processRow(Result rowResult) throws IOException {
1871           if(!HTableDescriptor.isSystemTable(MetaScanner.getHRegionInfo(rowResult)
1872               .getTableName())) {
1873             Delete delete = new Delete(rowResult.getRow());
1874             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1875             deletes.add(delete);
1876           }
1877           return true;
1878         }
1879 
1880         @Override
1881         public void close() throws IOException {
1882         }
1883       });
1884       meta.delete(deletes);
1885 
1886       // Mess it up by creating a fake META entry with no associated RegionInfo
1887       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
1888         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
1889       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
1890         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
1891       meta.close();
1892 
1893       HBaseFsck hbck = doFsck(conf, false);
1894       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
1895 
1896       // fix reference file
1897       hbck = doFsck(conf, true);
1898 
1899       // check that reference file fixed
1900       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
1901     } finally {
1902       deleteTable(table);
1903     }
1904   }
1905 
1906 
1907   /**
1908    * Test pluggable error reporter. It can be plugged in
1909    * from system property or configuration.
1910    */
1911   @Test
1912   public void testErrorReporter() throws Exception {
1913     try {
1914       MockErrorReporter.calledCount = 0;
1915       doFsck(conf, false);
1916       assertEquals(MockErrorReporter.calledCount, 0);
1917 
1918       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
1919       doFsck(conf, false);
1920       assertTrue(MockErrorReporter.calledCount > 20);
1921     } finally {
1922       conf.set("hbasefsck.errorreporter",
1923         PrintingErrorReporter.class.getName());
1924       MockErrorReporter.calledCount = 0;
1925     }
1926   }
1927 
1928   static class MockErrorReporter implements ErrorReporter {
1929     static int calledCount = 0;
1930 
1931     public void clear() {
1932       calledCount++;
1933     }
1934 
1935     public void report(String message) {
1936       calledCount++;
1937     }
1938 
1939     public void reportError(String message) {
1940       calledCount++;
1941     }
1942 
1943     public void reportError(ERROR_CODE errorCode, String message) {
1944       calledCount++;
1945     }
1946 
1947     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
1948       calledCount++;
1949     }
1950 
1951     public void reportError(ERROR_CODE errorCode,
1952         String message, TableInfo table, HbckInfo info) {
1953       calledCount++;
1954     }
1955 
1956     public void reportError(ERROR_CODE errorCode, String message,
1957         TableInfo table, HbckInfo info1, HbckInfo info2) {
1958       calledCount++;
1959     }
1960 
1961     public int summarize() {
1962       return ++calledCount;
1963     }
1964 
1965     public void detail(String details) {
1966       calledCount++;
1967     }
1968 
1969     public ArrayList<ERROR_CODE> getErrorList() {
1970       calledCount++;
1971       return new ArrayList<ERROR_CODE>();
1972     }
1973 
1974     public void progress() {
1975       calledCount++;
1976     }
1977 
1978     public void print(String message) {
1979       calledCount++;
1980     }
1981 
1982     public void resetErrors() {
1983       calledCount++;
1984     }
1985 
1986     public boolean tableHasErrors(TableInfo table) {
1987       calledCount++;
1988       return false;
1989     }
1990   }
1991 
1992   @Test(timeout=60000)
1993   public void testCheckTableLocks() throws Exception {
1994     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
1995     EnvironmentEdgeManager.injectEdge(edge);
1996     // check no errors
1997     HBaseFsck hbck = doFsck(conf, false);
1998     assertNoErrors(hbck);
1999 
2000     ServerName mockName = new ServerName("localhost", 60000, 1);
2001 
2002     // obtain one lock
2003     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2004     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2005         "testCheckTableLocks");
2006     writeLock.acquire();
2007     hbck = doFsck(conf, false);
2008     assertNoErrors(hbck); // should not have expired, no problems
2009 
2010     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2011         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2012 
2013     hbck = doFsck(conf, false);
2014     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2015 
2016     final CountDownLatch latch = new CountDownLatch(1);
2017     new Thread() {
2018       public void run() {
2019         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2020             "testCheckTableLocks");
2021         try {
2022           latch.countDown();
2023           readLock.acquire();
2024         } catch (IOException ex) {
2025           fail();
2026         } catch (IllegalStateException ex) {
2027           return; // expected, since this will be reaped under us.
2028         }
2029         fail("should not have come here");
2030       };
2031     }.start();
2032 
2033     latch.await(); // wait until thread starts
2034     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2035 
2036     hbck = doFsck(conf, false);
2037     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2038 
2039     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2040         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2041 
2042     hbck = doFsck(conf, false);
2043     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2044 
2045     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2046                                                                  // which is not injectable through EnvironmentEdge
2047     Threads.sleep(10);
2048     hbck = doFsck(conf, true); // now fix both cases
2049 
2050     hbck = doFsck(conf, false);
2051     assertNoErrors(hbck);
2052 
2053     // ensure that locks are deleted
2054     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2055         "should acquire without blocking");
2056     writeLock.acquire(); // this should not block.
2057     writeLock.release(); // release for clean state
2058   }
2059 
2060   @Test
2061   public void testMetaOffline() throws Exception {
2062     // check no errors
2063     HBaseFsck hbck = doFsck(conf, false);
2064     assertNoErrors(hbck);
2065     deleteMetaRegion(conf, true, false, false);
2066     hbck = doFsck(conf, false);
2067     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the .META.
2068     // inconsistency and whether we will be fixing it or not.
2069     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2070     hbck = doFsck(conf, true);
2071     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2072     hbck = doFsck(conf, false);
2073     assertNoErrors(hbck);
2074   }
2075   
2076   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2077       boolean regionInfoOnly) throws IOException, InterruptedException {
2078     HConnection connection = HConnectionManager.getConnection(conf);
2079     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2080         HConstants.EMPTY_START_ROW);
2081     ServerName hsa = new ServerName(metaLocation.getHostnamePort(), 0L);
2082     HRegionInfo hri = metaLocation.getRegionInfo();
2083     if (unassign) {
2084       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2085       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2086     }
2087 
2088     if (regionInfoOnly) {
2089       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2090       Path rootDir = FSUtils.getRootDir(conf);
2091       FileSystem fs = rootDir.getFileSystem(conf);
2092       Path p = new Path(rootDir + "/" + HTableDescriptor.META_TABLEDESC.getNameAsString(),
2093           hri.getEncodedName());
2094       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2095       fs.delete(hriPath, true);
2096     }
2097 
2098     if (hdfs) {
2099       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2100       Path rootDir = FSUtils.getRootDir(conf);
2101       FileSystem fs = rootDir.getFileSystem(conf);
2102       Path p = new Path(rootDir + "/" + HTableDescriptor.META_TABLEDESC.getNameAsString(),
2103           hri.getEncodedName());
2104       HBaseFsck.debugLsr(conf, p);
2105       boolean success = fs.delete(p, true);
2106       LOG.info("Deleted " + p + " sucessfully? " + success);
2107       HBaseFsck.debugLsr(conf, p);
2108     }
2109   }
2110   
2111   @org.junit.Rule
2112   public TestName name = new TestName();
2113 }