View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Collection;
34  import java.util.HashMap;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.Map;
38  import java.util.Map.Entry;
39  import java.util.concurrent.Callable;
40  import java.util.concurrent.CountDownLatch;
41  import java.util.concurrent.ExecutorService;
42  import java.util.concurrent.Executors;
43  import java.util.concurrent.Future;
44  import java.util.concurrent.ScheduledThreadPoolExecutor;
45  import java.util.concurrent.SynchronousQueue;
46  import java.util.concurrent.ThreadPoolExecutor;
47  import java.util.concurrent.TimeUnit;
48  import java.util.concurrent.atomic.AtomicBoolean;
49  
50  import org.apache.commons.io.IOUtils;
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  import org.apache.hadoop.conf.Configuration;
54  import org.apache.hadoop.fs.FileStatus;
55  import org.apache.hadoop.fs.FileSystem;
56  import org.apache.hadoop.fs.Path;
57  import org.apache.hadoop.hbase.ClusterStatus;
58  import org.apache.hadoop.hbase.HBaseTestingUtility;
59  import org.apache.hadoop.hbase.HColumnDescriptor;
60  import org.apache.hadoop.hbase.HConstants;
61  import org.apache.hadoop.hbase.HRegionInfo;
62  import org.apache.hadoop.hbase.HRegionLocation;
63  import org.apache.hadoop.hbase.HTableDescriptor;
64  import org.apache.hadoop.hbase.TableExistsException;
65  import org.apache.hadoop.hbase.testclassification.LargeTests;
66  import org.apache.hadoop.hbase.MiniHBaseCluster;
67  import org.apache.hadoop.hbase.ServerName;
68  import org.apache.hadoop.hbase.TableName;
69  import org.apache.hadoop.hbase.catalog.MetaEditor;
70  import org.apache.hadoop.hbase.client.Delete;
71  import org.apache.hadoop.hbase.client.Durability;
72  import org.apache.hadoop.hbase.client.Get;
73  import org.apache.hadoop.hbase.client.HBaseAdmin;
74  import org.apache.hadoop.hbase.client.HConnection;
75  import org.apache.hadoop.hbase.client.HConnectionManager;
76  import org.apache.hadoop.hbase.client.HTable;
77  import org.apache.hadoop.hbase.client.MetaScanner;
78  import org.apache.hadoop.hbase.client.Put;
79  import org.apache.hadoop.hbase.client.Result;
80  import org.apache.hadoop.hbase.client.ResultScanner;
81  import org.apache.hadoop.hbase.client.Scan;
82  import org.apache.hadoop.hbase.io.hfile.TestHFile;
83  import org.apache.hadoop.hbase.master.AssignmentManager;
84  import org.apache.hadoop.hbase.master.HMaster;
85  import org.apache.hadoop.hbase.master.RegionState;
86  import org.apache.hadoop.hbase.master.RegionStates;
87  import org.apache.hadoop.hbase.master.TableLockManager;
88  import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
89  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
90  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
91  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
92  import org.apache.hadoop.hbase.regionserver.HRegion;
93  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
94  import org.apache.hadoop.hbase.regionserver.HRegionServer;
95  import org.apache.hadoop.hbase.regionserver.SplitTransaction;
96  import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
97  import org.apache.hadoop.hbase.testclassification.LargeTests;
98  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
99  import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
100 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
101 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
102 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
103 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
104 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
105 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
106 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
107 import org.apache.zookeeper.KeeperException;
108 import org.junit.AfterClass;
109 import org.junit.Assert;
110 import org.junit.Before;
111 import org.junit.BeforeClass;
112 import org.junit.Ignore;
113 import org.junit.Test;
114 import org.junit.experimental.categories.Category;
115 import org.junit.rules.TestName;
116 
117 import com.google.common.collect.Multimap;
118 
119 /**
120  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
121  */
122 @Category(LargeTests.class)
123 public class TestHBaseFsck {
124   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
125   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
126   private final static Configuration conf = TEST_UTIL.getConfiguration();
127   private final static String FAM_STR = "fam";
128   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
129   private final static int REGION_ONLINE_TIMEOUT = 800;
130   private static RegionStates regionStates;
131   private static ExecutorService executorService;
132 
133   // for the instance, reset every test run
134   private HTable tbl;
135   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
136     Bytes.toBytes("B"), Bytes.toBytes("C") };
137   // one row per region.
138   private final static byte[][] ROWKEYS= new byte[][] {
139     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
140     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
141 
142   @SuppressWarnings("deprecation")
143   @BeforeClass
144   public static void setUpBeforeClass() throws Exception {
145     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.handler.count", 2);
146     TEST_UTIL.getConfiguration().setInt("hbase.regionserver.metahandler.count", 2);
147     TEST_UTIL.getConfiguration().setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
148     TEST_UTIL.startMiniCluster(3);
149     TEST_UTIL.setHDFSClientRetry(0);
150 
151     executorService = new ThreadPoolExecutor(1, Integer.MAX_VALUE, 60, TimeUnit.SECONDS,
152         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
153 
154     AssignmentManager assignmentManager =
155       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
156     regionStates = assignmentManager.getRegionStates();
157     TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, true);
158   }
159 
160   @AfterClass
161   public static void tearDownAfterClass() throws Exception {
162     TEST_UTIL.shutdownMiniCluster();
163   }
164 
165   @Test
166   public void testHBaseFsck() throws Exception {
167     assertNoErrors(doFsck(conf, false));
168     String table = "tableBadMetaAssign";
169     TEST_UTIL.createTable(Bytes.toBytes(table), FAM);
170 
171     // We created 1 table, should be fine
172     assertNoErrors(doFsck(conf, false));
173 
174     // Now let's mess it up and change the assignment in hbase:meta to
175     // point to a different region server
176     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
177     Scan scan = new Scan();
178     scan.setStartRow(Bytes.toBytes(table+",,"));
179     ResultScanner scanner = meta.getScanner(scan);
180     HRegionInfo hri = null;
181 
182     Result res = scanner.next();
183     ServerName currServer =
184       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
185           HConstants.SERVER_QUALIFIER));
186     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
187         HConstants.STARTCODE_QUALIFIER));
188 
189     for (JVMClusterUtil.RegionServerThread rs :
190         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
191 
192       ServerName sn = rs.getRegionServer().getServerName();
193 
194       // When we find a diff RS, change the assignment and break
195       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
196           startCode != sn.getStartcode()) {
197         Put put = new Put(res.getRow());
198         put.setDurability(Durability.SKIP_WAL);
199         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
200           Bytes.toBytes(sn.getHostAndPort()));
201         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
202           Bytes.toBytes(sn.getStartcode()));
203         meta.put(put);
204         hri = HRegionInfo.getHRegionInfo(res);
205         break;
206       }
207     }
208 
209     // Try to fix the data
210     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
211         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
212 
213     TEST_UTIL.getHBaseCluster().getMaster()
214       .getAssignmentManager().waitForAssignment(hri);
215 
216     // Should be fixed now
217     assertNoErrors(doFsck(conf, false));
218 
219     // comment needed - what is the purpose of this line
220     HTable t = new HTable(conf, Bytes.toBytes(table), executorService);
221     ResultScanner s = t.getScanner(new Scan());
222     s.close();
223     t.close();
224 
225     scanner.close();
226     meta.close();
227   }
228 
229   @Test(timeout=180000)
230   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
231     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
232     HBaseAdmin admin = null;
233     try {
234       admin = new HBaseAdmin(TEST_UTIL.getConfiguration());
235       admin.closeRegion(cluster.getServerHoldingMeta(),
236           HRegionInfo.FIRST_META_REGIONINFO);
237     } finally {
238       if (admin != null) {
239         admin.close();
240       }
241     }
242     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
243     MetaRegionTracker.deleteMetaLocation(cluster.getMaster().getZooKeeper());
244     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
245     HBaseFsck hbck = doFsck(conf, true);
246     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
247         ERROR_CODE.NULL_META_REGION });
248     assertNoErrors(doFsck(conf, false));
249   }
250 
251   /**
252    * Create a new region in META.
253    */
254   private HRegionInfo createRegion(Configuration conf, final HTableDescriptor
255       htd, byte[] startKey, byte[] endKey)
256       throws IOException {
257     HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
258     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
259     MetaEditor.addRegionToMeta(meta, hri);
260     meta.close();
261     return hri;
262   }
263 
264   /**
265    * Debugging method to dump the contents of meta.
266    */
267   private void dumpMeta(TableName tableName) throws IOException {
268     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
269     for (byte[] row : metaRows) {
270       LOG.info(Bytes.toString(row));
271     }
272   }
273 
274   /**
275    * This method is used to undeploy a region -- close it and attempt to
276    * remove its state from the Master.
277    */
278   private void undeployRegion(HBaseAdmin admin, ServerName sn,
279       HRegionInfo hri) throws IOException, InterruptedException {
280     try {
281       HBaseFsckRepair.closeRegionSilentlyAndWait(admin, sn, hri);
282       if (!hri.isMetaTable()) {
283         admin.offline(hri.getRegionName());
284       }
285     } catch (IOException ioe) {
286       LOG.warn("Got exception when attempting to offline region "
287           + Bytes.toString(hri.getRegionName()), ioe);
288     }
289   }
290   /**
291    * Delete a region from assignments, meta, or completely from hdfs.
292    * @param unassign if true unassign region if assigned
293    * @param metaRow  if true remove region's row from META
294    * @param hdfs if true remove region's dir in HDFS
295    */
296   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
297       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
298       boolean hdfs) throws IOException, InterruptedException {
299     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false);
300   }
301 
302   /**
303    * Delete a region from assignments, meta, or completely from hdfs.
304    * @param unassign if true unassign region if assigned
305    * @param metaRow  if true remove region's row from META
306    * @param hdfs if true remove region's dir in HDFS
307    * @param regionInfoOnly if true remove a region dir's .regioninfo file
308    */
309   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
310       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
311       boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException {
312     LOG.info("** Before delete:");
313     dumpMeta(htd.getTableName());
314 
315     Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
316     for (Entry<HRegionInfo, ServerName> e: hris.entrySet()) {
317       HRegionInfo hri = e.getKey();
318       ServerName hsa = e.getValue();
319       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
320           && Bytes.compareTo(hri.getEndKey(), endKey) == 0) {
321 
322         LOG.info("RegionName: " +hri.getRegionNameAsString());
323         byte[] deleteRow = hri.getRegionName();
324 
325         if (unassign) {
326           LOG.info("Undeploying region " + hri + " from server " + hsa);
327           undeployRegion(new HBaseAdmin(conf), hsa, hri);
328         }
329 
330         if (regionInfoOnly) {
331           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
332           Path rootDir = FSUtils.getRootDir(conf);
333           FileSystem fs = rootDir.getFileSystem(conf);
334           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
335               hri.getEncodedName());
336           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
337           fs.delete(hriPath, true);
338         }
339 
340         if (hdfs) {
341           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
342           Path rootDir = FSUtils.getRootDir(conf);
343           FileSystem fs = rootDir.getFileSystem(conf);
344           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
345               hri.getEncodedName());
346           HBaseFsck.debugLsr(conf, p);
347           boolean success = fs.delete(p, true);
348           LOG.info("Deleted " + p + " sucessfully? " + success);
349           HBaseFsck.debugLsr(conf, p);
350         }
351 
352         if (metaRow) {
353           HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
354           Delete delete = new Delete(deleteRow);
355           meta.delete(delete);
356         }
357       }
358       LOG.info(hri.toString() + hsa.toString());
359     }
360 
361     TEST_UTIL.getMetaTableRows(htd.getTableName());
362     LOG.info("*** After delete:");
363     dumpMeta(htd.getTableName());
364   }
365 
366   /**
367    * Setup a clean table before we start mucking with it.
368    *
369    * @throws IOException
370    * @throws InterruptedException
371    * @throws KeeperException
372    */
373   HTable setupTable(TableName tablename) throws Exception {
374     HTableDescriptor desc = new HTableDescriptor(tablename);
375     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
376     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
377     TEST_UTIL.getHBaseAdmin().createTable(desc, SPLITS);
378     tbl = new HTable(TEST_UTIL.getConfiguration(), tablename, executorService);
379 
380     List<Put> puts = new ArrayList<Put>();
381     for (byte[] row : ROWKEYS) {
382       Put p = new Put(row);
383       p.add(FAM, Bytes.toBytes("val"), row);
384       puts.add(p);
385     }
386     tbl.put(puts);
387     tbl.flushCommits();
388     return tbl;
389   }
390 
391   /**
392    * Counts the number of rows to verify data loss or non-dataloss.
393    */
394   int countRows() throws IOException {
395      Scan s = new Scan();
396      ResultScanner rs = tbl.getScanner(s);
397      int i = 0;
398      while(rs.next() !=null) {
399        i++;
400      }
401      return i;
402   }
403 
404   /**
405    * Counts the number of rows to verify data loss or non-dataloss.
406    */
407   int countRows(byte[] start, byte[] end) throws IOException {
408     Scan s = new Scan(start, end);
409     ResultScanner rs = tbl.getScanner(s);
410     int i = 0;
411     while (rs.next() != null) {
412       i++;
413     }
414     return i;
415   }  
416   /**
417    * delete table in preparation for next test
418    *
419    * @param tablename
420    * @throws IOException
421    */
422   void deleteTable(TableName tablename) throws IOException {
423     HBaseAdmin admin = new HBaseAdmin(conf);
424     admin.getConnection().clearRegionCache();
425     if (admin.isTableEnabled(tablename)) {
426       admin.disableTableAsync(tablename);
427     }
428     long totalWait = 0;
429     long maxWait = 30*1000;
430     long sleepTime = 250;
431     while (!admin.isTableDisabled(tablename)) {
432       try {
433         Thread.sleep(sleepTime);
434         totalWait += sleepTime;
435         if (totalWait >= maxWait) {
436           fail("Waited too long for table to be disabled + " + tablename);
437         }
438       } catch (InterruptedException e) {
439         e.printStackTrace();
440         fail("Interrupted when trying to disable table " + tablename);
441       }
442     }
443     admin.deleteTable(tablename);
444   }
445 
446   /**
447    * This creates a clean table and confirms that the table is clean.
448    */
449   @Test
450   public void testHBaseFsckClean() throws Exception {
451     assertNoErrors(doFsck(conf, false));
452     TableName table = TableName.valueOf("tableClean");
453     try {
454       HBaseFsck hbck = doFsck(conf, false);
455       assertNoErrors(hbck);
456 
457       setupTable(table);
458       assertEquals(ROWKEYS.length, countRows());
459 
460       // We created 1 table, should be fine
461       hbck = doFsck(conf, false);
462       assertNoErrors(hbck);
463       assertEquals(0, hbck.getOverlapGroups(table).size());
464       assertEquals(ROWKEYS.length, countRows());
465     } finally {
466       deleteTable(table);
467     }
468   }
469 
470   /**
471    * Test thread pooling in the case where there are more regions than threads
472    */
473   @Test
474   public void testHbckThreadpooling() throws Exception {
475     TableName table =
476         TableName.valueOf("tableDupeStartKey");
477     try {
478       // Create table with 4 regions
479       setupTable(table);
480 
481       // limit number of threads to 1.
482       Configuration newconf = new Configuration(conf);
483       newconf.setInt("hbasefsck.numthreads", 1);
484       assertNoErrors(doFsck(newconf, false));
485 
486       // We should pass without triggering a RejectedExecutionException
487     } finally {
488       deleteTable(table);
489     }
490   }
491 
492   @Test
493   public void testHbckFixOrphanTable() throws Exception {
494     TableName table = TableName.valueOf("tableInfo");
495     FileSystem fs = null;
496     Path tableinfo = null;
497     try {
498       setupTable(table);
499       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
500 
501       Path hbaseTableDir = FSUtils.getTableDir(
502           FSUtils.getRootDir(conf), table);
503       fs = hbaseTableDir.getFileSystem(conf);
504       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
505       tableinfo = status.getPath();
506       fs.rename(tableinfo, new Path("/.tableinfo"));
507 
508       //to report error if .tableinfo is missing.
509       HBaseFsck hbck = doFsck(conf, false);
510       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
511 
512       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
513       hbck = doFsck(conf, true);
514       assertNoErrors(hbck);
515       status = null;
516       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
517       assertNotNull(status);
518 
519       HTableDescriptor htd = admin.getTableDescriptor(table);
520       htd.setValue("NOT_DEFAULT", "true");
521       admin.disableTable(table);
522       admin.modifyTable(table, htd);
523       admin.enableTable(table);
524       fs.delete(status.getPath(), true);
525 
526       // fix OrphanTable with cache
527       htd = admin.getTableDescriptor(table); // warms up cached htd on master
528       hbck = doFsck(conf, true);
529       assertNoErrors(hbck);
530       status = null;
531       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
532       assertNotNull(status);
533       htd = admin.getTableDescriptor(table);
534       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
535     } finally {
536       fs.rename(new Path("/.tableinfo"), tableinfo);
537       deleteTable(table);
538     }
539   }
540 
541   /**
542    * This test makes sure that parallel instances of Hbck is disabled.
543    *
544    * @throws Exception
545    */
546   @Test
547   public void testParallelHbck() throws Exception {
548     final ExecutorService service;
549     final Future<HBaseFsck> hbck1,hbck2;
550 
551     class RunHbck implements Callable<HBaseFsck>{
552       boolean fail = true;
553       @Override
554       public HBaseFsck call(){
555         try{
556           return doFsck(conf, false);
557         } catch(Exception e){
558           if (e.getMessage().contains("Duplicate hbck")) {
559             fail = false;
560           } else {
561             LOG.fatal("hbck failed.", e);
562           }
563         }
564         // If we reach here, then an exception was caught
565         if (fail) fail();
566         return null;
567       }
568     }
569     service = Executors.newFixedThreadPool(2);
570     hbck1 = service.submit(new RunHbck());
571     hbck2 = service.submit(new RunHbck());
572     service.shutdown();
573     //wait for 15 seconds, for both hbck calls finish
574     service.awaitTermination(15, TimeUnit.SECONDS);
575     HBaseFsck h1 = hbck1.get();
576     HBaseFsck h2 = hbck2.get();
577     // Make sure only one of the calls was successful
578     assert(h1 == null || h2 == null);
579     if (h1 != null) {
580       assert(h1.getRetCode() >= 0);
581     }
582     if (h2 != null) {
583       assert(h2.getRetCode() >= 0);
584     }
585   }
586 
587   /**
588    * This create and fixes a bad table with regions that have a duplicate
589    * start key
590    */
591   @Test
592   public void testDupeStartKey() throws Exception {
593     TableName table =
594         TableName.valueOf("tableDupeStartKey");
595     try {
596       setupTable(table);
597       assertNoErrors(doFsck(conf, false));
598       assertEquals(ROWKEYS.length, countRows());
599 
600       // Now let's mess it up, by adding a region with a duplicate startkey
601       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
602           Bytes.toBytes("A"), Bytes.toBytes("A2"));
603       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
604       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
605           .waitForAssignment(hriDupe);
606       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
607       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
608 
609       HBaseFsck hbck = doFsck(conf, false);
610       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
611             ERROR_CODE.DUPE_STARTKEYS});
612       assertEquals(2, hbck.getOverlapGroups(table).size());
613       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
614 
615       // fix the degenerate region.
616       doFsck(conf,true);
617 
618       // check that the degenerate region is gone and no data loss
619       HBaseFsck hbck2 = doFsck(conf,false);
620       assertNoErrors(hbck2);
621       assertEquals(0, hbck2.getOverlapGroups(table).size());
622       assertEquals(ROWKEYS.length, countRows());
623     } finally {
624       deleteTable(table);
625     }
626   }
627 
628   /**
629    * Get region info from local cluster.
630    */
631   Map<ServerName, List<String>> getDeployedHRIs(
632       final HBaseAdmin admin) throws IOException {
633     ClusterStatus status = admin.getClusterStatus();
634     Collection<ServerName> regionServers = status.getServers();
635     Map<ServerName, List<String>> mm =
636         new HashMap<ServerName, List<String>>();
637     HConnection connection = admin.getConnection();
638     for (ServerName hsi : regionServers) {
639       AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi);
640 
641       // list all online regions from this region server
642       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
643       List<String> regionNames = new ArrayList<String>();
644       for (HRegionInfo hri : regions) {
645         regionNames.add(hri.getRegionNameAsString());
646       }
647       mm.put(hsi, regionNames);
648     }
649     return mm;
650   }
651 
652   /**
653    * Returns the HSI a region info is on.
654    */
655   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
656     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
657       if (e.getValue().contains(hri.getRegionNameAsString())) {
658         return e.getKey();
659       }
660     }
661     return null;
662   }
663 
664   /**
665    * This create and fixes a bad table with regions that have a duplicate
666    * start key
667    */
668   @Test
669   public void testDupeRegion() throws Exception {
670     TableName table =
671         TableName.valueOf("tableDupeRegion");
672     try {
673       setupTable(table);
674       assertNoErrors(doFsck(conf, false));
675       assertEquals(ROWKEYS.length, countRows());
676 
677       // Now let's mess it up, by adding a region with a duplicate startkey
678       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
679           Bytes.toBytes("A"), Bytes.toBytes("B"));
680 
681       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
682       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
683           .waitForAssignment(hriDupe);
684       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
685       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
686 
687       // Yikes! The assignment manager can't tell between diff between two
688       // different regions with the same start/endkeys since it doesn't
689       // differentiate on ts/regionId!  We actually need to recheck
690       // deployments!
691       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
692       while (findDeployedHSI(getDeployedHRIs(admin), hriDupe) == null) {
693         Thread.sleep(250);
694       }
695 
696       LOG.debug("Finished assignment of dupe region");
697 
698       // TODO why is dupe region different from dupe start keys?
699       HBaseFsck hbck = doFsck(conf, false);
700       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
701             ERROR_CODE.DUPE_STARTKEYS});
702       assertEquals(2, hbck.getOverlapGroups(table).size());
703       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
704 
705       // fix the degenerate region.
706       doFsck(conf,true);
707 
708       // check that the degenerate region is gone and no data loss
709       HBaseFsck hbck2 = doFsck(conf,false);
710       assertNoErrors(hbck2);
711       assertEquals(0, hbck2.getOverlapGroups(table).size());
712       assertEquals(ROWKEYS.length, countRows());
713     } finally {
714       deleteTable(table);
715     }
716   }
717 
718   /**
719    * This creates and fixes a bad table with regions that has startkey == endkey
720    */
721   @Test
722   public void testDegenerateRegions() throws Exception {
723     TableName table =
724         TableName.valueOf("tableDegenerateRegions");
725     try {
726       setupTable(table);
727       assertNoErrors(doFsck(conf,false));
728       assertEquals(ROWKEYS.length, countRows());
729 
730       // Now let's mess it up, by adding a region with a duplicate startkey
731       HRegionInfo hriDupe = createRegion(conf, tbl.getTableDescriptor(),
732           Bytes.toBytes("B"), Bytes.toBytes("B"));
733       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
734       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
735           .waitForAssignment(hriDupe);
736       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
737       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
738 
739       HBaseFsck hbck = doFsck(conf,false);
740       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION,
741           ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.DUPE_STARTKEYS});
742       assertEquals(2, hbck.getOverlapGroups(table).size());
743       assertEquals(ROWKEYS.length, countRows());
744 
745       // fix the degenerate region.
746       doFsck(conf,true);
747 
748       // check that the degenerate region is gone and no data loss
749       HBaseFsck hbck2 = doFsck(conf,false);
750       assertNoErrors(hbck2);
751       assertEquals(0, hbck2.getOverlapGroups(table).size());
752       assertEquals(ROWKEYS.length, countRows());
753     } finally {
754       deleteTable(table);
755     }
756   }
757 
758   /**
759    * This creates and fixes a bad table where a region is completely contained
760    * by another region.
761    */
762   @Test
763   public void testContainedRegionOverlap() throws Exception {
764     TableName table =
765         TableName.valueOf("tableContainedRegionOverlap");
766     try {
767       setupTable(table);
768       assertEquals(ROWKEYS.length, countRows());
769 
770       // Mess it up by creating an overlap in the metadata
771       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
772           Bytes.toBytes("A2"), Bytes.toBytes("B"));
773       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
774       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
775           .waitForAssignment(hriOverlap);
776       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
777       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
778 
779       HBaseFsck hbck = doFsck(conf, false);
780       assertErrors(hbck, new ERROR_CODE[] {
781           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
782       assertEquals(2, hbck.getOverlapGroups(table).size());
783       assertEquals(ROWKEYS.length, countRows());
784 
785       // fix the problem.
786       doFsck(conf, true);
787 
788       // verify that overlaps are fixed
789       HBaseFsck hbck2 = doFsck(conf,false);
790       assertNoErrors(hbck2);
791       assertEquals(0, hbck2.getOverlapGroups(table).size());
792       assertEquals(ROWKEYS.length, countRows());
793     } finally {
794        deleteTable(table);
795     }
796   }
797 
798   /**
799    * This creates and fixes a bad table where an overlap group of
800    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
801    * region. Mess around the meta data so that closeRegion/offlineRegion
802    * throws exceptions.
803    */
804   @Test
805   public void testSidelineOverlapRegion() throws Exception {
806     TableName table =
807         TableName.valueOf("testSidelineOverlapRegion");
808     try {
809       setupTable(table);
810       assertEquals(ROWKEYS.length, countRows());
811 
812       // Mess it up by creating an overlap
813       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
814       HMaster master = cluster.getMaster();
815       HRegionInfo hriOverlap1 = createRegion(conf, tbl.getTableDescriptor(),
816         Bytes.toBytes("A"), Bytes.toBytes("AB"));
817       master.assignRegion(hriOverlap1);
818       master.getAssignmentManager().waitForAssignment(hriOverlap1);
819       HRegionInfo hriOverlap2 = createRegion(conf, tbl.getTableDescriptor(),
820         Bytes.toBytes("AB"), Bytes.toBytes("B"));
821       master.assignRegion(hriOverlap2);
822       master.getAssignmentManager().waitForAssignment(hriOverlap2);
823 
824       HBaseFsck hbck = doFsck(conf, false);
825       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
826         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
827       assertEquals(3, hbck.getOverlapGroups(table).size());
828       assertEquals(ROWKEYS.length, countRows());
829 
830       // mess around the overlapped regions, to trigger NotServingRegionException
831       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
832       ServerName serverName = null;
833       byte[] regionName = null;
834       for (HbckInfo hbi: overlapGroups.values()) {
835         if ("A".equals(Bytes.toString(hbi.getStartKey()))
836             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
837           regionName = hbi.getRegionName();
838 
839           // get an RS not serving the region to force bad assignment info in to META.
840           int k = cluster.getServerWith(regionName);
841           for (int i = 0; i < 3; i++) {
842             if (i != k) {
843               HRegionServer rs = cluster.getRegionServer(i);
844               serverName = rs.getServerName();
845               break;
846             }
847           }
848 
849           HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
850           HBaseFsckRepair.closeRegionSilentlyAndWait(admin,
851             cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
852           admin.offline(regionName);
853           break;
854         }
855       }
856 
857       assertNotNull(regionName);
858       assertNotNull(serverName);
859       HTable meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
860       Put put = new Put(regionName);
861       put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
862         Bytes.toBytes(serverName.getHostAndPort()));
863       meta.put(put);
864 
865       // fix the problem.
866       HBaseFsck fsck = new HBaseFsck(conf);
867       fsck.connect();
868       fsck.setDisplayFullReport(); // i.e. -details
869       fsck.setTimeLag(0);
870       fsck.setFixAssignments(true);
871       fsck.setFixMeta(true);
872       fsck.setFixHdfsHoles(true);
873       fsck.setFixHdfsOverlaps(true);
874       fsck.setFixHdfsOrphans(true);
875       fsck.setFixVersionFile(true);
876       fsck.setSidelineBigOverlaps(true);
877       fsck.setMaxMerge(2);
878       fsck.onlineHbck();
879 
880       // verify that overlaps are fixed, and there are less rows
881       // since one region is sidelined.
882       HBaseFsck hbck2 = doFsck(conf,false);
883       assertNoErrors(hbck2);
884       assertEquals(0, hbck2.getOverlapGroups(table).size());
885       assertTrue(ROWKEYS.length > countRows());
886     } finally {
887        deleteTable(table);
888     }
889   }
890 
891   /**
892    * This creates and fixes a bad table where a region is completely contained
893    * by another region, and there is a hole (sort of like a bad split)
894    */
895   @Test
896   public void testOverlapAndOrphan() throws Exception {
897     TableName table =
898         TableName.valueOf("tableOverlapAndOrphan");
899     try {
900       setupTable(table);
901       assertEquals(ROWKEYS.length, countRows());
902 
903       // Mess it up by creating an overlap in the metadata
904       TEST_UTIL.getHBaseAdmin().disableTable(table);
905       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
906           Bytes.toBytes("B"), true, true, false, true);
907       TEST_UTIL.getHBaseAdmin().enableTable(table);
908 
909       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
910           Bytes.toBytes("A2"), Bytes.toBytes("B"));
911       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
912       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
913           .waitForAssignment(hriOverlap);
914       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
915       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
916 
917       HBaseFsck hbck = doFsck(conf, false);
918       assertErrors(hbck, new ERROR_CODE[] {
919           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
920           ERROR_CODE.HOLE_IN_REGION_CHAIN});
921 
922       // fix the problem.
923       doFsck(conf, true);
924 
925       // verify that overlaps are fixed
926       HBaseFsck hbck2 = doFsck(conf,false);
927       assertNoErrors(hbck2);
928       assertEquals(0, hbck2.getOverlapGroups(table).size());
929       assertEquals(ROWKEYS.length, countRows());
930     } finally {
931        deleteTable(table);
932     }
933   }
934 
935   /**
936    * This creates and fixes a bad table where a region overlaps two regions --
937    * a start key contained in another region and its end key is contained in
938    * yet another region.
939    */
940   @Test
941   public void testCoveredStartKey() throws Exception {
942     TableName table =
943         TableName.valueOf("tableCoveredStartKey");
944     try {
945       setupTable(table);
946       assertEquals(ROWKEYS.length, countRows());
947 
948       // Mess it up by creating an overlap in the metadata
949       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
950           Bytes.toBytes("A2"), Bytes.toBytes("B2"));
951       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
952       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
953           .waitForAssignment(hriOverlap);
954       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
955       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
956 
957       HBaseFsck hbck = doFsck(conf, false);
958       assertErrors(hbck, new ERROR_CODE[] {
959           ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
960           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
961       assertEquals(3, hbck.getOverlapGroups(table).size());
962       assertEquals(ROWKEYS.length, countRows());
963 
964       // fix the problem.
965       doFsck(conf, true);
966 
967       // verify that overlaps are fixed
968       HBaseFsck hbck2 = doFsck(conf, false);
969       assertErrors(hbck2, new ERROR_CODE[0]);
970       assertEquals(0, hbck2.getOverlapGroups(table).size());
971       assertEquals(ROWKEYS.length, countRows());
972     } finally {
973       deleteTable(table);
974     }
975   }
976 
977   /**
978    * This creates and fixes a bad table with a missing region -- hole in meta
979    * and data missing in the fs.
980    */
981   @Test
982   public void testRegionHole() throws Exception {
983     TableName table =
984         TableName.valueOf("tableRegionHole");
985     try {
986       setupTable(table);
987       assertEquals(ROWKEYS.length, countRows());
988 
989       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
990       TEST_UTIL.getHBaseAdmin().disableTable(table);
991       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
992           Bytes.toBytes("C"), true, true, true);
993       TEST_UTIL.getHBaseAdmin().enableTable(table);
994 
995       HBaseFsck hbck = doFsck(conf, false);
996       assertErrors(hbck, new ERROR_CODE[] {
997           ERROR_CODE.HOLE_IN_REGION_CHAIN});
998       // holes are separate from overlap groups
999       assertEquals(0, hbck.getOverlapGroups(table).size());
1000 
1001       // fix hole
1002       doFsck(conf, true);
1003 
1004       // check that hole fixed
1005       assertNoErrors(doFsck(conf,false));
1006       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1007     } finally {
1008       deleteTable(table);
1009     }
1010   }
1011 
1012   /**
1013    * This creates and fixes a bad table with a missing region -- hole in meta
1014    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1015    */
1016   @Test
1017   public void testHDFSRegioninfoMissing() throws Exception {
1018     TableName table =
1019         TableName.valueOf("tableHDFSRegioininfoMissing");
1020     try {
1021       setupTable(table);
1022       assertEquals(ROWKEYS.length, countRows());
1023 
1024       // Mess it up by leaving a hole in the meta data
1025       TEST_UTIL.getHBaseAdmin().disableTable(table);
1026       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1027           Bytes.toBytes("C"), true, true, false, true);
1028       TEST_UTIL.getHBaseAdmin().enableTable(table);
1029 
1030       HBaseFsck hbck = doFsck(conf, false);
1031       assertErrors(hbck, new ERROR_CODE[] {
1032           ERROR_CODE.ORPHAN_HDFS_REGION,
1033           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1034           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1035       // holes are separate from overlap groups
1036       assertEquals(0, hbck.getOverlapGroups(table).size());
1037 
1038       // fix hole
1039       doFsck(conf, true);
1040 
1041       // check that hole fixed
1042       assertNoErrors(doFsck(conf, false));
1043       assertEquals(ROWKEYS.length, countRows());
1044     } finally {
1045       deleteTable(table);
1046     }
1047   }
1048 
1049   /**
1050    * This creates and fixes a bad table with a missing region -- hole in meta and data present but
1051    * .regioninfo missing (an orphan hdfs region)in the fs. At last we check every row was present
1052    * at the correct region.
1053    */
1054   @Test(timeout = 180000)
1055   public void testHDFSRegioninfoMissingAndCheckRegionBoundary() throws Exception {
1056     TableName table = TableName.valueOf("testHDFSRegioninfoMissingAndCheckRegionBoundary");
1057     try {
1058       setupTable(table);
1059       assertEquals(ROWKEYS.length, countRows());
1060   
1061       // Mess it up by leaving a hole in the meta data
1062       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1063       admin.disableTable(table);
1064       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), true,
1065         true, false, true);
1066       admin.enableTable(table);
1067   
1068       HBaseFsck hbck = doFsck(conf, false);
1069       assertErrors(hbck,
1070         new HBaseFsck.ErrorReporter.ERROR_CODE[] {
1071             HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_HDFS_REGION,
1072             HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1073             HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
1074       // holes are separate from overlap groups
1075       assertEquals(0, hbck.getOverlapGroups(table).size());
1076   
1077       // fix hole
1078       doFsck(conf, true);
1079   
1080       // check that hole fixed
1081       assertNoErrors(doFsck(conf, false));
1082   
1083       // check data belong to the correct region,every scan should get one row.
1084       for (int i = 0; i < ROWKEYS.length; i++) {
1085         if (i != ROWKEYS.length - 1) {
1086           assertEquals(1, countRows(ROWKEYS[i], ROWKEYS[i + 1]));
1087         } else {
1088           assertEquals(1, countRows(ROWKEYS[i], null));
1089         }
1090       }
1091   
1092     } finally {
1093       deleteTable(table);
1094     }
1095   }
1096     
1097   /**
1098    * This creates and fixes a bad table with a region that is missing meta and
1099    * not assigned to a region server.
1100    */
1101   @Test
1102   public void testNotInMetaOrDeployedHole() throws Exception {
1103     TableName table =
1104         TableName.valueOf("tableNotInMetaOrDeployedHole");
1105     try {
1106       setupTable(table);
1107       assertEquals(ROWKEYS.length, countRows());
1108 
1109       // Mess it up by leaving a hole in the meta data
1110       TEST_UTIL.getHBaseAdmin().disableTable(table);
1111       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1112           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1113       TEST_UTIL.getHBaseAdmin().enableTable(table);
1114 
1115       HBaseFsck hbck = doFsck(conf, false);
1116       assertErrors(hbck, new ERROR_CODE[] {
1117           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1118       // holes are separate from overlap groups
1119       assertEquals(0, hbck.getOverlapGroups(table).size());
1120 
1121       // fix hole
1122       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1123           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1124 
1125       // check that hole fixed
1126       assertNoErrors(doFsck(conf,false));
1127       assertEquals(ROWKEYS.length, countRows());
1128     } finally {
1129       deleteTable(table);
1130     }
1131   }
1132 
1133   /**
1134    * This creates fixes a bad table with a hole in meta.
1135    */
1136   @Test
1137   public void testNotInMetaHole() throws Exception {
1138     TableName table =
1139         TableName.valueOf("tableNotInMetaHole");
1140     try {
1141       setupTable(table);
1142       assertEquals(ROWKEYS.length, countRows());
1143 
1144       // Mess it up by leaving a hole in the meta data
1145       TEST_UTIL.getHBaseAdmin().disableTable(table);
1146       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1147           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1148       TEST_UTIL.getHBaseAdmin().enableTable(table);
1149 
1150       HBaseFsck hbck = doFsck(conf, false);
1151       assertErrors(hbck, new ERROR_CODE[] {
1152           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1153       // holes are separate from overlap groups
1154       assertEquals(0, hbck.getOverlapGroups(table).size());
1155 
1156       // fix hole
1157       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1158           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1159 
1160       // check that hole fixed
1161       assertNoErrors(doFsck(conf,false));
1162       assertEquals(ROWKEYS.length, countRows());
1163     } finally {
1164       deleteTable(table);
1165     }
1166   }
1167 
1168   /**
1169    * This creates and fixes a bad table with a region that is in meta but has
1170    * no deployment or data hdfs
1171    */
1172   @Test
1173   public void testNotInHdfs() throws Exception {
1174     TableName table =
1175         TableName.valueOf("tableNotInHdfs");
1176     try {
1177       setupTable(table);
1178       assertEquals(ROWKEYS.length, countRows());
1179 
1180       // make sure data in regions, if in hlog only there is no data loss
1181       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1182 
1183       // Mess it up by leaving a hole in the hdfs data
1184       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1185           Bytes.toBytes("C"), false, false, true); // don't rm meta
1186 
1187       HBaseFsck hbck = doFsck(conf, false);
1188       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1189       // holes are separate from overlap groups
1190       assertEquals(0, hbck.getOverlapGroups(table).size());
1191 
1192       // fix hole
1193       doFsck(conf, true);
1194 
1195       // check that hole fixed
1196       assertNoErrors(doFsck(conf,false));
1197       assertEquals(ROWKEYS.length - 2, countRows());
1198     } finally {
1199       deleteTable(table);
1200     }
1201   }
1202 
1203   /**
1204    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1205    * remove the table.
1206    */
1207   @Test
1208   public void testNoHdfsTable() throws Exception {
1209     TableName table = TableName.valueOf("NoHdfsTable");
1210     setupTable(table);
1211     assertEquals(ROWKEYS.length, countRows());
1212 
1213     // make sure data in regions, if in hlog only there is no data loss
1214     TEST_UTIL.getHBaseAdmin().flush(table.getName());
1215 
1216     // Mess it up by deleting hdfs dirs
1217     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1218         Bytes.toBytes("A"), false, false, true); // don't rm meta
1219     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1220         Bytes.toBytes("B"), false, false, true); // don't rm meta
1221     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1222         Bytes.toBytes("C"), false, false, true); // don't rm meta
1223     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1224         Bytes.toBytes(""), false, false, true); // don't rm meta
1225 
1226     // also remove the table directory in hdfs
1227     deleteTableDir(table);
1228 
1229     HBaseFsck hbck = doFsck(conf, false);
1230     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1231         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1232         ERROR_CODE.NOT_IN_HDFS,});
1233     // holes are separate from overlap groups
1234     assertEquals(0, hbck.getOverlapGroups(table).size());
1235 
1236     // fix hole
1237     doFsck(conf, true); // detect dangling regions and remove those
1238 
1239     // check that hole fixed
1240     assertNoErrors(doFsck(conf,false));
1241     assertFalse("Table "+ table + " should have been deleted",
1242         TEST_UTIL.getHBaseAdmin().tableExists(table));
1243   }
1244 
1245   public void deleteTableDir(TableName table) throws IOException {
1246     Path rootDir = FSUtils.getRootDir(conf);
1247     FileSystem fs = rootDir.getFileSystem(conf);
1248     Path p = FSUtils.getTableDir(rootDir, table);
1249     HBaseFsck.debugLsr(conf, p);
1250     boolean success = fs.delete(p, true);
1251     LOG.info("Deleted " + p + " sucessfully? " + success);
1252   }
1253 
1254   /**
1255    * when the hbase.version file missing, It is fix the fault.
1256    */
1257   @Test
1258   public void testNoVersionFile() throws Exception {
1259     // delete the hbase.version file
1260     Path rootDir = FSUtils.getRootDir(conf);
1261     FileSystem fs = rootDir.getFileSystem(conf);
1262     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1263     fs.delete(versionFile, true);
1264 
1265     // test
1266     HBaseFsck hbck = doFsck(conf, false);
1267     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1268     // fix hbase.version missing
1269     doFsck(conf, true);
1270 
1271     // no version file fixed
1272     assertNoErrors(doFsck(conf, false));
1273   }
1274 
1275   /**
1276    * The region is not deployed when the table is disabled.
1277    */
1278   @Test
1279   public void testRegionShouldNotBeDeployed() throws Exception {
1280     TableName table =
1281         TableName.valueOf("tableRegionShouldNotBeDeployed");
1282     try {
1283       LOG.info("Starting testRegionShouldNotBeDeployed.");
1284       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1285       assertTrue(cluster.waitForActiveAndReadyMaster());
1286 
1287 
1288       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1289           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1290       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1291       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1292 
1293       // Write the .tableinfo
1294       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1295       fstd.createTableDescriptor(htdDisabled);
1296       List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
1297           TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);
1298 
1299       // Let's just assign everything to first RS
1300       HRegionServer hrs = cluster.getRegionServer(0);
1301 
1302       // Create region files.
1303       TEST_UTIL.getHBaseAdmin().disableTable(table);
1304       TEST_UTIL.getHBaseAdmin().enableTable(table);
1305 
1306       // Disable the table and close its regions
1307       TEST_UTIL.getHBaseAdmin().disableTable(table);
1308       HRegionInfo region = disabledRegions.remove(0);
1309       byte[] regionName = region.getRegionName();
1310 
1311       // The region should not be assigned currently
1312       assertTrue(cluster.getServerWith(regionName) == -1);
1313 
1314       // Directly open a region on a region server.
1315       // If going through AM/ZK, the region won't be open.
1316       // Even it is opened, AM will close it which causes
1317       // flakiness of this test.
1318       HRegion r = HRegion.openHRegion(
1319         region, htdDisabled, hrs.getWAL(region), conf);
1320       hrs.addToOnlineRegions(r);
1321 
1322       HBaseFsck hbck = doFsck(conf, false);
1323       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1324 
1325       // fix this fault
1326       doFsck(conf, true);
1327 
1328       // check result
1329       assertNoErrors(doFsck(conf, false));
1330     } finally {
1331       TEST_UTIL.getHBaseAdmin().enableTable(table);
1332       deleteTable(table);
1333     }
1334   }
1335 
1336   /**
1337    * This creates two tables and mess both of them and fix them one by one
1338    */
1339   @Test
1340   public void testFixByTable() throws Exception {
1341     TableName table1 =
1342         TableName.valueOf("testFixByTable1");
1343     TableName table2 =
1344         TableName.valueOf("testFixByTable2");
1345     try {
1346       setupTable(table1);
1347       // make sure data in regions, if in hlog only there is no data loss
1348       TEST_UTIL.getHBaseAdmin().flush(table1.getName());
1349       // Mess them up by leaving a hole in the hdfs data
1350       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1351         Bytes.toBytes("C"), false, false, true); // don't rm meta
1352 
1353       setupTable(table2);
1354       // make sure data in regions, if in hlog only there is no data loss
1355       TEST_UTIL.getHBaseAdmin().flush(table2.getName());
1356       // Mess them up by leaving a hole in the hdfs data
1357       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1358         Bytes.toBytes("C"), false, false, true); // don't rm meta
1359 
1360       HBaseFsck hbck = doFsck(conf, false);
1361       assertErrors(hbck, new ERROR_CODE[] {
1362         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1363 
1364       // fix hole in table 1
1365       doFsck(conf, true, table1);
1366       // check that hole in table 1 fixed
1367       assertNoErrors(doFsck(conf, false, table1));
1368       // check that hole in table 2 still there
1369       assertErrors(doFsck(conf, false, table2),
1370         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1371 
1372       // fix hole in table 2
1373       doFsck(conf, true, table2);
1374       // check that hole in both tables fixed
1375       assertNoErrors(doFsck(conf, false));
1376       assertEquals(ROWKEYS.length - 2, countRows());
1377     } finally {
1378       deleteTable(table1);
1379       deleteTable(table2);
1380     }
1381   }
1382   /**
1383    * A split parent in meta, in hdfs, and not deployed
1384    */
1385   @Test
1386   public void testLingeringSplitParent() throws Exception {
1387     TableName table =
1388         TableName.valueOf("testLingeringSplitParent");
1389     HTable meta = null;
1390     try {
1391       setupTable(table);
1392       assertEquals(ROWKEYS.length, countRows());
1393 
1394       // make sure data in regions, if in hlog only there is no data loss
1395       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1396       HRegionLocation location = tbl.getRegionLocation("B");
1397 
1398       // Delete one region from meta, but not hdfs, unassign it.
1399       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1400         Bytes.toBytes("C"), true, true, false);
1401 
1402       // Create a new meta entry to fake it as a split parent.
1403       meta = new HTable(conf, TableName.META_TABLE_NAME, executorService);
1404       HRegionInfo hri = location.getRegionInfo();
1405 
1406       HRegionInfo a = new HRegionInfo(tbl.getName(),
1407         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1408       HRegionInfo b = new HRegionInfo(tbl.getName(),
1409         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1410 
1411       hri.setOffline(true);
1412       hri.setSplit(true);
1413 
1414       MetaEditor.addRegionToMeta(meta, hri, a, b);
1415       meta.flushCommits();
1416       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1417 
1418       HBaseFsck hbck = doFsck(conf, false);
1419       assertErrors(hbck, new ERROR_CODE[] {
1420         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1421 
1422       // regular repair cannot fix lingering split parent
1423       hbck = doFsck(conf, true);
1424       assertErrors(hbck, new ERROR_CODE[] {
1425         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1426       assertFalse(hbck.shouldRerun());
1427       hbck = doFsck(conf, false);
1428       assertErrors(hbck, new ERROR_CODE[] {
1429         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1430 
1431       // fix lingering split parent
1432       hbck = new HBaseFsck(conf);
1433       hbck.connect();
1434       hbck.setDisplayFullReport(); // i.e. -details
1435       hbck.setTimeLag(0);
1436       hbck.setFixSplitParents(true);
1437       hbck.onlineHbck();
1438       assertTrue(hbck.shouldRerun());
1439 
1440       Get get = new Get(hri.getRegionName());
1441       Result result = meta.get(get);
1442       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1443         HConstants.SPLITA_QUALIFIER).isEmpty());
1444       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1445         HConstants.SPLITB_QUALIFIER).isEmpty());
1446       TEST_UTIL.getHBaseAdmin().flush(TableName.META_TABLE_NAME.getName());
1447 
1448       // fix other issues
1449       doFsck(conf, true);
1450 
1451       // check that all are fixed
1452       assertNoErrors(doFsck(conf, false));
1453       assertEquals(ROWKEYS.length, countRows());
1454     } finally {
1455       deleteTable(table);
1456       IOUtils.closeQuietly(meta);
1457     }
1458   }
1459 
1460   /**
1461    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1462    * valid cases where the daughters are there.
1463    */
1464   @Test
1465   public void testValidLingeringSplitParent() throws Exception {
1466     TableName table =
1467         TableName.valueOf("testLingeringSplitParent");
1468     HTable meta = null;
1469     try {
1470       setupTable(table);
1471       assertEquals(ROWKEYS.length, countRows());
1472 
1473       // make sure data in regions, if in hlog only there is no data loss
1474       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1475       HRegionLocation location = tbl.getRegionLocation("B");
1476 
1477       meta = new HTable(conf, TableName.META_TABLE_NAME);
1478       HRegionInfo hri = location.getRegionInfo();
1479 
1480       // do a regular split
1481       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1482       byte[] regionName = location.getRegionInfo().getRegionName();
1483       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1484       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1485           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1486 
1487       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1488       // for some time until children references are deleted. HBCK erroneously sees this as
1489       // overlapping regions
1490       HBaseFsck hbck = doFsck(
1491         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1492       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1493 
1494       // assert that the split hbase:meta entry is still there.
1495       Get get = new Get(hri.getRegionName());
1496       Result result = meta.get(get);
1497       assertNotNull(result);
1498       assertNotNull(HRegionInfo.getHRegionInfo(result));
1499 
1500       assertEquals(ROWKEYS.length, countRows());
1501 
1502       // assert that we still have the split regions
1503       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1504       assertNoErrors(doFsck(conf, false));
1505     } finally {
1506       deleteTable(table);
1507       IOUtils.closeQuietly(meta);
1508     }
1509   }
1510 
1511   /**
1512    * Split crashed after write to hbase:meta finished for the parent region, but
1513    * failed to write daughters (pre HBASE-7721 codebase)
1514    */
1515   @Test(timeout=75000)
1516   public void testSplitDaughtersNotInMeta() throws Exception {
1517     TableName table =
1518         TableName.valueOf("testSplitdaughtersNotInMeta");
1519     HTable meta = null;
1520     try {
1521       setupTable(table);
1522       assertEquals(ROWKEYS.length, countRows());
1523 
1524       // make sure data in regions, if in hlog only there is no data loss
1525       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1526       HRegionLocation location = tbl.getRegionLocation("B");
1527 
1528       meta = new HTable(conf, TableName.META_TABLE_NAME);
1529       HRegionInfo hri = location.getRegionInfo();
1530 
1531       // do a regular split
1532       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1533       byte[] regionName = location.getRegionInfo().getRegionName();
1534       admin.split(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1535       TestEndToEndSplitTransaction.blockUntilRegionSplit(
1536           TEST_UTIL.getConfiguration(), 60000, regionName, true);
1537 
1538       PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(meta.get(new Get(regionName)));
1539 
1540       // Delete daughter regions from meta, but not hdfs, unassign it.
1541       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1542       undeployRegion(admin, hris.get(daughters.getFirst()), daughters.getFirst());
1543       undeployRegion(admin, hris.get(daughters.getSecond()), daughters.getSecond());
1544 
1545       meta.delete(new Delete(daughters.getFirst().getRegionName()));
1546       meta.delete(new Delete(daughters.getSecond().getRegionName()));
1547       meta.flushCommits();
1548 
1549       HBaseFsck hbck = doFsck(conf, false);
1550       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1551           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN}); //no LINGERING_SPLIT_PARENT
1552 
1553       // now fix it. The fix should not revert the region split, but add daughters to META
1554       hbck = doFsck(
1555         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1556       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1557           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1558 
1559       // assert that the split hbase:meta entry is still there.
1560       Get get = new Get(hri.getRegionName());
1561       Result result = meta.get(get);
1562       assertNotNull(result);
1563       assertNotNull(HRegionInfo.getHRegionInfo(result));
1564 
1565       assertEquals(ROWKEYS.length, countRows());
1566 
1567       // assert that we still have the split regions
1568       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1569       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1570     } finally {
1571       deleteTable(table);
1572       IOUtils.closeQuietly(meta);
1573     }
1574   }
1575 
1576   /**
1577    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1578    * meta and data missing in the fs.
1579    */
1580   @Test(timeout=120000)
1581   public void testMissingFirstRegion() throws Exception {
1582     TableName table =
1583         TableName.valueOf("testMissingFirstRegion");
1584     try {
1585       setupTable(table);
1586       assertEquals(ROWKEYS.length, countRows());
1587 
1588       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1589       TEST_UTIL.getHBaseAdmin().disableTable(table);
1590       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1591           true, true);
1592       TEST_UTIL.getHBaseAdmin().enableTable(table);
1593 
1594       HBaseFsck hbck = doFsck(conf, false);
1595       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1596       // fix hole
1597       doFsck(conf, true);
1598       // check that hole fixed
1599       assertNoErrors(doFsck(conf, false));
1600     } finally {
1601       deleteTable(table);
1602     }
1603   }
1604 
1605   /**
1606    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1607    * meta and data missing in the fs.
1608    */
1609   @Test(timeout=120000)
1610   public void testRegionDeployedNotInHdfs() throws Exception {
1611     TableName table =
1612         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1613     try {
1614       setupTable(table);
1615       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1616 
1617       // Mess it up by deleting region dir
1618       deleteRegion(conf, tbl.getTableDescriptor(),
1619         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1620         false, true);
1621 
1622       HBaseFsck hbck = doFsck(conf, false);
1623       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1624       // fix hole
1625       doFsck(conf, true);
1626       // check that hole fixed
1627       assertNoErrors(doFsck(conf, false));
1628     } finally {
1629       deleteTable(table);
1630     }
1631   }
1632 
1633   /**
1634    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1635    * the fs.
1636    */
1637   @Test(timeout=120000)
1638   public void testMissingLastRegion() throws Exception {
1639     TableName table =
1640         TableName.valueOf("testMissingLastRegion");
1641     try {
1642       setupTable(table);
1643       assertEquals(ROWKEYS.length, countRows());
1644 
1645       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1646       TEST_UTIL.getHBaseAdmin().disableTable(table);
1647       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1648           true, true);
1649       TEST_UTIL.getHBaseAdmin().enableTable(table);
1650 
1651       HBaseFsck hbck = doFsck(conf, false);
1652       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1653       // fix hole
1654       doFsck(conf, true);
1655       // check that hole fixed
1656       assertNoErrors(doFsck(conf, false));
1657     } finally {
1658       deleteTable(table);
1659     }
1660   }
1661 
1662   /**
1663    * Test -noHdfsChecking option can detect and fix assignments issue.
1664    */
1665   @Test
1666   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1667     TableName table =
1668         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1669     try {
1670       setupTable(table);
1671       assertEquals(ROWKEYS.length, countRows());
1672 
1673       // Mess it up by closing a region
1674       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1675         Bytes.toBytes("B"), true, false, false, false);
1676 
1677       // verify there is no other errors
1678       HBaseFsck hbck = doFsck(conf, false);
1679       assertErrors(hbck, new ERROR_CODE[] {
1680         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1681 
1682       // verify that noHdfsChecking report the same errors
1683       HBaseFsck fsck = new HBaseFsck(conf);
1684       fsck.connect();
1685       fsck.setDisplayFullReport(); // i.e. -details
1686       fsck.setTimeLag(0);
1687       fsck.setCheckHdfs(false);
1688       fsck.onlineHbck();
1689       assertErrors(fsck, new ERROR_CODE[] {
1690         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1691 
1692       // verify that fixAssignments works fine with noHdfsChecking
1693       fsck = new HBaseFsck(conf);
1694       fsck.connect();
1695       fsck.setDisplayFullReport(); // i.e. -details
1696       fsck.setTimeLag(0);
1697       fsck.setCheckHdfs(false);
1698       fsck.setFixAssignments(true);
1699       fsck.onlineHbck();
1700       assertTrue(fsck.shouldRerun());
1701       fsck.onlineHbck();
1702       assertNoErrors(fsck);
1703 
1704       assertEquals(ROWKEYS.length, countRows());
1705     } finally {
1706       deleteTable(table);
1707     }
1708   }
1709 
1710   /**
1711    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1712    * However, it can not fix it without checking Hdfs because we need to get
1713    * the region info from Hdfs in this case, then to patch the meta.
1714    */
1715   @Test
1716   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1717     TableName table =
1718         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1719     try {
1720       setupTable(table);
1721       assertEquals(ROWKEYS.length, countRows());
1722 
1723       // Mess it up by deleting a region from the metadata
1724       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1725         Bytes.toBytes("B"), false, true, false, false);
1726 
1727       // verify there is no other errors
1728       HBaseFsck hbck = doFsck(conf, false);
1729       assertErrors(hbck, new ERROR_CODE[] {
1730         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1731 
1732       // verify that noHdfsChecking report the same errors
1733       HBaseFsck fsck = new HBaseFsck(conf);
1734       fsck.connect();
1735       fsck.setDisplayFullReport(); // i.e. -details
1736       fsck.setTimeLag(0);
1737       fsck.setCheckHdfs(false);
1738       fsck.onlineHbck();
1739       assertErrors(fsck, new ERROR_CODE[] {
1740         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1741 
1742       // verify that fixMeta doesn't work with noHdfsChecking
1743       fsck = new HBaseFsck(conf);
1744       fsck.connect();
1745       fsck.setDisplayFullReport(); // i.e. -details
1746       fsck.setTimeLag(0);
1747       fsck.setCheckHdfs(false);
1748       fsck.setFixAssignments(true);
1749       fsck.setFixMeta(true);
1750       fsck.onlineHbck();
1751       assertFalse(fsck.shouldRerun());
1752       assertErrors(fsck, new ERROR_CODE[] {
1753         ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1754 
1755       // fix the cluster so other tests won't be impacted
1756       fsck = doFsck(conf, true);
1757       assertTrue(fsck.shouldRerun());
1758       fsck = doFsck(conf, true);
1759       assertNoErrors(fsck);
1760     } finally {
1761       deleteTable(table);
1762     }
1763   }
1764 
1765   /**
1766    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1767    * and -noHdfsChecking can't detect orphan Hdfs region.
1768    */
1769   @Test
1770   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1771     TableName table =
1772         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1773     try {
1774       setupTable(table);
1775       assertEquals(ROWKEYS.length, countRows());
1776 
1777       // Mess it up by creating an overlap in the metadata
1778       TEST_UTIL.getHBaseAdmin().disableTable(table);
1779       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1780         Bytes.toBytes("B"), true, true, false, true);
1781       TEST_UTIL.getHBaseAdmin().enableTable(table);
1782 
1783       HRegionInfo hriOverlap = createRegion(conf, tbl.getTableDescriptor(),
1784         Bytes.toBytes("A2"), Bytes.toBytes("B"));
1785       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1786       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1787         .waitForAssignment(hriOverlap);
1788       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1789       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1790 
1791       HBaseFsck hbck = doFsck(conf, false);
1792       assertErrors(hbck, new ERROR_CODE[] {
1793         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1794         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1795 
1796       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
1797       HBaseFsck fsck = new HBaseFsck(conf);
1798       fsck.connect();
1799       fsck.setDisplayFullReport(); // i.e. -details
1800       fsck.setTimeLag(0);
1801       fsck.setCheckHdfs(false);
1802       fsck.onlineHbck();
1803       assertErrors(fsck, new ERROR_CODE[] {
1804         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1805 
1806       // verify that fixHdfsHoles doesn't work with noHdfsChecking
1807       fsck = new HBaseFsck(conf);
1808       fsck.connect();
1809       fsck.setDisplayFullReport(); // i.e. -details
1810       fsck.setTimeLag(0);
1811       fsck.setCheckHdfs(false);
1812       fsck.setFixHdfsHoles(true);
1813       fsck.setFixHdfsOverlaps(true);
1814       fsck.setFixHdfsOrphans(true);
1815       fsck.onlineHbck();
1816       assertFalse(fsck.shouldRerun());
1817       assertErrors(fsck, new ERROR_CODE[] {
1818         ERROR_CODE.HOLE_IN_REGION_CHAIN});
1819     } finally {
1820       if (TEST_UTIL.getHBaseAdmin().isTableDisabled(table)) {
1821         TEST_UTIL.getHBaseAdmin().enableTable(table);
1822       }
1823       deleteTable(table);
1824     }
1825   }
1826 
1827   /**
1828    * We don't have an easy way to verify that a flush completed, so we loop until we find a
1829    * legitimate hfile and return it.
1830    * @param fs
1831    * @param table
1832    * @return Path of a flushed hfile.
1833    * @throws IOException
1834    */
1835   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
1836     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
1837     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
1838     Path famDir = new Path(regionDir, FAM_STR);
1839 
1840     // keep doing this until we get a legit hfile
1841     while (true) {
1842       FileStatus[] hfFss = fs.listStatus(famDir);
1843       if (hfFss.length == 0) {
1844         continue;
1845       }
1846       for (FileStatus hfs : hfFss) {
1847         if (!hfs.isDir()) {
1848           return hfs.getPath();
1849         }
1850       }
1851     }
1852   }
1853 
1854   /**
1855    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
1856    */
1857   @Test(timeout=180000)
1858   public void testQuarantineCorruptHFile() throws Exception {
1859     TableName table = TableName.valueOf(name.getMethodName());
1860     try {
1861       setupTable(table);
1862       assertEquals(ROWKEYS.length, countRows());
1863       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1864 
1865       FileSystem fs = FileSystem.get(conf);
1866       Path hfile = getFlushedHFile(fs, table);
1867 
1868       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1869       TEST_UTIL.getHBaseAdmin().disableTable(table);
1870 
1871       // create new corrupt file called deadbeef (valid hfile name)
1872       Path corrupt = new Path(hfile.getParent(), "deadbeef");
1873       TestHFile.truncateFile(fs, hfile, corrupt);
1874       LOG.info("Created corrupted file " + corrupt);
1875       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
1876 
1877       // we cannot enable here because enable never finished due to the corrupt region.
1878       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
1879       assertEquals(res.getRetCode(), 0);
1880       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1881       assertEquals(hfcc.getHFilesChecked(), 5);
1882       assertEquals(hfcc.getCorrupted().size(), 1);
1883       assertEquals(hfcc.getFailures().size(), 0);
1884       assertEquals(hfcc.getQuarantined().size(), 1);
1885       assertEquals(hfcc.getMissing().size(), 0);
1886 
1887       // Its been fixed, verify that we can enable.
1888       TEST_UTIL.getHBaseAdmin().enableTable(table);
1889     } finally {
1890       deleteTable(table);
1891     }
1892   }
1893 
1894   /**
1895   * Test that use this should have a timeout, because this method could potentially wait forever.
1896   */
1897   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
1898                                 int corrupt, int fail, int quar, int missing) throws Exception {
1899     try {
1900       setupTable(table);
1901       assertEquals(ROWKEYS.length, countRows());
1902       TEST_UTIL.getHBaseAdmin().flush(table.getName()); // flush is async.
1903 
1904       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1905       TEST_UTIL.getHBaseAdmin().disableTable(table);
1906 
1907       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
1908           table.getNameAsString()};
1909       ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1910       HBaseFsck res = hbck.exec(exec, args);
1911 
1912       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
1913       assertEquals(hfcc.getHFilesChecked(), check);
1914       assertEquals(hfcc.getCorrupted().size(), corrupt);
1915       assertEquals(hfcc.getFailures().size(), fail);
1916       assertEquals(hfcc.getQuarantined().size(), quar);
1917       assertEquals(hfcc.getMissing().size(), missing);
1918 
1919       // its been fixed, verify that we can enable
1920       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
1921       admin.enableTableAsync(table);
1922       while (!admin.isTableEnabled(table)) {
1923         try {
1924           Thread.sleep(250);
1925         } catch (InterruptedException e) {
1926           e.printStackTrace();
1927           fail("Interrupted when trying to enable table " + table);
1928         }
1929       }
1930     } finally {
1931       deleteTable(table);
1932     }
1933   }
1934 
1935   /**
1936    * This creates a table and simulates the race situation where a concurrent compaction or split
1937    * has removed an hfile after the corruption checker learned about it.
1938    */
1939   @Test(timeout=180000)
1940   public void testQuarantineMissingHFile() throws Exception {
1941     TableName table = TableName.valueOf(name.getMethodName());
1942     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1943     // inject a fault in the hfcc created.
1944     final FileSystem fs = FileSystem.get(conf);
1945     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1946       @Override
1947       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1948         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1949           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1950           @Override
1951           protected void checkHFile(Path p) throws IOException {
1952             if (attemptedFirstHFile.compareAndSet(false, true)) {
1953               assertTrue(fs.delete(p, true)); // make sure delete happened.
1954             }
1955             super.checkHFile(p);
1956           }
1957         };
1958       }
1959     };
1960     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
1961   }
1962 
1963   /**
1964    * This creates a table and simulates the race situation where a concurrent compaction or split
1965    * has removed an colfam dir before the corruption checker got to it.
1966    */
1967   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
1968   // files in a column family on initial creation -- as suggested by Matteo.
1969   @Ignore @Test(timeout=180000)
1970   public void testQuarantineMissingFamdir() throws Exception {
1971     TableName table = TableName.valueOf(name.getMethodName());
1972     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
1973     // inject a fault in the hfcc created.
1974     final FileSystem fs = FileSystem.get(conf);
1975     HBaseFsck hbck = new HBaseFsck(conf, exec) {
1976       @Override
1977       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
1978         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
1979           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
1980           @Override
1981           protected void checkColFamDir(Path p) throws IOException {
1982             if (attemptedFirstHFile.compareAndSet(false, true)) {
1983               assertTrue(fs.delete(p, true)); // make sure delete happened.
1984             }
1985             super.checkColFamDir(p);
1986           }
1987         };
1988       }
1989     };
1990     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
1991   }
1992 
1993   /**
1994    * This creates a table and simulates the race situation where a concurrent compaction or split
1995    * has removed a region dir before the corruption checker got to it.
1996    */
1997   @Test(timeout=180000)
1998   public void testQuarantineMissingRegionDir() throws Exception {
1999     TableName table = TableName.valueOf(name.getMethodName());
2000     ExecutorService exec = new ScheduledThreadPoolExecutor(10);
2001     // inject a fault in the hfcc created.
2002     final FileSystem fs = FileSystem.get(conf);
2003     HBaseFsck hbck = new HBaseFsck(conf, exec) {
2004       @Override
2005       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2006         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2007           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2008           @Override
2009           protected void checkRegionDir(Path p) throws IOException {
2010             if (attemptedFirstHFile.compareAndSet(false, true)) {
2011               assertTrue(fs.delete(p, true)); // make sure delete happened.
2012             }
2013             super.checkRegionDir(p);
2014           }
2015         };
2016       }
2017     };
2018     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2019   }
2020 
2021   /**
2022    * Test fixing lingering reference file.
2023    */
2024   @Test
2025   public void testLingeringReferenceFile() throws Exception {
2026     TableName table =
2027         TableName.valueOf("testLingeringReferenceFile");
2028     try {
2029       setupTable(table);
2030       assertEquals(ROWKEYS.length, countRows());
2031 
2032       // Mess it up by creating a fake reference file
2033       FileSystem fs = FileSystem.get(conf);
2034       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2035       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2036       Path famDir = new Path(regionDir, FAM_STR);
2037       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2038       fs.create(fakeReferenceFile);
2039 
2040       HBaseFsck hbck = doFsck(conf, false);
2041       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2042       // fix reference file
2043       doFsck(conf, true);
2044       // check that reference file fixed
2045       assertNoErrors(doFsck(conf, false));
2046     } finally {
2047       deleteTable(table);
2048     }
2049   }
2050 
2051   /**
2052    * Test mission REGIONINFO_QUALIFIER in hbase:meta
2053    */
2054   @Test
2055   public void testMissingRegionInfoQualifier() throws Exception {
2056     TableName table =
2057         TableName.valueOf("testMissingRegionInfoQualifier");
2058     try {
2059       setupTable(table);
2060 
2061       // Mess it up by removing the RegionInfo for one region.
2062       final List<Delete> deletes = new LinkedList<Delete>();
2063       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
2064       MetaScanner.metaScan(conf, new MetaScanner.MetaScannerVisitor() {
2065 
2066         @Override
2067         public boolean processRow(Result rowResult) throws IOException {
2068           HRegionInfo hri = MetaScanner.getHRegionInfo(rowResult);
2069           if (hri != null && !hri.getTable().isSystemTable()) {
2070             Delete delete = new Delete(rowResult.getRow());
2071             delete.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2072             deletes.add(delete);
2073           }
2074           return true;
2075         }
2076 
2077         @Override
2078         public void close() throws IOException {
2079         }
2080       });
2081       meta.delete(deletes);
2082 
2083       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2084       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2085         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2086       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2087         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2088       meta.close();
2089 
2090       HBaseFsck hbck = doFsck(conf, false);
2091       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2092 
2093       // fix reference file
2094       hbck = doFsck(conf, true);
2095 
2096       // check that reference file fixed
2097       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2098     } finally {
2099       deleteTable(table);
2100     }
2101   }
2102 
2103 
2104   /**
2105    * Test pluggable error reporter. It can be plugged in
2106    * from system property or configuration.
2107    */
2108   @Test
2109   public void testErrorReporter() throws Exception {
2110     try {
2111       MockErrorReporter.calledCount = 0;
2112       doFsck(conf, false);
2113       assertEquals(MockErrorReporter.calledCount, 0);
2114 
2115       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2116       doFsck(conf, false);
2117       assertTrue(MockErrorReporter.calledCount > 20);
2118     } finally {
2119       conf.set("hbasefsck.errorreporter",
2120         PrintingErrorReporter.class.getName());
2121       MockErrorReporter.calledCount = 0;
2122     }
2123   }
2124 
2125   static class MockErrorReporter implements ErrorReporter {
2126     static int calledCount = 0;
2127 
2128     @Override
2129     public void clear() {
2130       calledCount++;
2131     }
2132 
2133     @Override
2134     public void report(String message) {
2135       calledCount++;
2136     }
2137 
2138     @Override
2139     public void reportError(String message) {
2140       calledCount++;
2141     }
2142 
2143     @Override
2144     public void reportError(ERROR_CODE errorCode, String message) {
2145       calledCount++;
2146     }
2147 
2148     @Override
2149     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2150       calledCount++;
2151     }
2152 
2153     @Override
2154     public void reportError(ERROR_CODE errorCode,
2155         String message, TableInfo table, HbckInfo info) {
2156       calledCount++;
2157     }
2158 
2159     @Override
2160     public void reportError(ERROR_CODE errorCode, String message,
2161         TableInfo table, HbckInfo info1, HbckInfo info2) {
2162       calledCount++;
2163     }
2164 
2165     @Override
2166     public int summarize() {
2167       return ++calledCount;
2168     }
2169 
2170     @Override
2171     public void detail(String details) {
2172       calledCount++;
2173     }
2174 
2175     @Override
2176     public ArrayList<ERROR_CODE> getErrorList() {
2177       calledCount++;
2178       return new ArrayList<ERROR_CODE>();
2179     }
2180 
2181     @Override
2182     public void progress() {
2183       calledCount++;
2184     }
2185 
2186     @Override
2187     public void print(String message) {
2188       calledCount++;
2189     }
2190 
2191     @Override
2192     public void resetErrors() {
2193       calledCount++;
2194     }
2195 
2196     @Override
2197     public boolean tableHasErrors(TableInfo table) {
2198       calledCount++;
2199       return false;
2200     }
2201   }
2202 
2203   @Test(timeout=180000)
2204   public void testCheckTableLocks() throws Exception {
2205     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2206     EnvironmentEdgeManager.injectEdge(edge);
2207     // check no errors
2208     HBaseFsck hbck = doFsck(conf, false);
2209     assertNoErrors(hbck);
2210 
2211     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2212 
2213     // obtain one lock
2214     final TableLockManager tableLockManager = TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2215     TableLock writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2216         "testCheckTableLocks");
2217     writeLock.acquire();
2218     hbck = doFsck(conf, false);
2219     assertNoErrors(hbck); // should not have expired, no problems
2220 
2221     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2222         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2223 
2224     hbck = doFsck(conf, false);
2225     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2226 
2227     final CountDownLatch latch = new CountDownLatch(1);
2228     new Thread() {
2229       @Override
2230       public void run() {
2231         TableLock readLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2232             "testCheckTableLocks");
2233         try {
2234           latch.countDown();
2235           readLock.acquire();
2236         } catch (IOException ex) {
2237           fail();
2238         } catch (IllegalStateException ex) {
2239           return; // expected, since this will be reaped under us.
2240         }
2241         fail("should not have come here");
2242       };
2243     }.start();
2244 
2245     latch.await(); // wait until thread starts
2246     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2247 
2248     hbck = doFsck(conf, false);
2249     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2250 
2251     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2252         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2253 
2254     hbck = doFsck(conf, false);
2255     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2256 
2257     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2258                                                                  // which is not injectable through EnvironmentEdge
2259     Threads.sleep(10);
2260     hbck = doFsck(conf, true); // now fix both cases
2261 
2262     hbck = doFsck(conf, false);
2263     assertNoErrors(hbck);
2264 
2265     // ensure that locks are deleted
2266     writeLock = tableLockManager.writeLock(TableName.valueOf("foo"),
2267         "should acquire without blocking");
2268     writeLock.acquire(); // this should not block.
2269     writeLock.release(); // release for clean state
2270   }
2271 
2272   /**
2273    * Test orphaned table ZNode (for table states)
2274    */
2275   @Test
2276   public void testOrphanedTableZNode() throws Exception {
2277     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2278 
2279     try {
2280       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getZKTable().
2281       setEnablingTable(table);
2282 
2283       try {
2284         setupTable(table);
2285         Assert.fail(
2286           "Create table should fail when its ZNode has already existed with ENABLING state.");
2287       } catch(TableExistsException t) {
2288         //Expected exception
2289       }
2290       // The setup table was interrupted in some state that needs to some cleanup.
2291       try {
2292         deleteTable(table);
2293       } catch (IOException e) {
2294         // Because create table failed, it is expected that the cleanup table would
2295         // throw some exception.  Ignore and continue.
2296       }
2297 
2298       HBaseFsck hbck = doFsck(conf, false);
2299       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2300 
2301       // fix the orphaned ZK entry
2302       hbck = doFsck(conf, true);
2303 
2304       // check that orpahned ZK table entry is gone.
2305       hbck = doFsck(conf, false);
2306       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2307       // Now create table should succeed.
2308       setupTable(table);
2309     } finally {
2310       // This code could be called that either a table was created successfully or set up
2311       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2312       try {
2313         deleteTable(table);
2314       } catch (IOException e) {
2315         // The cleanup table would throw some exception if create table failed in some state.
2316         // Ignore this exception
2317       }
2318     }
2319   }
2320 
2321   @Test
2322   public void testMetaOffline() throws Exception {
2323     // check no errors
2324     HBaseFsck hbck = doFsck(conf, false);
2325     assertNoErrors(hbck);
2326     deleteMetaRegion(conf, true, false, false);
2327     hbck = doFsck(conf, false);
2328     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2329     // inconsistency and whether we will be fixing it or not.
2330     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2331     hbck = doFsck(conf, true);
2332     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2333     hbck = doFsck(conf, false);
2334     assertNoErrors(hbck);
2335   }
2336 
2337   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2338       boolean regionInfoOnly) throws IOException, InterruptedException {
2339     HConnection connection = HConnectionManager.getConnection(conf);
2340     HRegionLocation metaLocation = connection.locateRegion(TableName.META_TABLE_NAME,
2341         HConstants.EMPTY_START_ROW);
2342     ServerName hsa = metaLocation.getServerName();
2343     HRegionInfo hri = metaLocation.getRegionInfo();
2344     if (unassign) {
2345       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2346       undeployRegion(new HBaseAdmin(conf), hsa, hri);
2347     }
2348 
2349     if (regionInfoOnly) {
2350       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2351       Path rootDir = FSUtils.getRootDir(conf);
2352       FileSystem fs = rootDir.getFileSystem(conf);
2353       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2354           hri.getEncodedName());
2355       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2356       fs.delete(hriPath, true);
2357     }
2358 
2359     if (hdfs) {
2360       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2361       Path rootDir = FSUtils.getRootDir(conf);
2362       FileSystem fs = rootDir.getFileSystem(conf);
2363       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2364           hri.getEncodedName());
2365       HBaseFsck.debugLsr(conf, p);
2366       boolean success = fs.delete(p, true);
2367       LOG.info("Deleted " + p + " sucessfully? " + success);
2368       HBaseFsck.debugLsr(conf, p);
2369     }
2370   }
2371 
2372   @Test
2373   public void testTableWithNoRegions() throws Exception {
2374     // We might end up with empty regions in a table
2375     // see also testNoHdfsTable()
2376     TableName table =
2377         TableName.valueOf(name.getMethodName());
2378     try {
2379       // create table with one region
2380       HTableDescriptor desc = new HTableDescriptor(table);
2381       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2382       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2383       TEST_UTIL.getHBaseAdmin().createTable(desc);
2384       tbl = new HTable(TEST_UTIL.getConfiguration(), table, executorService);
2385 
2386       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2387       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW, false,
2388           false, true);
2389 
2390       HBaseFsck hbck = doFsck(conf, false);
2391       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2392 
2393       doFsck(conf, true);
2394 
2395       // fix hole
2396       doFsck(conf, true);
2397 
2398       // check that hole fixed
2399       assertNoErrors(doFsck(conf, false));
2400     } finally {
2401       deleteTable(table);
2402     }
2403 
2404   }
2405 
2406   @Test
2407   public void testHbckAfterRegionMerge() throws Exception {
2408     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2409     HTable meta = null;
2410     try {
2411       // disable CatalogJanitor
2412       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2413       setupTable(table);
2414       assertEquals(ROWKEYS.length, countRows());
2415 
2416       // make sure data in regions, if in hlog only there is no data loss
2417       TEST_UTIL.getHBaseAdmin().flush(table.getName());
2418       HRegionInfo region1 = tbl.getRegionLocation("A").getRegionInfo();
2419       HRegionInfo region2 = tbl.getRegionLocation("B").getRegionInfo();
2420 
2421       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2422 
2423       assertNotEquals(region1, region2);
2424 
2425       // do a region merge
2426       HBaseAdmin admin = TEST_UTIL.getHBaseAdmin();
2427       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2428           region2.getEncodedNameAsBytes(), false);
2429 
2430       // wait until region merged
2431       long timeout = System.currentTimeMillis() + 30 * 1000;
2432       while (true) {
2433         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2434           break;
2435         } else if (System.currentTimeMillis() > timeout) {
2436           fail("Time out waiting on region " + region1.getEncodedName()
2437               + " and " + region2.getEncodedName() + " be merged");
2438         }
2439         Thread.sleep(10);
2440       }
2441 
2442       assertEquals(ROWKEYS.length, countRows());
2443 
2444       HBaseFsck hbck = doFsck(conf, false);
2445       assertNoErrors(hbck); // no errors
2446 
2447     } finally {
2448       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2449       deleteTable(table);
2450       IOUtils.closeQuietly(meta);
2451     }
2452   }
2453 
2454   @Test
2455   public void testRegionBoundariesCheck() throws Exception {
2456     HBaseFsck hbck = doFsck(conf, false);
2457     assertNoErrors(hbck); // no errors
2458     try {
2459       hbck.checkRegionBoundaries();
2460     } catch (IllegalArgumentException e) {
2461       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2462         fail("Table directory path is not valid." + e.getMessage());
2463       }
2464     }
2465   }
2466 
2467   @org.junit.Rule
2468   public TestName name = new TestName();
2469 
2470   @Test
2471   public void testReadOnlyProperty() throws Exception {
2472     HBaseFsck hbck = doFsck(conf, false);
2473     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2474       hbck.shouldIgnorePreCheckPermission());
2475 
2476     hbck = doFsck(conf, true);
2477     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2478       hbck.shouldIgnorePreCheckPermission());
2479 
2480     hbck = doFsck(conf, true);
2481     hbck.setIgnorePreCheckPermission(true);
2482     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2483       hbck.shouldIgnorePreCheckPermission());
2484   }
2485 
2486   @Before
2487   public void setUp() {
2488     EnvironmentEdgeManager.reset();
2489   }
2490 
2491   @Test (timeout=180000)
2492   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2493     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2494     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2495     try {
2496       HTableDescriptor desc = new HTableDescriptor(table);
2497       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2498       TEST_UTIL.getHBaseAdmin().createTable(desc);
2499       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2500       for (int i = 0; i < 5; i++) {
2501         Put p1 = new Put(("r" + i).getBytes());
2502         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2503         tbl.put(p1);
2504       }
2505       TEST_UTIL.getHBaseAdmin().flush(desc.getTableName().toString());
2506       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2507       int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
2508       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2509       cluster.getServerWith(regions.get(0).getRegionName());
2510       SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
2511       st.prepare();
2512       st.stepsBeforePONR(regionServer, regionServer, false);
2513       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2514       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2515       for (RegionState state : regionsInTransition.values()) {
2516         am.regionOffline(state.getRegion());
2517       }
2518       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2519       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2520       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2521       am.assign(regionsMap);
2522       am.waitForAssignment(regions.get(0).getRegionInfo());
2523       HBaseFsck hbck = doFsck(conf, false);
2524       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2525           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2526       // holes are separate from overlap groups
2527       assertEquals(0, hbck.getOverlapGroups(table).size());
2528 
2529       // fix hole
2530       assertErrors(
2531         doFsck(
2532           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2533         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2534           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2535 
2536       // check that hole fixed
2537       assertNoErrors(doFsck(conf, false));
2538       assertEquals(5, countRows());
2539     } finally {
2540       if (tbl != null) {
2541         tbl.close();
2542         tbl = null;
2543       }
2544       deleteTable(table);
2545     }
2546   }
2547 }