1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.io.PrintWriter;
23 import java.io.StringWriter;
24 import java.net.URI;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collection;
28 import java.util.Collections;
29 import java.util.Comparator;
30 import java.util.HashMap;
31 import java.util.HashSet;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.Map.Entry;
36 import java.util.Set;
37 import java.util.SortedMap;
38 import java.util.SortedSet;
39 import java.util.TreeMap;
40 import java.util.TreeSet;
41 import java.util.concurrent.Callable;
42 import java.util.concurrent.ConcurrentSkipListMap;
43 import java.util.concurrent.ExecutionException;
44 import java.util.concurrent.ExecutorService;
45 import java.util.concurrent.Future;
46 import java.util.concurrent.ScheduledThreadPoolExecutor;
47 import java.util.concurrent.atomic.AtomicInteger;
48
49 import org.apache.commons.logging.Log;
50 import org.apache.commons.logging.LogFactory;
51 import org.apache.hadoop.classification.InterfaceAudience;
52 import org.apache.hadoop.classification.InterfaceStability;
53 import org.apache.hadoop.conf.Configuration;
54 import org.apache.hadoop.conf.Configured;
55 import org.apache.hadoop.fs.FileStatus;
56 import org.apache.hadoop.fs.FileSystem;
57 import org.apache.hadoop.fs.Path;
58 import org.apache.hadoop.fs.permission.FsAction;
59 import org.apache.hadoop.hbase.Abortable;
60 import org.apache.hadoop.hbase.Cell;
61 import org.apache.hadoop.hbase.ClusterStatus;
62 import org.apache.hadoop.hbase.HBaseConfiguration;
63 import org.apache.hadoop.hbase.HColumnDescriptor;
64 import org.apache.hadoop.hbase.HConstants;
65 import org.apache.hadoop.hbase.HRegionInfo;
66 import org.apache.hadoop.hbase.HRegionLocation;
67 import org.apache.hadoop.hbase.HTableDescriptor;
68 import org.apache.hadoop.hbase.KeyValue;
69 import org.apache.hadoop.hbase.MasterNotRunningException;
70 import org.apache.hadoop.hbase.ServerName;
71 import org.apache.hadoop.hbase.TableName;
72 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
73 import org.apache.hadoop.hbase.catalog.MetaEditor;
74 import org.apache.hadoop.hbase.client.Delete;
75 import org.apache.hadoop.hbase.client.Get;
76 import org.apache.hadoop.hbase.client.HBaseAdmin;
77 import org.apache.hadoop.hbase.client.HConnectable;
78 import org.apache.hadoop.hbase.client.HConnection;
79 import org.apache.hadoop.hbase.client.HConnectionManager;
80 import org.apache.hadoop.hbase.client.HTable;
81 import org.apache.hadoop.hbase.client.MetaScanner;
82 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
83 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
84 import org.apache.hadoop.hbase.client.Put;
85 import org.apache.hadoop.hbase.client.Result;
86 import org.apache.hadoop.hbase.client.RowMutations;
87 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
88 import org.apache.hadoop.hbase.io.hfile.HFile;
89 import org.apache.hadoop.hbase.master.MasterFileSystem;
90 import org.apache.hadoop.hbase.master.RegionState;
91 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
92 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
93 import org.apache.hadoop.hbase.regionserver.HRegion;
94 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
95 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
96 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
97 import org.apache.hadoop.hbase.security.UserProvider;
98 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
99 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
100 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
101 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
102 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
103 import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
104 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
105 import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly;
106 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
107 import org.apache.hadoop.io.IOUtils;
108 import org.apache.hadoop.security.AccessControlException;
109 import org.apache.hadoop.security.UserGroupInformation;
110 import org.apache.hadoop.util.ReflectionUtils;
111 import org.apache.hadoop.util.Tool;
112 import org.apache.hadoop.util.ToolRunner;
113 import org.apache.zookeeper.KeeperException;
114
115 import com.google.common.base.Joiner;
116 import com.google.common.base.Preconditions;
117 import com.google.common.collect.Lists;
118 import com.google.common.collect.Multimap;
119 import com.google.common.collect.TreeMultimap;
120 import com.google.protobuf.ServiceException;
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167 @InterfaceAudience.Public
168 @InterfaceStability.Evolving
169 public class HBaseFsck extends Configured {
170 public static final long DEFAULT_TIME_LAG = 60000;
171 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
172 private static final int MAX_NUM_THREADS = 50;
173 private static boolean rsSupportsOffline = true;
174 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
175 private static final int DEFAULT_MAX_MERGE = 5;
176 private static final String TO_BE_LOADED = "to_be_loaded";
177
178
179
180
181 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
182 private ClusterStatus status;
183 private HConnection connection;
184 private HBaseAdmin admin;
185 private HTable meta;
186
187 protected ExecutorService executor;
188 private long startMillis = System.currentTimeMillis();
189 private HFileCorruptionChecker hfcc;
190 private int retcode = 0;
191
192
193
194
195 private static boolean details = false;
196 private long timelag = DEFAULT_TIME_LAG;
197 private boolean fixAssignments = false;
198 private boolean fixMeta = false;
199 private boolean checkHdfs = true;
200 private boolean fixHdfsHoles = false;
201 private boolean fixHdfsOverlaps = false;
202 private boolean fixHdfsOrphans = false;
203 private boolean fixTableOrphans = false;
204 private boolean fixVersionFile = false;
205 private boolean fixSplitParents = false;
206 private boolean fixReferenceFiles = false;
207 private boolean fixEmptyMetaCells = false;
208 private boolean fixTableLocks = false;
209
210
211
212 private Set<TableName> tablesIncluded = new HashSet<TableName>();
213 private int maxMerge = DEFAULT_MAX_MERGE;
214 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
215 private boolean sidelineBigOverlaps = false;
216 private Path sidelineDir = null;
217
218 private boolean rerun = false;
219 private static boolean summary = false;
220 private boolean checkMetaOnly = false;
221 private boolean checkRegionBoundaries = false;
222 private boolean ignorePreCheckPermission = false;
223
224
225
226
227 final private ErrorReporter errors;
228 int fixes = 0;
229
230
231
232
233
234
235 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
236 private TreeSet<TableName> disabledTables =
237 new TreeSet<TableName>();
238
239 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
240
241
242
243
244
245
246
247
248
249
250
251 private SortedMap<TableName, TableInfo> tablesInfo =
252 new ConcurrentSkipListMap<TableName, TableInfo>();
253
254
255
256
257 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
258
259 private Map<TableName, Set<String>> orphanTableDirs =
260 new HashMap<TableName, Set<String>>();
261
262
263
264
265
266
267
268
269 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
270 ZooKeeperConnectionException, IOException, ClassNotFoundException {
271 super(conf);
272
273 setConf(HBaseConfiguration.create(getConf()));
274
275 getConf().setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0);
276 errors = getErrorReporter(conf);
277
278 int numThreads = conf.getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
279 executor = new ScheduledThreadPoolExecutor(numThreads, Threads.newDaemonThreadFactory("hbasefsck"));
280 }
281
282
283
284
285
286
287
288
289
290
291
292 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
293 ZooKeeperConnectionException, IOException, ClassNotFoundException {
294 super(conf);
295 errors = getErrorReporter(getConf());
296 this.executor = exec;
297 }
298
299
300
301
302
303 public void connect() throws IOException {
304 admin = new HBaseAdmin(getConf());
305 meta = new HTable(getConf(), TableName.META_TABLE_NAME);
306 status = admin.getClusterStatus();
307 connection = admin.getConnection();
308 }
309
310
311
312
313 private void loadDeployedRegions() throws IOException, InterruptedException {
314
315 Collection<ServerName> regionServers = status.getServers();
316 errors.print("Number of live region servers: " + regionServers.size());
317 if (details) {
318 for (ServerName rsinfo: regionServers) {
319 errors.print(" " + rsinfo.getServerName());
320 }
321 }
322
323
324 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
325 errors.print("Number of dead region servers: " + deadRegionServers.size());
326 if (details) {
327 for (ServerName name: deadRegionServers) {
328 errors.print(" " + name);
329 }
330 }
331
332
333 errors.print("Master: " + status.getMaster());
334
335
336 Collection<ServerName> backupMasters = status.getBackupMasters();
337 errors.print("Number of backup masters: " + backupMasters.size());
338 if (details) {
339 for (ServerName name: backupMasters) {
340 errors.print(" " + name);
341 }
342 }
343
344 errors.print("Average load: " + status.getAverageLoad());
345 errors.print("Number of requests: " + status.getRequestsCount());
346 errors.print("Number of regions: " + status.getRegionsCount());
347
348 Map<String, RegionState> rits = status.getRegionsInTransition();
349 errors.print("Number of regions in transition: " + rits.size());
350 if (details) {
351 for (RegionState state: rits.values()) {
352 errors.print(" " + state.toDescriptiveString());
353 }
354 }
355
356
357 processRegionServers(regionServers);
358 }
359
360
361
362
363 private void clearState() {
364
365 fixes = 0;
366 regionInfoMap.clear();
367 emptyRegionInfoQualifiers.clear();
368 disabledTables.clear();
369 errors.clear();
370 tablesInfo.clear();
371 orphanHdfsDirs.clear();
372 }
373
374
375
376
377
378
379 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
380
381 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
382 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
383 LOG.info("Loading regioninfos HDFS");
384
385 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
386 int curIter = 0;
387 do {
388 clearState();
389
390 restoreHdfsIntegrity();
391 curIter++;
392 } while (fixes > 0 && curIter <= maxIterations);
393
394
395
396 if (curIter > 2) {
397 if (curIter == maxIterations) {
398 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
399 + "Tables integrity may not be fully repaired!");
400 } else {
401 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
402 }
403 }
404 }
405 }
406
407
408
409
410
411
412
413
414
415 public int onlineConsistencyRepair() throws IOException, KeeperException,
416 InterruptedException {
417 clearState();
418
419
420 loadDeployedRegions();
421
422 recordMetaRegion();
423
424 if (!checkMetaRegion()) {
425 String errorMsg = "hbase:meta table is not consistent. ";
426 if (shouldFixAssignments()) {
427 errorMsg += "HBCK will try fixing it. Rerun once hbase:meta is back to consistent state.";
428 } else {
429 errorMsg += "Run HBCK with proper fix options to fix hbase:meta inconsistency.";
430 }
431 errors.reportError(errorMsg + " Exiting...");
432 return -2;
433 }
434
435 LOG.info("Loading regionsinfo from the hbase:meta table");
436 boolean success = loadMetaEntries();
437 if (!success) return -1;
438
439
440 reportEmptyMetaCells();
441
442
443 if (shouldFixEmptyMetaCells()) {
444 fixEmptyMetaCells();
445 }
446
447
448 if (!checkMetaOnly) {
449 reportTablesInFlux();
450 }
451
452
453 if (shouldCheckHdfs()) {
454 loadHdfsRegionDirs();
455 loadHdfsRegionInfos();
456 }
457
458
459 loadDisabledTables();
460
461
462 fixOrphanTables();
463
464
465 checkAndFixConsistency();
466
467
468 checkIntegrity();
469 return errors.getErrorList().size();
470 }
471
472
473
474
475
476 public int onlineHbck() throws IOException, KeeperException, InterruptedException, ServiceException {
477
478 errors.print("Version: " + status.getHBaseVersion());
479 offlineHdfsIntegrityRepair();
480
481
482 boolean oldBalancer = admin.setBalancerRunning(false, true);
483 try {
484 onlineConsistencyRepair();
485 }
486 finally {
487 admin.setBalancerRunning(oldBalancer, false);
488 }
489
490 if (checkRegionBoundaries) {
491 checkRegionBoundaries();
492 }
493
494 offlineReferenceFileRepair();
495
496 checkAndFixTableLocks();
497
498
499 printTableSummary(tablesInfo);
500 return errors.summarize();
501 }
502
503 public static byte[] keyOnly (byte[] b) {
504 if (b == null)
505 return b;
506 int rowlength = Bytes.toShort(b, 0);
507 byte[] result = new byte[rowlength];
508 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
509 return result;
510 }
511
512 private static class RegionBoundariesInformation {
513 public byte [] regionName;
514 public byte [] metaFirstKey;
515 public byte [] metaLastKey;
516 public byte [] storesFirstKey;
517 public byte [] storesLastKey;
518 public String toString () {
519 return "regionName=" + Bytes.toStringBinary(regionName) +
520 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
521 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
522 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
523 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
524 }
525 }
526
527 public void checkRegionBoundaries() {
528 try {
529 ByteArrayComparator comparator = new ByteArrayComparator();
530 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), false);
531 final RegionBoundariesInformation currentRegionBoundariesInformation =
532 new RegionBoundariesInformation();
533 Path hbaseRoot = FSUtils.getRootDir(getConf());
534 for (HRegionInfo regionInfo : regions) {
535 Path tableDir = FSUtils.getTableDir(hbaseRoot, regionInfo.getTable());
536 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
537
538
539 Path path = new Path(tableDir, regionInfo.getEncodedName());
540 FileSystem fs = path.getFileSystem(getConf());
541 FileStatus[] files = fs.listStatus(path);
542
543 byte[] storeFirstKey = null;
544 byte[] storeLastKey = null;
545 for (FileStatus file : files) {
546 String fileName = file.getPath().toString();
547 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
548 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
549 FileStatus[] storeFiles = fs.listStatus(file.getPath());
550
551 for (FileStatus storeFile : storeFiles) {
552 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
553 getConf()));
554 if ((reader.getFirstKey() != null)
555 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
556 reader.getFirstKey()) > 0))) {
557 storeFirstKey = reader.getFirstKey();
558 }
559 if ((reader.getLastKey() != null)
560 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
561 reader.getLastKey())) < 0)) {
562 storeLastKey = reader.getLastKey();
563 }
564 reader.close();
565 }
566 }
567 }
568 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
569 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
570 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
571 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
572 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
573 currentRegionBoundariesInformation.metaFirstKey = null;
574 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
575 currentRegionBoundariesInformation.metaLastKey = null;
576
577
578
579
580
581
582 boolean valid = true;
583
584 if ((currentRegionBoundariesInformation.storesFirstKey != null)
585 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
586 valid = valid
587 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
588 currentRegionBoundariesInformation.metaFirstKey) >= 0;
589 }
590
591 if ((currentRegionBoundariesInformation.storesLastKey != null)
592 && (currentRegionBoundariesInformation.metaLastKey != null)) {
593 valid = valid
594 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
595 currentRegionBoundariesInformation.metaLastKey) < 0;
596 }
597 if (!valid) {
598 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
599 tablesInfo.get(regionInfo.getTable()));
600 LOG.warn("Region's boundaries not alligned between stores and META for:");
601 LOG.warn(currentRegionBoundariesInformation);
602 }
603 }
604 } catch (IOException e) {
605 LOG.error(e);
606 }
607 }
608
609
610
611
612 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
613 for (HbckInfo hi : orphanHdfsDirs) {
614 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
615 adoptHdfsOrphan(hi);
616 }
617 }
618
619
620
621
622
623
624
625
626
627
628 @SuppressWarnings("deprecation")
629 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
630 Path p = hi.getHdfsRegionDir();
631 FileSystem fs = p.getFileSystem(getConf());
632 FileStatus[] dirs = fs.listStatus(p);
633 if (dirs == null) {
634 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
635 p + ". This dir could probably be deleted.");
636 return ;
637 }
638
639 TableName tableName = hi.getTableName();
640 TableInfo tableInfo = tablesInfo.get(tableName);
641 Preconditions.checkNotNull(tableInfo, "Table '" + tableName + "' not present!");
642 HTableDescriptor template = tableInfo.getHTD();
643
644
645 Pair<byte[],byte[]> orphanRegionRange = null;
646 for (FileStatus cf : dirs) {
647 String cfName= cf.getPath().getName();
648
649 if (cfName.startsWith(".") || cfName.equals(HConstants.SPLIT_LOGDIR_NAME)) continue;
650
651 FileStatus[] hfiles = fs.listStatus(cf.getPath());
652 for (FileStatus hfile : hfiles) {
653 byte[] start, end;
654 HFile.Reader hf = null;
655 try {
656 CacheConfig cacheConf = new CacheConfig(getConf());
657 hf = HFile.createReader(fs, hfile.getPath(), cacheConf);
658 hf.loadFileInfo();
659 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
660 start = startKv.getRow();
661 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
662 end = endKv.getRow();
663 } catch (IOException ioe) {
664 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
665 continue;
666 } catch (NullPointerException ioe) {
667 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
668 continue;
669 } finally {
670 if (hf != null) {
671 hf.close();
672 }
673 }
674
675
676 if (orphanRegionRange == null) {
677
678 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
679 } else {
680
681
682
683 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
684 orphanRegionRange.setFirst(start);
685 }
686 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
687 orphanRegionRange.setSecond(end);
688 }
689 }
690 }
691 }
692 if (orphanRegionRange == null) {
693 LOG.warn("No data in dir " + p + ", sidelining data");
694 fixes++;
695 sidelineRegionDir(fs, hi);
696 return;
697 }
698 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
699 Bytes.toString(orphanRegionRange.getSecond()) + ")");
700
701
702 HRegionInfo hri = new HRegionInfo(template.getTableName(), orphanRegionRange.getFirst(), orphanRegionRange.getSecond());
703 LOG.info("Creating new region : " + hri);
704 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
705 Path target = region.getRegionFileSystem().getRegionDir();
706
707
708 mergeRegionDirs(target, hi);
709 fixes++;
710 }
711
712
713
714
715
716
717
718
719
720 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
721
722 LOG.info("Loading HBase regioninfo from HDFS...");
723 loadHdfsRegionDirs();
724
725 int errs = errors.getErrorList().size();
726
727 tablesInfo = loadHdfsRegionInfos();
728 checkHdfsIntegrity(false, false);
729
730 if (errors.getErrorList().size() == errs) {
731 LOG.info("No integrity errors. We are done with this phase. Glorious.");
732 return 0;
733 }
734
735 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
736 adoptHdfsOrphans(orphanHdfsDirs);
737
738 }
739
740
741 if (shouldFixHdfsHoles()) {
742 clearState();
743 loadHdfsRegionDirs();
744 tablesInfo = loadHdfsRegionInfos();
745 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
746 }
747
748
749 if (shouldFixHdfsOverlaps()) {
750
751 clearState();
752 loadHdfsRegionDirs();
753 tablesInfo = loadHdfsRegionInfos();
754 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
755 }
756
757 return errors.getErrorList().size();
758 }
759
760
761
762
763
764
765
766
767
768 private void offlineReferenceFileRepair() throws IOException {
769 Configuration conf = getConf();
770 Path hbaseRoot = FSUtils.getRootDir(conf);
771 FileSystem fs = hbaseRoot.getFileSystem(conf);
772 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot);
773 for (Path path: allFiles.values()) {
774 boolean isReference = false;
775 try {
776 isReference = StoreFileInfo.isReference(path);
777 } catch (Throwable t) {
778
779
780
781
782 }
783 if (!isReference) continue;
784
785 Path referredToFile = StoreFileInfo.getReferredToFile(path);
786 if (fs.exists(referredToFile)) continue;
787
788
789 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
790 "Found lingering reference file " + path);
791 if (!shouldFixReferenceFiles()) continue;
792
793
794 boolean success = false;
795 String pathStr = path.toString();
796
797
798
799
800
801 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
802 for (int i = 0; index > 0 && i < 5; i++) {
803 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index - 1);
804 }
805 if (index > 0) {
806 Path rootDir = getSidelineDir();
807 Path dst = new Path(rootDir, pathStr.substring(index + 1));
808 fs.mkdirs(dst.getParent());
809 LOG.info("Trying to sildeline reference file "
810 + path + " to " + dst);
811 setShouldRerun();
812
813 success = fs.rename(path, dst);
814 }
815 if (!success) {
816 LOG.error("Failed to sideline reference file " + path);
817 }
818 }
819 }
820
821
822
823
824 private void reportEmptyMetaCells() {
825 errors.print("Number of empty REGIONINFO_QUALIFIER rows in hbase:meta: " +
826 emptyRegionInfoQualifiers.size());
827 if (details) {
828 for (Result r: emptyRegionInfoQualifiers) {
829 errors.print(" " + r);
830 }
831 }
832 }
833
834
835
836
837 private void reportTablesInFlux() {
838 AtomicInteger numSkipped = new AtomicInteger(0);
839 HTableDescriptor[] allTables = getTables(numSkipped);
840 errors.print("Number of Tables: " + allTables.length);
841 if (details) {
842 if (numSkipped.get() > 0) {
843 errors.detail("Number of Tables in flux: " + numSkipped.get());
844 }
845 for (HTableDescriptor td : allTables) {
846 errors.detail(" Table: " + td.getTableName() + "\t" +
847 (td.isReadOnly() ? "ro" : "rw") + "\t" +
848 (td.isMetaRegion() ? "META" : " ") + "\t" +
849 " families: " + td.getFamilies().size());
850 }
851 }
852 }
853
854 public ErrorReporter getErrors() {
855 return errors;
856 }
857
858
859
860
861
862 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
863 Path regionDir = hbi.getHdfsRegionDir();
864 if (regionDir == null) {
865 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
866 return;
867 }
868
869 if (hbi.hdfsEntry.hri != null) {
870
871 return;
872 }
873
874 FileSystem fs = FileSystem.get(getConf());
875 HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
876 LOG.debug("HRegionInfo read: " + hri.toString());
877 hbi.hdfsEntry.hri = hri;
878 }
879
880
881
882
883
884 public static class RegionRepairException extends IOException {
885 private static final long serialVersionUID = 1L;
886 final IOException ioe;
887 public RegionRepairException(String s, IOException ioe) {
888 super(s);
889 this.ioe = ioe;
890 }
891 }
892
893
894
895
896 private SortedMap<TableName, TableInfo> loadHdfsRegionInfos()
897 throws IOException, InterruptedException {
898 tablesInfo.clear();
899
900 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
901
902
903 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
904 List<Future<Void>> hbiFutures;
905
906 for (HbckInfo hbi : hbckInfos) {
907 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
908 hbis.add(work);
909 }
910
911
912 hbiFutures = executor.invokeAll(hbis);
913
914 for(int i=0; i<hbiFutures.size(); i++) {
915 WorkItemHdfsRegionInfo work = hbis.get(i);
916 Future<Void> f = hbiFutures.get(i);
917 try {
918 f.get();
919 } catch(ExecutionException e) {
920 LOG.warn("Failed to read .regioninfo file for region " +
921 work.hbi.getRegionNameAsString(), e.getCause());
922 }
923 }
924
925 Path hbaseRoot = FSUtils.getRootDir(getConf());
926 FileSystem fs = hbaseRoot.getFileSystem(getConf());
927
928 for (HbckInfo hbi: hbckInfos) {
929
930 if (hbi.getHdfsHRI() == null) {
931
932 continue;
933 }
934
935
936
937 TableName tableName = hbi.getTableName();
938 if (tableName == null) {
939
940 LOG.warn("tableName was null for: " + hbi);
941 continue;
942 }
943
944 TableInfo modTInfo = tablesInfo.get(tableName);
945 if (modTInfo == null) {
946
947 modTInfo = new TableInfo(tableName);
948 tablesInfo.put(tableName, modTInfo);
949 try {
950 HTableDescriptor htd =
951 FSTableDescriptors.getTableDescriptorFromFs(fs, hbaseRoot, tableName);
952 modTInfo.htds.add(htd);
953 } catch (IOException ioe) {
954 if (!orphanTableDirs.containsKey(tableName)) {
955 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
956
957 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
958 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
959 Set<String> columns = new HashSet<String>();
960 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
961 }
962 }
963 }
964 if (!hbi.isSkipChecks()) {
965 modTInfo.addRegionInfo(hbi);
966 }
967 }
968
969 loadTableInfosForTablesWithNoRegion();
970
971 return tablesInfo;
972 }
973
974
975
976
977
978
979
980
981 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
982 Path regionDir = hbi.getHdfsRegionDir();
983 FileSystem fs = regionDir.getFileSystem(getConf());
984 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
985 for (FileStatus subdir : subDirs) {
986 String columnfamily = subdir.getPath().getName();
987 columns.add(columnfamily);
988 }
989 return columns;
990 }
991
992
993
994
995
996
997
998
999 private boolean fabricateTableInfo(FSTableDescriptors fstd, TableName tableName,
1000 Set<String> columns) throws IOException {
1001 if (columns ==null || columns.isEmpty()) return false;
1002 HTableDescriptor htd = new HTableDescriptor(tableName);
1003 for (String columnfamimly : columns) {
1004 htd.addFamily(new HColumnDescriptor(columnfamimly));
1005 }
1006 fstd.createTableDescriptor(htd, true);
1007 return true;
1008 }
1009
1010
1011
1012
1013
1014 public void fixEmptyMetaCells() throws IOException {
1015 if (shouldFixEmptyMetaCells() && !emptyRegionInfoQualifiers.isEmpty()) {
1016 LOG.info("Trying to fix empty REGIONINFO_QUALIFIER hbase:meta rows.");
1017 for (Result region : emptyRegionInfoQualifiers) {
1018 deleteMetaRegion(region.getRow());
1019 errors.getErrorList().remove(ERROR_CODE.EMPTY_META_CELL);
1020 }
1021 emptyRegionInfoQualifiers.clear();
1022 }
1023 }
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034 public void fixOrphanTables() throws IOException {
1035 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1036
1037 List<TableName> tmpList = new ArrayList<TableName>();
1038 tmpList.addAll(orphanTableDirs.keySet());
1039 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1040 Iterator<Entry<TableName, Set<String>>> iter =
1041 orphanTableDirs.entrySet().iterator();
1042 int j = 0;
1043 int numFailedCase = 0;
1044 FSTableDescriptors fstd = new FSTableDescriptors(getConf());
1045 while (iter.hasNext()) {
1046 Entry<TableName, Set<String>> entry =
1047 (Entry<TableName, Set<String>>) iter.next();
1048 TableName tableName = entry.getKey();
1049 LOG.info("Trying to fix orphan table error: " + tableName);
1050 if (j < htds.length) {
1051 if (tableName.equals(htds[j].getTableName())) {
1052 HTableDescriptor htd = htds[j];
1053 LOG.info("fixing orphan table: " + tableName + " from cache");
1054 fstd.createTableDescriptor(htd, true);
1055 j++;
1056 iter.remove();
1057 }
1058 } else {
1059 if (fabricateTableInfo(fstd, tableName, entry.getValue())) {
1060 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1061 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1062 iter.remove();
1063 } else {
1064 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1065 numFailedCase++;
1066 }
1067 }
1068 fixes++;
1069 }
1070
1071 if (orphanTableDirs.isEmpty()) {
1072
1073
1074 setShouldRerun();
1075 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1076 } else if (numFailedCase > 0) {
1077 LOG.error("Failed to fix " + numFailedCase
1078 + " OrphanTables with default .tableinfo files");
1079 }
1080
1081 }
1082
1083 orphanTableDirs.clear();
1084
1085 }
1086
1087
1088
1089
1090
1091
1092 private HRegion createNewMeta() throws IOException {
1093 Path rootdir = FSUtils.getRootDir(getConf());
1094 Configuration c = getConf();
1095 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1096 MasterFileSystem.setInfoFamilyCachingForMeta(false);
1097 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c,
1098 HTableDescriptor.META_TABLEDESC);
1099 MasterFileSystem.setInfoFamilyCachingForMeta(true);
1100 return meta;
1101 }
1102
1103
1104
1105
1106
1107
1108
1109 private ArrayList<Put> generatePuts(
1110 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1111 ArrayList<Put> puts = new ArrayList<Put>();
1112 boolean hasProblems = false;
1113 for (Entry<TableName, TableInfo> e : tablesInfo.entrySet()) {
1114 TableName name = e.getKey();
1115
1116
1117 if (name.compareTo(TableName.META_TABLE_NAME) == 0) {
1118 continue;
1119 }
1120
1121 TableInfo ti = e.getValue();
1122 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1123 .entrySet()) {
1124 Collection<HbckInfo> his = spl.getValue();
1125 int sz = his.size();
1126 if (sz != 1) {
1127
1128 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1129 + " had " + sz + " regions instead of exactly 1." );
1130 hasProblems = true;
1131 continue;
1132 }
1133
1134
1135 HbckInfo hi = his.iterator().next();
1136 HRegionInfo hri = hi.getHdfsHRI();
1137 Put p = MetaEditor.makePutFromRegionInfo(hri);
1138 puts.add(p);
1139 }
1140 }
1141 return hasProblems ? null : puts;
1142 }
1143
1144
1145
1146
1147 private void suggestFixes(
1148 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1149 for (TableInfo tInfo : tablesInfo.values()) {
1150 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1151 tInfo.checkRegionChain(handler);
1152 }
1153 }
1154
1155
1156
1157
1158
1159
1160
1161
1162 public boolean rebuildMeta(boolean fix) throws IOException,
1163 InterruptedException {
1164
1165
1166
1167
1168
1169 LOG.info("Loading HBase regioninfo from HDFS...");
1170 loadHdfsRegionDirs();
1171
1172 int errs = errors.getErrorList().size();
1173 tablesInfo = loadHdfsRegionInfos();
1174 checkHdfsIntegrity(false, false);
1175
1176
1177 if (errors.getErrorList().size() != errs) {
1178
1179 while(true) {
1180 fixes = 0;
1181 suggestFixes(tablesInfo);
1182 errors.clear();
1183 loadHdfsRegionInfos();
1184 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1185
1186 int errCount = errors.getErrorList().size();
1187
1188 if (fixes == 0) {
1189 if (errCount > 0) {
1190 return false;
1191 } else {
1192 break;
1193 }
1194 }
1195 }
1196 }
1197
1198
1199 LOG.info("HDFS regioninfo's seems good. Sidelining old hbase:meta");
1200 Path backupDir = sidelineOldMeta();
1201
1202 LOG.info("Creating new hbase:meta");
1203 HRegion meta = createNewMeta();
1204
1205
1206 List<Put> puts = generatePuts(tablesInfo);
1207 if (puts == null) {
1208 LOG.fatal("Problem encountered when creating new hbase:meta entries. " +
1209 "You may need to restore the previously sidelined hbase:meta");
1210 return false;
1211 }
1212 meta.batchMutate(puts.toArray(new Put[0]));
1213 HRegion.closeHRegion(meta);
1214 LOG.info("Success! hbase:meta table rebuilt.");
1215 LOG.info("Old hbase:meta is moved into " + backupDir);
1216 return true;
1217 }
1218
1219 private SortedMap<TableName, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1220 boolean fixOverlaps) throws IOException {
1221 LOG.info("Checking HBase region split map from HDFS data...");
1222 for (TableInfo tInfo : tablesInfo.values()) {
1223 TableIntegrityErrorHandler handler;
1224 if (fixHoles || fixOverlaps) {
1225 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1226 fixHoles, fixOverlaps);
1227 } else {
1228 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1229 }
1230 if (!tInfo.checkRegionChain(handler)) {
1231
1232 errors.report("Found inconsistency in table " + tInfo.getName());
1233 }
1234 }
1235 return tablesInfo;
1236 }
1237
1238 private Path getSidelineDir() throws IOException {
1239 if (sidelineDir == null) {
1240 Path hbaseDir = FSUtils.getRootDir(getConf());
1241 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1242 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1243 + startMillis);
1244 }
1245 return sidelineDir;
1246 }
1247
1248
1249
1250
1251 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1252 return sidelineRegionDir(fs, null, hi);
1253 }
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263 Path sidelineRegionDir(FileSystem fs,
1264 String parentDir, HbckInfo hi) throws IOException {
1265 TableName tableName = hi.getTableName();
1266 Path regionDir = hi.getHdfsRegionDir();
1267
1268 if (!fs.exists(regionDir)) {
1269 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1270 return null;
1271 }
1272
1273 Path rootDir = getSidelineDir();
1274 if (parentDir != null) {
1275 rootDir = new Path(rootDir, parentDir);
1276 }
1277 Path sidelineTableDir= FSUtils.getTableDir(rootDir, tableName);
1278 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1279 fs.mkdirs(sidelineRegionDir);
1280 boolean success = false;
1281 FileStatus[] cfs = fs.listStatus(regionDir);
1282 if (cfs == null) {
1283 LOG.info("Region dir is empty: " + regionDir);
1284 } else {
1285 for (FileStatus cf : cfs) {
1286 Path src = cf.getPath();
1287 Path dst = new Path(sidelineRegionDir, src.getName());
1288 if (fs.isFile(src)) {
1289
1290 success = fs.rename(src, dst);
1291 if (!success) {
1292 String msg = "Unable to rename file " + src + " to " + dst;
1293 LOG.error(msg);
1294 throw new IOException(msg);
1295 }
1296 continue;
1297 }
1298
1299
1300 fs.mkdirs(dst);
1301
1302 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1303
1304
1305
1306
1307 FileStatus[] hfiles = fs.listStatus(src);
1308 if (hfiles != null && hfiles.length > 0) {
1309 for (FileStatus hfile : hfiles) {
1310 success = fs.rename(hfile.getPath(), dst);
1311 if (!success) {
1312 String msg = "Unable to rename file " + src + " to " + dst;
1313 LOG.error(msg);
1314 throw new IOException(msg);
1315 }
1316 }
1317 }
1318 LOG.debug("Sideline directory contents:");
1319 debugLsr(sidelineRegionDir);
1320 }
1321 }
1322
1323 LOG.info("Removing old region dir: " + regionDir);
1324 success = fs.delete(regionDir, true);
1325 if (!success) {
1326 String msg = "Unable to delete dir " + regionDir;
1327 LOG.error(msg);
1328 throw new IOException(msg);
1329 }
1330 return sidelineRegionDir;
1331 }
1332
1333
1334
1335
1336 void sidelineTable(FileSystem fs, TableName tableName, Path hbaseDir,
1337 Path backupHbaseDir) throws IOException {
1338 Path tableDir = FSUtils.getTableDir(hbaseDir, tableName);
1339 if (fs.exists(tableDir)) {
1340 Path backupTableDir= FSUtils.getTableDir(backupHbaseDir, tableName);
1341 fs.mkdirs(backupTableDir.getParent());
1342 boolean success = fs.rename(tableDir, backupTableDir);
1343 if (!success) {
1344 throw new IOException("Failed to move " + tableName + " from "
1345 + tableDir + " to " + backupTableDir);
1346 }
1347 } else {
1348 LOG.info("No previous " + tableName + " exists. Continuing.");
1349 }
1350 }
1351
1352
1353
1354
1355 Path sidelineOldMeta() throws IOException {
1356
1357 Path hbaseDir = FSUtils.getRootDir(getConf());
1358 FileSystem fs = hbaseDir.getFileSystem(getConf());
1359 Path backupDir = getSidelineDir();
1360 fs.mkdirs(backupDir);
1361 try {
1362 sidelineTable(fs, TableName.META_TABLE_NAME, hbaseDir, backupDir);
1363 } catch (IOException e) {
1364 LOG.fatal("... failed to sideline meta. Currently in inconsistent state. To restore "
1365 + "try to rename .META. in " + backupDir.getName() + " to "
1366 + hbaseDir.getName() + ".", e);
1367 throw e;
1368 }
1369 return backupDir;
1370 }
1371
1372
1373
1374
1375
1376
1377 private void loadDisabledTables()
1378 throws ZooKeeperConnectionException, IOException {
1379 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1380 @Override
1381 public Void connect(HConnection connection) throws IOException {
1382 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1383 try {
1384 for (TableName tableName :
1385 ZKTableReadOnly.getDisabledOrDisablingTables(zkw)) {
1386 disabledTables.add(tableName);
1387 }
1388 } catch (KeeperException ke) {
1389 throw new IOException(ke);
1390 } finally {
1391 zkw.close();
1392 }
1393 return null;
1394 }
1395 });
1396 }
1397
1398
1399
1400
1401 private boolean isTableDisabled(HRegionInfo regionInfo) {
1402 return disabledTables.contains(regionInfo.getTable());
1403 }
1404
1405
1406
1407
1408
1409 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1410 Path rootDir = FSUtils.getRootDir(getConf());
1411 FileSystem fs = rootDir.getFileSystem(getConf());
1412
1413
1414 List<FileStatus> tableDirs = Lists.newArrayList();
1415
1416 boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME));
1417
1418 List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
1419 for (Path path : paths) {
1420 TableName tableName = FSUtils.getTableName(path);
1421 if ((!checkMetaOnly &&
1422 isTableIncluded(tableName)) ||
1423 tableName.equals(TableName.META_TABLE_NAME)) {
1424 tableDirs.add(fs.getFileStatus(path));
1425 }
1426 }
1427
1428
1429 if (!foundVersionFile) {
1430 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1431 "Version file does not exist in root dir " + rootDir);
1432 if (shouldFixVersionFile()) {
1433 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1434 + " file.");
1435 setShouldRerun();
1436 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1437 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1438 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1439 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1440 }
1441 }
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 private boolean recordMetaRegion() throws IOException {
1468 HRegionLocation metaLocation = connection.locateRegion(
1469 TableName.META_TABLE_NAME, HConstants.EMPTY_START_ROW);
1470
1471
1472 if (metaLocation == null || metaLocation.getRegionInfo() == null ||
1473 metaLocation.getHostname() == null) {
1474 errors.reportError(ERROR_CODE.NULL_META_REGION,
1475 "META region or some of its attributes are null.");
1476 return false;
1477 }
1478 ServerName sn;
1479 try {
1480 sn = getMetaRegionServerName();
1481 } catch (KeeperException e) {
1482 throw new IOException(e);
1483 }
1484 MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis());
1485 HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
1486 if (hbckInfo == null) {
1487 regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
1488 } else {
1489 hbckInfo.metaEntry = m;
1490 }
1491 return true;
1492 }
1493
1494 private ZooKeeperWatcher createZooKeeperWatcher() throws IOException {
1495 return new ZooKeeperWatcher(getConf(), "hbase Fsck", new Abortable() {
1496 @Override
1497 public void abort(String why, Throwable e) {
1498 LOG.error(why, e);
1499 System.exit(1);
1500 }
1501
1502 @Override
1503 public boolean isAborted() {
1504 return false;
1505 }
1506
1507 });
1508 }
1509
1510 private ServerName getMetaRegionServerName()
1511 throws IOException, KeeperException {
1512 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1513 ServerName sn = null;
1514 try {
1515 sn = MetaRegionTracker.getMetaRegionLocation(zkw);
1516 } finally {
1517 zkw.close();
1518 }
1519 return sn;
1520 }
1521
1522
1523
1524
1525
1526
1527 void processRegionServers(Collection<ServerName> regionServerList)
1528 throws IOException, InterruptedException {
1529
1530 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1531 List<Future<Void>> workFutures;
1532
1533
1534 for (ServerName rsinfo: regionServerList) {
1535 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1536 }
1537
1538 workFutures = executor.invokeAll(workItems);
1539
1540 for(int i=0; i<workFutures.size(); i++) {
1541 WorkItemRegion item = workItems.get(i);
1542 Future<Void> f = workFutures.get(i);
1543 try {
1544 f.get();
1545 } catch(ExecutionException e) {
1546 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1547 e.getCause());
1548 }
1549 }
1550 }
1551
1552
1553
1554
1555 private void checkAndFixConsistency()
1556 throws IOException, KeeperException, InterruptedException {
1557 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1558 checkRegionConsistency(e.getKey(), e.getValue());
1559 }
1560 }
1561
1562 private void preCheckPermission() throws IOException, AccessControlException {
1563 if (shouldIgnorePreCheckPermission()) {
1564 return;
1565 }
1566
1567 Path hbaseDir = FSUtils.getRootDir(getConf());
1568 FileSystem fs = hbaseDir.getFileSystem(getConf());
1569 UserProvider userProvider = UserProvider.instantiate(getConf());
1570 UserGroupInformation ugi = userProvider.getCurrent().getUGI();
1571 FileStatus[] files = fs.listStatus(hbaseDir);
1572 for (FileStatus file : files) {
1573 try {
1574 FSUtils.checkAccess(ugi, file, FsAction.WRITE);
1575 } catch (AccessControlException ace) {
1576 LOG.warn("Got AccessControlException when preCheckPermission ", ace);
1577 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + ugi.getUserName()
1578 + " does not have write perms to " + file.getPath()
1579 + ". Please rerun hbck as hdfs user " + file.getOwner());
1580 throw new AccessControlException(ace);
1581 }
1582 }
1583 }
1584
1585
1586
1587
1588 private void deleteMetaRegion(HbckInfo hi) throws IOException {
1589 deleteMetaRegion(hi.metaEntry.getRegionName());
1590 }
1591
1592
1593
1594
1595 private void deleteMetaRegion(byte[] metaKey) throws IOException {
1596 Delete d = new Delete(metaKey);
1597 meta.delete(d);
1598 meta.flushCommits();
1599 LOG.info("Deleted " + Bytes.toString(metaKey) + " from META" );
1600 }
1601
1602
1603
1604
1605 private void resetSplitParent(HbckInfo hi) throws IOException {
1606 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
1607 Delete d = new Delete(hi.metaEntry.getRegionName());
1608 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
1609 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
1610 mutations.add(d);
1611
1612 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
1613 hri.setOffline(false);
1614 hri.setSplit(false);
1615 Put p = MetaEditor.makePutFromRegionInfo(hri);
1616 mutations.add(p);
1617
1618 meta.mutateRow(mutations);
1619 meta.flushCommits();
1620 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
1621 }
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631 private void offline(byte[] regionName) throws IOException {
1632 String regionString = Bytes.toStringBinary(regionName);
1633 if (!rsSupportsOffline) {
1634 LOG.warn("Using unassign region " + regionString
1635 + " instead of using offline method, you should"
1636 + " restart HMaster after these repairs");
1637 admin.unassign(regionName, true);
1638 return;
1639 }
1640
1641
1642 try {
1643 LOG.info("Offlining region " + regionString);
1644 admin.offline(regionName);
1645 } catch (IOException ioe) {
1646 String notFoundMsg = "java.lang.NoSuchMethodException: " +
1647 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
1648 if (ioe.getMessage().contains(notFoundMsg)) {
1649 LOG.warn("Using unassign region " + regionString
1650 + " instead of using offline method, you should"
1651 + " restart HMaster after these repairs");
1652 rsSupportsOffline = false;
1653 admin.unassign(regionName, true);
1654 return;
1655 }
1656 throw ioe;
1657 }
1658 }
1659
1660 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
1661 for (OnlineEntry rse : hi.deployedEntries) {
1662 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
1663 try {
1664 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, rse.hsa, rse.hri);
1665 offline(rse.hri.getRegionName());
1666 } catch (IOException ioe) {
1667 LOG.warn("Got exception when attempting to offline region "
1668 + Bytes.toString(rse.hri.getRegionName()), ioe);
1669 }
1670 }
1671 }
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
1686 if (hi.metaEntry == null && hi.hdfsEntry == null) {
1687 undeployRegions(hi);
1688 return;
1689 }
1690
1691
1692 Get get = new Get(hi.getRegionName());
1693 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1694 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
1695 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
1696 Result r = meta.get(get);
1697 ServerName serverName = HRegionInfo.getServerName(r);
1698 if (serverName == null) {
1699 errors.reportError("Unable to close region "
1700 + hi.getRegionNameAsString() + " because meta does not "
1701 + "have handle to reach it.");
1702 return;
1703 }
1704
1705 HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
1706 if (hri == null) {
1707 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
1708 + " because hbase:meta had invalid or missing "
1709 + HConstants.CATALOG_FAMILY_STR + ":"
1710 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
1711 + " qualifier value.");
1712 return;
1713 }
1714
1715
1716 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, serverName, hri);
1717 }
1718
1719 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
1720 KeeperException, InterruptedException {
1721
1722 if (shouldFixAssignments()) {
1723 errors.print(msg);
1724 undeployRegions(hbi);
1725 setShouldRerun();
1726 HRegionInfo hri = hbi.getHdfsHRI();
1727 if (hri == null) {
1728 hri = hbi.metaEntry;
1729 }
1730 HBaseFsckRepair.fixUnassigned(admin, hri);
1731 HBaseFsckRepair.waitUntilAssigned(admin, hri);
1732 }
1733 }
1734
1735
1736
1737
1738 private void checkRegionConsistency(final String key, final HbckInfo hbi)
1739 throws IOException, KeeperException, InterruptedException {
1740 String descriptiveName = hbi.toString();
1741
1742 boolean inMeta = hbi.metaEntry != null;
1743
1744 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
1745 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
1746 boolean isDeployed = !hbi.deployedOn.isEmpty();
1747 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
1748 boolean deploymentMatchesMeta =
1749 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
1750 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
1751 boolean splitParent =
1752 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
1753 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
1754 boolean recentlyModified = inHdfs &&
1755 hbi.getModTime() + timelag > System.currentTimeMillis();
1756
1757
1758 if (hbi.containsOnlyHdfsEdits()) {
1759 return;
1760 }
1761 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
1762 return;
1763 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
1764 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
1765 "tabled that is not deployed");
1766 return;
1767 } else if (recentlyModified) {
1768 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
1769 return;
1770 }
1771
1772 else if (!inMeta && !inHdfs && !isDeployed) {
1773
1774 assert false : "Entry for region with no data";
1775 } else if (!inMeta && !inHdfs && isDeployed) {
1776 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
1777 + descriptiveName + ", key=" + key + ", not on HDFS or in hbase:meta but " +
1778 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1779 if (shouldFixAssignments()) {
1780 undeployRegions(hbi);
1781 }
1782
1783 } else if (!inMeta && inHdfs && !isDeployed) {
1784 if (hbi.isMerged()) {
1785
1786
1787 hbi.setSkipChecks(true);
1788 LOG.info("Region " + descriptiveName
1789 + " got merge recently, its file(s) will be cleaned by CatalogJanitor later");
1790 return;
1791 }
1792 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
1793 + descriptiveName + " on HDFS, but not listed in hbase:meta " +
1794 "or deployed on any region server");
1795
1796 if (shouldFixMeta()) {
1797 if (!hbi.isHdfsRegioninfoPresent()) {
1798 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
1799 + " in table integrity repair phase if -fixHdfsOrphans was" +
1800 " used.");
1801 return;
1802 }
1803
1804 LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
1805 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1806
1807 tryAssignmentRepair(hbi, "Trying to reassign region...");
1808 }
1809
1810 } else if (!inMeta && inHdfs && isDeployed) {
1811 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
1812 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1813 debugLsr(hbi.getHdfsRegionDir());
1814 if (shouldFixMeta()) {
1815 if (!hbi.isHdfsRegioninfoPresent()) {
1816 LOG.error("This should have been repaired in table integrity repair phase");
1817 return;
1818 }
1819
1820 LOG.info("Patching hbase:meta with with .regioninfo: " + hbi.getHdfsHRI());
1821 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1822
1823 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1824 }
1825
1826
1827 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
1828
1829
1830 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
1831
1832 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
1833 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
1834 if (infoA != null && infoB != null) {
1835
1836 hbi.setSkipChecks(true);
1837 return;
1838 }
1839 }
1840 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
1841 + descriptiveName + " is a split parent in META, in HDFS, "
1842 + "and not deployed on any region server. This could be transient.");
1843 if (shouldFixSplitParents()) {
1844 setShouldRerun();
1845 resetSplitParent(hbi);
1846 }
1847 } else if (inMeta && !inHdfs && !isDeployed) {
1848 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
1849 + descriptiveName + " found in META, but not in HDFS "
1850 + "or deployed on any region server.");
1851 if (shouldFixMeta()) {
1852 deleteMetaRegion(hbi);
1853 }
1854 } else if (inMeta && !inHdfs && isDeployed) {
1855 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
1856 + " found in META, but not in HDFS, " +
1857 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1858
1859
1860
1861 if (shouldFixAssignments()) {
1862 errors.print("Trying to fix unassigned region...");
1863 closeRegion(hbi);
1864 }
1865 if (shouldFixMeta()) {
1866
1867 deleteMetaRegion(hbi);
1868 }
1869 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
1870 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
1871 + " not deployed on any region server.");
1872 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1873 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
1874 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
1875 "Region " + descriptiveName + " should not be deployed according " +
1876 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1877 if (shouldFixAssignments()) {
1878 errors.print("Trying to close the region " + descriptiveName);
1879 setShouldRerun();
1880 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1881 }
1882 } else if (inMeta && inHdfs && isMultiplyDeployed) {
1883 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
1884 + " is listed in hbase:meta on region server " + hbi.metaEntry.regionServer
1885 + " but is multiply assigned to region servers " +
1886 Joiner.on(", ").join(hbi.deployedOn));
1887
1888 if (shouldFixAssignments()) {
1889 errors.print("Trying to fix assignment error...");
1890 setShouldRerun();
1891 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1892 }
1893 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
1894 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
1895 + descriptiveName + " listed in hbase:meta on region server " +
1896 hbi.metaEntry.regionServer + " but found on region server " +
1897 hbi.deployedOn.get(0));
1898
1899 if (shouldFixAssignments()) {
1900 errors.print("Trying to fix assignment error...");
1901 setShouldRerun();
1902 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1903 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
1904 }
1905 } else {
1906 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
1907 " is in an unforeseen state:" +
1908 " inMeta=" + inMeta +
1909 " inHdfs=" + inHdfs +
1910 " isDeployed=" + isDeployed +
1911 " isMultiplyDeployed=" + isMultiplyDeployed +
1912 " deploymentMatchesMeta=" + deploymentMatchesMeta +
1913 " shouldBeDeployed=" + shouldBeDeployed);
1914 }
1915 }
1916
1917
1918
1919
1920
1921
1922
1923 SortedMap<TableName, TableInfo> checkIntegrity() throws IOException {
1924 tablesInfo = new TreeMap<TableName,TableInfo> ();
1925 List<HbckInfo> noHDFSRegionInfos = new ArrayList<HbckInfo>();
1926 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
1927 for (HbckInfo hbi : regionInfoMap.values()) {
1928
1929 if (hbi.metaEntry == null) {
1930
1931 noHDFSRegionInfos.add(hbi);
1932 Path p = hbi.getHdfsRegionDir();
1933 if (p == null) {
1934 errors.report("No regioninfo in Meta or HDFS. " + hbi);
1935 }
1936
1937
1938 continue;
1939 }
1940 if (hbi.metaEntry.regionServer == null) {
1941 errors.detail("Skipping region because no region server: " + hbi);
1942 continue;
1943 }
1944 if (hbi.metaEntry.isOffline()) {
1945 errors.detail("Skipping region because it is offline: " + hbi);
1946 continue;
1947 }
1948 if (hbi.containsOnlyHdfsEdits()) {
1949 errors.detail("Skipping region because it only contains edits" + hbi);
1950 continue;
1951 }
1952
1953
1954
1955
1956
1957
1958 if (hbi.deployedOn.size() == 0) continue;
1959
1960
1961 TableName tableName = hbi.metaEntry.getTable();
1962 TableInfo modTInfo = tablesInfo.get(tableName);
1963 if (modTInfo == null) {
1964 modTInfo = new TableInfo(tableName);
1965 }
1966 for (ServerName server : hbi.deployedOn) {
1967 modTInfo.addServer(server);
1968 }
1969
1970 if (!hbi.isSkipChecks()) {
1971 modTInfo.addRegionInfo(hbi);
1972 }
1973
1974 tablesInfo.put(tableName, modTInfo);
1975 }
1976
1977 loadTableInfosForTablesWithNoRegion();
1978
1979 for (TableInfo tInfo : tablesInfo.values()) {
1980 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1981 if (!tInfo.checkRegionChain(handler)) {
1982 errors.report("Found inconsistency in table " + tInfo.getName());
1983 }
1984 }
1985 return tablesInfo;
1986 }
1987
1988
1989
1990
1991 private void loadTableInfosForTablesWithNoRegion() throws IOException {
1992 Map<String, HTableDescriptor> allTables = new FSTableDescriptors(getConf()).getAll();
1993 for (HTableDescriptor htd : allTables.values()) {
1994 if (checkMetaOnly && !htd.isMetaTable()) {
1995 continue;
1996 }
1997
1998 TableName tableName = htd.getTableName();
1999 if (isTableIncluded(tableName) && !tablesInfo.containsKey(tableName)) {
2000 TableInfo tableInfo = new TableInfo(tableName);
2001 tableInfo.htds.add(htd);
2002 tablesInfo.put(htd.getTableName(), tableInfo);
2003 }
2004 }
2005 }
2006
2007
2008
2009
2010
2011 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2012 int fileMoves = 0;
2013 String thread = Thread.currentThread().getName();
2014 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2015 debugLsr(contained.getHdfsRegionDir());
2016
2017
2018 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2019 FileStatus[] dirs = null;
2020 try {
2021 dirs = fs.listStatus(contained.getHdfsRegionDir());
2022 } catch (FileNotFoundException fnfe) {
2023
2024
2025 if (!fs.exists(contained.getHdfsRegionDir())) {
2026 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2027 + " is missing. Assuming already sidelined or moved.");
2028 } else {
2029 sidelineRegionDir(fs, contained);
2030 }
2031 return fileMoves;
2032 }
2033
2034 if (dirs == null) {
2035 if (!fs.exists(contained.getHdfsRegionDir())) {
2036 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2037 + " already sidelined.");
2038 } else {
2039 sidelineRegionDir(fs, contained);
2040 }
2041 return fileMoves;
2042 }
2043
2044 for (FileStatus cf : dirs) {
2045 Path src = cf.getPath();
2046 Path dst = new Path(targetRegionDir, src.getName());
2047
2048 if (src.getName().equals(HRegionFileSystem.REGION_INFO_FILE)) {
2049
2050 continue;
2051 }
2052
2053 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2054
2055 continue;
2056 }
2057
2058 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2059
2060
2061
2062
2063 for (FileStatus hfile : fs.listStatus(src)) {
2064 boolean success = fs.rename(hfile.getPath(), dst);
2065 if (success) {
2066 fileMoves++;
2067 }
2068 }
2069 LOG.debug("[" + thread + "] Sideline directory contents:");
2070 debugLsr(targetRegionDir);
2071 }
2072
2073
2074 sidelineRegionDir(fs, contained);
2075 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2076 getSidelineDir());
2077 debugLsr(contained.getHdfsRegionDir());
2078
2079 return fileMoves;
2080 }
2081
2082
2083 static class WorkItemOverlapMerge implements Callable<Void> {
2084 private TableIntegrityErrorHandler handler;
2085 Collection<HbckInfo> overlapgroup;
2086
2087 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2088 this.handler = handler;
2089 this.overlapgroup = overlapgroup;
2090 }
2091
2092 @Override
2093 public Void call() throws Exception {
2094 handler.handleOverlapGroup(overlapgroup);
2095 return null;
2096 }
2097 };
2098
2099
2100
2101
2102
2103 public class TableInfo {
2104 TableName tableName;
2105 TreeSet <ServerName> deployedOn;
2106
2107
2108 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2109
2110
2111 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2112
2113
2114 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2115
2116
2117 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2118
2119
2120 final Multimap<byte[], HbckInfo> overlapGroups =
2121 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2122
2123 TableInfo(TableName name) {
2124 this.tableName = name;
2125 deployedOn = new TreeSet <ServerName>();
2126 }
2127
2128
2129
2130
2131 private HTableDescriptor getHTD() {
2132 if (htds.size() == 1) {
2133 return (HTableDescriptor)htds.toArray()[0];
2134 } else {
2135 LOG.error("None/Multiple table descriptors found for table '"
2136 + tableName + "' regions: " + htds);
2137 }
2138 return null;
2139 }
2140
2141 public void addRegionInfo(HbckInfo hir) {
2142 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2143
2144 sc.add(hir);
2145 return;
2146 }
2147
2148
2149 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2150 errors.reportError(
2151 ERROR_CODE.REGION_CYCLE,
2152 String.format("The endkey for this region comes before the "
2153 + "startkey, startkey=%s, endkey=%s",
2154 Bytes.toStringBinary(hir.getStartKey()),
2155 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2156 backwards.add(hir);
2157 return;
2158 }
2159
2160
2161 sc.add(hir);
2162 }
2163
2164 public void addServer(ServerName server) {
2165 this.deployedOn.add(server);
2166 }
2167
2168 public TableName getName() {
2169 return tableName;
2170 }
2171
2172 public int getNumRegions() {
2173 return sc.getStarts().size() + backwards.size();
2174 }
2175
2176 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2177 ErrorReporter errors;
2178
2179 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2180 this.errors = errors;
2181 setTableInfo(ti);
2182 }
2183
2184 @Override
2185 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2186 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2187 "First region should start with an empty key. You need to "
2188 + " create a new region and regioninfo in HDFS to plug the hole.",
2189 getTableInfo(), hi);
2190 }
2191
2192 @Override
2193 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2194 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2195 "Last region should end with an empty key. You need to "
2196 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2197 }
2198
2199 @Override
2200 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2201 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2202 "Region has the same start and end key.", getTableInfo(), hi);
2203 }
2204
2205 @Override
2206 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2207 byte[] key = r1.getStartKey();
2208
2209 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2210 "Multiple regions have the same startkey: "
2211 + Bytes.toStringBinary(key), getTableInfo(), r1);
2212 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2213 "Multiple regions have the same startkey: "
2214 + Bytes.toStringBinary(key), getTableInfo(), r2);
2215 }
2216
2217 @Override
2218 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2219 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2220 "There is an overlap in the region chain.",
2221 getTableInfo(), hi1, hi2);
2222 }
2223
2224 @Override
2225 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2226 errors.reportError(
2227 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2228 "There is a hole in the region chain between "
2229 + Bytes.toStringBinary(holeStart) + " and "
2230 + Bytes.toStringBinary(holeStop)
2231 + ". You need to create a new .regioninfo and region "
2232 + "dir in hdfs to plug the hole.");
2233 }
2234 };
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2249 Configuration conf;
2250
2251 boolean fixOverlaps = true;
2252
2253 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2254 boolean fixHoles, boolean fixOverlaps) {
2255 super(ti, errors);
2256 this.conf = conf;
2257 this.fixOverlaps = fixOverlaps;
2258
2259 }
2260
2261
2262
2263
2264
2265
2266 @Override
2267 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2268 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2269 "First region should start with an empty key. Creating a new " +
2270 "region and regioninfo in HDFS to plug the hole.",
2271 getTableInfo(), next);
2272 HTableDescriptor htd = getTableInfo().getHTD();
2273
2274 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(),
2275 HConstants.EMPTY_START_ROW, next.getStartKey());
2276
2277
2278 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2279 LOG.info("Table region start key was not empty. Created new empty region: "
2280 + newRegion + " " +region);
2281 fixes++;
2282 }
2283
2284 @Override
2285 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2286 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2287 "Last region should end with an empty key. Creating a new "
2288 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2289 HTableDescriptor htd = getTableInfo().getHTD();
2290
2291 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), curEndKey,
2292 HConstants.EMPTY_START_ROW);
2293
2294 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2295 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2296 + " " + region);
2297 fixes++;
2298 }
2299
2300
2301
2302
2303
2304 @Override
2305 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2306 errors.reportError(
2307 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2308 "There is a hole in the region chain between "
2309 + Bytes.toStringBinary(holeStartKey) + " and "
2310 + Bytes.toStringBinary(holeStopKey)
2311 + ". Creating a new regioninfo and region "
2312 + "dir in hdfs to plug the hole.");
2313 HTableDescriptor htd = getTableInfo().getHTD();
2314 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), holeStartKey, holeStopKey);
2315 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2316 LOG.info("Plugged hold by creating new empty region: "+ newRegion + " " +region);
2317 fixes++;
2318 }
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331 @Override
2332 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2333 throws IOException {
2334 Preconditions.checkNotNull(overlap);
2335 Preconditions.checkArgument(overlap.size() >0);
2336
2337 if (!this.fixOverlaps) {
2338 LOG.warn("Not attempting to repair overlaps.");
2339 return;
2340 }
2341
2342 if (overlap.size() > maxMerge) {
2343 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2344 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2345 if (sidelineBigOverlaps) {
2346
2347 sidelineBigOverlaps(overlap);
2348 }
2349 return;
2350 }
2351
2352 mergeOverlaps(overlap);
2353 }
2354
2355 void mergeOverlaps(Collection<HbckInfo> overlap)
2356 throws IOException {
2357 String thread = Thread.currentThread().getName();
2358 LOG.info("== [" + thread + "] Merging regions into one region: "
2359 + Joiner.on(",").join(overlap));
2360
2361 Pair<byte[], byte[]> range = null;
2362 for (HbckInfo hi : overlap) {
2363 if (range == null) {
2364 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2365 } else {
2366 if (RegionSplitCalculator.BYTES_COMPARATOR
2367 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2368 range.setFirst(hi.getStartKey());
2369 }
2370 if (RegionSplitCalculator.BYTES_COMPARATOR
2371 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2372 range.setSecond(hi.getEndKey());
2373 }
2374 }
2375
2376 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2377 LOG.debug("[" + thread + "] Contained region dir before close");
2378 debugLsr(hi.getHdfsRegionDir());
2379 try {
2380 LOG.info("[" + thread + "] Closing region: " + hi);
2381 closeRegion(hi);
2382 } catch (IOException ioe) {
2383 LOG.warn("[" + thread + "] Was unable to close region " + hi
2384 + ". Just continuing... ", ioe);
2385 } catch (InterruptedException e) {
2386 LOG.warn("[" + thread + "] Was unable to close region " + hi
2387 + ". Just continuing... ", e);
2388 }
2389
2390 try {
2391 LOG.info("[" + thread + "] Offlining region: " + hi);
2392 offline(hi.getRegionName());
2393 } catch (IOException ioe) {
2394 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2395 + ". Just continuing... ", ioe);
2396 }
2397 }
2398
2399
2400 HTableDescriptor htd = getTableInfo().getHTD();
2401
2402 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), range.getFirst(),
2403 range.getSecond());
2404 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2405 LOG.info("[" + thread + "] Created new empty container region: " +
2406 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2407 debugLsr(region.getRegionFileSystem().getRegionDir());
2408
2409
2410 boolean didFix= false;
2411 Path target = region.getRegionFileSystem().getRegionDir();
2412 for (HbckInfo contained : overlap) {
2413 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
2414 int merges = mergeRegionDirs(target, contained);
2415 if (merges > 0) {
2416 didFix = true;
2417 }
2418 }
2419 if (didFix) {
2420 fixes++;
2421 }
2422 }
2423
2424
2425
2426
2427
2428
2429
2430
2431 void sidelineBigOverlaps(
2432 Collection<HbckInfo> bigOverlap) throws IOException {
2433 int overlapsToSideline = bigOverlap.size() - maxMerge;
2434 if (overlapsToSideline > maxOverlapsToSideline) {
2435 overlapsToSideline = maxOverlapsToSideline;
2436 }
2437 List<HbckInfo> regionsToSideline =
2438 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
2439 FileSystem fs = FileSystem.get(conf);
2440 for (HbckInfo regionToSideline: regionsToSideline) {
2441 try {
2442 LOG.info("Closing region: " + regionToSideline);
2443 closeRegion(regionToSideline);
2444 } catch (IOException ioe) {
2445 LOG.warn("Was unable to close region " + regionToSideline
2446 + ". Just continuing... ", ioe);
2447 } catch (InterruptedException e) {
2448 LOG.warn("Was unable to close region " + regionToSideline
2449 + ". Just continuing... ", e);
2450 }
2451
2452 try {
2453 LOG.info("Offlining region: " + regionToSideline);
2454 offline(regionToSideline.getRegionName());
2455 } catch (IOException ioe) {
2456 LOG.warn("Unable to offline region from master: " + regionToSideline
2457 + ". Just continuing... ", ioe);
2458 }
2459
2460 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
2461 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
2462 if (sidelineRegionDir != null) {
2463 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
2464 LOG.info("After sidelined big overlapped region: "
2465 + regionToSideline.getRegionNameAsString()
2466 + " to " + sidelineRegionDir.toString());
2467 fixes++;
2468 }
2469 }
2470 }
2471 }
2472
2473
2474
2475
2476
2477
2478
2479 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
2480
2481
2482
2483 if (disabledTables.contains(this.tableName)) {
2484 return true;
2485 }
2486 int originalErrorsCount = errors.getErrorList().size();
2487 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
2488 SortedSet<byte[]> splits = sc.getSplits();
2489
2490 byte[] prevKey = null;
2491 byte[] problemKey = null;
2492
2493 if (splits.size() == 0) {
2494
2495 handler.handleHoleInRegionChain(HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW);
2496 }
2497
2498 for (byte[] key : splits) {
2499 Collection<HbckInfo> ranges = regions.get(key);
2500 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
2501 for (HbckInfo rng : ranges) {
2502 handler.handleRegionStartKeyNotEmpty(rng);
2503 }
2504 }
2505
2506
2507 for (HbckInfo rng : ranges) {
2508
2509 byte[] endKey = rng.getEndKey();
2510 endKey = (endKey.length == 0) ? null : endKey;
2511 if (Bytes.equals(rng.getStartKey(),endKey)) {
2512 handler.handleDegenerateRegion(rng);
2513 }
2514 }
2515
2516 if (ranges.size() == 1) {
2517
2518 if (problemKey != null) {
2519 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2520 }
2521 problemKey = null;
2522 } else if (ranges.size() > 1) {
2523
2524
2525 if (problemKey == null) {
2526
2527 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
2528 problemKey = key;
2529 }
2530 overlapGroups.putAll(problemKey, ranges);
2531
2532
2533 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
2534
2535 for (HbckInfo r1 : ranges) {
2536 subRange.remove(r1);
2537 for (HbckInfo r2 : subRange) {
2538 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
2539 handler.handleDuplicateStartKeys(r1,r2);
2540 } else {
2541
2542 handler.handleOverlapInRegionChain(r1, r2);
2543 }
2544 }
2545 }
2546
2547 } else if (ranges.size() == 0) {
2548 if (problemKey != null) {
2549 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2550 }
2551 problemKey = null;
2552
2553 byte[] holeStopKey = sc.getSplits().higher(key);
2554
2555 if (holeStopKey != null) {
2556
2557 handler.handleHoleInRegionChain(key, holeStopKey);
2558 }
2559 }
2560 prevKey = key;
2561 }
2562
2563
2564
2565 if (prevKey != null) {
2566 handler.handleRegionEndKeyNotEmpty(prevKey);
2567 }
2568
2569
2570 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
2571 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
2572 " false to run serially.");
2573 boolean ok = handleOverlapsParallel(handler, prevKey);
2574 if (!ok) {
2575 return false;
2576 }
2577 } else {
2578 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
2579 " true to run in parallel.");
2580 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2581 handler.handleOverlapGroup(overlap);
2582 }
2583 }
2584
2585 if (details) {
2586
2587 errors.print("---- Table '" + this.tableName
2588 + "': region split map");
2589 dump(splits, regions);
2590 errors.print("---- Table '" + this.tableName
2591 + "': overlap groups");
2592 dumpOverlapProblems(overlapGroups);
2593 errors.print("There are " + overlapGroups.keySet().size()
2594 + " overlap groups with " + overlapGroups.size()
2595 + " overlapping regions");
2596 }
2597 if (!sidelinedRegions.isEmpty()) {
2598 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
2599 errors.print("---- Table '" + this.tableName
2600 + "': sidelined big overlapped regions");
2601 dumpSidelinedRegions(sidelinedRegions);
2602 }
2603 return errors.getErrorList().size() == originalErrorsCount;
2604 }
2605
2606 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
2607 throws IOException {
2608
2609
2610 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
2611 List<Future<Void>> rets;
2612 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2613
2614 merges.add(new WorkItemOverlapMerge(overlap, handler));
2615 }
2616 try {
2617 rets = executor.invokeAll(merges);
2618 } catch (InterruptedException e) {
2619 e.printStackTrace();
2620 LOG.error("Overlap merges were interrupted", e);
2621 return false;
2622 }
2623 for(int i=0; i<merges.size(); i++) {
2624 WorkItemOverlapMerge work = merges.get(i);
2625 Future<Void> f = rets.get(i);
2626 try {
2627 f.get();
2628 } catch(ExecutionException e) {
2629 LOG.warn("Failed to merge overlap group" + work, e.getCause());
2630 } catch (InterruptedException e) {
2631 LOG.error("Waiting for overlap merges was interrupted", e);
2632 return false;
2633 }
2634 }
2635 return true;
2636 }
2637
2638
2639
2640
2641
2642
2643
2644 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
2645
2646 StringBuilder sb = new StringBuilder();
2647 for (byte[] k : splits) {
2648 sb.setLength(0);
2649 sb.append(Bytes.toStringBinary(k) + ":\t");
2650 for (HbckInfo r : regions.get(k)) {
2651 sb.append("[ "+ r.toString() + ", "
2652 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
2653 }
2654 errors.print(sb.toString());
2655 }
2656 }
2657 }
2658
2659 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
2660
2661
2662 for (byte[] k : regions.keySet()) {
2663 errors.print(Bytes.toStringBinary(k) + ":");
2664 for (HbckInfo r : regions.get(k)) {
2665 errors.print("[ " + r.toString() + ", "
2666 + Bytes.toStringBinary(r.getEndKey()) + "]");
2667 }
2668 errors.print("----");
2669 }
2670 }
2671
2672 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
2673 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
2674 TableName tableName = entry.getValue().getTableName();
2675 Path path = entry.getKey();
2676 errors.print("This sidelined region dir should be bulk loaded: "
2677 + path.toString());
2678 errors.print("Bulk load command looks like: "
2679 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
2680 + path.toUri().getPath() + " "+ tableName);
2681 }
2682 }
2683
2684 public Multimap<byte[], HbckInfo> getOverlapGroups(
2685 TableName table) {
2686 TableInfo ti = tablesInfo.get(table);
2687 return ti.overlapGroups;
2688 }
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
2700 List<TableName> tableNames = new ArrayList<TableName>();
2701 long now = System.currentTimeMillis();
2702
2703 for (HbckInfo hbi : regionInfoMap.values()) {
2704 MetaEntry info = hbi.metaEntry;
2705
2706
2707
2708 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
2709 if (info.modTime + timelag < now) {
2710 tableNames.add(info.getTable());
2711 } else {
2712 numSkipped.incrementAndGet();
2713 }
2714 }
2715 }
2716 return getHTableDescriptors(tableNames);
2717 }
2718
2719 HTableDescriptor[] getHTableDescriptors(List<TableName> tableNames) {
2720 HTableDescriptor[] htd = new HTableDescriptor[0];
2721 try {
2722 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
2723 htd = new HBaseAdmin(getConf()).getTableDescriptorsByTableName(tableNames);
2724 } catch (IOException e) {
2725 LOG.debug("Exception getting table descriptors", e);
2726 }
2727 return htd;
2728 }
2729
2730
2731
2732
2733
2734
2735 private synchronized HbckInfo getOrCreateInfo(String name) {
2736 HbckInfo hbi = regionInfoMap.get(name);
2737 if (hbi == null) {
2738 hbi = new HbckInfo(null);
2739 regionInfoMap.put(name, hbi);
2740 }
2741 return hbi;
2742 }
2743
2744 private void checkAndFixTableLocks() throws IOException {
2745 TableLockChecker checker = new TableLockChecker(createZooKeeperWatcher(), errors);
2746 checker.checkTableLocks();
2747
2748 if (this.fixTableLocks) {
2749 checker.fixExpiredTableLocks();
2750 }
2751 }
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762 boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException {
2763 List<HbckInfo> metaRegions = Lists.newArrayList();
2764 for (HbckInfo value : regionInfoMap.values()) {
2765 if (value.metaEntry != null && value.metaEntry.isMetaRegion()) {
2766 metaRegions.add(value);
2767 }
2768 }
2769
2770
2771
2772 List<ServerName> servers = new ArrayList<ServerName>();
2773 HbckInfo metaHbckInfo = null;
2774 if (!metaRegions.isEmpty()) {
2775 metaHbckInfo = metaRegions.get(0);
2776 servers = metaHbckInfo.deployedOn;
2777 }
2778 if (servers.size() != 1) {
2779 if (servers.size() == 0) {
2780 errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region.");
2781 if (shouldFixAssignments()) {
2782 errors.print("Trying to fix a problem with hbase:meta..");
2783 setShouldRerun();
2784
2785 HBaseFsckRepair.fixUnassigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
2786 HBaseFsckRepair.waitUntilAssigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
2787 }
2788 } else if (servers.size() > 1) {
2789 errors
2790 .reportError(ERROR_CODE.MULTI_META_REGION, "hbase:meta is found on more than one region.");
2791 if (shouldFixAssignments()) {
2792 errors.print("Trying to fix a problem with hbase:meta..");
2793 setShouldRerun();
2794
2795 HBaseFsckRepair.fixMultiAssignment(admin, metaHbckInfo.metaEntry, servers);
2796 }
2797 }
2798
2799 return false;
2800 }
2801
2802 return true;
2803 }
2804
2805
2806
2807
2808
2809 boolean loadMetaEntries() throws IOException {
2810 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
2811 int countRecord = 1;
2812
2813
2814 final Comparator<Cell> comp = new Comparator<Cell>() {
2815 @Override
2816 public int compare(Cell k1, Cell k2) {
2817 return (int)(k1.getTimestamp() - k2.getTimestamp());
2818 }
2819 };
2820
2821 @Override
2822 public boolean processRow(Result result) throws IOException {
2823 try {
2824
2825
2826 long ts = Collections.max(result.listCells(), comp).getTimestamp();
2827 Pair<HRegionInfo, ServerName> pair = HRegionInfo.getHRegionInfoAndServerName(result);
2828 if (pair == null || pair.getFirst() == null) {
2829 emptyRegionInfoQualifiers.add(result);
2830 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
2831 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
2832 return true;
2833 }
2834 ServerName sn = null;
2835 if (pair.getSecond() != null) {
2836 sn = pair.getSecond();
2837 }
2838 HRegionInfo hri = pair.getFirst();
2839 if (!(isTableIncluded(hri.getTable())
2840 || hri.isMetaRegion())) {
2841 return true;
2842 }
2843 PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(result);
2844 MetaEntry m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
2845 HbckInfo previous = regionInfoMap.get(hri.getEncodedName());
2846 if (previous == null) {
2847 regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m));
2848 } else if (previous.metaEntry == null) {
2849 previous.metaEntry = m;
2850 } else {
2851 throw new IOException("Two entries in hbase:meta are same " + previous);
2852 }
2853
2854 PairOfSameType<HRegionInfo> mergeRegions = HRegionInfo.getMergeRegions(result);
2855 for (HRegionInfo mergeRegion : new HRegionInfo[] {
2856 mergeRegions.getFirst(), mergeRegions.getSecond() }) {
2857 if (mergeRegion != null) {
2858
2859 HbckInfo hbInfo = getOrCreateInfo(mergeRegion.getEncodedName());
2860 hbInfo.setMerged(true);
2861 }
2862 }
2863
2864
2865 if (countRecord % 100 == 0) {
2866 errors.progress();
2867 }
2868 countRecord++;
2869 return true;
2870 } catch (RuntimeException e) {
2871 LOG.error("Result=" + result);
2872 throw e;
2873 }
2874 }
2875 };
2876 if (!checkMetaOnly) {
2877
2878 MetaScanner.metaScan(getConf(), visitor);
2879 }
2880
2881 errors.print("");
2882 return true;
2883 }
2884
2885
2886
2887
2888 static class MetaEntry extends HRegionInfo {
2889 ServerName regionServer;
2890 long modTime;
2891 HRegionInfo splitA, splitB;
2892
2893 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
2894 this(rinfo, regionServer, modTime, null, null);
2895 }
2896
2897 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
2898 HRegionInfo splitA, HRegionInfo splitB) {
2899 super(rinfo);
2900 this.regionServer = regionServer;
2901 this.modTime = modTime;
2902 this.splitA = splitA;
2903 this.splitB = splitB;
2904 }
2905
2906 @Override
2907 public boolean equals(Object o) {
2908 boolean superEq = super.equals(o);
2909 if (!superEq) {
2910 return superEq;
2911 }
2912
2913 MetaEntry me = (MetaEntry) o;
2914 if (!regionServer.equals(me.regionServer)) {
2915 return false;
2916 }
2917 return (modTime == me.modTime);
2918 }
2919
2920 @Override
2921 public int hashCode() {
2922 int hash = Arrays.hashCode(getRegionName());
2923 hash ^= getRegionId();
2924 hash ^= Arrays.hashCode(getStartKey());
2925 hash ^= Arrays.hashCode(getEndKey());
2926 hash ^= Boolean.valueOf(isOffline()).hashCode();
2927 hash ^= getTable().hashCode();
2928 if (regionServer != null) {
2929 hash ^= regionServer.hashCode();
2930 }
2931 hash ^= modTime;
2932 return hash;
2933 }
2934 }
2935
2936
2937
2938
2939 static class HdfsEntry {
2940 HRegionInfo hri;
2941 Path hdfsRegionDir = null;
2942 long hdfsRegionDirModTime = 0;
2943 boolean hdfsRegioninfoFilePresent = false;
2944 boolean hdfsOnlyEdits = false;
2945 }
2946
2947
2948
2949
2950 static class OnlineEntry {
2951 HRegionInfo hri;
2952 ServerName hsa;
2953
2954 @Override
2955 public String toString() {
2956 return hsa.toString() + ";" + hri.getRegionNameAsString();
2957 }
2958 }
2959
2960
2961
2962
2963
2964 public static class HbckInfo implements KeyRange {
2965 private MetaEntry metaEntry = null;
2966 private HdfsEntry hdfsEntry = null;
2967 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
2968 private List<ServerName> deployedOn = Lists.newArrayList();
2969 private boolean skipChecks = false;
2970 private boolean isMerged = false;
2971
2972 HbckInfo(MetaEntry metaEntry) {
2973 this.metaEntry = metaEntry;
2974 }
2975
2976 public synchronized void addServer(HRegionInfo hri, ServerName server) {
2977 OnlineEntry rse = new OnlineEntry() ;
2978 rse.hri = hri;
2979 rse.hsa = server;
2980 this.deployedEntries.add(rse);
2981 this.deployedOn.add(server);
2982 }
2983
2984 @Override
2985 public synchronized String toString() {
2986 StringBuilder sb = new StringBuilder();
2987 sb.append("{ meta => ");
2988 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
2989 sb.append( ", hdfs => " + getHdfsRegionDir());
2990 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
2991 sb.append(" }");
2992 return sb.toString();
2993 }
2994
2995 @Override
2996 public byte[] getStartKey() {
2997 if (this.metaEntry != null) {
2998 return this.metaEntry.getStartKey();
2999 } else if (this.hdfsEntry != null) {
3000 return this.hdfsEntry.hri.getStartKey();
3001 } else {
3002 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3003 return null;
3004 }
3005 }
3006
3007 @Override
3008 public byte[] getEndKey() {
3009 if (this.metaEntry != null) {
3010 return this.metaEntry.getEndKey();
3011 } else if (this.hdfsEntry != null) {
3012 return this.hdfsEntry.hri.getEndKey();
3013 } else {
3014 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3015 return null;
3016 }
3017 }
3018
3019 public TableName getTableName() {
3020 if (this.metaEntry != null) {
3021 return this.metaEntry.getTable();
3022 } else if (this.hdfsEntry != null) {
3023
3024
3025 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3026 return FSUtils.getTableName(tableDir);
3027 } else {
3028
3029
3030 return null;
3031 }
3032 }
3033
3034 public String getRegionNameAsString() {
3035 if (metaEntry != null) {
3036 return metaEntry.getRegionNameAsString();
3037 } else if (hdfsEntry != null) {
3038 if (hdfsEntry.hri != null) {
3039 return hdfsEntry.hri.getRegionNameAsString();
3040 }
3041 }
3042 return null;
3043 }
3044
3045 public byte[] getRegionName() {
3046 if (metaEntry != null) {
3047 return metaEntry.getRegionName();
3048 } else if (hdfsEntry != null) {
3049 return hdfsEntry.hri.getRegionName();
3050 } else {
3051 return null;
3052 }
3053 }
3054
3055 Path getHdfsRegionDir() {
3056 if (hdfsEntry == null) {
3057 return null;
3058 }
3059 return hdfsEntry.hdfsRegionDir;
3060 }
3061
3062 boolean containsOnlyHdfsEdits() {
3063 if (hdfsEntry == null) {
3064 return false;
3065 }
3066 return hdfsEntry.hdfsOnlyEdits;
3067 }
3068
3069 boolean isHdfsRegioninfoPresent() {
3070 if (hdfsEntry == null) {
3071 return false;
3072 }
3073 return hdfsEntry.hdfsRegioninfoFilePresent;
3074 }
3075
3076 long getModTime() {
3077 if (hdfsEntry == null) {
3078 return 0;
3079 }
3080 return hdfsEntry.hdfsRegionDirModTime;
3081 }
3082
3083 HRegionInfo getHdfsHRI() {
3084 if (hdfsEntry == null) {
3085 return null;
3086 }
3087 return hdfsEntry.hri;
3088 }
3089
3090 public void setSkipChecks(boolean skipChecks) {
3091 this.skipChecks = skipChecks;
3092 }
3093
3094 public boolean isSkipChecks() {
3095 return skipChecks;
3096 }
3097
3098 public void setMerged(boolean isMerged) {
3099 this.isMerged = isMerged;
3100 }
3101
3102 public boolean isMerged() {
3103 return this.isMerged;
3104 }
3105 }
3106
3107 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3108 @Override
3109 public int compare(HbckInfo l, HbckInfo r) {
3110 if (l == r) {
3111
3112 return 0;
3113 }
3114
3115 int tableCompare = l.getTableName().compareTo(r.getTableName());
3116 if (tableCompare != 0) {
3117 return tableCompare;
3118 }
3119
3120 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3121 l.getStartKey(), r.getStartKey());
3122 if (startComparison != 0) {
3123 return startComparison;
3124 }
3125
3126
3127 byte[] endKey = r.getEndKey();
3128 endKey = (endKey.length == 0) ? null : endKey;
3129 byte[] endKey2 = l.getEndKey();
3130 endKey2 = (endKey2.length == 0) ? null : endKey2;
3131 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3132 endKey2, endKey);
3133
3134 if (endComparison != 0) {
3135 return endComparison;
3136 }
3137
3138
3139
3140 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3141 return 0;
3142 }
3143 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3144 return 1;
3145 }
3146
3147 if (r.hdfsEntry == null) {
3148 return -1;
3149 }
3150
3151 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3152 }
3153 };
3154
3155
3156
3157
3158 private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
3159 StringBuilder sb = new StringBuilder();
3160 errors.print("Summary:");
3161 for (TableInfo tInfo : tablesInfo.values()) {
3162 if (errors.tableHasErrors(tInfo)) {
3163 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3164 } else {
3165 errors.print(" " + tInfo.getName() + " is okay.");
3166 }
3167 errors.print(" Number of regions: " + tInfo.getNumRegions());
3168 sb.setLength(0);
3169 sb.append(" Deployed on: ");
3170 for (ServerName server : tInfo.deployedOn) {
3171 sb.append(" " + server.toString());
3172 }
3173 errors.print(sb.toString());
3174 }
3175 }
3176
3177 static ErrorReporter getErrorReporter(
3178 final Configuration conf) throws ClassNotFoundException {
3179 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3180 return (ErrorReporter)ReflectionUtils.newInstance(reporter, conf);
3181 }
3182
3183 public interface ErrorReporter {
3184 enum ERROR_CODE {
3185 UNKNOWN, NO_META_REGION, NULL_META_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3186 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3187 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3188 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3189 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3190 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3191 WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, BOUNDARIES_ERROR
3192 }
3193 void clear();
3194 void report(String message);
3195 void reportError(String message);
3196 void reportError(ERROR_CODE errorCode, String message);
3197 void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3198 void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3199 void reportError(
3200 ERROR_CODE errorCode,
3201 String message,
3202 TableInfo table,
3203 HbckInfo info1,
3204 HbckInfo info2
3205 );
3206 int summarize();
3207 void detail(String details);
3208 ArrayList<ERROR_CODE> getErrorList();
3209 void progress();
3210 void print(String message);
3211 void resetErrors();
3212 boolean tableHasErrors(TableInfo table);
3213 }
3214
3215 static class PrintingErrorReporter implements ErrorReporter {
3216 public int errorCount = 0;
3217 private int showProgress;
3218
3219 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3220
3221
3222 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3223
3224 @Override
3225 public void clear() {
3226 errorTables.clear();
3227 errorList.clear();
3228 errorCount = 0;
3229 }
3230
3231 @Override
3232 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3233 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3234 System.err.println(message);
3235 return;
3236 }
3237
3238 errorList.add(errorCode);
3239 if (!summary) {
3240 System.out.println("ERROR: " + message);
3241 }
3242 errorCount++;
3243 showProgress = 0;
3244 }
3245
3246 @Override
3247 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3248 errorTables.add(table);
3249 reportError(errorCode, message);
3250 }
3251
3252 @Override
3253 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3254 HbckInfo info) {
3255 errorTables.add(table);
3256 String reference = "(region " + info.getRegionNameAsString() + ")";
3257 reportError(errorCode, reference + " " + message);
3258 }
3259
3260 @Override
3261 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3262 HbckInfo info1, HbckInfo info2) {
3263 errorTables.add(table);
3264 String reference = "(regions " + info1.getRegionNameAsString()
3265 + " and " + info2.getRegionNameAsString() + ")";
3266 reportError(errorCode, reference + " " + message);
3267 }
3268
3269 @Override
3270 public synchronized void reportError(String message) {
3271 reportError(ERROR_CODE.UNKNOWN, message);
3272 }
3273
3274
3275
3276
3277
3278
3279 @Override
3280 public synchronized void report(String message) {
3281 if (! summary) {
3282 System.out.println("ERROR: " + message);
3283 }
3284 showProgress = 0;
3285 }
3286
3287 @Override
3288 public synchronized int summarize() {
3289 System.out.println(Integer.toString(errorCount) +
3290 " inconsistencies detected.");
3291 if (errorCount == 0) {
3292 System.out.println("Status: OK");
3293 return 0;
3294 } else {
3295 System.out.println("Status: INCONSISTENT");
3296 return -1;
3297 }
3298 }
3299
3300 @Override
3301 public ArrayList<ERROR_CODE> getErrorList() {
3302 return errorList;
3303 }
3304
3305 @Override
3306 public synchronized void print(String message) {
3307 if (!summary) {
3308 System.out.println(message);
3309 }
3310 }
3311
3312 @Override
3313 public boolean tableHasErrors(TableInfo table) {
3314 return errorTables.contains(table);
3315 }
3316
3317 @Override
3318 public void resetErrors() {
3319 errorCount = 0;
3320 }
3321
3322 @Override
3323 public synchronized void detail(String message) {
3324 if (details) {
3325 System.out.println(message);
3326 }
3327 showProgress = 0;
3328 }
3329
3330 @Override
3331 public synchronized void progress() {
3332 if (showProgress++ == 10) {
3333 if (!summary) {
3334 System.out.print(".");
3335 }
3336 showProgress = 0;
3337 }
3338 }
3339 }
3340
3341
3342
3343
3344 static class WorkItemRegion implements Callable<Void> {
3345 private HBaseFsck hbck;
3346 private ServerName rsinfo;
3347 private ErrorReporter errors;
3348 private HConnection connection;
3349
3350 WorkItemRegion(HBaseFsck hbck, ServerName info,
3351 ErrorReporter errors, HConnection connection) {
3352 this.hbck = hbck;
3353 this.rsinfo = info;
3354 this.errors = errors;
3355 this.connection = connection;
3356 }
3357
3358 @Override
3359 public synchronized Void call() throws IOException {
3360 errors.progress();
3361 try {
3362 BlockingInterface server = connection.getAdmin(rsinfo);
3363
3364
3365 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
3366 regions = filterRegions(regions);
3367
3368 if (details) {
3369 errors.detail("RegionServer: " + rsinfo.getServerName() +
3370 " number of regions: " + regions.size());
3371 for (HRegionInfo rinfo: regions) {
3372 errors.detail(" " + rinfo.getRegionNameAsString() +
3373 " id: " + rinfo.getRegionId() +
3374 " encoded_name: " + rinfo.getEncodedName() +
3375 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
3376 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
3377 }
3378 }
3379
3380
3381 for (HRegionInfo r:regions) {
3382 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
3383 hbi.addServer(r, rsinfo);
3384 }
3385 } catch (IOException e) {
3386 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
3387 " Unable to fetch region information. " + e);
3388 throw e;
3389 }
3390 return null;
3391 }
3392
3393 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
3394 List<HRegionInfo> ret = Lists.newArrayList();
3395 for (HRegionInfo hri : regions) {
3396 if (hri.isMetaTable() || (!hbck.checkMetaOnly
3397 && hbck.isTableIncluded(hri.getTable()))) {
3398 ret.add(hri);
3399 }
3400 }
3401 return ret;
3402 }
3403 }
3404
3405
3406
3407
3408
3409 static class WorkItemHdfsDir implements Callable<Void> {
3410 private HBaseFsck hbck;
3411 private FileStatus tableDir;
3412 private ErrorReporter errors;
3413 private FileSystem fs;
3414
3415 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
3416 FileStatus status) {
3417 this.hbck = hbck;
3418 this.fs = fs;
3419 this.tableDir = status;
3420 this.errors = errors;
3421 }
3422
3423 @Override
3424 public synchronized Void call() throws IOException {
3425 try {
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480 static class WorkItemHdfsRegionInfo implements Callable<Void> {
3481 private HbckInfo hbi;
3482 private HBaseFsck hbck;
3483 private ErrorReporter errors;
3484
3485 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
3486 this.hbi = hbi;
3487 this.hbck = hbck;
3488 this.errors = errors;
3489 }
3490
3491 @Override
3492 public synchronized Void call() throws IOException {
3493
3494 if (hbi.getHdfsHRI() == null) {
3495 try {
3496 hbck.loadHdfsRegioninfo(hbi);
3497 } catch (IOException ioe) {
3498 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
3499 + hbi.getTableName() + " in hdfs dir "
3500 + hbi.getHdfsRegionDir()
3501 + "! It may be an invalid format or version file. Treating as "
3502 + "an orphaned regiondir.";
3503 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
3504 try {
3505 hbck.debugLsr(hbi.getHdfsRegionDir());
3506 } catch (IOException ioe2) {
3507 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
3508 throw ioe2;
3509 }
3510 hbck.orphanHdfsDirs.add(hbi);
3511 throw ioe;
3512 }
3513 }
3514 return null;
3515 }
3516 };
3517
3518
3519
3520
3521
3522 public static void setDisplayFullReport() {
3523 details = true;
3524 }
3525
3526
3527
3528
3529
3530 void setSummary() {
3531 summary = true;
3532 }
3533
3534
3535
3536
3537
3538 void setCheckMetaOnly() {
3539 checkMetaOnly = true;
3540 }
3541
3542
3543
3544
3545 void setRegionBoundariesCheck() {
3546 checkRegionBoundaries = true;
3547 }
3548
3549
3550
3551
3552
3553 public void setFixTableLocks(boolean shouldFix) {
3554 fixTableLocks = shouldFix;
3555 }
3556
3557
3558
3559
3560
3561
3562
3563 void setShouldRerun() {
3564 rerun = true;
3565 }
3566
3567 boolean shouldRerun() {
3568 return rerun;
3569 }
3570
3571
3572
3573
3574
3575 public void setFixAssignments(boolean shouldFix) {
3576 fixAssignments = shouldFix;
3577 }
3578
3579 boolean shouldFixAssignments() {
3580 return fixAssignments;
3581 }
3582
3583 public void setFixMeta(boolean shouldFix) {
3584 fixMeta = shouldFix;
3585 }
3586
3587 boolean shouldFixMeta() {
3588 return fixMeta;
3589 }
3590
3591 public void setFixEmptyMetaCells(boolean shouldFix) {
3592 fixEmptyMetaCells = shouldFix;
3593 }
3594
3595 boolean shouldFixEmptyMetaCells() {
3596 return fixEmptyMetaCells;
3597 }
3598
3599 public void setCheckHdfs(boolean checking) {
3600 checkHdfs = checking;
3601 }
3602
3603 boolean shouldCheckHdfs() {
3604 return checkHdfs;
3605 }
3606
3607 public void setFixHdfsHoles(boolean shouldFix) {
3608 fixHdfsHoles = shouldFix;
3609 }
3610
3611 boolean shouldFixHdfsHoles() {
3612 return fixHdfsHoles;
3613 }
3614
3615 public void setFixTableOrphans(boolean shouldFix) {
3616 fixTableOrphans = shouldFix;
3617 }
3618
3619 boolean shouldFixTableOrphans() {
3620 return fixTableOrphans;
3621 }
3622
3623 public void setFixHdfsOverlaps(boolean shouldFix) {
3624 fixHdfsOverlaps = shouldFix;
3625 }
3626
3627 boolean shouldFixHdfsOverlaps() {
3628 return fixHdfsOverlaps;
3629 }
3630
3631 public void setFixHdfsOrphans(boolean shouldFix) {
3632 fixHdfsOrphans = shouldFix;
3633 }
3634
3635 boolean shouldFixHdfsOrphans() {
3636 return fixHdfsOrphans;
3637 }
3638
3639 public void setFixVersionFile(boolean shouldFix) {
3640 fixVersionFile = shouldFix;
3641 }
3642
3643 public boolean shouldFixVersionFile() {
3644 return fixVersionFile;
3645 }
3646
3647 public void setSidelineBigOverlaps(boolean sbo) {
3648 this.sidelineBigOverlaps = sbo;
3649 }
3650
3651 public boolean shouldSidelineBigOverlaps() {
3652 return sidelineBigOverlaps;
3653 }
3654
3655 public void setFixSplitParents(boolean shouldFix) {
3656 fixSplitParents = shouldFix;
3657 }
3658
3659 boolean shouldFixSplitParents() {
3660 return fixSplitParents;
3661 }
3662
3663 public void setFixReferenceFiles(boolean shouldFix) {
3664 fixReferenceFiles = shouldFix;
3665 }
3666
3667 boolean shouldFixReferenceFiles() {
3668 return fixReferenceFiles;
3669 }
3670
3671 public boolean shouldIgnorePreCheckPermission() {
3672 return ignorePreCheckPermission;
3673 }
3674
3675 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
3676 this.ignorePreCheckPermission = ignorePreCheckPermission;
3677 }
3678
3679
3680
3681
3682 public void setMaxMerge(int mm) {
3683 this.maxMerge = mm;
3684 }
3685
3686 public int getMaxMerge() {
3687 return maxMerge;
3688 }
3689
3690 public void setMaxOverlapsToSideline(int mo) {
3691 this.maxOverlapsToSideline = mo;
3692 }
3693
3694 public int getMaxOverlapsToSideline() {
3695 return maxOverlapsToSideline;
3696 }
3697
3698
3699
3700
3701
3702 boolean isTableIncluded(TableName table) {
3703 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
3704 }
3705
3706 public void includeTable(TableName table) {
3707 tablesIncluded.add(table);
3708 }
3709
3710 Set<TableName> getIncludedTables() {
3711 return new HashSet<TableName>(tablesIncluded);
3712 }
3713
3714
3715
3716
3717
3718
3719 public void setTimeLag(long seconds) {
3720 timelag = seconds * 1000;
3721 }
3722
3723
3724
3725
3726
3727 public void setSidelineDir(String sidelineDir) {
3728 this.sidelineDir = new Path(sidelineDir);
3729 }
3730
3731 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
3732 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
3733 }
3734
3735 public HFileCorruptionChecker getHFilecorruptionChecker() {
3736 return hfcc;
3737 }
3738
3739 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
3740 this.hfcc = hfcc;
3741 }
3742
3743 public void setRetCode(int code) {
3744 this.retcode = code;
3745 }
3746
3747 public int getRetCode() {
3748 return retcode;
3749 }
3750
3751 protected HBaseFsck printUsageAndExit() {
3752 StringWriter sw = new StringWriter(2048);
3753 PrintWriter out = new PrintWriter(sw);
3754 out.println("Usage: fsck [opts] {only tables}");
3755 out.println(" where [opts] are:");
3756 out.println(" -help Display help options (this)");
3757 out.println(" -details Display full report of all regions.");
3758 out.println(" -timelag <timeInSeconds> Process only regions that " +
3759 " have not experienced any metadata updates in the last " +
3760 " <timeInSeconds> seconds.");
3761 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
3762 " before checking if the fix worked if run with -fix");
3763 out.println(" -summary Print only summary of the tables and status.");
3764 out.println(" -metaonly Only check the state of the hbase:meta table.");
3765 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
3766 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
3767
3768 out.println("");
3769 out.println(" Metadata Repair options: (expert features, use with caution!)");
3770 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
3771 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
3772 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
3773 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
3774 + " Assumes hbase:meta region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
3775 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
3776 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
3777 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
3778 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
3779 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
3780 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
3781 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
3782 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
3783 out.println(" -fixSplitParents Try to force offline split parents to be online.");
3784 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
3785 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
3786 out.println(" -fixEmptyMetaCells Try to fix hbase:meta entries not referencing any region"
3787 + " (empty REGIONINFO_QUALIFIER rows)");
3788
3789 out.println("");
3790 out.println(" Datafile Repair options: (expert features, use with caution!)");
3791 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
3792 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
3793
3794 out.println("");
3795 out.println(" Metadata Repair shortcuts");
3796 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
3797 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles -fixTableLocks");
3798 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
3799
3800 out.println("");
3801 out.println(" Table lock options");
3802 out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)");
3803
3804 out.flush();
3805 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
3806
3807 setRetCode(-2);
3808 return this;
3809 }
3810
3811
3812
3813
3814
3815
3816
3817 public static void main(String[] args) throws Exception {
3818
3819 Configuration conf = HBaseConfiguration.create();
3820 Path hbasedir = FSUtils.getRootDir(conf);
3821 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
3822 FSUtils.setFsDefault(conf, new Path(defaultFs));
3823
3824 int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
3825 System.exit(ret);
3826 }
3827
3828
3829
3830
3831 static class HBaseFsckTool extends Configured implements Tool {
3832 HBaseFsckTool(Configuration conf) { super(conf); }
3833 @Override
3834 public int run(String[] args) throws Exception {
3835 HBaseFsck hbck = new HBaseFsck(getConf());
3836 hbck.exec(hbck.executor, args);
3837 return hbck.getRetCode();
3838 }
3839 };
3840
3841
3842 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
3843 ServiceException, InterruptedException {
3844 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
3845
3846 boolean checkCorruptHFiles = false;
3847 boolean sidelineCorruptHFiles = false;
3848
3849
3850 for (int i = 0; i < args.length; i++) {
3851 String cmd = args[i];
3852 if (cmd.equals("-help") || cmd.equals("-h")) {
3853 return printUsageAndExit();
3854 } else if (cmd.equals("-details")) {
3855 setDisplayFullReport();
3856 } else if (cmd.equals("-timelag")) {
3857 if (i == args.length - 1) {
3858 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
3859 return printUsageAndExit();
3860 }
3861 try {
3862 long timelag = Long.parseLong(args[i+1]);
3863 setTimeLag(timelag);
3864 } catch (NumberFormatException e) {
3865 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
3866 return printUsageAndExit();
3867 }
3868 i++;
3869 } else if (cmd.equals("-sleepBeforeRerun")) {
3870 if (i == args.length - 1) {
3871 errors.reportError(ERROR_CODE.WRONG_USAGE,
3872 "HBaseFsck: -sleepBeforeRerun needs a value.");
3873 return printUsageAndExit();
3874 }
3875 try {
3876 sleepBeforeRerun = Long.parseLong(args[i+1]);
3877 } catch (NumberFormatException e) {
3878 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
3879 return printUsageAndExit();
3880 }
3881 i++;
3882 } else if (cmd.equals("-sidelineDir")) {
3883 if (i == args.length - 1) {
3884 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
3885 return printUsageAndExit();
3886 }
3887 i++;
3888 setSidelineDir(args[i]);
3889 } else if (cmd.equals("-fix")) {
3890 errors.reportError(ERROR_CODE.WRONG_USAGE,
3891 "This option is deprecated, please use -fixAssignments instead.");
3892 setFixAssignments(true);
3893 } else if (cmd.equals("-fixAssignments")) {
3894 setFixAssignments(true);
3895 } else if (cmd.equals("-fixMeta")) {
3896 setFixMeta(true);
3897 } else if (cmd.equals("-noHdfsChecking")) {
3898 setCheckHdfs(false);
3899 } else if (cmd.equals("-fixHdfsHoles")) {
3900 setFixHdfsHoles(true);
3901 } else if (cmd.equals("-fixHdfsOrphans")) {
3902 setFixHdfsOrphans(true);
3903 } else if (cmd.equals("-fixTableOrphans")) {
3904 setFixTableOrphans(true);
3905 } else if (cmd.equals("-fixHdfsOverlaps")) {
3906 setFixHdfsOverlaps(true);
3907 } else if (cmd.equals("-fixVersionFile")) {
3908 setFixVersionFile(true);
3909 } else if (cmd.equals("-sidelineBigOverlaps")) {
3910 setSidelineBigOverlaps(true);
3911 } else if (cmd.equals("-fixSplitParents")) {
3912 setFixSplitParents(true);
3913 } else if (cmd.equals("-ignorePreCheckPermission")) {
3914 setIgnorePreCheckPermission(true);
3915 } else if (cmd.equals("-checkCorruptHFiles")) {
3916 checkCorruptHFiles = true;
3917 } else if (cmd.equals("-sidelineCorruptHFiles")) {
3918 sidelineCorruptHFiles = true;
3919 } else if (cmd.equals("-fixReferenceFiles")) {
3920 setFixReferenceFiles(true);
3921 } else if (cmd.equals("-fixEmptyMetaCells")) {
3922 setFixEmptyMetaCells(true);
3923 } else if (cmd.equals("-repair")) {
3924
3925
3926 setFixHdfsHoles(true);
3927 setFixHdfsOrphans(true);
3928 setFixMeta(true);
3929 setFixAssignments(true);
3930 setFixHdfsOverlaps(true);
3931 setFixVersionFile(true);
3932 setSidelineBigOverlaps(true);
3933 setFixSplitParents(false);
3934 setCheckHdfs(true);
3935 setFixReferenceFiles(true);
3936 setFixTableLocks(true);
3937 } else if (cmd.equals("-repairHoles")) {
3938
3939 setFixHdfsHoles(true);
3940 setFixHdfsOrphans(false);
3941 setFixMeta(true);
3942 setFixAssignments(true);
3943 setFixHdfsOverlaps(false);
3944 setSidelineBigOverlaps(false);
3945 setFixSplitParents(false);
3946 setCheckHdfs(true);
3947 } else if (cmd.equals("-maxOverlapsToSideline")) {
3948 if (i == args.length - 1) {
3949 errors.reportError(ERROR_CODE.WRONG_USAGE,
3950 "-maxOverlapsToSideline needs a numeric value argument.");
3951 return printUsageAndExit();
3952 }
3953 try {
3954 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
3955 setMaxOverlapsToSideline(maxOverlapsToSideline);
3956 } catch (NumberFormatException e) {
3957 errors.reportError(ERROR_CODE.WRONG_USAGE,
3958 "-maxOverlapsToSideline needs a numeric value argument.");
3959 return printUsageAndExit();
3960 }
3961 i++;
3962 } else if (cmd.equals("-maxMerge")) {
3963 if (i == args.length - 1) {
3964 errors.reportError(ERROR_CODE.WRONG_USAGE,
3965 "-maxMerge needs a numeric value argument.");
3966 return printUsageAndExit();
3967 }
3968 try {
3969 int maxMerge = Integer.parseInt(args[i+1]);
3970 setMaxMerge(maxMerge);
3971 } catch (NumberFormatException e) {
3972 errors.reportError(ERROR_CODE.WRONG_USAGE,
3973 "-maxMerge needs a numeric value argument.");
3974 return printUsageAndExit();
3975 }
3976 i++;
3977 } else if (cmd.equals("-summary")) {
3978 setSummary();
3979 } else if (cmd.equals("-metaonly")) {
3980 setCheckMetaOnly();
3981 } else if (cmd.equals("-boundaries")) {
3982 setRegionBoundariesCheck();
3983 } else if (cmd.equals("-fixTableLocks")) {
3984 setFixTableLocks(true);
3985 } else if (cmd.startsWith("-")) {
3986 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
3987 return printUsageAndExit();
3988 } else {
3989 includeTable(TableName.valueOf(cmd));
3990 errors.print("Allow checking/fixes for table: " + cmd);
3991 }
3992 }
3993
3994
3995 try {
3996 preCheckPermission();
3997 } catch (AccessControlException ace) {
3998 Runtime.getRuntime().exit(-1);
3999 } catch (IOException ioe) {
4000 Runtime.getRuntime().exit(-1);
4001 }
4002
4003
4004 connect();
4005
4006 try {
4007
4008 if (checkCorruptHFiles || sidelineCorruptHFiles) {
4009 LOG.info("Checking all hfiles for corruption");
4010 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
4011 setHFileCorruptionChecker(hfcc);
4012 Collection<TableName> tables = getIncludedTables();
4013 Collection<Path> tableDirs = new ArrayList<Path>();
4014 Path rootdir = FSUtils.getRootDir(getConf());
4015 if (tables.size() > 0) {
4016 for (TableName t : tables) {
4017 tableDirs.add(FSUtils.getTableDir(rootdir, t));
4018 }
4019 } else {
4020 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
4021 }
4022 hfcc.checkTables(tableDirs);
4023 hfcc.report(errors);
4024 }
4025
4026
4027 int code = onlineHbck();
4028 setRetCode(code);
4029
4030
4031
4032
4033 if (shouldRerun()) {
4034 try {
4035 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
4036 Thread.sleep(sleepBeforeRerun);
4037 } catch (InterruptedException ie) {
4038 return this;
4039 }
4040
4041 setFixAssignments(false);
4042 setFixMeta(false);
4043 setFixHdfsHoles(false);
4044 setFixHdfsOverlaps(false);
4045 setFixVersionFile(false);
4046 setFixTableOrphans(false);
4047 errors.resetErrors();
4048 code = onlineHbck();
4049 setRetCode(code);
4050 }
4051 } finally {
4052 IOUtils.cleanup(null, connection, meta, admin);
4053 }
4054 return this;
4055 }
4056
4057
4058
4059
4060 void debugLsr(Path p) throws IOException {
4061 debugLsr(getConf(), p, errors);
4062 }
4063
4064
4065
4066
4067 public static void debugLsr(Configuration conf,
4068 Path p) throws IOException {
4069 debugLsr(conf, p, new PrintingErrorReporter());
4070 }
4071
4072
4073
4074
4075 public static void debugLsr(Configuration conf,
4076 Path p, ErrorReporter errors) throws IOException {
4077 if (!LOG.isDebugEnabled() || p == null) {
4078 return;
4079 }
4080 FileSystem fs = p.getFileSystem(conf);
4081
4082 if (!fs.exists(p)) {
4083
4084 return;
4085 }
4086 errors.print(p.toString());
4087
4088 if (fs.isFile(p)) {
4089 return;
4090 }
4091
4092 if (fs.getFileStatus(p).isDir()) {
4093 FileStatus[] fss= fs.listStatus(p);
4094 for (FileStatus status : fss) {
4095 debugLsr(conf, status.getPath(), errors);
4096 }
4097 }
4098 }
4099 }