1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.io.PrintWriter;
23 import java.io.StringWriter;
24 import java.net.InetAddress;
25 import java.net.URI;
26 import java.util.ArrayList;
27 import java.util.Collection;
28 import java.util.Collections;
29 import java.util.Comparator;
30 import java.util.HashMap;
31 import java.util.HashSet;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.Map.Entry;
36 import java.util.Set;
37 import java.util.SortedMap;
38 import java.util.SortedSet;
39 import java.util.TreeMap;
40 import java.util.TreeSet;
41 import java.util.concurrent.Callable;
42 import java.util.concurrent.ConcurrentSkipListMap;
43 import java.util.concurrent.ExecutionException;
44 import java.util.concurrent.ExecutorService;
45 import java.util.concurrent.Future;
46 import java.util.concurrent.ScheduledThreadPoolExecutor;
47 import java.util.concurrent.atomic.AtomicBoolean;
48 import java.util.concurrent.atomic.AtomicInteger;
49
50 import org.apache.commons.logging.Log;
51 import org.apache.commons.logging.LogFactory;
52 import org.apache.hadoop.conf.Configuration;
53 import org.apache.hadoop.conf.Configured;
54 import org.apache.hadoop.fs.FSDataInputStream;
55 import org.apache.hadoop.fs.FSDataOutputStream;
56 import org.apache.hadoop.fs.FileStatus;
57 import org.apache.hadoop.fs.FileSystem;
58 import org.apache.hadoop.fs.Path;
59 import org.apache.hadoop.fs.permission.FsAction;
60 import org.apache.hadoop.fs.permission.FsPermission;
61 import org.apache.hadoop.hbase.Abortable;
62 import org.apache.hadoop.hbase.ClusterStatus;
63 import org.apache.hadoop.hbase.HBaseConfiguration;
64 import org.apache.hadoop.hbase.HColumnDescriptor;
65 import org.apache.hadoop.hbase.HConstants;
66 import org.apache.hadoop.hbase.HRegionInfo;
67 import org.apache.hadoop.hbase.HRegionLocation;
68 import org.apache.hadoop.hbase.HTableDescriptor;
69 import org.apache.hadoop.hbase.KeyValue;
70 import org.apache.hadoop.hbase.MasterNotRunningException;
71 import org.apache.hadoop.hbase.ServerName;
72 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
73 import org.apache.hadoop.hbase.catalog.MetaReader;
74 import org.apache.hadoop.hbase.client.Delete;
75 import org.apache.hadoop.hbase.client.Get;
76 import org.apache.hadoop.hbase.client.HBaseAdmin;
77 import org.apache.hadoop.hbase.client.HConnection;
78 import org.apache.hadoop.hbase.client.HConnectionManager;
79 import org.apache.hadoop.hbase.client.HConnectionManager.HConnectable;
80 import org.apache.hadoop.hbase.client.HTable;
81 import org.apache.hadoop.hbase.client.MetaScanner;
82 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
83 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
84 import org.apache.hadoop.hbase.client.Put;
85 import org.apache.hadoop.hbase.client.Result;
86 import org.apache.hadoop.hbase.client.RowMutations;
87 import org.apache.hadoop.hbase.client.UserProvider;
88 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
89 import org.apache.hadoop.hbase.io.hfile.HFile;
90 import org.apache.hadoop.hbase.ipc.HRegionInterface;
91 import org.apache.hadoop.hbase.master.MasterFileSystem;
92 import org.apache.hadoop.hbase.regionserver.HRegion;
93 import org.apache.hadoop.hbase.regionserver.StoreFile;
94 import org.apache.hadoop.hbase.regionserver.wal.HLog;
95 import org.apache.hadoop.hbase.security.User;
96 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
97 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
98 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
99 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
100 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
101 import org.apache.hadoop.hbase.zookeeper.RootRegionTracker;
102 import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly;
103 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
104 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
105 import org.apache.hadoop.io.IOUtils;
106 import org.apache.hadoop.ipc.RemoteException;
107 import org.apache.hadoop.security.AccessControlException;
108 import org.apache.hadoop.util.ReflectionUtils;
109 import org.apache.hadoop.util.Tool;
110 import org.apache.hadoop.util.ToolRunner;
111 import org.apache.zookeeper.KeeperException;
112
113 import com.google.common.base.Joiner;
114 import com.google.common.base.Preconditions;
115 import com.google.common.collect.Lists;
116 import com.google.common.collect.Multimap;
117 import com.google.common.collect.TreeMultimap;
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164 public class HBaseFsck extends Configured implements Tool {
165 public static final long DEFAULT_TIME_LAG = 60000;
166 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
167 private static final int MAX_NUM_THREADS = 50;
168 private static boolean rsSupportsOffline = true;
169 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
170 private static final int DEFAULT_MAX_MERGE = 5;
171 private static final String TO_BE_LOADED = "to_be_loaded";
172 private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
173
174
175
176
177
178 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
179 private ClusterStatus status;
180 private HConnection connection;
181 private HBaseAdmin admin;
182 private HTable meta;
183
184 protected ExecutorService executor;
185 private long startMillis = System.currentTimeMillis();
186 private HFileCorruptionChecker hfcc;
187 private int retcode = 0;
188 private Path HBCK_LOCK_PATH;
189 private FSDataOutputStream hbckOutFd;
190
191
192
193 private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
194
195
196
197
198 private static boolean details = false;
199 private long timelag = DEFAULT_TIME_LAG;
200 private boolean fixAssignments = false;
201 private boolean fixMeta = false;
202 private boolean checkHdfs = true;
203 private boolean fixHdfsHoles = false;
204 private boolean fixHdfsOverlaps = false;
205 private boolean fixHdfsOrphans = false;
206 private boolean fixTableOrphans = false;
207 private boolean fixVersionFile = false;
208 private boolean fixSplitParents = false;
209 private boolean fixReferenceFiles = false;
210
211
212
213 private Set<String> tablesIncluded = new HashSet<String>();
214 private int maxMerge = DEFAULT_MAX_MERGE;
215 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
216 private boolean sidelineBigOverlaps = false;
217 private Path sidelineDir = null;
218
219 private boolean rerun = false;
220 private static boolean summary = false;
221 private boolean checkMetaOnly = false;
222 private boolean checkRegionBoundaries = false;
223 private boolean ignorePreCheckPermission = false;
224
225
226
227
228 final private ErrorReporter errors;
229 int fixes = 0;
230
231
232
233
234
235
236 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
237 private TreeSet<byte[]> disabledTables =
238 new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
239
240 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
241
242
243
244
245
246
247
248
249
250
251
252 private SortedMap<String, TableInfo> tablesInfo = new ConcurrentSkipListMap<String,TableInfo>();
253
254
255
256
257 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
258
259 private Map<String, Set<String>> orphanTableDirs = new HashMap<String, Set<String>>();
260
261
262
263
264
265
266
267
268 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
269 ZooKeeperConnectionException, IOException, ClassNotFoundException {
270 super(conf);
271 errors = getErrorReporter(conf);
272
273 initialPoolNumThreads();
274 }
275
276
277
278
279
280
281
282
283
284
285
286 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
287 ZooKeeperConnectionException, IOException, ClassNotFoundException {
288 super(conf);
289 errors = getErrorReporter(getConf());
290 this.executor = exec;
291 }
292
293
294
295
296
297
298
299 private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
300 try {
301 FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
302 FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
303 HConstants.DATA_FILE_UMASK_KEY);
304 Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
305 fs.mkdirs(tmpDir);
306 HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
307 final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false);
308 out.writeBytes(InetAddress.getLocalHost().toString());
309 out.flush();
310 return out;
311 } catch (IOException exception) {
312 RemoteException e = null;
313 if (exception instanceof RemoteException) {
314 e = (RemoteException)exception;
315 } else if (exception.getCause() instanceof RemoteException) {
316 e = (RemoteException)(exception.getCause());
317 }
318 if(null != e && AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
319 return null;
320 }
321 throw exception;
322 }
323 }
324
325 private void unlockHbck() {
326 if(hbckLockCleanup.compareAndSet(true, false)){
327 IOUtils.closeStream(hbckOutFd);
328 try{
329 FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true);
330 } catch(IOException ioe) {
331 LOG.warn("Failed to delete " + HBCK_LOCK_PATH);
332 LOG.debug(ioe);
333 }
334 }
335 }
336
337
338
339
340
341 public void connect() throws IOException {
342
343
344 hbckOutFd = checkAndMarkRunningHbck();
345 if (hbckOutFd == null) {
346 setRetCode(-1);
347 LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
348 " no other instance is running, delete the lock file " +
349 HBCK_LOCK_PATH + " and rerun the tool]");
350 throw new IOException("Duplicate hbck - Abort");
351 }
352
353
354 hbckLockCleanup.set(true);
355
356
357
358
359 Runtime.getRuntime().addShutdownHook(new Thread() {
360 public void run() {
361 unlockHbck();
362 }
363 });
364 LOG.debug("Launching hbck");
365
366 admin = new HBaseAdmin(getConf());
367 meta = new HTable(getConf(), HConstants.META_TABLE_NAME);
368 status = admin.getMaster().getClusterStatus();
369 connection = admin.getConnection();
370 }
371
372
373
374
375 private void initialPoolNumThreads() {
376 if (executor != null) {
377 executor.shutdown();
378 }
379
380 int numThreads = getConf().getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
381 executor = new ScheduledThreadPoolExecutor(numThreads);
382 }
383
384
385
386
387 private void loadDeployedRegions() throws IOException, InterruptedException {
388
389 Collection<ServerName> regionServers = status.getServers();
390 errors.print("Number of live region servers: " + regionServers.size());
391 if (details) {
392 for (ServerName rsinfo: regionServers) {
393 errors.print(" " + rsinfo.getServerName());
394 }
395 }
396
397
398 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
399 errors.print("Number of dead region servers: " + deadRegionServers.size());
400 if (details) {
401 for (ServerName name: deadRegionServers) {
402 errors.print(" " + name);
403 }
404 }
405
406
407 errors.print("Master: " + status.getMaster());
408
409
410 Collection<ServerName> backupMasters = status.getBackupMasters();
411 errors.print("Number of backup masters: " + backupMasters.size());
412 if (details) {
413 for (ServerName name: backupMasters) {
414 errors.print(" " + name);
415 }
416 }
417
418
419 processRegionServers(regionServers);
420 }
421
422
423
424
425 private void clearState() {
426
427 fixes = 0;
428 regionInfoMap.clear();
429 emptyRegionInfoQualifiers.clear();
430 disabledTables.clear();
431 errors.clear();
432 tablesInfo.clear();
433 orphanHdfsDirs.clear();
434 }
435
436
437
438
439
440
441 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
442
443 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
444 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
445 LOG.info("Loading regioninfos HDFS");
446
447 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
448 int curIter = 0;
449 do {
450 clearState();
451
452 restoreHdfsIntegrity();
453 curIter++;
454 } while (fixes > 0 && curIter <= maxIterations);
455
456
457
458 if (curIter > 2) {
459 if (curIter == maxIterations) {
460 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
461 + "Tables integrity may not be fully repaired!");
462 } else {
463 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
464 }
465 }
466 }
467 }
468
469
470
471
472
473
474
475
476
477 public int onlineConsistencyRepair() throws IOException, KeeperException,
478 InterruptedException {
479 clearState();
480
481 LOG.info("Loading regionsinfo from the .META. table");
482 boolean success = loadMetaEntries();
483 if (!success) return -1;
484
485
486 if (!checkMetaRegion()) {
487
488 errors.reportError("Encountered fatal error. Exiting...");
489 return -2;
490 }
491
492
493 if (!checkMetaOnly) {
494 reportTablesInFlux();
495 }
496
497
498 loadDeployedRegions();
499
500
501 if (shouldCheckHdfs()) {
502 loadHdfsRegionDirs();
503 loadHdfsRegionInfos();
504 }
505
506
507 reportEmptyMetaCells();
508
509
510 loadDisabledTables();
511
512
513 fixOrphanTables();
514
515
516 checkAndFixConsistency();
517
518
519 checkIntegrity();
520 return errors.getErrorList().size();
521 }
522
523
524
525
526
527 public int onlineHbck() throws IOException, KeeperException, InterruptedException {
528
529 errors.print("Version: " + status.getHBaseVersion());
530 offlineHdfsIntegrityRepair();
531
532
533 boolean oldBalancer = admin.setBalancerRunning(false, true);
534 try {
535 onlineConsistencyRepair();
536 }
537 finally {
538 admin.setBalancerRunning(oldBalancer, false);
539 }
540
541 if (checkRegionBoundaries) {
542 checkRegionBoundaries();
543 }
544
545 offlineReferenceFileRepair();
546
547
548 unlockHbck();
549
550
551 printTableSummary(tablesInfo);
552 return errors.summarize();
553 }
554
555 public static byte[] keyOnly (byte[] b) {
556 if (b == null)
557 return b;
558 int rowlength = Bytes.toShort(b, 0);
559 byte[] result = new byte[rowlength];
560 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
561 return result;
562 }
563
564 private static class RegionBoundariesInformation {
565 public byte [] regionName;
566 public byte [] metaFirstKey;
567 public byte [] metaLastKey;
568 public byte [] storesFirstKey;
569 public byte [] storesLastKey;
570 public String toString () {
571 return "regionName=" + Bytes.toStringBinary(regionName) +
572 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
573 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
574 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
575 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
576 }
577 }
578
579 public void checkRegionBoundaries() {
580 try {
581 ByteArrayComparator comparator = new ByteArrayComparator();
582 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), false);
583 final RegionBoundariesInformation currentRegionBoundariesInformation =
584 new RegionBoundariesInformation();
585 for (HRegionInfo regionInfo : regions) {
586 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
587
588
589 Path path = new Path(getConf().get(HConstants.HBASE_DIR) + "/"
590 + Bytes.toString(regionInfo.getTableName()) + "/"
591 + regionInfo.getEncodedName() + "/");
592 FileSystem fs = path.getFileSystem(getConf());
593 FileStatus[] files = fs.listStatus(path);
594
595 byte[] storeFirstKey = null;
596 byte[] storeLastKey = null;
597 for (FileStatus file : files) {
598 String fileName = file.getPath().toString();
599 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
600 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
601 FileStatus[] storeFiles = fs.listStatus(file.getPath());
602
603 for (FileStatus storeFile : storeFiles) {
604 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
605 getConf()));
606 if ((reader.getFirstKey() != null)
607 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
608 reader.getFirstKey()) > 0))) {
609 storeFirstKey = reader.getFirstKey();
610 }
611 if ((reader.getLastKey() != null)
612 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
613 reader.getLastKey())) < 0)) {
614 storeLastKey = reader.getLastKey();
615 }
616 reader.close();
617 }
618 }
619 }
620 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
621 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
622 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
623 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
624 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
625 currentRegionBoundariesInformation.metaFirstKey = null;
626 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
627 currentRegionBoundariesInformation.metaLastKey = null;
628
629
630
631
632
633
634 boolean valid = true;
635
636 if ((currentRegionBoundariesInformation.storesFirstKey != null)
637 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
638 valid = valid
639 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
640 currentRegionBoundariesInformation.metaFirstKey) >= 0;
641 }
642
643 if ((currentRegionBoundariesInformation.storesLastKey != null)
644 && (currentRegionBoundariesInformation.metaLastKey != null)) {
645 valid = valid
646 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
647 currentRegionBoundariesInformation.metaLastKey) < 0;
648 }
649 if (!valid) {
650 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
651 tablesInfo.get(Bytes.toString(regionInfo.getTableName())));
652 LOG.warn("Region's boundaries not alligned between stores and META for:");
653 LOG.warn(currentRegionBoundariesInformation);
654 }
655 }
656 } catch (IOException e) {
657 LOG.error(e);
658 }
659 }
660
661
662
663
664 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
665 for (HbckInfo hi : orphanHdfsDirs) {
666 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
667 adoptHdfsOrphan(hi);
668 }
669 }
670
671
672
673
674
675
676
677
678
679
680 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
681 Path p = hi.getHdfsRegionDir();
682 FileSystem fs = p.getFileSystem(getConf());
683 FileStatus[] dirs = fs.listStatus(p);
684 if (dirs == null) {
685 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
686 p + ". This dir could probably be deleted.");
687 return ;
688 }
689
690 String tableName = Bytes.toString(hi.getTableName());
691 TableInfo tableInfo = tablesInfo.get(tableName);
692 Preconditions.checkNotNull("Table " + tableName + "' not present!", tableInfo);
693 HTableDescriptor template = tableInfo.getHTD();
694
695
696 Pair<byte[],byte[]> orphanRegionRange = null;
697 for (FileStatus cf : dirs) {
698 String cfName= cf.getPath().getName();
699
700 if (cfName.startsWith(".") || cfName.equals("splitlog")) continue;
701
702 FileStatus[] hfiles = fs.listStatus(cf.getPath());
703 for (FileStatus hfile : hfiles) {
704 byte[] start, end;
705 HFile.Reader hf = null;
706 try {
707 CacheConfig cacheConf = new CacheConfig(getConf());
708 hf = HFile.createReader(fs, hfile.getPath(), cacheConf);
709 hf.loadFileInfo();
710 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
711 start = startKv.getRow();
712 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
713 end = endKv.getRow();
714 } catch (IOException ioe) {
715 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
716 continue;
717 } catch (NullPointerException ioe) {
718 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
719 continue;
720 } finally {
721 if (hf != null) {
722 hf.close();
723 }
724 }
725
726
727 if (orphanRegionRange == null) {
728
729 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
730 } else {
731
732
733
734 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
735 orphanRegionRange.setFirst(start);
736 }
737 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
738 orphanRegionRange.setSecond(end);
739 }
740 }
741 }
742 }
743 if (orphanRegionRange == null) {
744 LOG.warn("No data in dir " + p + ", sidelining data");
745 fixes++;
746 sidelineRegionDir(fs, hi);
747 return;
748 }
749 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
750 Bytes.toString(orphanRegionRange.getSecond()) + ")");
751
752
753 HRegionInfo hri = new HRegionInfo(template.getName(), orphanRegionRange.getFirst(), orphanRegionRange.getSecond());
754 LOG.info("Creating new region : " + hri);
755 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
756 Path target = region.getRegionDir();
757
758
759 mergeRegionDirs(target, hi);
760 fixes++;
761 }
762
763
764
765
766
767
768
769
770
771 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
772
773 LOG.info("Loading HBase regioninfo from HDFS...");
774 loadHdfsRegionDirs();
775
776 int errs = errors.getErrorList().size();
777
778 tablesInfo = loadHdfsRegionInfos();
779 checkHdfsIntegrity(false, false);
780
781 if (errors.getErrorList().size() == errs) {
782 LOG.info("No integrity errors. We are done with this phase. Glorious.");
783 return 0;
784 }
785
786 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
787 adoptHdfsOrphans(orphanHdfsDirs);
788
789 }
790
791
792 if (shouldFixHdfsHoles()) {
793 clearState();
794 loadHdfsRegionDirs();
795 tablesInfo = loadHdfsRegionInfos();
796 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
797 }
798
799
800 if (shouldFixHdfsOverlaps()) {
801
802 clearState();
803 loadHdfsRegionDirs();
804 tablesInfo = loadHdfsRegionInfos();
805 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
806 }
807
808 return errors.getErrorList().size();
809 }
810
811
812
813
814
815
816
817
818
819 private void offlineReferenceFileRepair() throws IOException {
820 Configuration conf = getConf();
821 Path hbaseRoot = FSUtils.getRootDir(conf);
822 FileSystem fs = hbaseRoot.getFileSystem(conf);
823 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot);
824 for (Path path: allFiles.values()) {
825 boolean isReference = false;
826 try {
827 isReference = StoreFile.isReference(path);
828 } catch (Throwable t) {
829
830
831
832
833 }
834 if (!isReference) continue;
835
836 Path referredToFile = StoreFile.getReferredToFile(path);
837 if (fs.exists(referredToFile)) continue;
838
839
840 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
841 "Found lingering reference file " + path);
842 if (!shouldFixReferenceFiles()) continue;
843
844
845 boolean success = false;
846 String pathStr = path.toString();
847
848
849
850
851
852 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
853 for (int i = 0; index > 0 && i < 3; i++) {
854 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index);
855 }
856 if (index > 0) {
857 Path rootDir = getSidelineDir();
858 Path dst = new Path(rootDir, pathStr.substring(index));
859 fs.mkdirs(dst.getParent());
860 LOG.info("Trying to sildeline reference file"
861 + path + " to " + dst);
862 setShouldRerun();
863
864 success = fs.rename(path, dst);
865 }
866 if (!success) {
867 LOG.error("Failed to sideline reference file " + path);
868 }
869 }
870 }
871
872
873
874
875 private void reportEmptyMetaCells() {
876 errors.print("Number of empty REGIONINFO_QUALIFIER rows in .META.: " +
877 emptyRegionInfoQualifiers.size());
878 if (details) {
879 for (Result r: emptyRegionInfoQualifiers) {
880 errors.print(" " + r);
881 }
882 }
883 }
884
885
886
887
888 private void reportTablesInFlux() {
889 AtomicInteger numSkipped = new AtomicInteger(0);
890 HTableDescriptor[] allTables = getTables(numSkipped);
891 errors.print("Number of Tables: " + allTables.length);
892 if (details) {
893 if (numSkipped.get() > 0) {
894 errors.detail("Number of Tables in flux: " + numSkipped.get());
895 }
896 for (HTableDescriptor td : allTables) {
897 String tableName = td.getNameAsString();
898 errors.detail(" Table: " + tableName + "\t" +
899 (td.isReadOnly() ? "ro" : "rw") + "\t" +
900 (td.isRootRegion() ? "ROOT" :
901 (td.isMetaRegion() ? "META" : " ")) + "\t" +
902 " families: " + td.getFamilies().size());
903 }
904 }
905 }
906
907 public ErrorReporter getErrors() {
908 return errors;
909 }
910
911
912
913
914
915 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
916 Path regionDir = hbi.getHdfsRegionDir();
917 if (regionDir == null) {
918 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
919 return;
920 }
921
922 if (hbi.hdfsEntry.hri != null) {
923
924 return;
925 }
926
927 Path regioninfo = new Path(regionDir, HRegion.REGIONINFO_FILE);
928 FileSystem fs = regioninfo.getFileSystem(getConf());
929
930 FSDataInputStream in = fs.open(regioninfo);
931 HRegionInfo hri = new HRegionInfo();
932 hri.readFields(in);
933 in.close();
934 LOG.debug("HRegionInfo read: " + hri.toString());
935 hbi.hdfsEntry.hri = hri;
936 }
937
938
939
940
941
942 public static class RegionRepairException extends IOException {
943 private static final long serialVersionUID = 1L;
944 final IOException ioe;
945 public RegionRepairException(String s, IOException ioe) {
946 super(s);
947 this.ioe = ioe;
948 }
949 }
950
951
952
953
954 private SortedMap<String, TableInfo> loadHdfsRegionInfos() throws IOException, InterruptedException {
955 tablesInfo.clear();
956
957 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
958
959
960 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
961 List<Future<Void>> hbiFutures;
962
963 for (HbckInfo hbi : hbckInfos) {
964 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
965 hbis.add(work);
966 }
967
968
969 hbiFutures = executor.invokeAll(hbis);
970
971 for(int i=0; i<hbiFutures.size(); i++) {
972 WorkItemHdfsRegionInfo work = hbis.get(i);
973 Future<Void> f = hbiFutures.get(i);
974 try {
975 f.get();
976 } catch(ExecutionException e) {
977 LOG.warn("Failed to read .regioninfo file for region " +
978 work.hbi.getRegionNameAsString(), e.getCause());
979 }
980 }
981
982
983 for (HbckInfo hbi: hbckInfos) {
984
985 if (hbi.getHdfsHRI() == null) {
986
987 continue;
988 }
989
990
991
992 String tableName = Bytes.toString(hbi.getTableName());
993 if (tableName == null) {
994
995 LOG.warn("tableName was null for: " + hbi);
996 continue;
997 }
998
999 TableInfo modTInfo = tablesInfo.get(tableName);
1000 if (modTInfo == null) {
1001
1002 modTInfo = new TableInfo(tableName);
1003 Path hbaseRoot = FSUtils.getRootDir(getConf());
1004 tablesInfo.put(tableName, modTInfo);
1005 try {
1006 HTableDescriptor htd =
1007 FSTableDescriptors.getTableDescriptorFromFs(hbaseRoot.getFileSystem(getConf()),
1008 hbaseRoot, tableName);
1009 modTInfo.htds.add(htd);
1010 } catch (IOException ioe) {
1011 if (!orphanTableDirs.containsKey(tableName)) {
1012 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
1013
1014 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
1015 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
1016 Set<String> columns = new HashSet<String>();
1017 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
1018 }
1019 }
1020 }
1021 if (!hbi.isSkipChecks()) {
1022 modTInfo.addRegionInfo(hbi);
1023 }
1024 }
1025
1026 return tablesInfo;
1027 }
1028
1029
1030
1031
1032
1033
1034
1035
1036 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
1037 Path regionDir = hbi.getHdfsRegionDir();
1038 FileSystem fs = regionDir.getFileSystem(getConf());
1039 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
1040 for (FileStatus subdir : subDirs) {
1041 String columnfamily = subdir.getPath().getName();
1042 columns.add(columnfamily);
1043 }
1044 return columns;
1045 }
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055 private boolean fabricateTableInfo(String tableName, Set<String> columns) throws IOException {
1056 if (columns ==null || columns.isEmpty()) return false;
1057 HTableDescriptor htd = new HTableDescriptor(tableName);
1058 for (String columnfamimly : columns) {
1059 htd.addFamily(new HColumnDescriptor(columnfamimly));
1060 }
1061 FSTableDescriptors.createTableDescriptor(htd, getConf(), true);
1062 return true;
1063 }
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074 public void fixOrphanTables() throws IOException {
1075 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1076
1077 Path hbaseRoot = FSUtils.getRootDir(getConf());
1078 List<String> tmpList = new ArrayList<String>();
1079 tmpList.addAll(orphanTableDirs.keySet());
1080 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1081 Iterator<Entry<String, Set<String>>> iter = orphanTableDirs.entrySet().iterator();
1082 int j = 0;
1083 int numFailedCase = 0;
1084 while (iter.hasNext()) {
1085 Entry<String, Set<String>> entry = (Entry<String, Set<String>>) iter.next();
1086 String tableName = entry.getKey();
1087 LOG.info("Trying to fix orphan table error: " + tableName);
1088 if (j < htds.length) {
1089 if (tableName.equals(Bytes.toString(htds[j].getName()))) {
1090 HTableDescriptor htd = htds[j];
1091 LOG.info("fixing orphan table: " + tableName + " from cache");
1092 FSTableDescriptors.createTableDescriptor(
1093 hbaseRoot.getFileSystem(getConf()), hbaseRoot, htd, true);
1094 j++;
1095 iter.remove();
1096 }
1097 } else {
1098 if (fabricateTableInfo(tableName, entry.getValue())) {
1099 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1100 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1101 iter.remove();
1102 } else {
1103 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1104 numFailedCase++;
1105 }
1106 }
1107 fixes++;
1108 }
1109
1110 if (orphanTableDirs.isEmpty()) {
1111
1112
1113 setShouldRerun();
1114 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1115 } else if (numFailedCase > 0) {
1116 LOG.error("Failed to fix " + numFailedCase
1117 + " OrphanTables with default .tableinfo files");
1118 }
1119
1120 }
1121
1122 orphanTableDirs.clear();
1123
1124 }
1125
1126
1127
1128
1129
1130
1131 private HRegion createNewRootAndMeta() throws IOException {
1132 Path rootdir = new Path(getConf().get(HConstants.HBASE_DIR));
1133 Configuration c = getConf();
1134 HRegionInfo rootHRI = new HRegionInfo(HRegionInfo.ROOT_REGIONINFO);
1135 MasterFileSystem.setInfoFamilyCachingForRoot(false);
1136 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1137 MasterFileSystem.setInfoFamilyCachingForMeta(false);
1138 HRegion root = HRegion.createHRegion(rootHRI, rootdir, c,
1139 HTableDescriptor.ROOT_TABLEDESC);
1140 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c,
1141 HTableDescriptor.META_TABLEDESC);
1142 MasterFileSystem.setInfoFamilyCachingForRoot(true);
1143 MasterFileSystem.setInfoFamilyCachingForMeta(true);
1144
1145
1146 HRegion.addRegionToMETA(root, meta);
1147 root.close();
1148 root.getLog().closeAndDelete();
1149 return meta;
1150 }
1151
1152
1153
1154
1155
1156
1157
1158 private ArrayList<Put> generatePuts(SortedMap<String, TableInfo> tablesInfo) throws IOException {
1159 ArrayList<Put> puts = new ArrayList<Put>();
1160 boolean hasProblems = false;
1161 for (Entry<String, TableInfo> e : tablesInfo.entrySet()) {
1162 String name = e.getKey();
1163
1164
1165 if (Bytes.compareTo(Bytes.toBytes(name), HConstants.ROOT_TABLE_NAME) == 0
1166 || Bytes.compareTo(Bytes.toBytes(name), HConstants.META_TABLE_NAME) == 0) {
1167 continue;
1168 }
1169
1170 TableInfo ti = e.getValue();
1171 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1172 .entrySet()) {
1173 Collection<HbckInfo> his = spl.getValue();
1174 int sz = his.size();
1175 if (sz != 1) {
1176
1177 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1178 + " had " + sz + " regions instead of exactly 1." );
1179 hasProblems = true;
1180 continue;
1181 }
1182
1183
1184 HbckInfo hi = his.iterator().next();
1185 HRegionInfo hri = hi.getHdfsHRI();
1186 Put p = new Put(hri.getRegionName());
1187 p.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
1188 Writables.getBytes(hri));
1189 puts.add(p);
1190 }
1191 }
1192 return hasProblems ? null : puts;
1193 }
1194
1195
1196
1197
1198 private void suggestFixes(SortedMap<String, TableInfo> tablesInfo) throws IOException {
1199 for (TableInfo tInfo : tablesInfo.values()) {
1200 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1201 tInfo.checkRegionChain(handler);
1202 }
1203 }
1204
1205
1206
1207
1208
1209
1210
1211
1212 public boolean rebuildMeta(boolean fix) throws IOException,
1213 InterruptedException {
1214
1215
1216
1217
1218
1219 LOG.info("Loading HBase regioninfo from HDFS...");
1220 loadHdfsRegionDirs();
1221
1222 int errs = errors.getErrorList().size();
1223 tablesInfo = loadHdfsRegionInfos();
1224 checkHdfsIntegrity(false, false);
1225
1226
1227 if (errors.getErrorList().size() != errs) {
1228
1229 while(true) {
1230 fixes = 0;
1231 suggestFixes(tablesInfo);
1232 errors.clear();
1233 loadHdfsRegionInfos();
1234 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1235
1236 int errCount = errors.getErrorList().size();
1237
1238 if (fixes == 0) {
1239 if (errCount > 0) {
1240 return false;
1241 } else {
1242 break;
1243 }
1244 }
1245 }
1246 }
1247
1248
1249 LOG.info("HDFS regioninfo's seems good. Sidelining old .META.");
1250 Path backupDir = sidelineOldRootAndMeta();
1251
1252 LOG.info("Creating new .META.");
1253 HRegion meta = createNewRootAndMeta();
1254
1255
1256 List<Put> puts = generatePuts(tablesInfo);
1257 if (puts == null) {
1258 LOG.fatal("Problem encountered when creating new .META. entries. " +
1259 "You may need to restore the previously sidelined -ROOT- and .META.");
1260 return false;
1261 }
1262 meta.put(puts.toArray(new Put[0]));
1263 meta.close();
1264 meta.getLog().closeAndDelete();
1265 LOG.info("Success! .META. table rebuilt.");
1266 LOG.info("Old -ROOT- and .META. are moved into " + backupDir);
1267 return true;
1268 }
1269
1270 private SortedMap<String, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1271 boolean fixOverlaps) throws IOException {
1272 LOG.info("Checking HBase region split map from HDFS data...");
1273 for (TableInfo tInfo : tablesInfo.values()) {
1274 TableIntegrityErrorHandler handler;
1275 if (fixHoles || fixOverlaps) {
1276 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1277 fixHoles, fixOverlaps);
1278 } else {
1279 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1280 }
1281 if (!tInfo.checkRegionChain(handler)) {
1282
1283 errors.report("Found inconsistency in table " + tInfo.getName());
1284 }
1285 }
1286 return tablesInfo;
1287 }
1288
1289 private Path getSidelineDir() throws IOException {
1290 if (sidelineDir == null) {
1291 Path hbaseDir = FSUtils.getRootDir(getConf());
1292 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1293 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1294 + startMillis);
1295 }
1296 return sidelineDir;
1297 }
1298
1299
1300
1301
1302 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1303 return sidelineRegionDir(fs, null, hi);
1304 }
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314 Path sidelineRegionDir(FileSystem fs,
1315 String parentDir, HbckInfo hi) throws IOException {
1316 String tableName = Bytes.toString(hi.getTableName());
1317 Path regionDir = hi.getHdfsRegionDir();
1318
1319 if (!fs.exists(regionDir)) {
1320 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1321 return null;
1322 }
1323
1324 Path rootDir = getSidelineDir();
1325 if (parentDir != null) {
1326 rootDir = new Path(rootDir, parentDir);
1327 }
1328 Path sidelineTableDir= new Path(rootDir, tableName);
1329 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1330 fs.mkdirs(sidelineRegionDir);
1331 boolean success = false;
1332 FileStatus[] cfs = fs.listStatus(regionDir);
1333 if (cfs == null) {
1334 LOG.info("Region dir is empty: " + regionDir);
1335 } else {
1336 for (FileStatus cf : cfs) {
1337 Path src = cf.getPath();
1338 Path dst = new Path(sidelineRegionDir, src.getName());
1339 if (fs.isFile(src)) {
1340
1341 success = fs.rename(src, dst);
1342 if (!success) {
1343 String msg = "Unable to rename file " + src + " to " + dst;
1344 LOG.error(msg);
1345 throw new IOException(msg);
1346 }
1347 continue;
1348 }
1349
1350
1351 fs.mkdirs(dst);
1352
1353 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1354
1355
1356
1357
1358 FileStatus[] hfiles = fs.listStatus(src);
1359 if (hfiles != null && hfiles.length > 0) {
1360 for (FileStatus hfile : hfiles) {
1361 success = fs.rename(hfile.getPath(), dst);
1362 if (!success) {
1363 String msg = "Unable to rename file " + src + " to " + dst;
1364 LOG.error(msg);
1365 throw new IOException(msg);
1366 }
1367 }
1368 }
1369 LOG.debug("Sideline directory contents:");
1370 debugLsr(sidelineRegionDir);
1371 }
1372 }
1373
1374 LOG.info("Removing old region dir: " + regionDir);
1375 success = fs.delete(regionDir, true);
1376 if (!success) {
1377 String msg = "Unable to delete dir " + regionDir;
1378 LOG.error(msg);
1379 throw new IOException(msg);
1380 }
1381 return sidelineRegionDir;
1382 }
1383
1384
1385
1386
1387 void sidelineTable(FileSystem fs, byte[] table, Path hbaseDir,
1388 Path backupHbaseDir) throws IOException {
1389 String tableName = Bytes.toString(table);
1390 Path tableDir = new Path(hbaseDir, tableName);
1391 if (fs.exists(tableDir)) {
1392 Path backupTableDir= new Path(backupHbaseDir, tableName);
1393 boolean success = fs.rename(tableDir, backupTableDir);
1394 if (!success) {
1395 throw new IOException("Failed to move " + tableName + " from "
1396 + tableDir.getName() + " to " + backupTableDir.getName());
1397 }
1398 } else {
1399 LOG.info("No previous " + tableName + " exists. Continuing.");
1400 }
1401 }
1402
1403
1404
1405
1406 Path sidelineOldRootAndMeta() throws IOException {
1407
1408 Path hbaseDir = new Path(getConf().get(HConstants.HBASE_DIR));
1409 FileSystem fs = hbaseDir.getFileSystem(getConf());
1410 Path backupDir = getSidelineDir();
1411 fs.mkdirs(backupDir);
1412
1413 sidelineTable(fs, HConstants.ROOT_TABLE_NAME, hbaseDir, backupDir);
1414 try {
1415 sidelineTable(fs, HConstants.META_TABLE_NAME, hbaseDir, backupDir);
1416 } catch (IOException e) {
1417 LOG.error("Attempt to sideline meta failed, attempt to revert...", e);
1418 try {
1419
1420 sidelineTable(fs, HConstants.ROOT_TABLE_NAME, backupDir, hbaseDir);
1421 LOG.warn("... revert succeed. -ROOT- and .META. still in "
1422 + "original state.");
1423 } catch (IOException ioe) {
1424 LOG.fatal("... failed to sideline root and meta and failed to restore "
1425 + "prevoius state. Currently in inconsistent state. To restore "
1426 + "try to rename -ROOT- in " + backupDir.getName() + " to "
1427 + hbaseDir.getName() + ".", ioe);
1428 }
1429 throw e;
1430 }
1431 return backupDir;
1432 }
1433
1434
1435
1436
1437
1438
1439 private void loadDisabledTables()
1440 throws ZooKeeperConnectionException, IOException {
1441 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1442 @Override
1443 public Void connect(HConnection connection) throws IOException {
1444 ZooKeeperWatcher zkw = connection.getZooKeeperWatcher();
1445 try {
1446 for (String tableName : ZKTableReadOnly.getDisabledOrDisablingTables(zkw)) {
1447 disabledTables.add(Bytes.toBytes(tableName));
1448 }
1449 } catch (KeeperException ke) {
1450 throw new IOException(ke);
1451 }
1452 return null;
1453 }
1454 });
1455 }
1456
1457
1458
1459
1460 private boolean isTableDisabled(HRegionInfo regionInfo) {
1461 return disabledTables.contains(regionInfo.getTableName());
1462 }
1463
1464
1465
1466
1467
1468 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1469 Path rootDir = new Path(getConf().get(HConstants.HBASE_DIR));
1470 FileSystem fs = rootDir.getFileSystem(getConf());
1471
1472
1473 List<FileStatus> tableDirs = Lists.newArrayList();
1474
1475 boolean foundVersionFile = false;
1476 FileStatus[] files = fs.listStatus(rootDir);
1477 for (FileStatus file : files) {
1478 String dirName = file.getPath().getName();
1479 if (dirName.equals(HConstants.VERSION_FILE_NAME)) {
1480 foundVersionFile = true;
1481 } else {
1482 if ((!checkMetaOnly && isTableIncluded(dirName)) ||
1483 dirName.equals("-ROOT-") ||
1484 dirName.equals(".META.")) {
1485 tableDirs.add(file);
1486 }
1487 }
1488 }
1489
1490
1491 if (!foundVersionFile) {
1492 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1493 "Version file does not exist in root dir " + rootDir);
1494 if (shouldFixVersionFile()) {
1495 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1496 + " file.");
1497 setShouldRerun();
1498 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1499 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1500 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1501 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1502 }
1503 }
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531 private boolean recordRootRegion() throws IOException {
1532 HRegionLocation rootLocation = connection.locateRegion(
1533 HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_START_ROW);
1534
1535
1536 if (rootLocation == null || rootLocation.getRegionInfo() == null ||
1537 rootLocation.getHostname() == null) {
1538 errors.reportError(ERROR_CODE.NULL_ROOT_REGION,
1539 "Root Region or some of its attributes are null.");
1540 return false;
1541 }
1542 ServerName sn;
1543 try {
1544 sn = getRootRegionServerName();
1545 } catch (InterruptedException e) {
1546 throw new IOException("Interrupted", e);
1547 }
1548 MetaEntry m =
1549 new MetaEntry(rootLocation.getRegionInfo(), sn, System.currentTimeMillis());
1550 HbckInfo hbInfo = new HbckInfo(m);
1551 regionInfoMap.put(rootLocation.getRegionInfo().getEncodedName(), hbInfo);
1552 return true;
1553 }
1554
1555 private ServerName getRootRegionServerName()
1556 throws IOException, InterruptedException {
1557 RootRegionTracker rootRegionTracker =
1558 new RootRegionTracker(this.connection.getZooKeeperWatcher(), new Abortable() {
1559 @Override
1560 public void abort(String why, Throwable e) {
1561 LOG.error(why, e);
1562 System.exit(1);
1563 }
1564 @Override
1565 public boolean isAborted(){
1566 return false;
1567 }
1568
1569 });
1570 rootRegionTracker.start();
1571 ServerName sn = null;
1572 try {
1573 sn = rootRegionTracker.getRootRegionLocation();
1574 } finally {
1575 rootRegionTracker.stop();
1576 }
1577 return sn;
1578 }
1579
1580
1581
1582
1583
1584
1585 void processRegionServers(Collection<ServerName> regionServerList)
1586 throws IOException, InterruptedException {
1587
1588 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1589 List<Future<Void>> workFutures;
1590
1591
1592 for (ServerName rsinfo: regionServerList) {
1593 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1594 }
1595
1596 workFutures = executor.invokeAll(workItems);
1597
1598 for(int i=0; i<workFutures.size(); i++) {
1599 WorkItemRegion item = workItems.get(i);
1600 Future<Void> f = workFutures.get(i);
1601 try {
1602 f.get();
1603 } catch(ExecutionException e) {
1604 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1605 e.getCause());
1606 }
1607 }
1608 }
1609
1610
1611
1612
1613 private void checkAndFixConsistency()
1614 throws IOException, KeeperException, InterruptedException {
1615 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1616 checkRegionConsistency(e.getKey(), e.getValue());
1617 }
1618 }
1619
1620 private void preCheckPermission() throws IOException, AccessControlException {
1621 if (shouldIgnorePreCheckPermission()) {
1622 return;
1623 }
1624
1625 Configuration conf = getConf();
1626 Path hbaseDir = new Path(conf.get(HConstants.HBASE_DIR));
1627 FileSystem fs = hbaseDir.getFileSystem(conf);
1628 UserProvider provider = UserProvider.instantiate(conf);
1629 User user = provider.getCurrent();
1630 FileStatus[] files = fs.listStatus(hbaseDir);
1631 for (FileStatus file : files) {
1632 try {
1633 FSUtils.checkAccess(user, file, FsAction.WRITE);
1634 } catch (AccessControlException ace) {
1635 LOG.warn("Got AccessControlException when preCheckPermission ", ace);
1636 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + user.getShortName()
1637 + " does not have write perms to " + file.getPath()
1638 + ". Please rerun hbck as hdfs user " + file.getOwner());
1639 throw new AccessControlException(ace);
1640 }
1641 }
1642 }
1643
1644
1645
1646
1647 private void deleteMetaRegion(HbckInfo hi) throws IOException {
1648 Delete d = new Delete(hi.metaEntry.getRegionName());
1649 meta.delete(d);
1650 meta.flushCommits();
1651 LOG.info("Deleted " + hi.metaEntry.getRegionNameAsString() + " from META" );
1652 }
1653
1654
1655
1656
1657 private void resetSplitParent(HbckInfo hi) throws IOException {
1658 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
1659 Delete d = new Delete(hi.metaEntry.getRegionName());
1660 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
1661 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
1662 mutations.add(d);
1663
1664 Put p = new Put(hi.metaEntry.getRegionName());
1665 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
1666 hri.setOffline(false);
1667 hri.setSplit(false);
1668 p.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
1669 Writables.getBytes(hri));
1670 mutations.add(p);
1671
1672 meta.mutateRow(mutations);
1673 meta.flushCommits();
1674 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
1675 }
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685 private void offline(byte[] regionName) throws IOException {
1686 String regionString = Bytes.toStringBinary(regionName);
1687 if (!rsSupportsOffline) {
1688 LOG.warn("Using unassign region " + regionString
1689 + " instead of using offline method, you should"
1690 + " restart HMaster after these repairs");
1691 admin.unassign(regionName, true);
1692 return;
1693 }
1694
1695
1696 try {
1697 LOG.info("Offlining region " + regionString);
1698 admin.getMaster().offline(regionName);
1699 } catch (IOException ioe) {
1700 String notFoundMsg = "java.lang.NoSuchMethodException: " +
1701 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
1702 if (ioe.getMessage().contains(notFoundMsg)) {
1703 LOG.warn("Using unassign region " + regionString
1704 + " instead of using offline method, you should"
1705 + " restart HMaster after these repairs");
1706 rsSupportsOffline = false;
1707 admin.unassign(regionName, true);
1708 return;
1709 }
1710 throw ioe;
1711 }
1712 }
1713
1714 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
1715 for (OnlineEntry rse : hi.deployedEntries) {
1716 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
1717 try {
1718 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, rse.hsa, rse.hri);
1719 offline(rse.hri.getRegionName());
1720 } catch (IOException ioe) {
1721 LOG.warn("Got exception when attempting to offline region "
1722 + Bytes.toString(rse.hri.getRegionName()), ioe);
1723 }
1724 }
1725 }
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739 @SuppressWarnings("deprecation")
1740 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
1741 if (hi.metaEntry == null && hi.hdfsEntry == null) {
1742 undeployRegions(hi);
1743 return;
1744 }
1745
1746
1747 Get get = new Get(hi.getRegionName());
1748 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1749 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
1750 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
1751 Result r = meta.get(get);
1752 byte[] value = r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
1753 byte[] startcodeBytes = r.getValue(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
1754 if (value == null || startcodeBytes == null) {
1755 errors.reportError("Unable to close region "
1756 + hi.getRegionNameAsString() + " because meta does not "
1757 + "have handle to reach it.");
1758 return;
1759 }
1760 long startcode = Bytes.toLong(startcodeBytes);
1761
1762 ServerName hsa = new ServerName(Bytes.toString(value), startcode);
1763 byte[] hriVal = r.getValue(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1764 HRegionInfo hri= Writables.getHRegionInfoOrNull(hriVal);
1765 if (hri == null) {
1766 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
1767 + " because META had invalid or missing "
1768 + HConstants.CATALOG_FAMILY_STR + ":"
1769 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
1770 + " qualifier value.");
1771 return;
1772 }
1773
1774
1775 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, hsa, hri);
1776 }
1777
1778 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
1779 KeeperException, InterruptedException {
1780
1781 if (shouldFixAssignments()) {
1782 errors.print(msg);
1783 undeployRegions(hbi);
1784 setShouldRerun();
1785 HRegionInfo hri = hbi.getHdfsHRI();
1786 if (hri == null) {
1787 hri = hbi.metaEntry;
1788 }
1789 HBaseFsckRepair.fixUnassigned(admin, hri);
1790 HBaseFsckRepair.waitUntilAssigned(admin, hri);
1791 }
1792 }
1793
1794
1795
1796
1797 private void checkRegionConsistency(final String key, final HbckInfo hbi)
1798 throws IOException, KeeperException, InterruptedException {
1799 String descriptiveName = hbi.toString();
1800
1801 boolean inMeta = hbi.metaEntry != null;
1802
1803 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
1804 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
1805 boolean isDeployed = !hbi.deployedOn.isEmpty();
1806 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
1807 boolean deploymentMatchesMeta =
1808 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
1809 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
1810 boolean splitParent =
1811 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
1812 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
1813 boolean recentlyModified = inHdfs &&
1814 hbi.getModTime() + timelag > System.currentTimeMillis();
1815
1816
1817 if (hbi.containsOnlyHdfsEdits()) {
1818 return;
1819 }
1820 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
1821 return;
1822 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
1823 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
1824 "tabled that is not deployed");
1825 return;
1826 } else if (recentlyModified) {
1827 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
1828 return;
1829 }
1830
1831 else if (!inMeta && !inHdfs && !isDeployed) {
1832
1833 assert false : "Entry for region with no data";
1834 } else if (!inMeta && !inHdfs && isDeployed) {
1835 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
1836 + descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
1837 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1838 if (shouldFixAssignments()) {
1839 undeployRegions(hbi);
1840 }
1841
1842 } else if (!inMeta && inHdfs && !isDeployed) {
1843 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
1844 + descriptiveName + " on HDFS, but not listed in META " +
1845 "or deployed on any region server");
1846
1847 if (shouldFixMeta()) {
1848 if (!hbi.isHdfsRegioninfoPresent()) {
1849 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
1850 + " in table integrity repair phase if -fixHdfsOrphans was" +
1851 " used.");
1852 return;
1853 }
1854
1855 LOG.info("Patching .META. with .regioninfo: " + hbi.getHdfsHRI());
1856 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1857
1858 tryAssignmentRepair(hbi, "Trying to reassign region...");
1859 }
1860
1861 } else if (!inMeta && inHdfs && isDeployed) {
1862 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
1863 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1864 debugLsr(hbi.getHdfsRegionDir());
1865 if (shouldFixMeta()) {
1866 if (!hbi.isHdfsRegioninfoPresent()) {
1867 LOG.error("This should have been repaired in table integrity repair phase");
1868 return;
1869 }
1870
1871 LOG.info("Patching .META. with with .regioninfo: " + hbi.getHdfsHRI());
1872 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1873
1874 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1875 }
1876
1877
1878 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
1879
1880
1881 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
1882
1883 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
1884 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
1885 if (infoA != null && infoB != null) {
1886
1887 hbi.setSkipChecks(true);
1888 return;
1889 }
1890 }
1891 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
1892 + descriptiveName + " is a split parent in META, in HDFS, "
1893 + "and not deployed on any region server. This could be transient.");
1894 if (shouldFixSplitParents()) {
1895 setShouldRerun();
1896 resetSplitParent(hbi);
1897 }
1898 } else if (inMeta && !inHdfs && !isDeployed) {
1899 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
1900 + descriptiveName + " found in META, but not in HDFS "
1901 + "or deployed on any region server.");
1902 if (shouldFixMeta()) {
1903 deleteMetaRegion(hbi);
1904 }
1905 } else if (inMeta && !inHdfs && isDeployed) {
1906 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
1907 + " found in META, but not in HDFS, " +
1908 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1909
1910
1911
1912 if (shouldFixAssignments()) {
1913 errors.print("Trying to fix unassigned region...");
1914 closeRegion(hbi);
1915 }
1916 if (shouldFixMeta()) {
1917
1918 deleteMetaRegion(hbi);
1919 }
1920 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
1921 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
1922 + " not deployed on any region server.");
1923 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1924 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
1925 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
1926 "Region " + descriptiveName + " should not be deployed according " +
1927 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1928 if (shouldFixAssignments()) {
1929 errors.print("Trying to close the region " + descriptiveName);
1930 setShouldRerun();
1931 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1932 }
1933 } else if (inMeta && inHdfs && isMultiplyDeployed) {
1934 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
1935 + " is listed in META on region server " + hbi.metaEntry.regionServer
1936 + " but is multiply assigned to region servers " +
1937 Joiner.on(", ").join(hbi.deployedOn));
1938
1939 if (shouldFixAssignments()) {
1940 errors.print("Trying to fix assignment error...");
1941 setShouldRerun();
1942 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1943 }
1944 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
1945 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
1946 + descriptiveName + " listed in META on region server " +
1947 hbi.metaEntry.regionServer + " but found on region server " +
1948 hbi.deployedOn.get(0));
1949
1950 if (shouldFixAssignments()) {
1951 errors.print("Trying to fix assignment error...");
1952 setShouldRerun();
1953 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1954 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
1955 }
1956 } else {
1957 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
1958 " is in an unforeseen state:" +
1959 " inMeta=" + inMeta +
1960 " inHdfs=" + inHdfs +
1961 " isDeployed=" + isDeployed +
1962 " isMultiplyDeployed=" + isMultiplyDeployed +
1963 " deploymentMatchesMeta=" + deploymentMatchesMeta +
1964 " shouldBeDeployed=" + shouldBeDeployed);
1965 }
1966 }
1967
1968
1969
1970
1971
1972
1973
1974 SortedMap<String, TableInfo> checkIntegrity() throws IOException {
1975 tablesInfo = new TreeMap<String,TableInfo> ();
1976 List<HbckInfo> noHDFSRegionInfos = new ArrayList<HbckInfo>();
1977 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
1978 for (HbckInfo hbi : regionInfoMap.values()) {
1979
1980 if (hbi.metaEntry == null) {
1981
1982 noHDFSRegionInfos.add(hbi);
1983 Path p = hbi.getHdfsRegionDir();
1984 if (p == null) {
1985 errors.report("No regioninfo in Meta or HDFS. " + hbi);
1986 }
1987
1988
1989 continue;
1990 }
1991 if (hbi.metaEntry.regionServer == null) {
1992 errors.detail("Skipping region because no region server: " + hbi);
1993 continue;
1994 }
1995 if (hbi.metaEntry.isOffline()) {
1996 errors.detail("Skipping region because it is offline: " + hbi);
1997 continue;
1998 }
1999 if (hbi.containsOnlyHdfsEdits()) {
2000 errors.detail("Skipping region because it only contains edits" + hbi);
2001 continue;
2002 }
2003
2004
2005
2006
2007
2008
2009 if (hbi.deployedOn.size() == 0) continue;
2010
2011
2012 String tableName = hbi.metaEntry.getTableNameAsString();
2013 TableInfo modTInfo = tablesInfo.get(tableName);
2014 if (modTInfo == null) {
2015 modTInfo = new TableInfo(tableName);
2016 }
2017 for (ServerName server : hbi.deployedOn) {
2018 modTInfo.addServer(server);
2019 }
2020
2021 if (!hbi.isSkipChecks()) {
2022 modTInfo.addRegionInfo(hbi);
2023 }
2024
2025 tablesInfo.put(tableName, modTInfo);
2026 }
2027
2028 for (TableInfo tInfo : tablesInfo.values()) {
2029 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
2030 if (!tInfo.checkRegionChain(handler)) {
2031 errors.report("Found inconsistency in table " + tInfo.getName());
2032 }
2033 }
2034 return tablesInfo;
2035 }
2036
2037
2038
2039
2040
2041 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2042 int fileMoves = 0;
2043 String thread = Thread.currentThread().getName();
2044 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2045 debugLsr(contained.getHdfsRegionDir());
2046
2047
2048 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2049 FileStatus[] dirs = null;
2050 try {
2051 dirs = fs.listStatus(contained.getHdfsRegionDir());
2052 } catch (FileNotFoundException fnfe) {
2053
2054
2055 if (!fs.exists(contained.getHdfsRegionDir())) {
2056 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2057 + " is missing. Assuming already sidelined or moved.");
2058 } else {
2059 sidelineRegionDir(fs, contained);
2060 }
2061 return fileMoves;
2062 }
2063
2064 if (dirs == null) {
2065 if (!fs.exists(contained.getHdfsRegionDir())) {
2066 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2067 + " already sidelined.");
2068 } else {
2069 sidelineRegionDir(fs, contained);
2070 }
2071 return fileMoves;
2072 }
2073
2074 for (FileStatus cf : dirs) {
2075 Path src = cf.getPath();
2076 Path dst = new Path(targetRegionDir, src.getName());
2077
2078 if (src.getName().equals(HRegion.REGIONINFO_FILE)) {
2079
2080 continue;
2081 }
2082
2083 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2084
2085 continue;
2086 }
2087
2088 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2089
2090
2091
2092
2093 for (FileStatus hfile : fs.listStatus(src)) {
2094 boolean success = fs.rename(hfile.getPath(), dst);
2095 if (success) {
2096 fileMoves++;
2097 }
2098 }
2099 LOG.debug("[" + thread + "] Sideline directory contents:");
2100 debugLsr(targetRegionDir);
2101 }
2102
2103
2104 sidelineRegionDir(fs, contained);
2105 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2106 getSidelineDir());
2107 debugLsr(contained.getHdfsRegionDir());
2108
2109 return fileMoves;
2110 }
2111
2112
2113 static class WorkItemOverlapMerge implements Callable<Void> {
2114 private TableIntegrityErrorHandler handler;
2115 Collection<HbckInfo> overlapgroup;
2116
2117 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2118 this.handler = handler;
2119 this.overlapgroup = overlapgroup;
2120 }
2121
2122 @Override
2123 public Void call() throws Exception {
2124 handler.handleOverlapGroup(overlapgroup);
2125 return null;
2126 }
2127 };
2128
2129
2130
2131
2132
2133 public class TableInfo {
2134 String tableName;
2135 TreeSet <ServerName> deployedOn;
2136
2137
2138 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2139
2140
2141 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2142
2143
2144 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2145
2146
2147 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2148
2149
2150 final Multimap<byte[], HbckInfo> overlapGroups =
2151 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2152
2153 TableInfo(String name) {
2154 this.tableName = name;
2155 deployedOn = new TreeSet <ServerName>();
2156 }
2157
2158
2159
2160
2161 private HTableDescriptor getHTD() {
2162 if (htds.size() == 1) {
2163 return (HTableDescriptor)htds.toArray()[0];
2164 } else {
2165 LOG.error("None/Multiple table descriptors found for table '"
2166 + tableName + "' regions: " + htds);
2167 }
2168 return null;
2169 }
2170
2171 public void addRegionInfo(HbckInfo hir) {
2172 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2173
2174 sc.add(hir);
2175 return;
2176 }
2177
2178
2179 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2180 errors.reportError(
2181 ERROR_CODE.REGION_CYCLE,
2182 String.format("The endkey for this region comes before the "
2183 + "startkey, startkey=%s, endkey=%s",
2184 Bytes.toStringBinary(hir.getStartKey()),
2185 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2186 backwards.add(hir);
2187 return;
2188 }
2189
2190
2191 sc.add(hir);
2192 }
2193
2194 public void addServer(ServerName server) {
2195 this.deployedOn.add(server);
2196 }
2197
2198 public String getName() {
2199 return tableName;
2200 }
2201
2202 public int getNumRegions() {
2203 return sc.getStarts().size() + backwards.size();
2204 }
2205
2206 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2207 ErrorReporter errors;
2208
2209 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2210 this.errors = errors;
2211 setTableInfo(ti);
2212 }
2213
2214 @Override
2215 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2216 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2217 "First region should start with an empty key. You need to "
2218 + " create a new region and regioninfo in HDFS to plug the hole.",
2219 getTableInfo(), hi);
2220 }
2221
2222 @Override
2223 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2224 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2225 "Last region should end with an empty key. You need to "
2226 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2227 }
2228
2229 @Override
2230 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2231 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2232 "Region has the same start and end key.", getTableInfo(), hi);
2233 }
2234
2235 @Override
2236 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2237 byte[] key = r1.getStartKey();
2238
2239 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2240 "Multiple regions have the same startkey: "
2241 + Bytes.toStringBinary(key), getTableInfo(), r1);
2242 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2243 "Multiple regions have the same startkey: "
2244 + Bytes.toStringBinary(key), getTableInfo(), r2);
2245 }
2246
2247 @Override
2248 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2249 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2250 "There is an overlap in the region chain.",
2251 getTableInfo(), hi1, hi2);
2252 }
2253
2254 @Override
2255 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2256 errors.reportError(
2257 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2258 "There is a hole in the region chain between "
2259 + Bytes.toStringBinary(holeStart) + " and "
2260 + Bytes.toStringBinary(holeStop)
2261 + ". You need to create a new .regioninfo and region "
2262 + "dir in hdfs to plug the hole.");
2263 }
2264 };
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2279 Configuration conf;
2280
2281 boolean fixOverlaps = true;
2282
2283 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2284 boolean fixHoles, boolean fixOverlaps) {
2285 super(ti, errors);
2286 this.conf = conf;
2287 this.fixOverlaps = fixOverlaps;
2288
2289 }
2290
2291
2292
2293
2294
2295
2296 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2297 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2298 "First region should start with an empty key. Creating a new " +
2299 "region and regioninfo in HDFS to plug the hole.",
2300 getTableInfo(), next);
2301 HTableDescriptor htd = getTableInfo().getHTD();
2302
2303 HRegionInfo newRegion = new HRegionInfo(htd.getName(),
2304 HConstants.EMPTY_START_ROW, next.getStartKey());
2305
2306
2307 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2308 LOG.info("Table region start key was not empty. Created new empty region: "
2309 + newRegion + " " +region);
2310 fixes++;
2311 }
2312
2313 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2314 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2315 "Last region should end with an empty key. Creating a new "
2316 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2317 HTableDescriptor htd = getTableInfo().getHTD();
2318
2319 HRegionInfo newRegion = new HRegionInfo(htd.getName(), curEndKey,
2320 HConstants.EMPTY_START_ROW);
2321
2322 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2323 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2324 + " " + region);
2325 fixes++;
2326 }
2327
2328
2329
2330
2331
2332 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2333 errors.reportError(
2334 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2335 "There is a hole in the region chain between "
2336 + Bytes.toStringBinary(holeStartKey) + " and "
2337 + Bytes.toStringBinary(holeStopKey)
2338 + ". Creating a new regioninfo and region "
2339 + "dir in hdfs to plug the hole.");
2340 HTableDescriptor htd = getTableInfo().getHTD();
2341 HRegionInfo newRegion = new HRegionInfo(htd.getName(), holeStartKey, holeStopKey);
2342 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2343 LOG.info("Plugged hold by creating new empty region: "+ newRegion + " " +region);
2344 fixes++;
2345 }
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358 @Override
2359 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2360 throws IOException {
2361 Preconditions.checkNotNull(overlap);
2362 Preconditions.checkArgument(overlap.size() >0);
2363
2364 if (!this.fixOverlaps) {
2365 LOG.warn("Not attempting to repair overlaps.");
2366 return;
2367 }
2368
2369 if (overlap.size() > maxMerge) {
2370 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2371 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2372 if (sidelineBigOverlaps) {
2373
2374 sidelineBigOverlaps(overlap);
2375 }
2376 return;
2377 }
2378
2379 mergeOverlaps(overlap);
2380 }
2381
2382 void mergeOverlaps(Collection<HbckInfo> overlap)
2383 throws IOException {
2384 String thread = Thread.currentThread().getName();
2385 LOG.info("== [" + thread + "] Merging regions into one region: "
2386 + Joiner.on(",").join(overlap));
2387
2388 Pair<byte[], byte[]> range = null;
2389 for (HbckInfo hi : overlap) {
2390 if (range == null) {
2391 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2392 } else {
2393 if (RegionSplitCalculator.BYTES_COMPARATOR
2394 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2395 range.setFirst(hi.getStartKey());
2396 }
2397 if (RegionSplitCalculator.BYTES_COMPARATOR
2398 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2399 range.setSecond(hi.getEndKey());
2400 }
2401 }
2402
2403 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2404 LOG.debug("[" + thread + "] Contained region dir before close");
2405 debugLsr(hi.getHdfsRegionDir());
2406 try {
2407 LOG.info("[" + thread + "] Closing region: " + hi);
2408 closeRegion(hi);
2409 } catch (IOException ioe) {
2410 LOG.warn("[" + thread + "] Was unable to close region " + hi
2411 + ". Just continuing... ", ioe);
2412 } catch (InterruptedException e) {
2413 LOG.warn("[" + thread + "] Was unable to close region " + hi
2414 + ". Just continuing... ", e);
2415 }
2416
2417 try {
2418 LOG.info("[" + thread + "] Offlining region: " + hi);
2419 offline(hi.getRegionName());
2420 } catch (IOException ioe) {
2421 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2422 + ". Just continuing... ", ioe);
2423 }
2424 }
2425
2426
2427 HTableDescriptor htd = getTableInfo().getHTD();
2428
2429 HRegionInfo newRegion = new HRegionInfo(htd.getName(), range.getFirst(),
2430 range.getSecond());
2431 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2432 LOG.info("[" + thread + "] Created new empty container region: " +
2433 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2434 debugLsr(region.getRegionDir());
2435
2436
2437 boolean didFix= false;
2438 Path target = region.getRegionDir();
2439 for (HbckInfo contained : overlap) {
2440 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
2441 int merges = mergeRegionDirs(target, contained);
2442 if (merges > 0) {
2443 didFix = true;
2444 }
2445 }
2446 if (didFix) {
2447 fixes++;
2448 }
2449 }
2450
2451
2452
2453
2454
2455
2456
2457
2458 void sidelineBigOverlaps(
2459 Collection<HbckInfo> bigOverlap) throws IOException {
2460 int overlapsToSideline = bigOverlap.size() - maxMerge;
2461 if (overlapsToSideline > maxOverlapsToSideline) {
2462 overlapsToSideline = maxOverlapsToSideline;
2463 }
2464 List<HbckInfo> regionsToSideline =
2465 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
2466 FileSystem fs = FileSystem.get(conf);
2467 for (HbckInfo regionToSideline: regionsToSideline) {
2468 try {
2469 LOG.info("Closing region: " + regionToSideline);
2470 closeRegion(regionToSideline);
2471 } catch (IOException ioe) {
2472 LOG.warn("Was unable to close region " + regionToSideline
2473 + ". Just continuing... ", ioe);
2474 } catch (InterruptedException e) {
2475 LOG.warn("Was unable to close region " + regionToSideline
2476 + ". Just continuing... ", e);
2477 }
2478
2479 try {
2480 LOG.info("Offlining region: " + regionToSideline);
2481 offline(regionToSideline.getRegionName());
2482 } catch (IOException ioe) {
2483 LOG.warn("Unable to offline region from master: " + regionToSideline
2484 + ". Just continuing... ", ioe);
2485 }
2486
2487 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
2488 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
2489 if (sidelineRegionDir != null) {
2490 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
2491 LOG.info("After sidelined big overlapped region: "
2492 + regionToSideline.getRegionNameAsString()
2493 + " to " + sidelineRegionDir.toString());
2494 fixes++;
2495 }
2496 }
2497 }
2498 }
2499
2500
2501
2502
2503
2504
2505
2506 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
2507
2508
2509
2510 if (disabledTables.contains(this.tableName.getBytes())) {
2511 return true;
2512 }
2513 int originalErrorsCount = errors.getErrorList().size();
2514 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
2515 SortedSet<byte[]> splits = sc.getSplits();
2516
2517 byte[] prevKey = null;
2518 byte[] problemKey = null;
2519 for (byte[] key : splits) {
2520 Collection<HbckInfo> ranges = regions.get(key);
2521 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
2522 for (HbckInfo rng : ranges) {
2523 handler.handleRegionStartKeyNotEmpty(rng);
2524 }
2525 }
2526
2527
2528 for (HbckInfo rng : ranges) {
2529
2530 byte[] endKey = rng.getEndKey();
2531 endKey = (endKey.length == 0) ? null : endKey;
2532 if (Bytes.equals(rng.getStartKey(),endKey)) {
2533 handler.handleDegenerateRegion(rng);
2534 }
2535 }
2536
2537 if (ranges.size() == 1) {
2538
2539 if (problemKey != null) {
2540 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2541 }
2542 problemKey = null;
2543 } else if (ranges.size() > 1) {
2544
2545
2546 if (problemKey == null) {
2547
2548 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
2549 problemKey = key;
2550 }
2551 overlapGroups.putAll(problemKey, ranges);
2552
2553
2554 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
2555
2556 for (HbckInfo r1 : ranges) {
2557 subRange.remove(r1);
2558 for (HbckInfo r2 : subRange) {
2559 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
2560 handler.handleDuplicateStartKeys(r1,r2);
2561 } else {
2562
2563 handler.handleOverlapInRegionChain(r1, r2);
2564 }
2565 }
2566 }
2567
2568 } else if (ranges.size() == 0) {
2569 if (problemKey != null) {
2570 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2571 }
2572 problemKey = null;
2573
2574 byte[] holeStopKey = sc.getSplits().higher(key);
2575
2576 if (holeStopKey != null) {
2577
2578 handler.handleHoleInRegionChain(key, holeStopKey);
2579 }
2580 }
2581 prevKey = key;
2582 }
2583
2584
2585
2586 if (prevKey != null) {
2587 handler.handleRegionEndKeyNotEmpty(prevKey);
2588 }
2589
2590
2591 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
2592 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
2593 " false to run serially.");
2594 boolean ok = handleOverlapsParallel(handler, prevKey);
2595 if (!ok) {
2596 return false;
2597 }
2598 } else {
2599 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
2600 " true to run in parallel.");
2601 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2602 handler.handleOverlapGroup(overlap);
2603 }
2604 }
2605
2606 if (details) {
2607
2608 errors.print("---- Table '" + this.tableName
2609 + "': region split map");
2610 dump(splits, regions);
2611 errors.print("---- Table '" + this.tableName
2612 + "': overlap groups");
2613 dumpOverlapProblems(overlapGroups);
2614 errors.print("There are " + overlapGroups.keySet().size()
2615 + " overlap groups with " + overlapGroups.size()
2616 + " overlapping regions");
2617 }
2618 if (!sidelinedRegions.isEmpty()) {
2619 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
2620 errors.print("---- Table '" + this.tableName
2621 + "': sidelined big overlapped regions");
2622 dumpSidelinedRegions(sidelinedRegions);
2623 }
2624 return errors.getErrorList().size() == originalErrorsCount;
2625 }
2626
2627 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
2628 throws IOException {
2629
2630
2631 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
2632 List<Future<Void>> rets;
2633 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2634
2635 merges.add(new WorkItemOverlapMerge(overlap, handler));
2636 }
2637 try {
2638 rets = executor.invokeAll(merges);
2639 } catch (InterruptedException e) {
2640 e.printStackTrace();
2641 LOG.error("Overlap merges were interrupted", e);
2642 return false;
2643 }
2644 for(int i=0; i<merges.size(); i++) {
2645 WorkItemOverlapMerge work = merges.get(i);
2646 Future<Void> f = rets.get(i);
2647 try {
2648 f.get();
2649 } catch(ExecutionException e) {
2650 LOG.warn("Failed to merge overlap group" + work, e.getCause());
2651 } catch (InterruptedException e) {
2652 LOG.error("Waiting for overlap merges was interrupted", e);
2653 return false;
2654 }
2655 }
2656 return true;
2657 }
2658
2659
2660
2661
2662
2663
2664
2665 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
2666
2667 StringBuilder sb = new StringBuilder();
2668 for (byte[] k : splits) {
2669 sb.setLength(0);
2670 sb.append(Bytes.toStringBinary(k) + ":\t");
2671 for (HbckInfo r : regions.get(k)) {
2672 sb.append("[ "+ r.toString() + ", "
2673 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
2674 }
2675 errors.print(sb.toString());
2676 }
2677 }
2678 }
2679
2680 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
2681
2682
2683 for (byte[] k : regions.keySet()) {
2684 errors.print(Bytes.toStringBinary(k) + ":");
2685 for (HbckInfo r : regions.get(k)) {
2686 errors.print("[ " + r.toString() + ", "
2687 + Bytes.toStringBinary(r.getEndKey()) + "]");
2688 }
2689 errors.print("----");
2690 }
2691 }
2692
2693 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
2694 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
2695 String tableName = Bytes.toStringBinary(entry.getValue().getTableName());
2696 Path path = entry.getKey();
2697 errors.print("This sidelined region dir should be bulk loaded: "
2698 + path.toString());
2699 errors.print("Bulk load command looks like: "
2700 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
2701 + path.toUri().getPath() + " "+ tableName);
2702 }
2703 }
2704
2705 public Multimap<byte[], HbckInfo> getOverlapGroups(
2706 String table) {
2707 TableInfo ti = tablesInfo.get(table);
2708 return ti.overlapGroups;
2709 }
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
2721 List<String> tableNames = new ArrayList<String>();
2722 long now = System.currentTimeMillis();
2723
2724 for (HbckInfo hbi : regionInfoMap.values()) {
2725 MetaEntry info = hbi.metaEntry;
2726
2727
2728
2729 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
2730 if (info.modTime + timelag < now) {
2731 tableNames.add(info.getTableNameAsString());
2732 } else {
2733 numSkipped.incrementAndGet();
2734 }
2735 }
2736 }
2737 return getHTableDescriptors(tableNames);
2738 }
2739
2740 HTableDescriptor[] getHTableDescriptors(List<String> tableNames) {
2741 HTableDescriptor[] htd = new HTableDescriptor[0];
2742 try {
2743 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
2744 htd = new HBaseAdmin(getConf()).getTableDescriptors(tableNames);
2745 } catch (IOException e) {
2746 LOG.debug("Exception getting table descriptors", e);
2747 }
2748 return htd;
2749 }
2750
2751
2752
2753
2754
2755
2756
2757 private synchronized HbckInfo getOrCreateInfo(String name) {
2758 HbckInfo hbi = regionInfoMap.get(name);
2759 if (hbi == null) {
2760 hbi = new HbckInfo(null);
2761 regionInfoMap.put(name, hbi);
2762 }
2763 return hbi;
2764 }
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775 boolean checkMetaRegion()
2776 throws IOException, KeeperException, InterruptedException {
2777 List <HbckInfo> metaRegions = Lists.newArrayList();
2778 for (HbckInfo value : regionInfoMap.values()) {
2779 if (value.metaEntry.isMetaRegion()) {
2780 metaRegions.add(value);
2781 }
2782 }
2783
2784
2785 if (metaRegions.size() != 1) {
2786 HRegionLocation rootLocation = connection.locateRegion(
2787 HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_START_ROW);
2788 HbckInfo root =
2789 regionInfoMap.get(rootLocation.getRegionInfo().getEncodedName());
2790
2791
2792 if (metaRegions.size() == 0) {
2793 errors.reportError(ERROR_CODE.NO_META_REGION, ".META. is not found on any region.");
2794 if (shouldFixAssignments()) {
2795 errors.print("Trying to fix a problem with .META...");
2796 setShouldRerun();
2797
2798 HBaseFsckRepair.fixUnassigned(admin, root.metaEntry);
2799 HBaseFsckRepair.waitUntilAssigned(admin, root.getHdfsHRI());
2800 }
2801 }
2802
2803 else if (metaRegions.size() > 1) {
2804 errors.reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region.");
2805 if (shouldFixAssignments()) {
2806 errors.print("Trying to fix a problem with .META...");
2807 setShouldRerun();
2808
2809 List <ServerName> deployedOn = Lists.newArrayList();
2810 for (HbckInfo mRegion : metaRegions) {
2811 deployedOn.add(mRegion.metaEntry.regionServer);
2812 }
2813 HBaseFsckRepair.fixMultiAssignment(admin, root.metaEntry, deployedOn);
2814 }
2815 }
2816
2817 return false;
2818 }
2819
2820 return true;
2821 }
2822
2823
2824
2825
2826
2827 boolean loadMetaEntries() throws IOException {
2828
2829
2830
2831 if (!recordRootRegion()) {
2832
2833 errors.reportError("Fatal error: unable to get root region location. Exiting...");
2834 return false;
2835 }
2836
2837 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
2838 int countRecord = 1;
2839
2840
2841 final Comparator<KeyValue> comp = new Comparator<KeyValue>() {
2842 public int compare(KeyValue k1, KeyValue k2) {
2843 return (int)(k1.getTimestamp() - k2.getTimestamp());
2844 }
2845 };
2846
2847 public boolean processRow(Result result) throws IOException {
2848 try {
2849
2850
2851 long ts = Collections.max(result.list(), comp).getTimestamp();
2852 Pair<HRegionInfo, ServerName> pair = MetaReader.parseCatalogResult(result);
2853 if (pair == null || pair.getFirst() == null) {
2854 emptyRegionInfoQualifiers.add(result);
2855 return true;
2856 }
2857 ServerName sn = null;
2858 if (pair.getSecond() != null) {
2859 sn = pair.getSecond();
2860 }
2861 HRegionInfo hri = pair.getFirst();
2862 if (!(isTableIncluded(hri.getTableNameAsString())
2863 || hri.isMetaRegion() || hri.isRootRegion())) {
2864 return true;
2865 }
2866 PairOfSameType<HRegionInfo> daughters = MetaReader.getDaughterRegions(result);
2867 MetaEntry m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
2868 HbckInfo hbInfo = new HbckInfo(m);
2869 HbckInfo previous = regionInfoMap.put(hri.getEncodedName(), hbInfo);
2870 if (previous != null) {
2871 throw new IOException("Two entries in META are same " + previous);
2872 }
2873
2874
2875 if (countRecord % 100 == 0) {
2876 errors.progress();
2877 }
2878 countRecord++;
2879 return true;
2880 } catch (RuntimeException e) {
2881 LOG.error("Result=" + result);
2882 throw e;
2883 }
2884 }
2885 };
2886
2887
2888 MetaScanner.metaScan(getConf(), null, visitor, null, null,
2889 Integer.MAX_VALUE, HConstants.ROOT_TABLE_NAME);
2890
2891 if (!checkMetaOnly) {
2892
2893 MetaScanner.metaScan(getConf(), visitor);
2894 }
2895
2896 errors.print("");
2897 return true;
2898 }
2899
2900
2901
2902
2903 static class MetaEntry extends HRegionInfo {
2904 ServerName regionServer;
2905 long modTime;
2906 HRegionInfo splitA, splitB;
2907
2908 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
2909 this(rinfo, regionServer, modTime, null, null);
2910 }
2911
2912 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
2913 HRegionInfo splitA, HRegionInfo splitB) {
2914 super(rinfo);
2915 this.regionServer = regionServer;
2916 this.modTime = modTime;
2917 this.splitA = splitA;
2918 this.splitB = splitB;
2919 }
2920
2921 public boolean equals(Object o) {
2922 boolean superEq = super.equals(o);
2923 if (!superEq) {
2924 return superEq;
2925 }
2926
2927 MetaEntry me = (MetaEntry) o;
2928 if (!regionServer.equals(me.regionServer)) {
2929 return false;
2930 }
2931 return (modTime == me.modTime);
2932 }
2933 }
2934
2935
2936
2937
2938 static class HdfsEntry {
2939 HRegionInfo hri;
2940 Path hdfsRegionDir = null;
2941 long hdfsRegionDirModTime = 0;
2942 boolean hdfsRegioninfoFilePresent = false;
2943 boolean hdfsOnlyEdits = false;
2944 }
2945
2946
2947
2948
2949 static class OnlineEntry {
2950 HRegionInfo hri;
2951 ServerName hsa;
2952
2953 public String toString() {
2954 return hsa.toString() + ";" + hri.getRegionNameAsString();
2955 }
2956 }
2957
2958
2959
2960
2961
2962 public static class HbckInfo implements KeyRange {
2963 private MetaEntry metaEntry = null;
2964 private HdfsEntry hdfsEntry = null;
2965 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
2966 private List<ServerName> deployedOn = Lists.newArrayList();
2967 private boolean skipChecks = false;
2968
2969 HbckInfo(MetaEntry metaEntry) {
2970 this.metaEntry = metaEntry;
2971 }
2972
2973 public synchronized void addServer(HRegionInfo hri, ServerName server) {
2974 OnlineEntry rse = new OnlineEntry() ;
2975 rse.hri = hri;
2976 rse.hsa = server;
2977 this.deployedEntries.add(rse);
2978 this.deployedOn.add(server);
2979 }
2980
2981 public synchronized String toString() {
2982 StringBuilder sb = new StringBuilder();
2983 sb.append("{ meta => ");
2984 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
2985 sb.append( ", hdfs => " + getHdfsRegionDir());
2986 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
2987 sb.append(" }");
2988 return sb.toString();
2989 }
2990
2991 @Override
2992 public byte[] getStartKey() {
2993 if (this.metaEntry != null) {
2994 return this.metaEntry.getStartKey();
2995 } else if (this.hdfsEntry != null) {
2996 return this.hdfsEntry.hri.getStartKey();
2997 } else {
2998 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
2999 return null;
3000 }
3001 }
3002
3003 @Override
3004 public byte[] getEndKey() {
3005 if (this.metaEntry != null) {
3006 return this.metaEntry.getEndKey();
3007 } else if (this.hdfsEntry != null) {
3008 return this.hdfsEntry.hri.getEndKey();
3009 } else {
3010 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3011 return null;
3012 }
3013 }
3014
3015 public byte[] getTableName() {
3016 if (this.metaEntry != null) {
3017 return this.metaEntry.getTableName();
3018 } else if (this.hdfsEntry != null) {
3019
3020
3021 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3022 return Bytes.toBytes(tableDir.getName());
3023 } else {
3024
3025
3026 return null;
3027 }
3028 }
3029
3030 public String getRegionNameAsString() {
3031 if (metaEntry != null) {
3032 return metaEntry.getRegionNameAsString();
3033 } else if (hdfsEntry != null) {
3034 if (hdfsEntry.hri != null) {
3035 return hdfsEntry.hri.getRegionNameAsString();
3036 }
3037 }
3038 return null;
3039 }
3040
3041 public byte[] getRegionName() {
3042 if (metaEntry != null) {
3043 return metaEntry.getRegionName();
3044 } else if (hdfsEntry != null) {
3045 return hdfsEntry.hri.getRegionName();
3046 } else {
3047 return null;
3048 }
3049 }
3050
3051 Path getHdfsRegionDir() {
3052 if (hdfsEntry == null) {
3053 return null;
3054 }
3055 return hdfsEntry.hdfsRegionDir;
3056 }
3057
3058 boolean containsOnlyHdfsEdits() {
3059 if (hdfsEntry == null) {
3060 return false;
3061 }
3062 return hdfsEntry.hdfsOnlyEdits;
3063 }
3064
3065 boolean isHdfsRegioninfoPresent() {
3066 if (hdfsEntry == null) {
3067 return false;
3068 }
3069 return hdfsEntry.hdfsRegioninfoFilePresent;
3070 }
3071
3072 long getModTime() {
3073 if (hdfsEntry == null) {
3074 return 0;
3075 }
3076 return hdfsEntry.hdfsRegionDirModTime;
3077 }
3078
3079 HRegionInfo getHdfsHRI() {
3080 if (hdfsEntry == null) {
3081 return null;
3082 }
3083 return hdfsEntry.hri;
3084 }
3085
3086 public void setSkipChecks(boolean skipChecks) {
3087 this.skipChecks = skipChecks;
3088 }
3089
3090 public boolean isSkipChecks() {
3091 return skipChecks;
3092 }
3093 }
3094
3095 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3096 @Override
3097 public int compare(HbckInfo l, HbckInfo r) {
3098 if (l == r) {
3099
3100 return 0;
3101 }
3102
3103 int tableCompare = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3104 l.getTableName(), r.getTableName());
3105 if (tableCompare != 0) {
3106 return tableCompare;
3107 }
3108
3109 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3110 l.getStartKey(), r.getStartKey());
3111 if (startComparison != 0) {
3112 return startComparison;
3113 }
3114
3115
3116 byte[] endKey = r.getEndKey();
3117 endKey = (endKey.length == 0) ? null : endKey;
3118 byte[] endKey2 = l.getEndKey();
3119 endKey2 = (endKey2.length == 0) ? null : endKey2;
3120 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3121 endKey2, endKey);
3122
3123 if (endComparison != 0) {
3124 return endComparison;
3125 }
3126
3127
3128
3129 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3130 return 0;
3131 }
3132 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3133 return 1;
3134 }
3135
3136 if (r.hdfsEntry == null) {
3137 return -1;
3138 }
3139
3140 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3141 }
3142 };
3143
3144
3145
3146
3147 private void printTableSummary(SortedMap<String, TableInfo> tablesInfo) {
3148 StringBuilder sb = new StringBuilder();
3149 errors.print("Summary:");
3150 for (TableInfo tInfo : tablesInfo.values()) {
3151 if (errors.tableHasErrors(tInfo)) {
3152 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3153 } else {
3154 errors.print(" " + tInfo.getName() + " is okay.");
3155 }
3156 errors.print(" Number of regions: " + tInfo.getNumRegions());
3157 sb.setLength(0);
3158 sb.append(" Deployed on: ");
3159 for (ServerName server : tInfo.deployedOn) {
3160 sb.append(" " + server.toString());
3161 }
3162 errors.print(sb.toString());
3163 }
3164 }
3165
3166 static ErrorReporter getErrorReporter(
3167 final Configuration conf) throws ClassNotFoundException {
3168 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3169 return (ErrorReporter)ReflectionUtils.newInstance(reporter, conf);
3170 }
3171
3172 public interface ErrorReporter {
3173 public static enum ERROR_CODE {
3174 UNKNOWN, NO_META_REGION, NULL_ROOT_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3175 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3176 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3177 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3178 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3179 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3180 WRONG_USAGE, BOUNDARIES_ERROR
3181 }
3182 public void clear();
3183 public void report(String message);
3184 public void reportError(String message);
3185 public void reportError(ERROR_CODE errorCode, String message);
3186 public void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3187 public void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3188 public void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info1, HbckInfo info2);
3189 public int summarize();
3190 public void detail(String details);
3191 public ArrayList<ERROR_CODE> getErrorList();
3192 public void progress();
3193 public void print(String message);
3194 public void resetErrors();
3195 public boolean tableHasErrors(TableInfo table);
3196 }
3197
3198 static class PrintingErrorReporter implements ErrorReporter {
3199 public int errorCount = 0;
3200 private int showProgress;
3201
3202 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3203
3204
3205 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3206
3207 public void clear() {
3208 errorTables.clear();
3209 errorList.clear();
3210 errorCount = 0;
3211 }
3212
3213 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3214 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3215 System.err.println(message);
3216 return;
3217 }
3218
3219 errorList.add(errorCode);
3220 if (!summary) {
3221 System.out.println("ERROR: " + message);
3222 }
3223 errorCount++;
3224 showProgress = 0;
3225 }
3226
3227 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3228 errorTables.add(table);
3229 reportError(errorCode, message);
3230 }
3231
3232 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3233 HbckInfo info) {
3234 errorTables.add(table);
3235 String reference = "(region " + info.getRegionNameAsString() + ")";
3236 reportError(errorCode, reference + " " + message);
3237 }
3238
3239 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3240 HbckInfo info1, HbckInfo info2) {
3241 errorTables.add(table);
3242 String reference = "(regions " + info1.getRegionNameAsString()
3243 + " and " + info2.getRegionNameAsString() + ")";
3244 reportError(errorCode, reference + " " + message);
3245 }
3246
3247 public synchronized void reportError(String message) {
3248 reportError(ERROR_CODE.UNKNOWN, message);
3249 }
3250
3251
3252
3253
3254
3255
3256 public synchronized void report(String message) {
3257 if (! summary) {
3258 System.out.println("ERROR: " + message);
3259 }
3260 showProgress = 0;
3261 }
3262
3263 public synchronized int summarize() {
3264 System.out.println(Integer.toString(errorCount) +
3265 " inconsistencies detected.");
3266 if (errorCount == 0) {
3267 System.out.println("Status: OK");
3268 return 0;
3269 } else {
3270 System.out.println("Status: INCONSISTENT");
3271 return -1;
3272 }
3273 }
3274
3275 public ArrayList<ERROR_CODE> getErrorList() {
3276 return errorList;
3277 }
3278
3279 public synchronized void print(String message) {
3280 if (!summary) {
3281 System.out.println(message);
3282 }
3283 }
3284
3285 @Override
3286 public boolean tableHasErrors(TableInfo table) {
3287 return errorTables.contains(table);
3288 }
3289
3290 @Override
3291 public void resetErrors() {
3292 errorCount = 0;
3293 }
3294
3295 public synchronized void detail(String message) {
3296 if (details) {
3297 System.out.println(message);
3298 }
3299 showProgress = 0;
3300 }
3301
3302 public synchronized void progress() {
3303 if (showProgress++ == 10) {
3304 if (!summary) {
3305 System.out.print(".");
3306 }
3307 showProgress = 0;
3308 }
3309 }
3310 }
3311
3312
3313
3314
3315 static class WorkItemRegion implements Callable<Void> {
3316 private HBaseFsck hbck;
3317 private ServerName rsinfo;
3318 private ErrorReporter errors;
3319 private HConnection connection;
3320
3321 WorkItemRegion(HBaseFsck hbck, ServerName info,
3322 ErrorReporter errors, HConnection connection) {
3323 this.hbck = hbck;
3324 this.rsinfo = info;
3325 this.errors = errors;
3326 this.connection = connection;
3327 }
3328
3329 @Override
3330 public synchronized Void call() throws IOException {
3331 errors.progress();
3332 try {
3333 HRegionInterface server =
3334 connection.getHRegionConnection(rsinfo.getHostname(), rsinfo.getPort());
3335
3336
3337 List<HRegionInfo> regions = server.getOnlineRegions();
3338 regions = filterRegions(regions);
3339 if (details) {
3340 errors.detail("RegionServer: " + rsinfo.getServerName() +
3341 " number of regions: " + regions.size());
3342 for (HRegionInfo rinfo: regions) {
3343 errors.detail(" " + rinfo.getRegionNameAsString() +
3344 " id: " + rinfo.getRegionId() +
3345 " encoded_name: " + rinfo.getEncodedName() +
3346 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
3347 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
3348 }
3349 }
3350
3351
3352 for (HRegionInfo r:regions) {
3353 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
3354 hbi.addServer(r, rsinfo);
3355 }
3356 } catch (IOException e) {
3357 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
3358 " Unable to fetch region information. " + e);
3359 throw e;
3360 }
3361 return null;
3362 }
3363
3364 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
3365 List<HRegionInfo> ret = Lists.newArrayList();
3366 for (HRegionInfo hri : regions) {
3367 if (hri.isMetaTable() || (!hbck.checkMetaOnly
3368 && hbck.isTableIncluded(hri.getTableNameAsString()))) {
3369 ret.add(hri);
3370 }
3371 }
3372 return ret;
3373 }
3374 }
3375
3376
3377
3378
3379
3380 static class WorkItemHdfsDir implements Callable<Void> {
3381 private HBaseFsck hbck;
3382 private FileStatus tableDir;
3383 private ErrorReporter errors;
3384 private FileSystem fs;
3385
3386 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
3387 FileStatus status) {
3388 this.hbck = hbck;
3389 this.fs = fs;
3390 this.tableDir = status;
3391 this.errors = errors;
3392 }
3393
3394 @Override
3395 public synchronized Void call() throws IOException {
3396 try {
3397 String tableName = tableDir.getPath().getName();
3398
3399 if (tableName.startsWith(".") &&
3400 !tableName.equals( Bytes.toString(HConstants.META_TABLE_NAME))) {
3401 return null;
3402 }
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457 static class WorkItemHdfsRegionInfo implements Callable<Void> {
3458 private HbckInfo hbi;
3459 private HBaseFsck hbck;
3460 private ErrorReporter errors;
3461
3462 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
3463 this.hbi = hbi;
3464 this.hbck = hbck;
3465 this.errors = errors;
3466 }
3467
3468 @Override
3469 public synchronized Void call() throws IOException {
3470
3471 if (hbi.getHdfsHRI() == null) {
3472 try {
3473 hbck.loadHdfsRegioninfo(hbi);
3474 } catch (IOException ioe) {
3475 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
3476 + Bytes.toString(hbi.getTableName()) + " in hdfs dir "
3477 + hbi.getHdfsRegionDir()
3478 + "! It may be an invalid format or version file. Treating as "
3479 + "an orphaned regiondir.";
3480 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
3481 try {
3482 hbck.debugLsr(hbi.getHdfsRegionDir());
3483 } catch (IOException ioe2) {
3484 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
3485 throw ioe2;
3486 }
3487 hbck.orphanHdfsDirs.add(hbi);
3488 throw ioe;
3489 }
3490 }
3491 return null;
3492 }
3493 };
3494
3495
3496
3497
3498
3499 public void setDisplayFullReport() {
3500 details = true;
3501 }
3502
3503
3504
3505
3506
3507 void setSummary() {
3508 summary = true;
3509 }
3510
3511
3512
3513
3514
3515 void setCheckMetaOnly() {
3516 checkMetaOnly = true;
3517 }
3518
3519
3520
3521
3522
3523
3524
3525 void setShouldRerun() {
3526 rerun = true;
3527 }
3528
3529 boolean shouldRerun() {
3530 return rerun;
3531 }
3532
3533
3534
3535
3536
3537 public void setFixAssignments(boolean shouldFix) {
3538 fixAssignments = shouldFix;
3539 }
3540
3541 boolean shouldFixAssignments() {
3542 return fixAssignments;
3543 }
3544
3545 public void setFixMeta(boolean shouldFix) {
3546 fixMeta = shouldFix;
3547 }
3548
3549 boolean shouldFixMeta() {
3550 return fixMeta;
3551 }
3552
3553 public void setCheckHdfs(boolean checking) {
3554 checkHdfs = checking;
3555 }
3556
3557 boolean shouldCheckHdfs() {
3558 return checkHdfs;
3559 }
3560
3561 public void setFixHdfsHoles(boolean shouldFix) {
3562 fixHdfsHoles = shouldFix;
3563 }
3564
3565 boolean shouldFixHdfsHoles() {
3566 return fixHdfsHoles;
3567 }
3568
3569 public void setFixTableOrphans(boolean shouldFix) {
3570 fixTableOrphans = shouldFix;
3571 }
3572
3573 boolean shouldFixTableOrphans() {
3574 return fixTableOrphans;
3575 }
3576
3577 public void setFixHdfsOverlaps(boolean shouldFix) {
3578 fixHdfsOverlaps = shouldFix;
3579 }
3580
3581 boolean shouldFixHdfsOverlaps() {
3582 return fixHdfsOverlaps;
3583 }
3584
3585 public void setFixHdfsOrphans(boolean shouldFix) {
3586 fixHdfsOrphans = shouldFix;
3587 }
3588
3589 boolean shouldFixHdfsOrphans() {
3590 return fixHdfsOrphans;
3591 }
3592
3593 public void setFixVersionFile(boolean shouldFix) {
3594 fixVersionFile = shouldFix;
3595 }
3596
3597 public boolean shouldFixVersionFile() {
3598 return fixVersionFile;
3599 }
3600
3601 public void setSidelineBigOverlaps(boolean sbo) {
3602 this.sidelineBigOverlaps = sbo;
3603 }
3604
3605 public boolean shouldSidelineBigOverlaps() {
3606 return sidelineBigOverlaps;
3607 }
3608
3609 public void setFixSplitParents(boolean shouldFix) {
3610 fixSplitParents = shouldFix;
3611 }
3612
3613 boolean shouldFixSplitParents() {
3614 return fixSplitParents;
3615 }
3616
3617 public void setFixReferenceFiles(boolean shouldFix) {
3618 fixReferenceFiles = shouldFix;
3619 }
3620
3621 boolean shouldFixReferenceFiles() {
3622 return fixReferenceFiles;
3623 }
3624
3625 public boolean shouldIgnorePreCheckPermission() {
3626 return ignorePreCheckPermission;
3627 }
3628
3629 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
3630 this.ignorePreCheckPermission = ignorePreCheckPermission;
3631 }
3632
3633
3634
3635
3636 public void setMaxMerge(int mm) {
3637 this.maxMerge = mm;
3638 }
3639
3640 public int getMaxMerge() {
3641 return maxMerge;
3642 }
3643
3644 public void setMaxOverlapsToSideline(int mo) {
3645 this.maxOverlapsToSideline = mo;
3646 }
3647
3648 public int getMaxOverlapsToSideline() {
3649 return maxOverlapsToSideline;
3650 }
3651
3652
3653
3654
3655
3656 boolean isTableIncluded(String table) {
3657 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
3658 }
3659
3660 public void includeTable(String table) {
3661 tablesIncluded.add(table);
3662 }
3663
3664 Set<String> getIncludedTables() {
3665 return new HashSet<String>(tablesIncluded);
3666 }
3667
3668
3669
3670
3671
3672
3673 public void setTimeLag(long seconds) {
3674 timelag = seconds * 1000;
3675 }
3676
3677
3678
3679
3680
3681 public void setSidelineDir(String sidelineDir) {
3682 this.sidelineDir = new Path(sidelineDir);
3683 }
3684
3685 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
3686 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
3687 }
3688
3689 public HFileCorruptionChecker getHFilecorruptionChecker() {
3690 return hfcc;
3691 }
3692
3693 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
3694 this.hfcc = hfcc;
3695 }
3696
3697
3698
3699
3700 void setRegionBoundariesCheck() {
3701 checkRegionBoundaries = true;
3702 }
3703
3704 public void setRetCode(int code) {
3705 this.retcode = code;
3706 }
3707
3708 public int getRetCode() {
3709 return retcode;
3710 }
3711
3712 protected HBaseFsck printUsageAndExit() {
3713 StringWriter sw = new StringWriter(2048);
3714 PrintWriter out = new PrintWriter(sw);
3715 out.println("Usage: fsck [opts] {only tables}");
3716 out.println(" where [opts] are:");
3717 out.println(" -help Display help options (this)");
3718 out.println(" -details Display full report of all regions.");
3719 out.println(" -timelag <timeInSeconds> Process only regions that " +
3720 " have not experienced any metadata updates in the last " +
3721 " <timeInSeconds> seconds.");
3722 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
3723 " before checking if the fix worked if run with -fix");
3724 out.println(" -summary Print only summary of the tables and status.");
3725 out.println(" -metaonly Only check the state of ROOT and META tables.");
3726 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta and root.");
3727
3728 out.println("");
3729 out.println(" Metadata Repair options: (expert features, use with caution!)");
3730 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
3731 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
3732 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
3733 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
3734 + " Assumes META region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
3735 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
3736 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
3737 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
3738 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
3739 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
3740 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
3741 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
3742 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
3743 out.println(" -fixSplitParents Try to force offline split parents to be online.");
3744 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
3745 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
3746 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
3747
3748 out.println("");
3749 out.println(" Datafile Repair options: (expert features, use with caution!)");
3750 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
3751 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
3752
3753 out.println("");
3754 out.println(" Metadata Repair shortcuts");
3755 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
3756 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles");
3757 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
3758
3759 out.flush();
3760 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
3761
3762 setRetCode(-2);
3763 return this;
3764 }
3765
3766
3767
3768
3769
3770
3771
3772 public static void main(String[] args) throws Exception {
3773
3774 Configuration conf = HBaseConfiguration.create();
3775 Path hbasedir = new Path(conf.get(HConstants.HBASE_DIR));
3776 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
3777 conf.set("fs.defaultFS", defaultFs.toString());
3778 conf.set("fs.default.name", defaultFs.toString());
3779 int ret = ToolRunner.run(new HBaseFsck(conf), args);
3780 System.exit(ret);
3781 }
3782
3783 @Override
3784 public int run(String[] args) throws Exception {
3785
3786 initialPoolNumThreads();
3787
3788 exec(executor, args);
3789 return getRetCode();
3790 }
3791
3792 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
3793 InterruptedException {
3794 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
3795
3796 boolean checkCorruptHFiles = false;
3797 boolean sidelineCorruptHFiles = false;
3798
3799
3800 for (int i = 0; i < args.length; i++) {
3801 String cmd = args[i];
3802 if (cmd.equals("-help") || cmd.equals("-h")) {
3803 return printUsageAndExit();
3804 } else if (cmd.equals("-details")) {
3805 setDisplayFullReport();
3806 } else if (cmd.equals("-timelag")) {
3807 if (i == args.length - 1) {
3808 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
3809 return printUsageAndExit();
3810 }
3811 try {
3812 long timelag = Long.parseLong(args[i+1]);
3813 setTimeLag(timelag);
3814 } catch (NumberFormatException e) {
3815 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
3816 return printUsageAndExit();
3817 }
3818 i++;
3819 } else if (cmd.equals("-sleepBeforeRerun")) {
3820 if (i == args.length - 1) {
3821 errors.reportError(ERROR_CODE.WRONG_USAGE,
3822 "HBaseFsck: -sleepBeforeRerun needs a value.");
3823 return printUsageAndExit();
3824 }
3825 try {
3826 sleepBeforeRerun = Long.parseLong(args[i+1]);
3827 } catch (NumberFormatException e) {
3828 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
3829 return printUsageAndExit();
3830 }
3831 i++;
3832 } else if (cmd.equals("-sidelineDir")) {
3833 if (i == args.length - 1) {
3834 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
3835 return printUsageAndExit();
3836 }
3837 i++;
3838 setSidelineDir(args[i]);
3839 } else if (cmd.equals("-fix")) {
3840 errors.reportError(ERROR_CODE.WRONG_USAGE,
3841 "This option is deprecated, please use -fixAssignments instead.");
3842 setFixAssignments(true);
3843 } else if (cmd.equals("-fixAssignments")) {
3844 setFixAssignments(true);
3845 } else if (cmd.equals("-fixMeta")) {
3846 setFixMeta(true);
3847 } else if (cmd.equals("-noHdfsChecking")) {
3848 setCheckHdfs(false);
3849 } else if (cmd.equals("-fixHdfsHoles")) {
3850 setFixHdfsHoles(true);
3851 } else if (cmd.equals("-fixHdfsOrphans")) {
3852 setFixHdfsOrphans(true);
3853 } else if (cmd.equals("-fixTableOrphans")) {
3854 setFixTableOrphans(true);
3855 } else if (cmd.equals("-fixHdfsOverlaps")) {
3856 setFixHdfsOverlaps(true);
3857 } else if (cmd.equals("-fixVersionFile")) {
3858 setFixVersionFile(true);
3859 } else if (cmd.equals("-sidelineBigOverlaps")) {
3860 setSidelineBigOverlaps(true);
3861 } else if (cmd.equals("-fixSplitParents")) {
3862 setFixSplitParents(true);
3863 } else if (cmd.equals("-ignorePreCheckPermission")) {
3864 setIgnorePreCheckPermission(true);
3865 } else if (cmd.equals("-checkCorruptHFiles")) {
3866 checkCorruptHFiles = true;
3867 } else if (cmd.equals("-sidelineCorruptHFiles")) {
3868 sidelineCorruptHFiles = true;
3869 } else if (cmd.equals("-fixReferenceFiles")) {
3870 setFixReferenceFiles(true);
3871 } else if (cmd.equals("-repair")) {
3872
3873
3874 setFixHdfsHoles(true);
3875 setFixHdfsOrphans(true);
3876 setFixMeta(true);
3877 setFixAssignments(true);
3878 setFixHdfsOverlaps(true);
3879 setFixVersionFile(true);
3880 setSidelineBigOverlaps(true);
3881 setFixSplitParents(false);
3882 setCheckHdfs(true);
3883 setFixReferenceFiles(true);
3884 } else if (cmd.equals("-repairHoles")) {
3885
3886 setFixHdfsHoles(true);
3887 setFixHdfsOrphans(false);
3888 setFixMeta(true);
3889 setFixAssignments(true);
3890 setFixHdfsOverlaps(false);
3891 setSidelineBigOverlaps(false);
3892 setFixSplitParents(false);
3893 setCheckHdfs(true);
3894 } else if (cmd.equals("-maxOverlapsToSideline")) {
3895 if (i == args.length - 1) {
3896 errors.reportError(ERROR_CODE.WRONG_USAGE,
3897 "-maxOverlapsToSideline needs a numeric value argument.");
3898 return printUsageAndExit();
3899 }
3900 try {
3901 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
3902 setMaxOverlapsToSideline(maxOverlapsToSideline);
3903 } catch (NumberFormatException e) {
3904 errors.reportError(ERROR_CODE.WRONG_USAGE,
3905 "-maxOverlapsToSideline needs a numeric value argument.");
3906 return printUsageAndExit();
3907 }
3908 i++;
3909 } else if (cmd.equals("-maxMerge")) {
3910 if (i == args.length - 1) {
3911 errors.reportError(ERROR_CODE.WRONG_USAGE,
3912 "-maxMerge needs a numeric value argument.");
3913 return printUsageAndExit();
3914 }
3915 try {
3916 int maxMerge = Integer.parseInt(args[i+1]);
3917 setMaxMerge(maxMerge);
3918 } catch (NumberFormatException e) {
3919 errors.reportError(ERROR_CODE.WRONG_USAGE,
3920 "-maxMerge needs a numeric value argument.");
3921 return printUsageAndExit();
3922 }
3923 i++;
3924 } else if (cmd.equals("-summary")) {
3925 setSummary();
3926 } else if (cmd.equals("-metaonly")) {
3927 setCheckMetaOnly();
3928 } else if (cmd.equals("-boundaries")) {
3929 setRegionBoundariesCheck();
3930 } else if (cmd.startsWith("-")) {
3931 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
3932 return printUsageAndExit();
3933 } else {
3934 includeTable(cmd);
3935 errors.print("Allow checking/fixes for table: " + cmd);
3936 }
3937 }
3938
3939
3940 try {
3941 preCheckPermission();
3942 } catch (AccessControlException ace) {
3943 Runtime.getRuntime().exit(-1);
3944 } catch (IOException ioe) {
3945 Runtime.getRuntime().exit(-1);
3946 }
3947
3948
3949 connect();
3950
3951 try {
3952
3953 if (checkCorruptHFiles || sidelineCorruptHFiles) {
3954 LOG.info("Checking all hfiles for corruption");
3955 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
3956 setHFileCorruptionChecker(hfcc);
3957 Collection<String> tables = getIncludedTables();
3958 Collection<Path> tableDirs = new ArrayList<Path>();
3959 Path rootdir = FSUtils.getRootDir(getConf());
3960 if (tables.size() > 0) {
3961 for (String t : tables) {
3962 tableDirs.add(FSUtils.getTablePath(rootdir, t));
3963 }
3964 } else {
3965 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
3966 }
3967 hfcc.checkTables(tableDirs);
3968 hfcc.report(errors);
3969 }
3970
3971
3972 int code = onlineHbck();
3973 setRetCode(code);
3974
3975
3976
3977
3978 if (shouldRerun()) {
3979 try {
3980 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
3981 Thread.sleep(sleepBeforeRerun);
3982 } catch (InterruptedException ie) {
3983 return this;
3984 }
3985
3986 setFixAssignments(false);
3987 setFixMeta(false);
3988 setFixHdfsHoles(false);
3989 setFixHdfsOverlaps(false);
3990 setFixVersionFile(false);
3991 setFixTableOrphans(false);
3992 errors.resetErrors();
3993 code = onlineHbck();
3994 setRetCode(code);
3995 }
3996 } finally {
3997 IOUtils.cleanup(null, connection, meta, admin);
3998 }
3999 return this;
4000 }
4001
4002
4003
4004
4005 void debugLsr(Path p) throws IOException {
4006 debugLsr(getConf(), p, errors);
4007 }
4008
4009
4010
4011
4012 public static void debugLsr(Configuration conf,
4013 Path p) throws IOException {
4014 debugLsr(conf, p, new PrintingErrorReporter());
4015 }
4016
4017
4018
4019
4020 public static void debugLsr(Configuration conf,
4021 Path p, ErrorReporter errors) throws IOException {
4022 if (!LOG.isDebugEnabled() || p == null) {
4023 return;
4024 }
4025 FileSystem fs = p.getFileSystem(conf);
4026
4027 if (!fs.exists(p)) {
4028
4029 return;
4030 }
4031 errors.print(p.toString());
4032
4033 if (fs.isFile(p)) {
4034 return;
4035 }
4036
4037 if (fs.getFileStatus(p).isDir()) {
4038 FileStatus[] fss= fs.listStatus(p);
4039 for (FileStatus status : fss) {
4040 debugLsr(conf, status.getPath(), errors);
4041 }
4042 }
4043 }
4044 }