1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.io.PrintWriter;
23 import java.io.StringWriter;
24 import java.net.InetAddress;
25 import java.net.URI;
26 import java.util.ArrayList;
27 import java.util.Arrays;
28 import java.util.Collection;
29 import java.util.Collections;
30 import java.util.Comparator;
31 import java.util.HashMap;
32 import java.util.HashSet;
33 import java.util.Iterator;
34 import java.util.List;
35 import java.util.Map;
36 import java.util.Map.Entry;
37 import java.util.Set;
38 import java.util.SortedMap;
39 import java.util.SortedSet;
40 import java.util.TreeMap;
41 import java.util.TreeSet;
42 import java.util.concurrent.Callable;
43 import java.util.concurrent.ConcurrentSkipListMap;
44 import java.util.concurrent.ExecutionException;
45 import java.util.concurrent.ExecutorService;
46 import java.util.concurrent.Future;
47 import java.util.concurrent.ScheduledThreadPoolExecutor;
48 import java.util.concurrent.atomic.AtomicInteger;
49 import java.util.concurrent.atomic.AtomicBoolean;
50
51 import org.apache.commons.lang.StringUtils;
52 import org.apache.commons.logging.Log;
53 import org.apache.commons.logging.LogFactory;
54 import org.apache.hadoop.hbase.classification.InterfaceAudience;
55 import org.apache.hadoop.hbase.classification.InterfaceStability;
56 import org.apache.hadoop.conf.Configuration;
57 import org.apache.hadoop.conf.Configured;
58 import org.apache.hadoop.fs.FSDataOutputStream;
59 import org.apache.hadoop.fs.FileStatus;
60 import org.apache.hadoop.fs.FileSystem;
61 import org.apache.hadoop.fs.Path;
62 import org.apache.hadoop.fs.permission.FsAction;
63 import org.apache.hadoop.fs.permission.FsPermission;
64 import org.apache.hadoop.hbase.Abortable;
65 import org.apache.hadoop.hbase.Cell;
66 import org.apache.hadoop.hbase.ClusterStatus;
67 import org.apache.hadoop.hbase.HBaseConfiguration;
68 import org.apache.hadoop.hbase.HColumnDescriptor;
69 import org.apache.hadoop.hbase.HConstants;
70 import org.apache.hadoop.hbase.HRegionInfo;
71 import org.apache.hadoop.hbase.HRegionLocation;
72 import org.apache.hadoop.hbase.HTableDescriptor;
73 import org.apache.hadoop.hbase.KeyValue;
74 import org.apache.hadoop.hbase.MasterNotRunningException;
75 import org.apache.hadoop.hbase.ServerName;
76 import org.apache.hadoop.hbase.TableName;
77 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
78 import org.apache.hadoop.hbase.catalog.MetaEditor;
79 import org.apache.hadoop.hbase.client.Delete;
80 import org.apache.hadoop.hbase.client.Get;
81 import org.apache.hadoop.hbase.client.HBaseAdmin;
82 import org.apache.hadoop.hbase.client.HConnectable;
83 import org.apache.hadoop.hbase.client.HConnection;
84 import org.apache.hadoop.hbase.client.HConnectionManager;
85 import org.apache.hadoop.hbase.client.HTable;
86 import org.apache.hadoop.hbase.client.MetaScanner;
87 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
88 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
89 import org.apache.hadoop.hbase.client.Put;
90 import org.apache.hadoop.hbase.client.Result;
91 import org.apache.hadoop.hbase.client.RowMutations;
92 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
93 import org.apache.hadoop.hbase.io.hfile.HFile;
94 import org.apache.hadoop.hbase.master.MasterFileSystem;
95 import org.apache.hadoop.hbase.master.RegionState;
96 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
97 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
98 import org.apache.hadoop.hbase.regionserver.HRegion;
99 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
100 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
101 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
102 import org.apache.hadoop.hbase.security.UserProvider;
103 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
104 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
105 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
106 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
107 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
108 import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
109 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
110 import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly;
111 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
112 import org.apache.hadoop.hbase.security.AccessDeniedException;
113 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
114 import org.apache.hadoop.io.IOUtils;
115 import org.apache.hadoop.ipc.RemoteException;
116 import org.apache.hadoop.security.UserGroupInformation;
117 import org.apache.hadoop.util.ReflectionUtils;
118 import org.apache.hadoop.util.Tool;
119 import org.apache.hadoop.util.ToolRunner;
120 import org.apache.zookeeper.KeeperException;
121
122 import com.google.common.base.Joiner;
123 import com.google.common.base.Preconditions;
124 import com.google.common.collect.Lists;
125 import com.google.common.collect.Multimap;
126 import com.google.common.collect.TreeMultimap;
127 import com.google.protobuf.ServiceException;
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174 @InterfaceAudience.Public
175 @InterfaceStability.Evolving
176 public class HBaseFsck extends Configured {
177 public static final long DEFAULT_TIME_LAG = 60000;
178 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
179 private static final int MAX_NUM_THREADS = 50;
180 private static boolean rsSupportsOffline = true;
181 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
182 private static final int DEFAULT_MAX_MERGE = 5;
183 private static final String TO_BE_LOADED = "to_be_loaded";
184 private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
185
186
187
188
189
190 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
191 private ClusterStatus status;
192 private HConnection connection;
193 private HBaseAdmin admin;
194 private HTable meta;
195
196 protected ExecutorService executor;
197 private long startMillis = System.currentTimeMillis();
198 private HFileCorruptionChecker hfcc;
199 private int retcode = 0;
200 private Path HBCK_LOCK_PATH;
201 private FSDataOutputStream hbckOutFd;
202
203
204
205 private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
206
207
208
209
210 private static boolean details = false;
211 private long timelag = DEFAULT_TIME_LAG;
212 private boolean fixAssignments = false;
213 private boolean fixMeta = false;
214 private boolean checkHdfs = true;
215 private boolean fixHdfsHoles = false;
216 private boolean fixHdfsOverlaps = false;
217 private boolean fixHdfsOrphans = false;
218 private boolean fixTableOrphans = false;
219 private boolean fixVersionFile = false;
220 private boolean fixSplitParents = false;
221 private boolean fixReferenceFiles = false;
222 private boolean fixEmptyMetaCells = false;
223 private boolean fixTableLocks = false;
224 private boolean fixAny = false;
225
226
227
228 private Set<TableName> tablesIncluded = new HashSet<TableName>();
229 private int maxMerge = DEFAULT_MAX_MERGE;
230 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
231 private boolean sidelineBigOverlaps = false;
232 private Path sidelineDir = null;
233
234 private boolean rerun = false;
235 private static boolean summary = false;
236 private boolean checkMetaOnly = false;
237 private boolean checkRegionBoundaries = false;
238 private boolean ignorePreCheckPermission = false;
239
240
241
242
243 final private ErrorReporter errors;
244 int fixes = 0;
245
246
247
248
249
250
251 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
252 private TreeSet<TableName> disabledTables =
253 new TreeSet<TableName>();
254
255 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
256
257
258
259
260
261
262
263
264
265
266
267 private SortedMap<TableName, TableInfo> tablesInfo =
268 new ConcurrentSkipListMap<TableName, TableInfo>();
269
270
271
272
273 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
274
275 private Map<TableName, Set<String>> orphanTableDirs =
276 new HashMap<TableName, Set<String>>();
277
278
279
280
281
282
283
284
285 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
286 ZooKeeperConnectionException, IOException, ClassNotFoundException {
287 super(conf);
288
289 setConf(HBaseConfiguration.create(getConf()));
290
291 getConf().setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0);
292 errors = getErrorReporter(conf);
293
294 int numThreads = conf.getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
295 executor = new ScheduledThreadPoolExecutor(numThreads, Threads.newDaemonThreadFactory("hbasefsck"));
296 }
297
298
299
300
301
302
303
304
305
306
307
308 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
309 ZooKeeperConnectionException, IOException, ClassNotFoundException {
310 super(conf);
311 errors = getErrorReporter(getConf());
312 this.executor = exec;
313 }
314
315
316
317
318
319
320
321 private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
322 try {
323 FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
324 FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
325 HConstants.DATA_FILE_UMASK_KEY);
326 Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
327 fs.mkdirs(tmpDir);
328 HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
329 final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false);
330 out.writeBytes(InetAddress.getLocalHost().toString());
331 out.flush();
332 return out;
333 } catch(RemoteException e) {
334 if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
335 return null;
336 } else {
337 throw e;
338 }
339 }
340 }
341
342 private void unlockHbck() {
343 if(hbckLockCleanup.compareAndSet(true, false)){
344 IOUtils.closeStream(hbckOutFd);
345 try{
346 FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true);
347 } catch(IOException ioe) {
348 LOG.warn("Failed to delete " + HBCK_LOCK_PATH);
349 LOG.debug(ioe);
350 }
351 }
352 }
353
354
355
356
357
358 public void connect() throws IOException {
359
360
361 hbckOutFd = checkAndMarkRunningHbck();
362 if (hbckOutFd == null) {
363 setRetCode(-1);
364 LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
365 " no other instance is running, delete the lock file " +
366 HBCK_LOCK_PATH + " and rerun the tool]");
367 throw new IOException("Duplicate hbck - Abort");
368 }
369
370
371 hbckLockCleanup.set(true);
372
373
374
375
376 Runtime.getRuntime().addShutdownHook(new Thread() {
377 @Override
378 public void run() {
379 unlockHbck();
380 }
381 });
382 LOG.debug("Launching hbck");
383
384 connection = HConnectionManager.createConnection(getConf());
385 admin = new HBaseAdmin(connection);
386 meta = new HTable(TableName.META_TABLE_NAME, connection);
387 status = admin.getClusterStatus();
388 }
389
390
391
392
393 private void loadDeployedRegions() throws IOException, InterruptedException {
394
395 Collection<ServerName> regionServers = status.getServers();
396 errors.print("Number of live region servers: " + regionServers.size());
397 if (details) {
398 for (ServerName rsinfo: regionServers) {
399 errors.print(" " + rsinfo.getServerName());
400 }
401 }
402
403
404 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
405 errors.print("Number of dead region servers: " + deadRegionServers.size());
406 if (details) {
407 for (ServerName name: deadRegionServers) {
408 errors.print(" " + name);
409 }
410 }
411
412
413 errors.print("Master: " + status.getMaster());
414
415
416 Collection<ServerName> backupMasters = status.getBackupMasters();
417 errors.print("Number of backup masters: " + backupMasters.size());
418 if (details) {
419 for (ServerName name: backupMasters) {
420 errors.print(" " + name);
421 }
422 }
423
424 errors.print("Average load: " + status.getAverageLoad());
425 errors.print("Number of requests: " + status.getRequestsCount());
426 errors.print("Number of regions: " + status.getRegionsCount());
427
428 Map<String, RegionState> rits = status.getRegionsInTransition();
429 errors.print("Number of regions in transition: " + rits.size());
430 if (details) {
431 for (RegionState state: rits.values()) {
432 errors.print(" " + state.toDescriptiveString());
433 }
434 }
435
436
437 processRegionServers(regionServers);
438 }
439
440
441
442
443 private void clearState() {
444
445 fixes = 0;
446 regionInfoMap.clear();
447 emptyRegionInfoQualifiers.clear();
448 disabledTables.clear();
449 errors.clear();
450 tablesInfo.clear();
451 orphanHdfsDirs.clear();
452 }
453
454
455
456
457
458
459 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
460
461 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
462 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
463 LOG.info("Loading regioninfos HDFS");
464
465 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
466 int curIter = 0;
467 do {
468 clearState();
469
470 restoreHdfsIntegrity();
471 curIter++;
472 } while (fixes > 0 && curIter <= maxIterations);
473
474
475
476 if (curIter > 2) {
477 if (curIter == maxIterations) {
478 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
479 + "Tables integrity may not be fully repaired!");
480 } else {
481 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
482 }
483 }
484 }
485 }
486
487
488
489
490
491
492
493
494
495 public int onlineConsistencyRepair() throws IOException, KeeperException,
496 InterruptedException {
497 clearState();
498
499
500 loadDeployedRegions();
501
502 recordMetaRegion();
503
504 if (!checkMetaRegion()) {
505 String errorMsg = "hbase:meta table is not consistent. ";
506 if (shouldFixAssignments()) {
507 errorMsg += "HBCK will try fixing it. Rerun once hbase:meta is back to consistent state.";
508 } else {
509 errorMsg += "Run HBCK with proper fix options to fix hbase:meta inconsistency.";
510 }
511 errors.reportError(errorMsg + " Exiting...");
512 return -2;
513 }
514
515 LOG.info("Loading regionsinfo from the hbase:meta table");
516 boolean success = loadMetaEntries();
517 if (!success) return -1;
518
519
520 reportEmptyMetaCells();
521
522
523 if (shouldFixEmptyMetaCells()) {
524 fixEmptyMetaCells();
525 }
526
527
528 if (!checkMetaOnly) {
529 reportTablesInFlux();
530 }
531
532
533 if (shouldCheckHdfs()) {
534 loadHdfsRegionDirs();
535 loadHdfsRegionInfos();
536 }
537
538
539 loadDisabledTables();
540
541
542 fixOrphanTables();
543
544
545 checkAndFixConsistency();
546
547
548 checkIntegrity();
549 return errors.getErrorList().size();
550 }
551
552
553
554
555
556 public int onlineHbck() throws IOException, KeeperException, InterruptedException, ServiceException {
557
558 errors.print("Version: " + status.getHBaseVersion());
559 offlineHdfsIntegrityRepair();
560
561
562 boolean oldBalancer = admin.setBalancerRunning(false, true);
563 try {
564 onlineConsistencyRepair();
565 }
566 finally {
567 admin.setBalancerRunning(oldBalancer, false);
568 }
569
570 if (checkRegionBoundaries) {
571 checkRegionBoundaries();
572 }
573
574 offlineReferenceFileRepair();
575
576 checkAndFixTableLocks();
577
578
579 unlockHbck();
580
581
582 printTableSummary(tablesInfo);
583 return errors.summarize();
584 }
585
586 public static byte[] keyOnly (byte[] b) {
587 if (b == null)
588 return b;
589 int rowlength = Bytes.toShort(b, 0);
590 byte[] result = new byte[rowlength];
591 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
592 return result;
593 }
594
595 private static class RegionBoundariesInformation {
596 public byte [] regionName;
597 public byte [] metaFirstKey;
598 public byte [] metaLastKey;
599 public byte [] storesFirstKey;
600 public byte [] storesLastKey;
601 @Override
602 public String toString () {
603 return "regionName=" + Bytes.toStringBinary(regionName) +
604 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
605 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
606 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
607 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
608 }
609 }
610
611 public void checkRegionBoundaries() {
612 try {
613 ByteArrayComparator comparator = new ByteArrayComparator();
614 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), false);
615 final RegionBoundariesInformation currentRegionBoundariesInformation =
616 new RegionBoundariesInformation();
617 Path hbaseRoot = FSUtils.getRootDir(getConf());
618 for (HRegionInfo regionInfo : regions) {
619 Path tableDir = FSUtils.getTableDir(hbaseRoot, regionInfo.getTable());
620 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
621
622
623 Path path = new Path(tableDir, regionInfo.getEncodedName());
624 FileSystem fs = path.getFileSystem(getConf());
625 FileStatus[] files = fs.listStatus(path);
626
627 byte[] storeFirstKey = null;
628 byte[] storeLastKey = null;
629 for (FileStatus file : files) {
630 String fileName = file.getPath().toString();
631 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
632 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
633 FileStatus[] storeFiles = fs.listStatus(file.getPath());
634
635 for (FileStatus storeFile : storeFiles) {
636 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
637 getConf()), getConf());
638 if ((reader.getFirstKey() != null)
639 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
640 reader.getFirstKey()) > 0))) {
641 storeFirstKey = reader.getFirstKey();
642 }
643 if ((reader.getLastKey() != null)
644 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
645 reader.getLastKey())) < 0)) {
646 storeLastKey = reader.getLastKey();
647 }
648 reader.close();
649 }
650 }
651 }
652 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
653 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
654 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
655 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
656 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
657 currentRegionBoundariesInformation.metaFirstKey = null;
658 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
659 currentRegionBoundariesInformation.metaLastKey = null;
660
661
662
663
664
665
666 boolean valid = true;
667
668 if ((currentRegionBoundariesInformation.storesFirstKey != null)
669 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
670 valid = valid
671 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
672 currentRegionBoundariesInformation.metaFirstKey) >= 0;
673 }
674
675 if ((currentRegionBoundariesInformation.storesLastKey != null)
676 && (currentRegionBoundariesInformation.metaLastKey != null)) {
677 valid = valid
678 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
679 currentRegionBoundariesInformation.metaLastKey) < 0;
680 }
681 if (!valid) {
682 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
683 tablesInfo.get(regionInfo.getTable()));
684 LOG.warn("Region's boundaries not alligned between stores and META for:");
685 LOG.warn(currentRegionBoundariesInformation);
686 }
687 }
688 } catch (IOException e) {
689 LOG.error(e);
690 }
691 }
692
693
694
695
696 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
697 for (HbckInfo hi : orphanHdfsDirs) {
698 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
699 adoptHdfsOrphan(hi);
700 }
701 }
702
703
704
705
706
707
708
709
710
711
712 @SuppressWarnings("deprecation")
713 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
714 Path p = hi.getHdfsRegionDir();
715 FileSystem fs = p.getFileSystem(getConf());
716 FileStatus[] dirs = fs.listStatus(p);
717 if (dirs == null) {
718 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
719 p + ". This dir could probably be deleted.");
720 return ;
721 }
722
723 TableName tableName = hi.getTableName();
724 TableInfo tableInfo = tablesInfo.get(tableName);
725 Preconditions.checkNotNull(tableInfo, "Table '" + tableName + "' not present!");
726 HTableDescriptor template = tableInfo.getHTD();
727
728
729 Pair<byte[],byte[]> orphanRegionRange = null;
730 for (FileStatus cf : dirs) {
731 String cfName= cf.getPath().getName();
732
733 if (cfName.startsWith(".") || cfName.equals(HConstants.SPLIT_LOGDIR_NAME)) continue;
734
735 FileStatus[] hfiles = fs.listStatus(cf.getPath());
736 for (FileStatus hfile : hfiles) {
737 byte[] start, end;
738 HFile.Reader hf = null;
739 try {
740 CacheConfig cacheConf = new CacheConfig(getConf());
741 hf = HFile.createReader(fs, hfile.getPath(), cacheConf, getConf());
742 hf.loadFileInfo();
743 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
744 start = startKv.getRow();
745 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
746 end = endKv.getRow();
747 } catch (IOException ioe) {
748 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
749 continue;
750 } catch (NullPointerException ioe) {
751 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
752 continue;
753 } finally {
754 if (hf != null) {
755 hf.close();
756 }
757 }
758
759
760 if (orphanRegionRange == null) {
761
762 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
763 } else {
764
765
766
767 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
768 orphanRegionRange.setFirst(start);
769 }
770 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
771 orphanRegionRange.setSecond(end);
772 }
773 }
774 }
775 }
776 if (orphanRegionRange == null) {
777 LOG.warn("No data in dir " + p + ", sidelining data");
778 fixes++;
779 sidelineRegionDir(fs, hi);
780 return;
781 }
782 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
783 Bytes.toString(orphanRegionRange.getSecond()) + ")");
784
785
786 HRegionInfo hri = new HRegionInfo(template.getTableName(), orphanRegionRange.getFirst(), orphanRegionRange.getSecond());
787 LOG.info("Creating new region : " + hri);
788 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
789 Path target = region.getRegionFileSystem().getRegionDir();
790
791
792 mergeRegionDirs(target, hi);
793 fixes++;
794 }
795
796
797
798
799
800
801
802
803
804 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
805
806 LOG.info("Loading HBase regioninfo from HDFS...");
807 loadHdfsRegionDirs();
808
809 int errs = errors.getErrorList().size();
810
811 tablesInfo = loadHdfsRegionInfos();
812 checkHdfsIntegrity(false, false);
813
814 if (errors.getErrorList().size() == errs) {
815 LOG.info("No integrity errors. We are done with this phase. Glorious.");
816 return 0;
817 }
818
819 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
820 adoptHdfsOrphans(orphanHdfsDirs);
821
822 }
823
824
825 if (shouldFixHdfsHoles()) {
826 clearState();
827 loadHdfsRegionDirs();
828 tablesInfo = loadHdfsRegionInfos();
829 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
830 }
831
832
833 if (shouldFixHdfsOverlaps()) {
834
835 clearState();
836 loadHdfsRegionDirs();
837 tablesInfo = loadHdfsRegionInfos();
838 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
839 }
840
841 return errors.getErrorList().size();
842 }
843
844
845
846
847
848
849
850
851
852 private void offlineReferenceFileRepair() throws IOException {
853 Configuration conf = getConf();
854 Path hbaseRoot = FSUtils.getRootDir(conf);
855 FileSystem fs = hbaseRoot.getFileSystem(conf);
856 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot);
857 for (Path path: allFiles.values()) {
858 boolean isReference = false;
859 try {
860 isReference = StoreFileInfo.isReference(path);
861 } catch (Throwable t) {
862
863
864
865
866 }
867 if (!isReference) continue;
868
869 Path referredToFile = StoreFileInfo.getReferredToFile(path);
870 if (fs.exists(referredToFile)) continue;
871
872
873 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
874 "Found lingering reference file " + path);
875 if (!shouldFixReferenceFiles()) continue;
876
877
878 boolean success = false;
879 String pathStr = path.toString();
880
881
882
883
884
885 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
886 for (int i = 0; index > 0 && i < 5; i++) {
887 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index - 1);
888 }
889 if (index > 0) {
890 Path rootDir = getSidelineDir();
891 Path dst = new Path(rootDir, pathStr.substring(index + 1));
892 fs.mkdirs(dst.getParent());
893 LOG.info("Trying to sildeline reference file "
894 + path + " to " + dst);
895 setShouldRerun();
896
897 success = fs.rename(path, dst);
898 }
899 if (!success) {
900 LOG.error("Failed to sideline reference file " + path);
901 }
902 }
903 }
904
905
906
907
908 private void reportEmptyMetaCells() {
909 errors.print("Number of empty REGIONINFO_QUALIFIER rows in hbase:meta: " +
910 emptyRegionInfoQualifiers.size());
911 if (details) {
912 for (Result r: emptyRegionInfoQualifiers) {
913 errors.print(" " + r);
914 }
915 }
916 }
917
918
919
920
921 private void reportTablesInFlux() {
922 AtomicInteger numSkipped = new AtomicInteger(0);
923 HTableDescriptor[] allTables = getTables(numSkipped);
924 errors.print("Number of Tables: " + allTables.length);
925 if (details) {
926 if (numSkipped.get() > 0) {
927 errors.detail("Number of Tables in flux: " + numSkipped.get());
928 }
929 for (HTableDescriptor td : allTables) {
930 errors.detail(" Table: " + td.getTableName() + "\t" +
931 (td.isReadOnly() ? "ro" : "rw") + "\t" +
932 (td.isMetaRegion() ? "META" : " ") + "\t" +
933 " families: " + td.getFamilies().size());
934 }
935 }
936 }
937
938 public ErrorReporter getErrors() {
939 return errors;
940 }
941
942
943
944
945
946 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
947 Path regionDir = hbi.getHdfsRegionDir();
948 if (regionDir == null) {
949 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
950 return;
951 }
952
953 if (hbi.hdfsEntry.hri != null) {
954
955 return;
956 }
957
958 FileSystem fs = FileSystem.get(getConf());
959 HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
960 LOG.debug("HRegionInfo read: " + hri.toString());
961 hbi.hdfsEntry.hri = hri;
962 }
963
964
965
966
967
968 public static class RegionRepairException extends IOException {
969 private static final long serialVersionUID = 1L;
970 final IOException ioe;
971 public RegionRepairException(String s, IOException ioe) {
972 super(s);
973 this.ioe = ioe;
974 }
975 }
976
977
978
979
980 private SortedMap<TableName, TableInfo> loadHdfsRegionInfos()
981 throws IOException, InterruptedException {
982 tablesInfo.clear();
983
984 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
985
986
987 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
988 List<Future<Void>> hbiFutures;
989
990 for (HbckInfo hbi : hbckInfos) {
991 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
992 hbis.add(work);
993 }
994
995
996 hbiFutures = executor.invokeAll(hbis);
997
998 for(int i=0; i<hbiFutures.size(); i++) {
999 WorkItemHdfsRegionInfo work = hbis.get(i);
1000 Future<Void> f = hbiFutures.get(i);
1001 try {
1002 f.get();
1003 } catch(ExecutionException e) {
1004 LOG.warn("Failed to read .regioninfo file for region " +
1005 work.hbi.getRegionNameAsString(), e.getCause());
1006 }
1007 }
1008
1009 Path hbaseRoot = FSUtils.getRootDir(getConf());
1010 FileSystem fs = hbaseRoot.getFileSystem(getConf());
1011
1012 for (HbckInfo hbi: hbckInfos) {
1013
1014 if (hbi.getHdfsHRI() == null) {
1015
1016 continue;
1017 }
1018
1019
1020
1021 TableName tableName = hbi.getTableName();
1022 if (tableName == null) {
1023
1024 LOG.warn("tableName was null for: " + hbi);
1025 continue;
1026 }
1027
1028 TableInfo modTInfo = tablesInfo.get(tableName);
1029 if (modTInfo == null) {
1030
1031 modTInfo = new TableInfo(tableName);
1032 tablesInfo.put(tableName, modTInfo);
1033 try {
1034 HTableDescriptor htd =
1035 FSTableDescriptors.getTableDescriptorFromFs(fs, hbaseRoot, tableName);
1036 modTInfo.htds.add(htd);
1037 } catch (IOException ioe) {
1038 if (!orphanTableDirs.containsKey(tableName)) {
1039 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
1040
1041 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
1042 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
1043 Set<String> columns = new HashSet<String>();
1044 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
1045 }
1046 }
1047 }
1048 if (!hbi.isSkipChecks()) {
1049 modTInfo.addRegionInfo(hbi);
1050 }
1051 }
1052
1053 loadTableInfosForTablesWithNoRegion();
1054
1055 return tablesInfo;
1056 }
1057
1058
1059
1060
1061
1062
1063
1064
1065 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
1066 Path regionDir = hbi.getHdfsRegionDir();
1067 FileSystem fs = regionDir.getFileSystem(getConf());
1068 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
1069 for (FileStatus subdir : subDirs) {
1070 String columnfamily = subdir.getPath().getName();
1071 columns.add(columnfamily);
1072 }
1073 return columns;
1074 }
1075
1076
1077
1078
1079
1080
1081
1082
1083 private boolean fabricateTableInfo(FSTableDescriptors fstd, TableName tableName,
1084 Set<String> columns) throws IOException {
1085 if (columns ==null || columns.isEmpty()) return false;
1086 HTableDescriptor htd = new HTableDescriptor(tableName);
1087 for (String columnfamimly : columns) {
1088 htd.addFamily(new HColumnDescriptor(columnfamimly));
1089 }
1090 fstd.createTableDescriptor(htd, true);
1091 return true;
1092 }
1093
1094
1095
1096
1097
1098 public void fixEmptyMetaCells() throws IOException {
1099 if (shouldFixEmptyMetaCells() && !emptyRegionInfoQualifiers.isEmpty()) {
1100 LOG.info("Trying to fix empty REGIONINFO_QUALIFIER hbase:meta rows.");
1101 for (Result region : emptyRegionInfoQualifiers) {
1102 deleteMetaRegion(region.getRow());
1103 errors.getErrorList().remove(ERROR_CODE.EMPTY_META_CELL);
1104 }
1105 emptyRegionInfoQualifiers.clear();
1106 }
1107 }
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118 public void fixOrphanTables() throws IOException {
1119 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1120
1121 List<TableName> tmpList = new ArrayList<TableName>();
1122 tmpList.addAll(orphanTableDirs.keySet());
1123 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1124 Iterator<Entry<TableName, Set<String>>> iter =
1125 orphanTableDirs.entrySet().iterator();
1126 int j = 0;
1127 int numFailedCase = 0;
1128 FSTableDescriptors fstd = new FSTableDescriptors(getConf());
1129 while (iter.hasNext()) {
1130 Entry<TableName, Set<String>> entry =
1131 iter.next();
1132 TableName tableName = entry.getKey();
1133 LOG.info("Trying to fix orphan table error: " + tableName);
1134 if (j < htds.length) {
1135 if (tableName.equals(htds[j].getTableName())) {
1136 HTableDescriptor htd = htds[j];
1137 LOG.info("fixing orphan table: " + tableName + " from cache");
1138 fstd.createTableDescriptor(htd, true);
1139 j++;
1140 iter.remove();
1141 }
1142 } else {
1143 if (fabricateTableInfo(fstd, tableName, entry.getValue())) {
1144 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1145 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1146 iter.remove();
1147 } else {
1148 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1149 numFailedCase++;
1150 }
1151 }
1152 fixes++;
1153 }
1154
1155 if (orphanTableDirs.isEmpty()) {
1156
1157
1158 setShouldRerun();
1159 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1160 } else if (numFailedCase > 0) {
1161 LOG.error("Failed to fix " + numFailedCase
1162 + " OrphanTables with default .tableinfo files");
1163 }
1164
1165 }
1166
1167 orphanTableDirs.clear();
1168
1169 }
1170
1171
1172
1173
1174
1175
1176 private HRegion createNewMeta() throws IOException {
1177 Path rootdir = FSUtils.getRootDir(getConf());
1178 Configuration c = getConf();
1179 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1180 HTableDescriptor metaDescriptor = new FSTableDescriptors(c).get(TableName.META_TABLE_NAME);
1181 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, false);
1182 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c, metaDescriptor);
1183 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, true);
1184 return meta;
1185 }
1186
1187
1188
1189
1190
1191
1192
1193 private ArrayList<Put> generatePuts(
1194 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1195 ArrayList<Put> puts = new ArrayList<Put>();
1196 boolean hasProblems = false;
1197 for (Entry<TableName, TableInfo> e : tablesInfo.entrySet()) {
1198 TableName name = e.getKey();
1199
1200
1201 if (name.compareTo(TableName.META_TABLE_NAME) == 0) {
1202 continue;
1203 }
1204
1205 TableInfo ti = e.getValue();
1206 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1207 .entrySet()) {
1208 Collection<HbckInfo> his = spl.getValue();
1209 int sz = his.size();
1210 if (sz != 1) {
1211
1212 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1213 + " had " + sz + " regions instead of exactly 1." );
1214 hasProblems = true;
1215 continue;
1216 }
1217
1218
1219 HbckInfo hi = his.iterator().next();
1220 HRegionInfo hri = hi.getHdfsHRI();
1221 Put p = MetaEditor.makePutFromRegionInfo(hri);
1222 puts.add(p);
1223 }
1224 }
1225 return hasProblems ? null : puts;
1226 }
1227
1228
1229
1230
1231 private void suggestFixes(
1232 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1233 for (TableInfo tInfo : tablesInfo.values()) {
1234 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1235 tInfo.checkRegionChain(handler);
1236 }
1237 }
1238
1239
1240
1241
1242
1243
1244
1245
1246 public boolean rebuildMeta(boolean fix) throws IOException,
1247 InterruptedException {
1248
1249
1250
1251
1252
1253 LOG.info("Loading HBase regioninfo from HDFS...");
1254 loadHdfsRegionDirs();
1255
1256 int errs = errors.getErrorList().size();
1257 tablesInfo = loadHdfsRegionInfos();
1258 checkHdfsIntegrity(false, false);
1259
1260
1261 if (errors.getErrorList().size() != errs) {
1262
1263 while(true) {
1264 fixes = 0;
1265 suggestFixes(tablesInfo);
1266 errors.clear();
1267 loadHdfsRegionInfos();
1268 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1269
1270 int errCount = errors.getErrorList().size();
1271
1272 if (fixes == 0) {
1273 if (errCount > 0) {
1274 return false;
1275 } else {
1276 break;
1277 }
1278 }
1279 }
1280 }
1281
1282
1283 LOG.info("HDFS regioninfo's seems good. Sidelining old hbase:meta");
1284 Path backupDir = sidelineOldMeta();
1285
1286 LOG.info("Creating new hbase:meta");
1287 HRegion meta = createNewMeta();
1288
1289
1290 List<Put> puts = generatePuts(tablesInfo);
1291 if (puts == null) {
1292 LOG.fatal("Problem encountered when creating new hbase:meta entries. " +
1293 "You may need to restore the previously sidelined hbase:meta");
1294 return false;
1295 }
1296 meta.batchMutate(puts.toArray(new Put[puts.size()]));
1297 HRegion.closeHRegion(meta);
1298 LOG.info("Success! hbase:meta table rebuilt.");
1299 LOG.info("Old hbase:meta is moved into " + backupDir);
1300 return true;
1301 }
1302
1303 private SortedMap<TableName, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1304 boolean fixOverlaps) throws IOException {
1305 LOG.info("Checking HBase region split map from HDFS data...");
1306 for (TableInfo tInfo : tablesInfo.values()) {
1307 TableIntegrityErrorHandler handler;
1308 if (fixHoles || fixOverlaps) {
1309 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1310 fixHoles, fixOverlaps);
1311 } else {
1312 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1313 }
1314 if (!tInfo.checkRegionChain(handler)) {
1315
1316 errors.report("Found inconsistency in table " + tInfo.getName());
1317 }
1318 }
1319 return tablesInfo;
1320 }
1321
1322 private Path getSidelineDir() throws IOException {
1323 if (sidelineDir == null) {
1324 Path hbaseDir = FSUtils.getRootDir(getConf());
1325 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1326 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1327 + startMillis);
1328 }
1329 return sidelineDir;
1330 }
1331
1332
1333
1334
1335 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1336 return sidelineRegionDir(fs, null, hi);
1337 }
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347 Path sidelineRegionDir(FileSystem fs,
1348 String parentDir, HbckInfo hi) throws IOException {
1349 TableName tableName = hi.getTableName();
1350 Path regionDir = hi.getHdfsRegionDir();
1351
1352 if (!fs.exists(regionDir)) {
1353 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1354 return null;
1355 }
1356
1357 Path rootDir = getSidelineDir();
1358 if (parentDir != null) {
1359 rootDir = new Path(rootDir, parentDir);
1360 }
1361 Path sidelineTableDir= FSUtils.getTableDir(rootDir, tableName);
1362 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1363 fs.mkdirs(sidelineRegionDir);
1364 boolean success = false;
1365 FileStatus[] cfs = fs.listStatus(regionDir);
1366 if (cfs == null) {
1367 LOG.info("Region dir is empty: " + regionDir);
1368 } else {
1369 for (FileStatus cf : cfs) {
1370 Path src = cf.getPath();
1371 Path dst = new Path(sidelineRegionDir, src.getName());
1372 if (fs.isFile(src)) {
1373
1374 success = fs.rename(src, dst);
1375 if (!success) {
1376 String msg = "Unable to rename file " + src + " to " + dst;
1377 LOG.error(msg);
1378 throw new IOException(msg);
1379 }
1380 continue;
1381 }
1382
1383
1384 fs.mkdirs(dst);
1385
1386 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1387
1388
1389
1390
1391 FileStatus[] hfiles = fs.listStatus(src);
1392 if (hfiles != null && hfiles.length > 0) {
1393 for (FileStatus hfile : hfiles) {
1394 success = fs.rename(hfile.getPath(), dst);
1395 if (!success) {
1396 String msg = "Unable to rename file " + src + " to " + dst;
1397 LOG.error(msg);
1398 throw new IOException(msg);
1399 }
1400 }
1401 }
1402 LOG.debug("Sideline directory contents:");
1403 debugLsr(sidelineRegionDir);
1404 }
1405 }
1406
1407 LOG.info("Removing old region dir: " + regionDir);
1408 success = fs.delete(regionDir, true);
1409 if (!success) {
1410 String msg = "Unable to delete dir " + regionDir;
1411 LOG.error(msg);
1412 throw new IOException(msg);
1413 }
1414 return sidelineRegionDir;
1415 }
1416
1417
1418
1419
1420 void sidelineTable(FileSystem fs, TableName tableName, Path hbaseDir,
1421 Path backupHbaseDir) throws IOException {
1422 Path tableDir = FSUtils.getTableDir(hbaseDir, tableName);
1423 if (fs.exists(tableDir)) {
1424 Path backupTableDir= FSUtils.getTableDir(backupHbaseDir, tableName);
1425 fs.mkdirs(backupTableDir.getParent());
1426 boolean success = fs.rename(tableDir, backupTableDir);
1427 if (!success) {
1428 throw new IOException("Failed to move " + tableName + " from "
1429 + tableDir + " to " + backupTableDir);
1430 }
1431 } else {
1432 LOG.info("No previous " + tableName + " exists. Continuing.");
1433 }
1434 }
1435
1436
1437
1438
1439 Path sidelineOldMeta() throws IOException {
1440
1441 Path hbaseDir = FSUtils.getRootDir(getConf());
1442 FileSystem fs = hbaseDir.getFileSystem(getConf());
1443 Path backupDir = getSidelineDir();
1444 fs.mkdirs(backupDir);
1445
1446 try {
1447 sidelineTable(fs, TableName.META_TABLE_NAME, hbaseDir, backupDir);
1448 } catch (IOException e) {
1449 LOG.fatal("... failed to sideline meta. Currently in inconsistent state. To restore "
1450 + "try to rename hbase:meta in " + backupDir.getName() + " to "
1451 + hbaseDir.getName() + ".", e);
1452 throw e;
1453 }
1454 return backupDir;
1455 }
1456
1457
1458
1459
1460
1461
1462 private void loadDisabledTables()
1463 throws ZooKeeperConnectionException, IOException {
1464 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1465 @Override
1466 public Void connect(HConnection connection) throws IOException {
1467 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1468 try {
1469 for (TableName tableName :
1470 ZKTableReadOnly.getDisabledOrDisablingTables(zkw)) {
1471 disabledTables.add(tableName);
1472 }
1473 } catch (KeeperException ke) {
1474 throw new IOException(ke);
1475 } finally {
1476 zkw.close();
1477 }
1478 return null;
1479 }
1480 });
1481 }
1482
1483
1484
1485
1486 private boolean isTableDisabled(HRegionInfo regionInfo) {
1487 return disabledTables.contains(regionInfo.getTable());
1488 }
1489
1490
1491
1492
1493
1494 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1495 Path rootDir = FSUtils.getRootDir(getConf());
1496 FileSystem fs = rootDir.getFileSystem(getConf());
1497
1498
1499 List<FileStatus> tableDirs = Lists.newArrayList();
1500
1501 boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME));
1502
1503 List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
1504 for (Path path : paths) {
1505 TableName tableName = FSUtils.getTableName(path);
1506 if ((!checkMetaOnly &&
1507 isTableIncluded(tableName)) ||
1508 tableName.equals(TableName.META_TABLE_NAME)) {
1509 tableDirs.add(fs.getFileStatus(path));
1510 }
1511 }
1512
1513
1514 if (!foundVersionFile) {
1515 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1516 "Version file does not exist in root dir " + rootDir);
1517 if (shouldFixVersionFile()) {
1518 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1519 + " file.");
1520 setShouldRerun();
1521 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1522 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1523 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1524 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1525 }
1526 }
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552 private boolean recordMetaRegion() throws IOException {
1553 HRegionLocation metaLocation = connection.locateRegion(
1554 TableName.META_TABLE_NAME, HConstants.EMPTY_START_ROW);
1555
1556
1557 if (metaLocation == null || metaLocation.getRegionInfo() == null ||
1558 metaLocation.getHostname() == null) {
1559 errors.reportError(ERROR_CODE.NULL_META_REGION,
1560 "META region or some of its attributes are null.");
1561 return false;
1562 }
1563 ServerName sn;
1564 try {
1565 sn = getMetaRegionServerName();
1566 } catch (KeeperException e) {
1567 throw new IOException(e);
1568 }
1569 MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis());
1570 HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
1571 if (hbckInfo == null) {
1572 regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
1573 } else {
1574 hbckInfo.metaEntry = m;
1575 }
1576 return true;
1577 }
1578
1579 private ZooKeeperWatcher createZooKeeperWatcher() throws IOException {
1580 return new ZooKeeperWatcher(getConf(), "hbase Fsck", new Abortable() {
1581 @Override
1582 public void abort(String why, Throwable e) {
1583 LOG.error(why, e);
1584 System.exit(1);
1585 }
1586
1587 @Override
1588 public boolean isAborted() {
1589 return false;
1590 }
1591
1592 });
1593 }
1594
1595 private ServerName getMetaRegionServerName()
1596 throws IOException, KeeperException {
1597 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1598 ServerName sn = null;
1599 try {
1600 sn = MetaRegionTracker.getMetaRegionLocation(zkw);
1601 } finally {
1602 zkw.close();
1603 }
1604 return sn;
1605 }
1606
1607
1608
1609
1610
1611
1612 void processRegionServers(Collection<ServerName> regionServerList)
1613 throws IOException, InterruptedException {
1614
1615 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1616 List<Future<Void>> workFutures;
1617
1618
1619 for (ServerName rsinfo: regionServerList) {
1620 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1621 }
1622
1623 workFutures = executor.invokeAll(workItems);
1624
1625 for(int i=0; i<workFutures.size(); i++) {
1626 WorkItemRegion item = workItems.get(i);
1627 Future<Void> f = workFutures.get(i);
1628 try {
1629 f.get();
1630 } catch(ExecutionException e) {
1631 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1632 e.getCause());
1633 }
1634 }
1635 }
1636
1637
1638
1639
1640 private void checkAndFixConsistency()
1641 throws IOException, KeeperException, InterruptedException {
1642 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1643 checkRegionConsistency(e.getKey(), e.getValue());
1644 }
1645 }
1646
1647 private void preCheckPermission() throws IOException, AccessDeniedException {
1648 if (shouldIgnorePreCheckPermission()) {
1649 return;
1650 }
1651
1652 Path hbaseDir = FSUtils.getRootDir(getConf());
1653 FileSystem fs = hbaseDir.getFileSystem(getConf());
1654 UserProvider userProvider = UserProvider.instantiate(getConf());
1655 UserGroupInformation ugi = userProvider.getCurrent().getUGI();
1656 FileStatus[] files = fs.listStatus(hbaseDir);
1657 for (FileStatus file : files) {
1658 try {
1659 FSUtils.checkAccess(ugi, file, FsAction.WRITE);
1660 } catch (AccessDeniedException ace) {
1661 LOG.warn("Got AccessDeniedException when preCheckPermission ", ace);
1662 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + ugi.getUserName()
1663 + " does not have write perms to " + file.getPath()
1664 + ". Please rerun hbck as hdfs user " + file.getOwner());
1665 throw ace;
1666 }
1667 }
1668 }
1669
1670
1671
1672
1673 private void deleteMetaRegion(HbckInfo hi) throws IOException {
1674 deleteMetaRegion(hi.metaEntry.getRegionName());
1675 }
1676
1677
1678
1679
1680 private void deleteMetaRegion(byte[] metaKey) throws IOException {
1681 Delete d = new Delete(metaKey);
1682 meta.delete(d);
1683 meta.flushCommits();
1684 LOG.info("Deleted " + Bytes.toString(metaKey) + " from META" );
1685 }
1686
1687
1688
1689
1690 private void resetSplitParent(HbckInfo hi) throws IOException {
1691 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
1692 Delete d = new Delete(hi.metaEntry.getRegionName());
1693 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
1694 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
1695 mutations.add(d);
1696
1697 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
1698 hri.setOffline(false);
1699 hri.setSplit(false);
1700 Put p = MetaEditor.makePutFromRegionInfo(hri);
1701 mutations.add(p);
1702
1703 meta.mutateRow(mutations);
1704 meta.flushCommits();
1705 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
1706 }
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716 private void offline(byte[] regionName) throws IOException {
1717 String regionString = Bytes.toStringBinary(regionName);
1718 if (!rsSupportsOffline) {
1719 LOG.warn("Using unassign region " + regionString
1720 + " instead of using offline method, you should"
1721 + " restart HMaster after these repairs");
1722 admin.unassign(regionName, true);
1723 return;
1724 }
1725
1726
1727 try {
1728 LOG.info("Offlining region " + regionString);
1729 admin.offline(regionName);
1730 } catch (IOException ioe) {
1731 String notFoundMsg = "java.lang.NoSuchMethodException: " +
1732 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
1733 if (ioe.getMessage().contains(notFoundMsg)) {
1734 LOG.warn("Using unassign region " + regionString
1735 + " instead of using offline method, you should"
1736 + " restart HMaster after these repairs");
1737 rsSupportsOffline = false;
1738 admin.unassign(regionName, true);
1739 return;
1740 }
1741 throw ioe;
1742 }
1743 }
1744
1745 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
1746 for (OnlineEntry rse : hi.deployedEntries) {
1747 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
1748 try {
1749 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, rse.hsa, rse.hri);
1750 offline(rse.hri.getRegionName());
1751 } catch (IOException ioe) {
1752 LOG.warn("Got exception when attempting to offline region "
1753 + Bytes.toString(rse.hri.getRegionName()), ioe);
1754 }
1755 }
1756 }
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
1771 if (hi.metaEntry == null && hi.hdfsEntry == null) {
1772 undeployRegions(hi);
1773 return;
1774 }
1775
1776
1777 Get get = new Get(hi.getRegionName());
1778 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1779 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
1780 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
1781 Result r = meta.get(get);
1782 ServerName serverName = HRegionInfo.getServerName(r);
1783 if (serverName == null) {
1784 errors.reportError("Unable to close region "
1785 + hi.getRegionNameAsString() + " because meta does not "
1786 + "have handle to reach it.");
1787 return;
1788 }
1789
1790 HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
1791 if (hri == null) {
1792 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
1793 + " because hbase:meta had invalid or missing "
1794 + HConstants.CATALOG_FAMILY_STR + ":"
1795 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
1796 + " qualifier value.");
1797 return;
1798 }
1799
1800
1801 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, serverName, hri);
1802 }
1803
1804 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
1805 KeeperException, InterruptedException {
1806
1807 if (shouldFixAssignments()) {
1808 errors.print(msg);
1809 undeployRegions(hbi);
1810 setShouldRerun();
1811 HRegionInfo hri = hbi.getHdfsHRI();
1812 if (hri == null) {
1813 hri = hbi.metaEntry;
1814 }
1815 HBaseFsckRepair.fixUnassigned(admin, hri);
1816 HBaseFsckRepair.waitUntilAssigned(admin, hri);
1817 }
1818 }
1819
1820
1821
1822
1823 private void checkRegionConsistency(final String key, final HbckInfo hbi)
1824 throws IOException, KeeperException, InterruptedException {
1825 String descriptiveName = hbi.toString();
1826
1827 boolean inMeta = hbi.metaEntry != null;
1828
1829 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
1830 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
1831 boolean isDeployed = !hbi.deployedOn.isEmpty();
1832 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
1833 boolean deploymentMatchesMeta =
1834 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
1835 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
1836 boolean splitParent =
1837 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
1838 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
1839 boolean recentlyModified = inHdfs &&
1840 hbi.getModTime() + timelag > System.currentTimeMillis();
1841
1842
1843 if (hbi.containsOnlyHdfsEdits()) {
1844 return;
1845 }
1846 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
1847 return;
1848 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
1849 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
1850 "tabled that is not deployed");
1851 return;
1852 } else if (recentlyModified) {
1853 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
1854 return;
1855 }
1856
1857 else if (!inMeta && !inHdfs && !isDeployed) {
1858
1859 assert false : "Entry for region with no data";
1860 } else if (!inMeta && !inHdfs && isDeployed) {
1861 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
1862 + descriptiveName + ", key=" + key + ", not on HDFS or in hbase:meta but " +
1863 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1864 if (shouldFixAssignments()) {
1865 undeployRegions(hbi);
1866 }
1867
1868 } else if (!inMeta && inHdfs && !isDeployed) {
1869 if (hbi.isMerged()) {
1870
1871
1872 hbi.setSkipChecks(true);
1873 LOG.info("Region " + descriptiveName
1874 + " got merge recently, its file(s) will be cleaned by CatalogJanitor later");
1875 return;
1876 }
1877 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
1878 + descriptiveName + " on HDFS, but not listed in hbase:meta " +
1879 "or deployed on any region server");
1880
1881 if (shouldFixMeta()) {
1882 if (!hbi.isHdfsRegioninfoPresent()) {
1883 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
1884 + " in table integrity repair phase if -fixHdfsOrphans was" +
1885 " used.");
1886 return;
1887 }
1888
1889 LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
1890 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1891
1892 tryAssignmentRepair(hbi, "Trying to reassign region...");
1893 }
1894
1895 } else if (!inMeta && inHdfs && isDeployed) {
1896 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
1897 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1898 debugLsr(hbi.getHdfsRegionDir());
1899 if (shouldFixMeta()) {
1900 if (!hbi.isHdfsRegioninfoPresent()) {
1901 LOG.error("This should have been repaired in table integrity repair phase");
1902 return;
1903 }
1904
1905 LOG.info("Patching hbase:meta with with .regioninfo: " + hbi.getHdfsHRI());
1906 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1907
1908 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1909 }
1910
1911
1912 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
1913
1914
1915 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
1916
1917 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
1918 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
1919 if (infoA != null && infoB != null) {
1920
1921 hbi.setSkipChecks(true);
1922 return;
1923 }
1924 }
1925 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
1926 + descriptiveName + " is a split parent in META, in HDFS, "
1927 + "and not deployed on any region server. This could be transient.");
1928 if (shouldFixSplitParents()) {
1929 setShouldRerun();
1930 resetSplitParent(hbi);
1931 }
1932 } else if (inMeta && !inHdfs && !isDeployed) {
1933 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
1934 + descriptiveName + " found in META, but not in HDFS "
1935 + "or deployed on any region server.");
1936 if (shouldFixMeta()) {
1937 deleteMetaRegion(hbi);
1938 }
1939 } else if (inMeta && !inHdfs && isDeployed) {
1940 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
1941 + " found in META, but not in HDFS, " +
1942 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1943
1944
1945
1946 if (shouldFixAssignments()) {
1947 errors.print("Trying to fix unassigned region...");
1948 undeployRegions(hbi);
1949 }
1950 if (shouldFixMeta()) {
1951
1952 deleteMetaRegion(hbi);
1953 }
1954 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
1955 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
1956 + " not deployed on any region server.");
1957 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1958 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
1959 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
1960 "Region " + descriptiveName + " should not be deployed according " +
1961 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1962 if (shouldFixAssignments()) {
1963 errors.print("Trying to close the region " + descriptiveName);
1964 setShouldRerun();
1965 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1966 }
1967 } else if (inMeta && inHdfs && isMultiplyDeployed) {
1968 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
1969 + " is listed in hbase:meta on region server " + hbi.metaEntry.regionServer
1970 + " but is multiply assigned to region servers " +
1971 Joiner.on(", ").join(hbi.deployedOn));
1972
1973 if (shouldFixAssignments()) {
1974 errors.print("Trying to fix assignment error...");
1975 setShouldRerun();
1976 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1977 }
1978 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
1979 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
1980 + descriptiveName + " listed in hbase:meta on region server " +
1981 hbi.metaEntry.regionServer + " but found on region server " +
1982 hbi.deployedOn.get(0));
1983
1984 if (shouldFixAssignments()) {
1985 errors.print("Trying to fix assignment error...");
1986 setShouldRerun();
1987 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
1988 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
1989 }
1990 } else {
1991 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
1992 " is in an unforeseen state:" +
1993 " inMeta=" + inMeta +
1994 " inHdfs=" + inHdfs +
1995 " isDeployed=" + isDeployed +
1996 " isMultiplyDeployed=" + isMultiplyDeployed +
1997 " deploymentMatchesMeta=" + deploymentMatchesMeta +
1998 " shouldBeDeployed=" + shouldBeDeployed);
1999 }
2000 }
2001
2002
2003
2004
2005
2006
2007
2008 SortedMap<TableName, TableInfo> checkIntegrity() throws IOException {
2009 tablesInfo = new TreeMap<TableName,TableInfo> ();
2010 List<HbckInfo> noHDFSRegionInfos = new ArrayList<HbckInfo>();
2011 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
2012 for (HbckInfo hbi : regionInfoMap.values()) {
2013
2014 if (hbi.metaEntry == null) {
2015
2016 noHDFSRegionInfos.add(hbi);
2017 Path p = hbi.getHdfsRegionDir();
2018 if (p == null) {
2019 errors.report("No regioninfo in Meta or HDFS. " + hbi);
2020 }
2021
2022
2023 continue;
2024 }
2025 if (hbi.metaEntry.regionServer == null) {
2026 errors.detail("Skipping region because no region server: " + hbi);
2027 continue;
2028 }
2029 if (hbi.metaEntry.isOffline()) {
2030 errors.detail("Skipping region because it is offline: " + hbi);
2031 continue;
2032 }
2033 if (hbi.containsOnlyHdfsEdits()) {
2034 errors.detail("Skipping region because it only contains edits" + hbi);
2035 continue;
2036 }
2037
2038
2039
2040
2041
2042
2043 if (hbi.deployedOn.size() == 0) continue;
2044
2045
2046 TableName tableName = hbi.metaEntry.getTable();
2047 TableInfo modTInfo = tablesInfo.get(tableName);
2048 if (modTInfo == null) {
2049 modTInfo = new TableInfo(tableName);
2050 }
2051 for (ServerName server : hbi.deployedOn) {
2052 modTInfo.addServer(server);
2053 }
2054
2055 if (!hbi.isSkipChecks()) {
2056 modTInfo.addRegionInfo(hbi);
2057 }
2058
2059 tablesInfo.put(tableName, modTInfo);
2060 }
2061
2062 loadTableInfosForTablesWithNoRegion();
2063
2064 for (TableInfo tInfo : tablesInfo.values()) {
2065 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
2066 if (!tInfo.checkRegionChain(handler)) {
2067 errors.report("Found inconsistency in table " + tInfo.getName());
2068 }
2069 }
2070 return tablesInfo;
2071 }
2072
2073
2074
2075
2076 private void loadTableInfosForTablesWithNoRegion() throws IOException {
2077 Map<String, HTableDescriptor> allTables = new FSTableDescriptors(getConf()).getAll();
2078 for (HTableDescriptor htd : allTables.values()) {
2079 if (checkMetaOnly && !htd.isMetaTable()) {
2080 continue;
2081 }
2082
2083 TableName tableName = htd.getTableName();
2084 if (isTableIncluded(tableName) && !tablesInfo.containsKey(tableName)) {
2085 TableInfo tableInfo = new TableInfo(tableName);
2086 tableInfo.htds.add(htd);
2087 tablesInfo.put(htd.getTableName(), tableInfo);
2088 }
2089 }
2090 }
2091
2092
2093
2094
2095
2096 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2097 int fileMoves = 0;
2098 String thread = Thread.currentThread().getName();
2099 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2100 debugLsr(contained.getHdfsRegionDir());
2101
2102
2103 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2104 FileStatus[] dirs = null;
2105 try {
2106 dirs = fs.listStatus(contained.getHdfsRegionDir());
2107 } catch (FileNotFoundException fnfe) {
2108
2109
2110 if (!fs.exists(contained.getHdfsRegionDir())) {
2111 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2112 + " is missing. Assuming already sidelined or moved.");
2113 } else {
2114 sidelineRegionDir(fs, contained);
2115 }
2116 return fileMoves;
2117 }
2118
2119 if (dirs == null) {
2120 if (!fs.exists(contained.getHdfsRegionDir())) {
2121 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2122 + " already sidelined.");
2123 } else {
2124 sidelineRegionDir(fs, contained);
2125 }
2126 return fileMoves;
2127 }
2128
2129 for (FileStatus cf : dirs) {
2130 Path src = cf.getPath();
2131 Path dst = new Path(targetRegionDir, src.getName());
2132
2133 if (src.getName().equals(HRegionFileSystem.REGION_INFO_FILE)) {
2134
2135 continue;
2136 }
2137
2138 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2139
2140 continue;
2141 }
2142
2143 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2144
2145
2146
2147
2148 for (FileStatus hfile : fs.listStatus(src)) {
2149 boolean success = fs.rename(hfile.getPath(), dst);
2150 if (success) {
2151 fileMoves++;
2152 }
2153 }
2154 LOG.debug("[" + thread + "] Sideline directory contents:");
2155 debugLsr(targetRegionDir);
2156 }
2157
2158
2159 sidelineRegionDir(fs, contained);
2160 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2161 getSidelineDir());
2162 debugLsr(contained.getHdfsRegionDir());
2163
2164 return fileMoves;
2165 }
2166
2167
2168 static class WorkItemOverlapMerge implements Callable<Void> {
2169 private TableIntegrityErrorHandler handler;
2170 Collection<HbckInfo> overlapgroup;
2171
2172 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2173 this.handler = handler;
2174 this.overlapgroup = overlapgroup;
2175 }
2176
2177 @Override
2178 public Void call() throws Exception {
2179 handler.handleOverlapGroup(overlapgroup);
2180 return null;
2181 }
2182 };
2183
2184
2185
2186
2187
2188 public class TableInfo {
2189 TableName tableName;
2190 TreeSet <ServerName> deployedOn;
2191
2192
2193 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2194
2195
2196 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2197
2198
2199 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2200
2201
2202 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2203
2204
2205 final Multimap<byte[], HbckInfo> overlapGroups =
2206 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2207
2208 TableInfo(TableName name) {
2209 this.tableName = name;
2210 deployedOn = new TreeSet <ServerName>();
2211 }
2212
2213
2214
2215
2216 private HTableDescriptor getHTD() {
2217 if (htds.size() == 1) {
2218 return (HTableDescriptor)htds.toArray()[0];
2219 } else {
2220 LOG.error("None/Multiple table descriptors found for table '"
2221 + tableName + "' regions: " + htds);
2222 }
2223 return null;
2224 }
2225
2226 public void addRegionInfo(HbckInfo hir) {
2227 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2228
2229 sc.add(hir);
2230 return;
2231 }
2232
2233
2234 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2235 errors.reportError(
2236 ERROR_CODE.REGION_CYCLE,
2237 String.format("The endkey for this region comes before the "
2238 + "startkey, startkey=%s, endkey=%s",
2239 Bytes.toStringBinary(hir.getStartKey()),
2240 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2241 backwards.add(hir);
2242 return;
2243 }
2244
2245
2246 sc.add(hir);
2247 }
2248
2249 public void addServer(ServerName server) {
2250 this.deployedOn.add(server);
2251 }
2252
2253 public TableName getName() {
2254 return tableName;
2255 }
2256
2257 public int getNumRegions() {
2258 return sc.getStarts().size() + backwards.size();
2259 }
2260
2261 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2262 ErrorReporter errors;
2263
2264 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2265 this.errors = errors;
2266 setTableInfo(ti);
2267 }
2268
2269 @Override
2270 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2271 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2272 "First region should start with an empty key. You need to "
2273 + " create a new region and regioninfo in HDFS to plug the hole.",
2274 getTableInfo(), hi);
2275 }
2276
2277 @Override
2278 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2279 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2280 "Last region should end with an empty key. You need to "
2281 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2282 }
2283
2284 @Override
2285 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2286 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2287 "Region has the same start and end key.", getTableInfo(), hi);
2288 }
2289
2290 @Override
2291 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2292 byte[] key = r1.getStartKey();
2293
2294 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2295 "Multiple regions have the same startkey: "
2296 + Bytes.toStringBinary(key), getTableInfo(), r1);
2297 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2298 "Multiple regions have the same startkey: "
2299 + Bytes.toStringBinary(key), getTableInfo(), r2);
2300 }
2301
2302 @Override
2303 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2304 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2305 "There is an overlap in the region chain.",
2306 getTableInfo(), hi1, hi2);
2307 }
2308
2309 @Override
2310 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2311 errors.reportError(
2312 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2313 "There is a hole in the region chain between "
2314 + Bytes.toStringBinary(holeStart) + " and "
2315 + Bytes.toStringBinary(holeStop)
2316 + ". You need to create a new .regioninfo and region "
2317 + "dir in hdfs to plug the hole.");
2318 }
2319 };
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2334 Configuration conf;
2335
2336 boolean fixOverlaps = true;
2337
2338 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2339 boolean fixHoles, boolean fixOverlaps) {
2340 super(ti, errors);
2341 this.conf = conf;
2342 this.fixOverlaps = fixOverlaps;
2343
2344 }
2345
2346
2347
2348
2349
2350
2351 @Override
2352 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2353 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2354 "First region should start with an empty key. Creating a new " +
2355 "region and regioninfo in HDFS to plug the hole.",
2356 getTableInfo(), next);
2357 HTableDescriptor htd = getTableInfo().getHTD();
2358
2359 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(),
2360 HConstants.EMPTY_START_ROW, next.getStartKey());
2361
2362
2363 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2364 LOG.info("Table region start key was not empty. Created new empty region: "
2365 + newRegion + " " +region);
2366 fixes++;
2367 }
2368
2369 @Override
2370 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2371 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2372 "Last region should end with an empty key. Creating a new "
2373 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2374 HTableDescriptor htd = getTableInfo().getHTD();
2375
2376 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), curEndKey,
2377 HConstants.EMPTY_START_ROW);
2378
2379 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2380 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2381 + " " + region);
2382 fixes++;
2383 }
2384
2385
2386
2387
2388
2389 @Override
2390 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2391 errors.reportError(
2392 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2393 "There is a hole in the region chain between "
2394 + Bytes.toStringBinary(holeStartKey) + " and "
2395 + Bytes.toStringBinary(holeStopKey)
2396 + ". Creating a new regioninfo and region "
2397 + "dir in hdfs to plug the hole.");
2398 HTableDescriptor htd = getTableInfo().getHTD();
2399 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), holeStartKey, holeStopKey);
2400 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2401 LOG.info("Plugged hole by creating new empty region: "+ newRegion + " " +region);
2402 fixes++;
2403 }
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416 @Override
2417 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2418 throws IOException {
2419 Preconditions.checkNotNull(overlap);
2420 Preconditions.checkArgument(overlap.size() >0);
2421
2422 if (!this.fixOverlaps) {
2423 LOG.warn("Not attempting to repair overlaps.");
2424 return;
2425 }
2426
2427 if (overlap.size() > maxMerge) {
2428 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2429 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2430 if (sidelineBigOverlaps) {
2431
2432 sidelineBigOverlaps(overlap);
2433 }
2434 return;
2435 }
2436
2437 mergeOverlaps(overlap);
2438 }
2439
2440 void mergeOverlaps(Collection<HbckInfo> overlap)
2441 throws IOException {
2442 String thread = Thread.currentThread().getName();
2443 LOG.info("== [" + thread + "] Merging regions into one region: "
2444 + Joiner.on(",").join(overlap));
2445
2446 Pair<byte[], byte[]> range = null;
2447 for (HbckInfo hi : overlap) {
2448 if (range == null) {
2449 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2450 } else {
2451 if (RegionSplitCalculator.BYTES_COMPARATOR
2452 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2453 range.setFirst(hi.getStartKey());
2454 }
2455 if (RegionSplitCalculator.BYTES_COMPARATOR
2456 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2457 range.setSecond(hi.getEndKey());
2458 }
2459 }
2460
2461 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2462 LOG.debug("[" + thread + "] Contained region dir before close");
2463 debugLsr(hi.getHdfsRegionDir());
2464 try {
2465 LOG.info("[" + thread + "] Closing region: " + hi);
2466 closeRegion(hi);
2467 } catch (IOException ioe) {
2468 LOG.warn("[" + thread + "] Was unable to close region " + hi
2469 + ". Just continuing... ", ioe);
2470 } catch (InterruptedException e) {
2471 LOG.warn("[" + thread + "] Was unable to close region " + hi
2472 + ". Just continuing... ", e);
2473 }
2474
2475 try {
2476 LOG.info("[" + thread + "] Offlining region: " + hi);
2477 offline(hi.getRegionName());
2478 } catch (IOException ioe) {
2479 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2480 + ". Just continuing... ", ioe);
2481 }
2482 }
2483
2484
2485 HTableDescriptor htd = getTableInfo().getHTD();
2486
2487 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), range.getFirst(),
2488 range.getSecond());
2489 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2490 LOG.info("[" + thread + "] Created new empty container region: " +
2491 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2492 debugLsr(region.getRegionFileSystem().getRegionDir());
2493
2494
2495 boolean didFix= false;
2496 Path target = region.getRegionFileSystem().getRegionDir();
2497 for (HbckInfo contained : overlap) {
2498 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
2499 int merges = mergeRegionDirs(target, contained);
2500 if (merges > 0) {
2501 didFix = true;
2502 }
2503 }
2504 if (didFix) {
2505 fixes++;
2506 }
2507 }
2508
2509
2510
2511
2512
2513
2514
2515
2516 void sidelineBigOverlaps(
2517 Collection<HbckInfo> bigOverlap) throws IOException {
2518 int overlapsToSideline = bigOverlap.size() - maxMerge;
2519 if (overlapsToSideline > maxOverlapsToSideline) {
2520 overlapsToSideline = maxOverlapsToSideline;
2521 }
2522 List<HbckInfo> regionsToSideline =
2523 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
2524 FileSystem fs = FileSystem.get(conf);
2525 for (HbckInfo regionToSideline: regionsToSideline) {
2526 try {
2527 LOG.info("Closing region: " + regionToSideline);
2528 closeRegion(regionToSideline);
2529 } catch (IOException ioe) {
2530 LOG.warn("Was unable to close region " + regionToSideline
2531 + ". Just continuing... ", ioe);
2532 } catch (InterruptedException e) {
2533 LOG.warn("Was unable to close region " + regionToSideline
2534 + ". Just continuing... ", e);
2535 }
2536
2537 try {
2538 LOG.info("Offlining region: " + regionToSideline);
2539 offline(regionToSideline.getRegionName());
2540 } catch (IOException ioe) {
2541 LOG.warn("Unable to offline region from master: " + regionToSideline
2542 + ". Just continuing... ", ioe);
2543 }
2544
2545 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
2546 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
2547 if (sidelineRegionDir != null) {
2548 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
2549 LOG.info("After sidelined big overlapped region: "
2550 + regionToSideline.getRegionNameAsString()
2551 + " to " + sidelineRegionDir.toString());
2552 fixes++;
2553 }
2554 }
2555 }
2556 }
2557
2558
2559
2560
2561
2562
2563
2564 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
2565
2566
2567
2568 if (disabledTables.contains(this.tableName)) {
2569 return true;
2570 }
2571 int originalErrorsCount = errors.getErrorList().size();
2572 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
2573 SortedSet<byte[]> splits = sc.getSplits();
2574
2575 byte[] prevKey = null;
2576 byte[] problemKey = null;
2577
2578 if (splits.size() == 0) {
2579
2580 handler.handleHoleInRegionChain(HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW);
2581 }
2582
2583 for (byte[] key : splits) {
2584 Collection<HbckInfo> ranges = regions.get(key);
2585 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
2586 for (HbckInfo rng : ranges) {
2587 handler.handleRegionStartKeyNotEmpty(rng);
2588 }
2589 }
2590
2591
2592 for (HbckInfo rng : ranges) {
2593
2594 byte[] endKey = rng.getEndKey();
2595 endKey = (endKey.length == 0) ? null : endKey;
2596 if (Bytes.equals(rng.getStartKey(),endKey)) {
2597 handler.handleDegenerateRegion(rng);
2598 }
2599 }
2600
2601 if (ranges.size() == 1) {
2602
2603 if (problemKey != null) {
2604 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2605 }
2606 problemKey = null;
2607 } else if (ranges.size() > 1) {
2608
2609
2610 if (problemKey == null) {
2611
2612 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
2613 problemKey = key;
2614 }
2615 overlapGroups.putAll(problemKey, ranges);
2616
2617
2618 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
2619
2620 for (HbckInfo r1 : ranges) {
2621 subRange.remove(r1);
2622 for (HbckInfo r2 : subRange) {
2623 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
2624 handler.handleDuplicateStartKeys(r1,r2);
2625 } else {
2626
2627 handler.handleOverlapInRegionChain(r1, r2);
2628 }
2629 }
2630 }
2631
2632 } else if (ranges.size() == 0) {
2633 if (problemKey != null) {
2634 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2635 }
2636 problemKey = null;
2637
2638 byte[] holeStopKey = sc.getSplits().higher(key);
2639
2640 if (holeStopKey != null) {
2641
2642 handler.handleHoleInRegionChain(key, holeStopKey);
2643 }
2644 }
2645 prevKey = key;
2646 }
2647
2648
2649
2650 if (prevKey != null) {
2651 handler.handleRegionEndKeyNotEmpty(prevKey);
2652 }
2653
2654
2655 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
2656 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
2657 " false to run serially.");
2658 boolean ok = handleOverlapsParallel(handler, prevKey);
2659 if (!ok) {
2660 return false;
2661 }
2662 } else {
2663 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
2664 " true to run in parallel.");
2665 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2666 handler.handleOverlapGroup(overlap);
2667 }
2668 }
2669
2670 if (details) {
2671
2672 errors.print("---- Table '" + this.tableName
2673 + "': region split map");
2674 dump(splits, regions);
2675 errors.print("---- Table '" + this.tableName
2676 + "': overlap groups");
2677 dumpOverlapProblems(overlapGroups);
2678 errors.print("There are " + overlapGroups.keySet().size()
2679 + " overlap groups with " + overlapGroups.size()
2680 + " overlapping regions");
2681 }
2682 if (!sidelinedRegions.isEmpty()) {
2683 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
2684 errors.print("---- Table '" + this.tableName
2685 + "': sidelined big overlapped regions");
2686 dumpSidelinedRegions(sidelinedRegions);
2687 }
2688 return errors.getErrorList().size() == originalErrorsCount;
2689 }
2690
2691 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
2692 throws IOException {
2693
2694
2695 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
2696 List<Future<Void>> rets;
2697 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2698
2699 merges.add(new WorkItemOverlapMerge(overlap, handler));
2700 }
2701 try {
2702 rets = executor.invokeAll(merges);
2703 } catch (InterruptedException e) {
2704 LOG.error("Overlap merges were interrupted", e);
2705 return false;
2706 }
2707 for(int i=0; i<merges.size(); i++) {
2708 WorkItemOverlapMerge work = merges.get(i);
2709 Future<Void> f = rets.get(i);
2710 try {
2711 f.get();
2712 } catch(ExecutionException e) {
2713 LOG.warn("Failed to merge overlap group" + work, e.getCause());
2714 } catch (InterruptedException e) {
2715 LOG.error("Waiting for overlap merges was interrupted", e);
2716 return false;
2717 }
2718 }
2719 return true;
2720 }
2721
2722
2723
2724
2725
2726
2727
2728 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
2729
2730 StringBuilder sb = new StringBuilder();
2731 for (byte[] k : splits) {
2732 sb.setLength(0);
2733 sb.append(Bytes.toStringBinary(k) + ":\t");
2734 for (HbckInfo r : regions.get(k)) {
2735 sb.append("[ "+ r.toString() + ", "
2736 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
2737 }
2738 errors.print(sb.toString());
2739 }
2740 }
2741 }
2742
2743 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
2744
2745
2746 for (byte[] k : regions.keySet()) {
2747 errors.print(Bytes.toStringBinary(k) + ":");
2748 for (HbckInfo r : regions.get(k)) {
2749 errors.print("[ " + r.toString() + ", "
2750 + Bytes.toStringBinary(r.getEndKey()) + "]");
2751 }
2752 errors.print("----");
2753 }
2754 }
2755
2756 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
2757 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
2758 TableName tableName = entry.getValue().getTableName();
2759 Path path = entry.getKey();
2760 errors.print("This sidelined region dir should be bulk loaded: "
2761 + path.toString());
2762 errors.print("Bulk load command looks like: "
2763 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
2764 + path.toUri().getPath() + " "+ tableName);
2765 }
2766 }
2767
2768 public Multimap<byte[], HbckInfo> getOverlapGroups(
2769 TableName table) {
2770 TableInfo ti = tablesInfo.get(table);
2771 return ti.overlapGroups;
2772 }
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
2784 List<TableName> tableNames = new ArrayList<TableName>();
2785 long now = System.currentTimeMillis();
2786
2787 for (HbckInfo hbi : regionInfoMap.values()) {
2788 MetaEntry info = hbi.metaEntry;
2789
2790
2791
2792 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
2793 if (info.modTime + timelag < now) {
2794 tableNames.add(info.getTable());
2795 } else {
2796 numSkipped.incrementAndGet();
2797 }
2798 }
2799 }
2800 return getHTableDescriptors(tableNames);
2801 }
2802
2803 HTableDescriptor[] getHTableDescriptors(List<TableName> tableNames) {
2804 HTableDescriptor[] htd = new HTableDescriptor[0];
2805 try {
2806 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
2807 htd = new HBaseAdmin(getConf()).getTableDescriptorsByTableName(tableNames);
2808 } catch (IOException e) {
2809 LOG.debug("Exception getting table descriptors", e);
2810 }
2811 return htd;
2812 }
2813
2814
2815
2816
2817
2818
2819 private synchronized HbckInfo getOrCreateInfo(String name) {
2820 HbckInfo hbi = regionInfoMap.get(name);
2821 if (hbi == null) {
2822 hbi = new HbckInfo(null);
2823 regionInfoMap.put(name, hbi);
2824 }
2825 return hbi;
2826 }
2827
2828 private void checkAndFixTableLocks() throws IOException {
2829 TableLockChecker checker = new TableLockChecker(createZooKeeperWatcher(), errors);
2830 checker.checkTableLocks();
2831
2832 if (this.fixTableLocks) {
2833 checker.fixExpiredTableLocks();
2834 }
2835 }
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846 boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException {
2847 List<HbckInfo> metaRegions = Lists.newArrayList();
2848 for (HbckInfo value : regionInfoMap.values()) {
2849 if (value.metaEntry != null && value.metaEntry.isMetaRegion()) {
2850 metaRegions.add(value);
2851 }
2852 }
2853
2854
2855
2856 List<ServerName> servers = new ArrayList<ServerName>();
2857 HbckInfo metaHbckInfo = null;
2858 if (!metaRegions.isEmpty()) {
2859 metaHbckInfo = metaRegions.get(0);
2860 servers = metaHbckInfo.deployedOn;
2861 }
2862 if (servers.size() != 1) {
2863 if (servers.size() == 0) {
2864 errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region.");
2865 if (shouldFixAssignments()) {
2866 errors.print("Trying to fix a problem with hbase:meta..");
2867 setShouldRerun();
2868
2869 HBaseFsckRepair.fixUnassigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
2870 HBaseFsckRepair.waitUntilAssigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
2871 }
2872 } else if (servers.size() > 1) {
2873 errors
2874 .reportError(ERROR_CODE.MULTI_META_REGION, "hbase:meta is found on more than one region.");
2875 if (shouldFixAssignments()) {
2876 if (metaHbckInfo == null) {
2877 errors.print(
2878 "Unable to fix problem with hbase:meta due to hbase:meta region info missing");
2879 return false;
2880 }
2881 errors.print("Trying to fix a problem with hbase:meta..");
2882 setShouldRerun();
2883
2884 HBaseFsckRepair.fixMultiAssignment(admin, metaHbckInfo.metaEntry, servers);
2885 }
2886 }
2887
2888 return false;
2889 }
2890
2891 return true;
2892 }
2893
2894
2895
2896
2897
2898 boolean loadMetaEntries() throws IOException {
2899 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
2900 int countRecord = 1;
2901
2902
2903 final Comparator<Cell> comp = new Comparator<Cell>() {
2904 @Override
2905 public int compare(Cell k1, Cell k2) {
2906 return (int)(k1.getTimestamp() - k2.getTimestamp());
2907 }
2908 };
2909
2910 @Override
2911 public boolean processRow(Result result) throws IOException {
2912 try {
2913
2914
2915 long ts = Collections.max(result.listCells(), comp).getTimestamp();
2916 Pair<HRegionInfo, ServerName> pair = HRegionInfo.getHRegionInfoAndServerName(result);
2917 if (pair == null || pair.getFirst() == null) {
2918 emptyRegionInfoQualifiers.add(result);
2919 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
2920 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
2921 return true;
2922 }
2923 ServerName sn = null;
2924 if (pair.getSecond() != null) {
2925 sn = pair.getSecond();
2926 }
2927 HRegionInfo hri = pair.getFirst();
2928 if (!(isTableIncluded(hri.getTable())
2929 || hri.isMetaRegion())) {
2930 return true;
2931 }
2932 PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(result);
2933 MetaEntry m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
2934 HbckInfo previous = regionInfoMap.get(hri.getEncodedName());
2935 if (previous == null) {
2936 regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m));
2937 } else if (previous.metaEntry == null) {
2938 previous.metaEntry = m;
2939 } else {
2940 throw new IOException("Two entries in hbase:meta are same " + previous);
2941 }
2942
2943 PairOfSameType<HRegionInfo> mergeRegions = HRegionInfo.getMergeRegions(result);
2944 for (HRegionInfo mergeRegion : new HRegionInfo[] {
2945 mergeRegions.getFirst(), mergeRegions.getSecond() }) {
2946 if (mergeRegion != null) {
2947
2948 HbckInfo hbInfo = getOrCreateInfo(mergeRegion.getEncodedName());
2949 hbInfo.setMerged(true);
2950 }
2951 }
2952
2953
2954 if (countRecord % 100 == 0) {
2955 errors.progress();
2956 }
2957 countRecord++;
2958 return true;
2959 } catch (RuntimeException e) {
2960 LOG.error("Result=" + result);
2961 throw e;
2962 }
2963 }
2964 };
2965 if (!checkMetaOnly) {
2966
2967 MetaScanner.metaScan(getConf(), visitor);
2968 }
2969
2970 errors.print("");
2971 return true;
2972 }
2973
2974
2975
2976
2977 static class MetaEntry extends HRegionInfo {
2978 ServerName regionServer;
2979 long modTime;
2980 HRegionInfo splitA, splitB;
2981
2982 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
2983 this(rinfo, regionServer, modTime, null, null);
2984 }
2985
2986 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
2987 HRegionInfo splitA, HRegionInfo splitB) {
2988 super(rinfo);
2989 this.regionServer = regionServer;
2990 this.modTime = modTime;
2991 this.splitA = splitA;
2992 this.splitB = splitB;
2993 }
2994
2995 @Override
2996 public boolean equals(Object o) {
2997 boolean superEq = super.equals(o);
2998 if (!superEq) {
2999 return superEq;
3000 }
3001
3002 MetaEntry me = (MetaEntry) o;
3003 if (!regionServer.equals(me.regionServer)) {
3004 return false;
3005 }
3006 return (modTime == me.modTime);
3007 }
3008
3009 @Override
3010 public int hashCode() {
3011 int hash = Arrays.hashCode(getRegionName());
3012 hash ^= getRegionId();
3013 hash ^= Arrays.hashCode(getStartKey());
3014 hash ^= Arrays.hashCode(getEndKey());
3015 hash ^= Boolean.valueOf(isOffline()).hashCode();
3016 hash ^= getTable().hashCode();
3017 if (regionServer != null) {
3018 hash ^= regionServer.hashCode();
3019 }
3020 hash ^= modTime;
3021 return hash;
3022 }
3023 }
3024
3025
3026
3027
3028 static class HdfsEntry {
3029 HRegionInfo hri;
3030 Path hdfsRegionDir = null;
3031 long hdfsRegionDirModTime = 0;
3032 boolean hdfsRegioninfoFilePresent = false;
3033 boolean hdfsOnlyEdits = false;
3034 }
3035
3036
3037
3038
3039 static class OnlineEntry {
3040 HRegionInfo hri;
3041 ServerName hsa;
3042
3043 @Override
3044 public String toString() {
3045 return hsa.toString() + ";" + hri.getRegionNameAsString();
3046 }
3047 }
3048
3049
3050
3051
3052
3053 public static class HbckInfo implements KeyRange {
3054 private MetaEntry metaEntry = null;
3055 private HdfsEntry hdfsEntry = null;
3056 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
3057 private List<ServerName> deployedOn = Lists.newArrayList();
3058 private boolean skipChecks = false;
3059 private boolean isMerged = false;
3060
3061 HbckInfo(MetaEntry metaEntry) {
3062 this.metaEntry = metaEntry;
3063 }
3064
3065 public synchronized void addServer(HRegionInfo hri, ServerName server) {
3066 OnlineEntry rse = new OnlineEntry() ;
3067 rse.hri = hri;
3068 rse.hsa = server;
3069 this.deployedEntries.add(rse);
3070 this.deployedOn.add(server);
3071 }
3072
3073 @Override
3074 public synchronized String toString() {
3075 StringBuilder sb = new StringBuilder();
3076 sb.append("{ meta => ");
3077 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
3078 sb.append( ", hdfs => " + getHdfsRegionDir());
3079 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
3080 sb.append(" }");
3081 return sb.toString();
3082 }
3083
3084 @Override
3085 public byte[] getStartKey() {
3086 if (this.metaEntry != null) {
3087 return this.metaEntry.getStartKey();
3088 } else if (this.hdfsEntry != null) {
3089 return this.hdfsEntry.hri.getStartKey();
3090 } else {
3091 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3092 return null;
3093 }
3094 }
3095
3096 @Override
3097 public byte[] getEndKey() {
3098 if (this.metaEntry != null) {
3099 return this.metaEntry.getEndKey();
3100 } else if (this.hdfsEntry != null) {
3101 return this.hdfsEntry.hri.getEndKey();
3102 } else {
3103 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3104 return null;
3105 }
3106 }
3107
3108 public TableName getTableName() {
3109 if (this.metaEntry != null) {
3110 return this.metaEntry.getTable();
3111 } else if (this.hdfsEntry != null) {
3112
3113
3114 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3115 return FSUtils.getTableName(tableDir);
3116 } else {
3117
3118
3119 return null;
3120 }
3121 }
3122
3123 public String getRegionNameAsString() {
3124 if (metaEntry != null) {
3125 return metaEntry.getRegionNameAsString();
3126 } else if (hdfsEntry != null) {
3127 if (hdfsEntry.hri != null) {
3128 return hdfsEntry.hri.getRegionNameAsString();
3129 }
3130 }
3131 return null;
3132 }
3133
3134 public byte[] getRegionName() {
3135 if (metaEntry != null) {
3136 return metaEntry.getRegionName();
3137 } else if (hdfsEntry != null) {
3138 return hdfsEntry.hri.getRegionName();
3139 } else {
3140 return null;
3141 }
3142 }
3143
3144 Path getHdfsRegionDir() {
3145 if (hdfsEntry == null) {
3146 return null;
3147 }
3148 return hdfsEntry.hdfsRegionDir;
3149 }
3150
3151 boolean containsOnlyHdfsEdits() {
3152 if (hdfsEntry == null) {
3153 return false;
3154 }
3155 return hdfsEntry.hdfsOnlyEdits;
3156 }
3157
3158 boolean isHdfsRegioninfoPresent() {
3159 if (hdfsEntry == null) {
3160 return false;
3161 }
3162 return hdfsEntry.hdfsRegioninfoFilePresent;
3163 }
3164
3165 long getModTime() {
3166 if (hdfsEntry == null) {
3167 return 0;
3168 }
3169 return hdfsEntry.hdfsRegionDirModTime;
3170 }
3171
3172 HRegionInfo getHdfsHRI() {
3173 if (hdfsEntry == null) {
3174 return null;
3175 }
3176 return hdfsEntry.hri;
3177 }
3178
3179 public void setSkipChecks(boolean skipChecks) {
3180 this.skipChecks = skipChecks;
3181 }
3182
3183 public boolean isSkipChecks() {
3184 return skipChecks;
3185 }
3186
3187 public void setMerged(boolean isMerged) {
3188 this.isMerged = isMerged;
3189 }
3190
3191 public boolean isMerged() {
3192 return this.isMerged;
3193 }
3194 }
3195
3196 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3197 @Override
3198 public int compare(HbckInfo l, HbckInfo r) {
3199 if (l == r) {
3200
3201 return 0;
3202 }
3203
3204 int tableCompare = l.getTableName().compareTo(r.getTableName());
3205 if (tableCompare != 0) {
3206 return tableCompare;
3207 }
3208
3209 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3210 l.getStartKey(), r.getStartKey());
3211 if (startComparison != 0) {
3212 return startComparison;
3213 }
3214
3215
3216 byte[] endKey = r.getEndKey();
3217 endKey = (endKey.length == 0) ? null : endKey;
3218 byte[] endKey2 = l.getEndKey();
3219 endKey2 = (endKey2.length == 0) ? null : endKey2;
3220 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3221 endKey2, endKey);
3222
3223 if (endComparison != 0) {
3224 return endComparison;
3225 }
3226
3227
3228
3229 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3230 return 0;
3231 }
3232 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3233 return 1;
3234 }
3235
3236 if (r.hdfsEntry == null) {
3237 return -1;
3238 }
3239
3240 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3241 }
3242 };
3243
3244
3245
3246
3247 private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
3248 StringBuilder sb = new StringBuilder();
3249 errors.print("Summary:");
3250 for (TableInfo tInfo : tablesInfo.values()) {
3251 if (errors.tableHasErrors(tInfo)) {
3252 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3253 } else {
3254 errors.print(" " + tInfo.getName() + " is okay.");
3255 }
3256 errors.print(" Number of regions: " + tInfo.getNumRegions());
3257 sb.setLength(0);
3258 sb.append(" Deployed on: ");
3259 for (ServerName server : tInfo.deployedOn) {
3260 sb.append(" " + server.toString());
3261 }
3262 errors.print(sb.toString());
3263 }
3264 }
3265
3266 static ErrorReporter getErrorReporter(
3267 final Configuration conf) throws ClassNotFoundException {
3268 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3269 return ReflectionUtils.newInstance(reporter, conf);
3270 }
3271
3272 public interface ErrorReporter {
3273 enum ERROR_CODE {
3274 UNKNOWN, NO_META_REGION, NULL_META_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3275 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3276 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3277 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3278 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3279 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3280 WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, BOUNDARIES_ERROR
3281 }
3282 void clear();
3283 void report(String message);
3284 void reportError(String message);
3285 void reportError(ERROR_CODE errorCode, String message);
3286 void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3287 void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3288 void reportError(
3289 ERROR_CODE errorCode,
3290 String message,
3291 TableInfo table,
3292 HbckInfo info1,
3293 HbckInfo info2
3294 );
3295 int summarize();
3296 void detail(String details);
3297 ArrayList<ERROR_CODE> getErrorList();
3298 void progress();
3299 void print(String message);
3300 void resetErrors();
3301 boolean tableHasErrors(TableInfo table);
3302 }
3303
3304 static class PrintingErrorReporter implements ErrorReporter {
3305 public int errorCount = 0;
3306 private int showProgress;
3307
3308 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3309
3310
3311 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3312
3313 @Override
3314 public void clear() {
3315 errorTables.clear();
3316 errorList.clear();
3317 errorCount = 0;
3318 }
3319
3320 @Override
3321 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3322 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3323 System.err.println(message);
3324 return;
3325 }
3326
3327 errorList.add(errorCode);
3328 if (!summary) {
3329 System.out.println("ERROR: " + message);
3330 }
3331 errorCount++;
3332 showProgress = 0;
3333 }
3334
3335 @Override
3336 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3337 errorTables.add(table);
3338 reportError(errorCode, message);
3339 }
3340
3341 @Override
3342 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3343 HbckInfo info) {
3344 errorTables.add(table);
3345 String reference = "(region " + info.getRegionNameAsString() + ")";
3346 reportError(errorCode, reference + " " + message);
3347 }
3348
3349 @Override
3350 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3351 HbckInfo info1, HbckInfo info2) {
3352 errorTables.add(table);
3353 String reference = "(regions " + info1.getRegionNameAsString()
3354 + " and " + info2.getRegionNameAsString() + ")";
3355 reportError(errorCode, reference + " " + message);
3356 }
3357
3358 @Override
3359 public synchronized void reportError(String message) {
3360 reportError(ERROR_CODE.UNKNOWN, message);
3361 }
3362
3363
3364
3365
3366
3367
3368 @Override
3369 public synchronized void report(String message) {
3370 if (! summary) {
3371 System.out.println("ERROR: " + message);
3372 }
3373 showProgress = 0;
3374 }
3375
3376 @Override
3377 public synchronized int summarize() {
3378 System.out.println(Integer.toString(errorCount) +
3379 " inconsistencies detected.");
3380 if (errorCount == 0) {
3381 System.out.println("Status: OK");
3382 return 0;
3383 } else {
3384 System.out.println("Status: INCONSISTENT");
3385 return -1;
3386 }
3387 }
3388
3389 @Override
3390 public ArrayList<ERROR_CODE> getErrorList() {
3391 return errorList;
3392 }
3393
3394 @Override
3395 public synchronized void print(String message) {
3396 if (!summary) {
3397 System.out.println(message);
3398 }
3399 }
3400
3401 @Override
3402 public boolean tableHasErrors(TableInfo table) {
3403 return errorTables.contains(table);
3404 }
3405
3406 @Override
3407 public void resetErrors() {
3408 errorCount = 0;
3409 }
3410
3411 @Override
3412 public synchronized void detail(String message) {
3413 if (details) {
3414 System.out.println(message);
3415 }
3416 showProgress = 0;
3417 }
3418
3419 @Override
3420 public synchronized void progress() {
3421 if (showProgress++ == 10) {
3422 if (!summary) {
3423 System.out.print(".");
3424 }
3425 showProgress = 0;
3426 }
3427 }
3428 }
3429
3430
3431
3432
3433 static class WorkItemRegion implements Callable<Void> {
3434 private HBaseFsck hbck;
3435 private ServerName rsinfo;
3436 private ErrorReporter errors;
3437 private HConnection connection;
3438
3439 WorkItemRegion(HBaseFsck hbck, ServerName info,
3440 ErrorReporter errors, HConnection connection) {
3441 this.hbck = hbck;
3442 this.rsinfo = info;
3443 this.errors = errors;
3444 this.connection = connection;
3445 }
3446
3447 @Override
3448 public synchronized Void call() throws IOException {
3449 errors.progress();
3450 try {
3451 BlockingInterface server = connection.getAdmin(rsinfo);
3452
3453
3454 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
3455 regions = filterRegions(regions);
3456
3457 if (details) {
3458 errors.detail("RegionServer: " + rsinfo.getServerName() +
3459 " number of regions: " + regions.size());
3460 for (HRegionInfo rinfo: regions) {
3461 errors.detail(" " + rinfo.getRegionNameAsString() +
3462 " id: " + rinfo.getRegionId() +
3463 " encoded_name: " + rinfo.getEncodedName() +
3464 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
3465 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
3466 }
3467 }
3468
3469
3470 for (HRegionInfo r:regions) {
3471 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
3472 hbi.addServer(r, rsinfo);
3473 }
3474 } catch (IOException e) {
3475 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
3476 " Unable to fetch region information. " + e);
3477 throw e;
3478 }
3479 return null;
3480 }
3481
3482 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
3483 List<HRegionInfo> ret = Lists.newArrayList();
3484 for (HRegionInfo hri : regions) {
3485 if (hri.isMetaTable() || (!hbck.checkMetaOnly
3486 && hbck.isTableIncluded(hri.getTable()))) {
3487 ret.add(hri);
3488 }
3489 }
3490 return ret;
3491 }
3492 }
3493
3494
3495
3496
3497
3498 static class WorkItemHdfsDir implements Callable<Void> {
3499 private HBaseFsck hbck;
3500 private FileStatus tableDir;
3501 private ErrorReporter errors;
3502 private FileSystem fs;
3503
3504 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
3505 FileStatus status) {
3506 this.hbck = hbck;
3507 this.fs = fs;
3508 this.tableDir = status;
3509 this.errors = errors;
3510 }
3511
3512 @Override
3513 public synchronized Void call() throws IOException {
3514 try {
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569 static class WorkItemHdfsRegionInfo implements Callable<Void> {
3570 private HbckInfo hbi;
3571 private HBaseFsck hbck;
3572 private ErrorReporter errors;
3573
3574 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
3575 this.hbi = hbi;
3576 this.hbck = hbck;
3577 this.errors = errors;
3578 }
3579
3580 @Override
3581 public synchronized Void call() throws IOException {
3582
3583 if (hbi.getHdfsHRI() == null) {
3584 try {
3585 hbck.loadHdfsRegioninfo(hbi);
3586 } catch (IOException ioe) {
3587 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
3588 + hbi.getTableName() + " in hdfs dir "
3589 + hbi.getHdfsRegionDir()
3590 + "! It may be an invalid format or version file. Treating as "
3591 + "an orphaned regiondir.";
3592 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
3593 try {
3594 hbck.debugLsr(hbi.getHdfsRegionDir());
3595 } catch (IOException ioe2) {
3596 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
3597 throw ioe2;
3598 }
3599 hbck.orphanHdfsDirs.add(hbi);
3600 throw ioe;
3601 }
3602 }
3603 return null;
3604 }
3605 };
3606
3607
3608
3609
3610
3611 public static void setDisplayFullReport() {
3612 details = true;
3613 }
3614
3615
3616
3617
3618
3619 void setSummary() {
3620 summary = true;
3621 }
3622
3623
3624
3625
3626
3627 void setCheckMetaOnly() {
3628 checkMetaOnly = true;
3629 }
3630
3631
3632
3633
3634 void setRegionBoundariesCheck() {
3635 checkRegionBoundaries = true;
3636 }
3637
3638
3639
3640
3641
3642 public void setFixTableLocks(boolean shouldFix) {
3643 fixTableLocks = shouldFix;
3644 fixAny |= shouldFix;
3645 }
3646
3647
3648
3649
3650
3651
3652
3653 void setShouldRerun() {
3654 rerun = true;
3655 }
3656
3657 boolean shouldRerun() {
3658 return rerun;
3659 }
3660
3661
3662
3663
3664
3665 public void setFixAssignments(boolean shouldFix) {
3666 fixAssignments = shouldFix;
3667 fixAny |= shouldFix;
3668 }
3669
3670 boolean shouldFixAssignments() {
3671 return fixAssignments;
3672 }
3673
3674 public void setFixMeta(boolean shouldFix) {
3675 fixMeta = shouldFix;
3676 fixAny |= shouldFix;
3677 }
3678
3679 boolean shouldFixMeta() {
3680 return fixMeta;
3681 }
3682
3683 public void setFixEmptyMetaCells(boolean shouldFix) {
3684 fixEmptyMetaCells = shouldFix;
3685 fixAny |= shouldFix;
3686 }
3687
3688 boolean shouldFixEmptyMetaCells() {
3689 return fixEmptyMetaCells;
3690 }
3691
3692 public void setCheckHdfs(boolean checking) {
3693 checkHdfs = checking;
3694 }
3695
3696 boolean shouldCheckHdfs() {
3697 return checkHdfs;
3698 }
3699
3700 public void setFixHdfsHoles(boolean shouldFix) {
3701 fixHdfsHoles = shouldFix;
3702 fixAny |= shouldFix;
3703 }
3704
3705 boolean shouldFixHdfsHoles() {
3706 return fixHdfsHoles;
3707 }
3708
3709 public void setFixTableOrphans(boolean shouldFix) {
3710 fixTableOrphans = shouldFix;
3711 fixAny |= shouldFix;
3712 }
3713
3714 boolean shouldFixTableOrphans() {
3715 return fixTableOrphans;
3716 }
3717
3718 public void setFixHdfsOverlaps(boolean shouldFix) {
3719 fixHdfsOverlaps = shouldFix;
3720 fixAny |= shouldFix;
3721 }
3722
3723 boolean shouldFixHdfsOverlaps() {
3724 return fixHdfsOverlaps;
3725 }
3726
3727 public void setFixHdfsOrphans(boolean shouldFix) {
3728 fixHdfsOrphans = shouldFix;
3729 fixAny |= shouldFix;
3730 }
3731
3732 boolean shouldFixHdfsOrphans() {
3733 return fixHdfsOrphans;
3734 }
3735
3736 public void setFixVersionFile(boolean shouldFix) {
3737 fixVersionFile = shouldFix;
3738 fixAny |= shouldFix;
3739 }
3740
3741 public boolean shouldFixVersionFile() {
3742 return fixVersionFile;
3743 }
3744
3745 public void setSidelineBigOverlaps(boolean sbo) {
3746 this.sidelineBigOverlaps = sbo;
3747 }
3748
3749 public boolean shouldSidelineBigOverlaps() {
3750 return sidelineBigOverlaps;
3751 }
3752
3753 public void setFixSplitParents(boolean shouldFix) {
3754 fixSplitParents = shouldFix;
3755 fixAny |= shouldFix;
3756 }
3757
3758 boolean shouldFixSplitParents() {
3759 return fixSplitParents;
3760 }
3761
3762 public void setFixReferenceFiles(boolean shouldFix) {
3763 fixReferenceFiles = shouldFix;
3764 fixAny |= shouldFix;
3765 }
3766
3767 boolean shouldFixReferenceFiles() {
3768 return fixReferenceFiles;
3769 }
3770
3771 public boolean shouldIgnorePreCheckPermission() {
3772 return !fixAny || ignorePreCheckPermission;
3773 }
3774
3775 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
3776 this.ignorePreCheckPermission = ignorePreCheckPermission;
3777 }
3778
3779
3780
3781
3782 public void setMaxMerge(int mm) {
3783 this.maxMerge = mm;
3784 }
3785
3786 public int getMaxMerge() {
3787 return maxMerge;
3788 }
3789
3790 public void setMaxOverlapsToSideline(int mo) {
3791 this.maxOverlapsToSideline = mo;
3792 }
3793
3794 public int getMaxOverlapsToSideline() {
3795 return maxOverlapsToSideline;
3796 }
3797
3798
3799
3800
3801
3802 boolean isTableIncluded(TableName table) {
3803 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
3804 }
3805
3806 public void includeTable(TableName table) {
3807 tablesIncluded.add(table);
3808 }
3809
3810 Set<TableName> getIncludedTables() {
3811 return new HashSet<TableName>(tablesIncluded);
3812 }
3813
3814
3815
3816
3817
3818
3819 public void setTimeLag(long seconds) {
3820 timelag = seconds * 1000;
3821 }
3822
3823
3824
3825
3826
3827 public void setSidelineDir(String sidelineDir) {
3828 this.sidelineDir = new Path(sidelineDir);
3829 }
3830
3831 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
3832 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
3833 }
3834
3835 public HFileCorruptionChecker getHFilecorruptionChecker() {
3836 return hfcc;
3837 }
3838
3839 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
3840 this.hfcc = hfcc;
3841 }
3842
3843 public void setRetCode(int code) {
3844 this.retcode = code;
3845 }
3846
3847 public int getRetCode() {
3848 return retcode;
3849 }
3850
3851 protected HBaseFsck printUsageAndExit() {
3852 StringWriter sw = new StringWriter(2048);
3853 PrintWriter out = new PrintWriter(sw);
3854 out.println("Usage: fsck [opts] {only tables}");
3855 out.println(" where [opts] are:");
3856 out.println(" -help Display help options (this)");
3857 out.println(" -details Display full report of all regions.");
3858 out.println(" -timelag <timeInSeconds> Process only regions that " +
3859 " have not experienced any metadata updates in the last " +
3860 " <timeInSeconds> seconds.");
3861 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
3862 " before checking if the fix worked if run with -fix");
3863 out.println(" -summary Print only summary of the tables and status.");
3864 out.println(" -metaonly Only check the state of the hbase:meta table.");
3865 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
3866 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
3867
3868 out.println("");
3869 out.println(" Metadata Repair options: (expert features, use with caution!)");
3870 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
3871 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
3872 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
3873 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
3874 + " Assumes hbase:meta region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
3875 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
3876 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
3877 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
3878 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
3879 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
3880 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
3881 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
3882 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
3883 out.println(" -fixSplitParents Try to force offline split parents to be online.");
3884 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
3885 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
3886 out.println(" -fixEmptyMetaCells Try to fix hbase:meta entries not referencing any region"
3887 + " (empty REGIONINFO_QUALIFIER rows)");
3888
3889 out.println("");
3890 out.println(" Datafile Repair options: (expert features, use with caution!)");
3891 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
3892 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
3893
3894 out.println("");
3895 out.println(" Metadata Repair shortcuts");
3896 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
3897 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles -fixTableLocks");
3898 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
3899
3900 out.println("");
3901 out.println(" Table lock options");
3902 out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)");
3903
3904 out.flush();
3905 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
3906
3907 setRetCode(-2);
3908 return this;
3909 }
3910
3911
3912
3913
3914
3915
3916
3917 public static void main(String[] args) throws Exception {
3918
3919 Configuration conf = HBaseConfiguration.create();
3920 Path hbasedir = FSUtils.getRootDir(conf);
3921 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
3922 FSUtils.setFsDefault(conf, new Path(defaultFs));
3923 int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
3924 System.exit(ret);
3925 }
3926
3927
3928
3929
3930 static class HBaseFsckTool extends Configured implements Tool {
3931 HBaseFsckTool(Configuration conf) { super(conf); }
3932 @Override
3933 public int run(String[] args) throws Exception {
3934 HBaseFsck hbck = new HBaseFsck(getConf());
3935 hbck.exec(hbck.executor, args);
3936 return hbck.getRetCode();
3937 }
3938 };
3939
3940
3941 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
3942 ServiceException, InterruptedException {
3943 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
3944
3945 boolean checkCorruptHFiles = false;
3946 boolean sidelineCorruptHFiles = false;
3947
3948
3949 for (int i = 0; i < args.length; i++) {
3950 String cmd = args[i];
3951 if (cmd.equals("-help") || cmd.equals("-h")) {
3952 return printUsageAndExit();
3953 } else if (cmd.equals("-details")) {
3954 setDisplayFullReport();
3955 } else if (cmd.equals("-timelag")) {
3956 if (i == args.length - 1) {
3957 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
3958 return printUsageAndExit();
3959 }
3960 try {
3961 long timelag = Long.parseLong(args[i+1]);
3962 setTimeLag(timelag);
3963 } catch (NumberFormatException e) {
3964 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
3965 return printUsageAndExit();
3966 }
3967 i++;
3968 } else if (cmd.equals("-sleepBeforeRerun")) {
3969 if (i == args.length - 1) {
3970 errors.reportError(ERROR_CODE.WRONG_USAGE,
3971 "HBaseFsck: -sleepBeforeRerun needs a value.");
3972 return printUsageAndExit();
3973 }
3974 try {
3975 sleepBeforeRerun = Long.parseLong(args[i+1]);
3976 } catch (NumberFormatException e) {
3977 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
3978 return printUsageAndExit();
3979 }
3980 i++;
3981 } else if (cmd.equals("-sidelineDir")) {
3982 if (i == args.length - 1) {
3983 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
3984 return printUsageAndExit();
3985 }
3986 i++;
3987 setSidelineDir(args[i]);
3988 } else if (cmd.equals("-fix")) {
3989 errors.reportError(ERROR_CODE.WRONG_USAGE,
3990 "This option is deprecated, please use -fixAssignments instead.");
3991 setFixAssignments(true);
3992 } else if (cmd.equals("-fixAssignments")) {
3993 setFixAssignments(true);
3994 } else if (cmd.equals("-fixMeta")) {
3995 setFixMeta(true);
3996 } else if (cmd.equals("-noHdfsChecking")) {
3997 setCheckHdfs(false);
3998 } else if (cmd.equals("-fixHdfsHoles")) {
3999 setFixHdfsHoles(true);
4000 } else if (cmd.equals("-fixHdfsOrphans")) {
4001 setFixHdfsOrphans(true);
4002 } else if (cmd.equals("-fixTableOrphans")) {
4003 setFixTableOrphans(true);
4004 } else if (cmd.equals("-fixHdfsOverlaps")) {
4005 setFixHdfsOverlaps(true);
4006 } else if (cmd.equals("-fixVersionFile")) {
4007 setFixVersionFile(true);
4008 } else if (cmd.equals("-sidelineBigOverlaps")) {
4009 setSidelineBigOverlaps(true);
4010 } else if (cmd.equals("-fixSplitParents")) {
4011 setFixSplitParents(true);
4012 } else if (cmd.equals("-ignorePreCheckPermission")) {
4013 setIgnorePreCheckPermission(true);
4014 } else if (cmd.equals("-checkCorruptHFiles")) {
4015 checkCorruptHFiles = true;
4016 } else if (cmd.equals("-sidelineCorruptHFiles")) {
4017 sidelineCorruptHFiles = true;
4018 } else if (cmd.equals("-fixReferenceFiles")) {
4019 setFixReferenceFiles(true);
4020 } else if (cmd.equals("-fixEmptyMetaCells")) {
4021 setFixEmptyMetaCells(true);
4022 } else if (cmd.equals("-repair")) {
4023
4024
4025 setFixHdfsHoles(true);
4026 setFixHdfsOrphans(true);
4027 setFixMeta(true);
4028 setFixAssignments(true);
4029 setFixHdfsOverlaps(true);
4030 setFixVersionFile(true);
4031 setSidelineBigOverlaps(true);
4032 setFixSplitParents(false);
4033 setCheckHdfs(true);
4034 setFixReferenceFiles(true);
4035 setFixTableLocks(true);
4036 } else if (cmd.equals("-repairHoles")) {
4037
4038 setFixHdfsHoles(true);
4039 setFixHdfsOrphans(false);
4040 setFixMeta(true);
4041 setFixAssignments(true);
4042 setFixHdfsOverlaps(false);
4043 setSidelineBigOverlaps(false);
4044 setFixSplitParents(false);
4045 setCheckHdfs(true);
4046 } else if (cmd.equals("-maxOverlapsToSideline")) {
4047 if (i == args.length - 1) {
4048 errors.reportError(ERROR_CODE.WRONG_USAGE,
4049 "-maxOverlapsToSideline needs a numeric value argument.");
4050 return printUsageAndExit();
4051 }
4052 try {
4053 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
4054 setMaxOverlapsToSideline(maxOverlapsToSideline);
4055 } catch (NumberFormatException e) {
4056 errors.reportError(ERROR_CODE.WRONG_USAGE,
4057 "-maxOverlapsToSideline needs a numeric value argument.");
4058 return printUsageAndExit();
4059 }
4060 i++;
4061 } else if (cmd.equals("-maxMerge")) {
4062 if (i == args.length - 1) {
4063 errors.reportError(ERROR_CODE.WRONG_USAGE,
4064 "-maxMerge needs a numeric value argument.");
4065 return printUsageAndExit();
4066 }
4067 try {
4068 int maxMerge = Integer.parseInt(args[i+1]);
4069 setMaxMerge(maxMerge);
4070 } catch (NumberFormatException e) {
4071 errors.reportError(ERROR_CODE.WRONG_USAGE,
4072 "-maxMerge needs a numeric value argument.");
4073 return printUsageAndExit();
4074 }
4075 i++;
4076 } else if (cmd.equals("-summary")) {
4077 setSummary();
4078 } else if (cmd.equals("-metaonly")) {
4079 setCheckMetaOnly();
4080 } else if (cmd.equals("-boundaries")) {
4081 setRegionBoundariesCheck();
4082 } else if (cmd.equals("-fixTableLocks")) {
4083 setFixTableLocks(true);
4084 } else if (cmd.startsWith("-")) {
4085 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
4086 return printUsageAndExit();
4087 } else {
4088 includeTable(TableName.valueOf(cmd));
4089 errors.print("Allow checking/fixes for table: " + cmd);
4090 }
4091 }
4092
4093 errors.print("HBaseFsck command line options: " + StringUtils.join(args, " "));
4094
4095
4096 try {
4097 preCheckPermission();
4098 } catch (AccessDeniedException ace) {
4099 Runtime.getRuntime().exit(-1);
4100 } catch (IOException ioe) {
4101 Runtime.getRuntime().exit(-1);
4102 }
4103
4104
4105 connect();
4106
4107 try {
4108
4109 if (checkCorruptHFiles || sidelineCorruptHFiles) {
4110 LOG.info("Checking all hfiles for corruption");
4111 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
4112 setHFileCorruptionChecker(hfcc);
4113 Collection<TableName> tables = getIncludedTables();
4114 Collection<Path> tableDirs = new ArrayList<Path>();
4115 Path rootdir = FSUtils.getRootDir(getConf());
4116 if (tables.size() > 0) {
4117 for (TableName t : tables) {
4118 tableDirs.add(FSUtils.getTableDir(rootdir, t));
4119 }
4120 } else {
4121 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
4122 }
4123 hfcc.checkTables(tableDirs);
4124 hfcc.report(errors);
4125 }
4126
4127
4128 int code = onlineHbck();
4129 setRetCode(code);
4130
4131
4132
4133
4134 if (shouldRerun()) {
4135 try {
4136 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
4137 Thread.sleep(sleepBeforeRerun);
4138 } catch (InterruptedException ie) {
4139 return this;
4140 }
4141
4142 setFixAssignments(false);
4143 setFixMeta(false);
4144 setFixHdfsHoles(false);
4145 setFixHdfsOverlaps(false);
4146 setFixVersionFile(false);
4147 setFixTableOrphans(false);
4148 errors.resetErrors();
4149 code = onlineHbck();
4150 setRetCode(code);
4151 }
4152 } finally {
4153 IOUtils.cleanup(null, connection, meta, admin);
4154 }
4155 return this;
4156 }
4157
4158
4159
4160
4161 void debugLsr(Path p) throws IOException {
4162 debugLsr(getConf(), p, errors);
4163 }
4164
4165
4166
4167
4168 public static void debugLsr(Configuration conf,
4169 Path p) throws IOException {
4170 debugLsr(conf, p, new PrintingErrorReporter());
4171 }
4172
4173
4174
4175
4176 public static void debugLsr(Configuration conf,
4177 Path p, ErrorReporter errors) throws IOException {
4178 if (!LOG.isDebugEnabled() || p == null) {
4179 return;
4180 }
4181 FileSystem fs = p.getFileSystem(conf);
4182
4183 if (!fs.exists(p)) {
4184
4185 return;
4186 }
4187 errors.print(p.toString());
4188
4189 if (fs.isFile(p)) {
4190 return;
4191 }
4192
4193 if (fs.getFileStatus(p).isDir()) {
4194 FileStatus[] fss= fs.listStatus(p);
4195 for (FileStatus status : fss) {
4196 debugLsr(conf, status.getPath(), errors);
4197 }
4198 }
4199 }
4200 }