1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.io.PrintWriter;
23 import java.io.StringWriter;
24 import java.net.InetAddress;
25 import java.net.URI;
26 import java.util.ArrayList;
27 import java.util.Arrays;
28 import java.util.Collection;
29 import java.util.Collections;
30 import java.util.Comparator;
31 import java.util.HashMap;
32 import java.util.HashSet;
33 import java.util.Iterator;
34 import java.util.List;
35 import java.util.Map;
36 import java.util.Map.Entry;
37 import java.util.Set;
38 import java.util.SortedMap;
39 import java.util.SortedSet;
40 import java.util.TreeMap;
41 import java.util.TreeSet;
42 import java.util.concurrent.Callable;
43 import java.util.concurrent.ConcurrentSkipListMap;
44 import java.util.concurrent.ExecutionException;
45 import java.util.concurrent.ExecutorService;
46 import java.util.concurrent.Future;
47 import java.util.concurrent.ScheduledThreadPoolExecutor;
48 import java.util.concurrent.atomic.AtomicInteger;
49 import java.util.concurrent.atomic.AtomicBoolean;
50
51 import org.apache.commons.lang.StringUtils;
52 import org.apache.commons.logging.Log;
53 import org.apache.commons.logging.LogFactory;
54 import org.apache.hadoop.hbase.classification.InterfaceAudience;
55 import org.apache.hadoop.hbase.classification.InterfaceStability;
56 import org.apache.hadoop.conf.Configuration;
57 import org.apache.hadoop.conf.Configured;
58 import org.apache.hadoop.fs.FSDataOutputStream;
59 import org.apache.hadoop.fs.FileStatus;
60 import org.apache.hadoop.fs.FileSystem;
61 import org.apache.hadoop.fs.Path;
62 import org.apache.hadoop.fs.permission.FsAction;
63 import org.apache.hadoop.fs.permission.FsPermission;
64 import org.apache.hadoop.hbase.Abortable;
65 import org.apache.hadoop.hbase.Cell;
66 import org.apache.hadoop.hbase.ClusterStatus;
67 import org.apache.hadoop.hbase.HBaseConfiguration;
68 import org.apache.hadoop.hbase.HColumnDescriptor;
69 import org.apache.hadoop.hbase.HConstants;
70 import org.apache.hadoop.hbase.HRegionInfo;
71 import org.apache.hadoop.hbase.HRegionLocation;
72 import org.apache.hadoop.hbase.HTableDescriptor;
73 import org.apache.hadoop.hbase.KeyValue;
74 import org.apache.hadoop.hbase.MasterNotRunningException;
75 import org.apache.hadoop.hbase.ServerName;
76 import org.apache.hadoop.hbase.TableName;
77 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
78 import org.apache.hadoop.hbase.catalog.MetaEditor;
79 import org.apache.hadoop.hbase.client.Delete;
80 import org.apache.hadoop.hbase.client.Get;
81 import org.apache.hadoop.hbase.client.HBaseAdmin;
82 import org.apache.hadoop.hbase.client.HConnectable;
83 import org.apache.hadoop.hbase.client.HConnection;
84 import org.apache.hadoop.hbase.client.HConnectionManager;
85 import org.apache.hadoop.hbase.client.HTable;
86 import org.apache.hadoop.hbase.client.MetaScanner;
87 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
88 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
89 import org.apache.hadoop.hbase.client.Put;
90 import org.apache.hadoop.hbase.client.Result;
91 import org.apache.hadoop.hbase.client.RowMutations;
92 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
93 import org.apache.hadoop.hbase.io.hfile.HFile;
94 import org.apache.hadoop.hbase.master.MasterFileSystem;
95 import org.apache.hadoop.hbase.master.RegionState;
96 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
97 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
98 import org.apache.hadoop.hbase.regionserver.HRegion;
99 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
100 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
101 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
102 import org.apache.hadoop.hbase.security.UserProvider;
103 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
104 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
105 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
106 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
107 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
108 import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
109 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
110 import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly;
111 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
112 import org.apache.hadoop.hbase.security.AccessDeniedException;
113 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
114 import org.apache.hadoop.io.IOUtils;
115 import org.apache.hadoop.ipc.RemoteException;
116 import org.apache.hadoop.security.UserGroupInformation;
117 import org.apache.hadoop.util.ReflectionUtils;
118 import org.apache.hadoop.util.Tool;
119 import org.apache.hadoop.util.ToolRunner;
120 import org.apache.zookeeper.KeeperException;
121
122 import com.google.common.annotations.VisibleForTesting;
123 import com.google.common.base.Joiner;
124 import com.google.common.base.Preconditions;
125 import com.google.common.collect.Lists;
126 import com.google.common.collect.Multimap;
127 import com.google.common.collect.TreeMultimap;
128 import com.google.protobuf.ServiceException;
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 @InterfaceAudience.Public
176 @InterfaceStability.Evolving
177 public class HBaseFsck extends Configured {
178 public static final long DEFAULT_TIME_LAG = 60000;
179 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
180 private static final int MAX_NUM_THREADS = 50;
181 private static boolean rsSupportsOffline = true;
182 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
183 private static final int DEFAULT_MAX_MERGE = 5;
184 private static final String TO_BE_LOADED = "to_be_loaded";
185 private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
186
187
188
189
190
191 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
192 private ClusterStatus status;
193 private HConnection connection;
194 private HBaseAdmin admin;
195 private HTable meta;
196
197 protected ExecutorService executor;
198 private long startMillis = System.currentTimeMillis();
199 private HFileCorruptionChecker hfcc;
200 private int retcode = 0;
201 private Path HBCK_LOCK_PATH;
202 private FSDataOutputStream hbckOutFd;
203
204
205
206 private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
207
208
209
210
211 private static boolean details = false;
212 private long timelag = DEFAULT_TIME_LAG;
213 private boolean fixAssignments = false;
214 private boolean fixMeta = false;
215 private boolean checkHdfs = true;
216 private boolean fixHdfsHoles = false;
217 private boolean fixHdfsOverlaps = false;
218 private boolean fixHdfsOrphans = false;
219 private boolean fixTableOrphans = false;
220 private boolean fixVersionFile = false;
221 private boolean fixSplitParents = false;
222 private boolean fixReferenceFiles = false;
223 private boolean fixEmptyMetaCells = false;
224 private boolean fixTableLocks = false;
225 private boolean fixAny = false;
226
227
228
229 private Set<TableName> tablesIncluded = new HashSet<TableName>();
230 private int maxMerge = DEFAULT_MAX_MERGE;
231 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
232 private boolean sidelineBigOverlaps = false;
233 private Path sidelineDir = null;
234
235 private boolean rerun = false;
236 private static boolean summary = false;
237 private boolean checkMetaOnly = false;
238 private boolean checkRegionBoundaries = false;
239 private boolean ignorePreCheckPermission = false;
240
241
242
243
244 final private ErrorReporter errors;
245 int fixes = 0;
246
247
248
249
250
251
252 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
253 private TreeSet<TableName> disabledTables =
254 new TreeSet<TableName>();
255
256 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
257
258
259
260
261
262
263
264
265
266
267
268 private SortedMap<TableName, TableInfo> tablesInfo =
269 new ConcurrentSkipListMap<TableName, TableInfo>();
270
271
272
273
274 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
275
276 private Map<TableName, Set<String>> orphanTableDirs =
277 new HashMap<TableName, Set<String>>();
278
279
280
281
282
283
284
285
286 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
287 ZooKeeperConnectionException, IOException, ClassNotFoundException {
288 super(conf);
289
290 setConf(HBaseConfiguration.create(getConf()));
291
292 getConf().setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0);
293 errors = getErrorReporter(conf);
294
295 int numThreads = conf.getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
296 executor = new ScheduledThreadPoolExecutor(numThreads, Threads.newDaemonThreadFactory("hbasefsck"));
297 }
298
299
300
301
302
303
304
305
306
307
308
309 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
310 ZooKeeperConnectionException, IOException, ClassNotFoundException {
311 super(conf);
312 errors = getErrorReporter(getConf());
313 this.executor = exec;
314 }
315
316
317
318
319
320
321
322 private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
323 long start = EnvironmentEdgeManager.currentTimeMillis();
324 try {
325 FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
326 FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
327 HConstants.DATA_FILE_UMASK_KEY);
328 Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
329 fs.mkdirs(tmpDir);
330 HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
331 final FSDataOutputStream out = FSUtils.create(fs, HBCK_LOCK_PATH, defaultPerms, false);
332 out.writeBytes(InetAddress.getLocalHost().toString());
333 out.flush();
334 return out;
335 } catch(RemoteException e) {
336 if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
337 return null;
338 } else {
339 throw e;
340 }
341 } finally {
342 long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
343 if (duration > 30000) {
344 LOG.warn("Took " + duration + " milliseconds to obtain lock");
345
346 return null;
347 }
348 }
349 }
350
351 private void unlockHbck() {
352 if(hbckLockCleanup.compareAndSet(true, false)){
353 IOUtils.closeStream(hbckOutFd);
354 try{
355 FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true);
356 } catch(IOException ioe) {
357 LOG.warn("Failed to delete " + HBCK_LOCK_PATH);
358 LOG.debug(ioe);
359 }
360 }
361 }
362
363
364
365
366
367 public void connect() throws IOException {
368
369
370 hbckOutFd = checkAndMarkRunningHbck();
371 if (hbckOutFd == null) {
372 setRetCode(-1);
373 LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
374 " no other instance is running, delete the lock file " +
375 HBCK_LOCK_PATH + " and rerun the tool]");
376 throw new IOException("Duplicate hbck - Abort");
377 }
378
379
380 hbckLockCleanup.set(true);
381
382
383
384
385 Runtime.getRuntime().addShutdownHook(new Thread() {
386 @Override
387 public void run() {
388 unlockHbck();
389 }
390 });
391 LOG.debug("Launching hbck");
392
393 connection = HConnectionManager.createConnection(getConf());
394 admin = new HBaseAdmin(connection);
395 meta = new HTable(TableName.META_TABLE_NAME, connection);
396 status = admin.getClusterStatus();
397 }
398
399
400
401
402 private void loadDeployedRegions() throws IOException, InterruptedException {
403
404 Collection<ServerName> regionServers = status.getServers();
405 errors.print("Number of live region servers: " + regionServers.size());
406 if (details) {
407 for (ServerName rsinfo: regionServers) {
408 errors.print(" " + rsinfo.getServerName());
409 }
410 }
411
412
413 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
414 errors.print("Number of dead region servers: " + deadRegionServers.size());
415 if (details) {
416 for (ServerName name: deadRegionServers) {
417 errors.print(" " + name);
418 }
419 }
420
421
422 errors.print("Master: " + status.getMaster());
423
424
425 Collection<ServerName> backupMasters = status.getBackupMasters();
426 errors.print("Number of backup masters: " + backupMasters.size());
427 if (details) {
428 for (ServerName name: backupMasters) {
429 errors.print(" " + name);
430 }
431 }
432
433 errors.print("Average load: " + status.getAverageLoad());
434 errors.print("Number of requests: " + status.getRequestsCount());
435 errors.print("Number of regions: " + status.getRegionsCount());
436
437 Map<String, RegionState> rits = status.getRegionsInTransition();
438 errors.print("Number of regions in transition: " + rits.size());
439 if (details) {
440 for (RegionState state: rits.values()) {
441 errors.print(" " + state.toDescriptiveString());
442 }
443 }
444
445
446 processRegionServers(regionServers);
447 }
448
449
450
451
452 private void clearState() {
453
454 fixes = 0;
455 regionInfoMap.clear();
456 emptyRegionInfoQualifiers.clear();
457 disabledTables.clear();
458 errors.clear();
459 tablesInfo.clear();
460 orphanHdfsDirs.clear();
461 }
462
463
464
465
466
467
468 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
469
470 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
471 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
472 LOG.info("Loading regioninfos HDFS");
473
474 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
475 int curIter = 0;
476 do {
477 clearState();
478
479 restoreHdfsIntegrity();
480 curIter++;
481 } while (fixes > 0 && curIter <= maxIterations);
482
483
484
485 if (curIter > 2) {
486 if (curIter == maxIterations) {
487 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
488 + "Tables integrity may not be fully repaired!");
489 } else {
490 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
491 }
492 }
493 }
494 }
495
496
497
498
499
500
501
502
503
504 public int onlineConsistencyRepair() throws IOException, KeeperException,
505 InterruptedException {
506 clearState();
507
508
509 loadDeployedRegions();
510
511 recordMetaRegion();
512
513 if (!checkMetaRegion()) {
514 String errorMsg = "hbase:meta table is not consistent. ";
515 if (shouldFixAssignments()) {
516 errorMsg += "HBCK will try fixing it. Rerun once hbase:meta is back to consistent state.";
517 } else {
518 errorMsg += "Run HBCK with proper fix options to fix hbase:meta inconsistency.";
519 }
520 errors.reportError(errorMsg + " Exiting...");
521 return -2;
522 }
523
524 LOG.info("Loading regionsinfo from the hbase:meta table");
525 boolean success = loadMetaEntries();
526 if (!success) return -1;
527
528
529 reportEmptyMetaCells();
530
531
532 if (shouldFixEmptyMetaCells()) {
533 fixEmptyMetaCells();
534 }
535
536
537 if (!checkMetaOnly) {
538 reportTablesInFlux();
539 }
540
541
542 if (shouldCheckHdfs()) {
543 loadHdfsRegionDirs();
544 loadHdfsRegionInfos();
545 }
546
547
548 loadDisabledTables();
549
550
551 fixOrphanTables();
552
553
554 checkAndFixConsistency();
555
556
557 checkIntegrity();
558 return errors.getErrorList().size();
559 }
560
561
562
563
564
565 public int onlineHbck() throws IOException, KeeperException, InterruptedException, ServiceException {
566
567 errors.print("Version: " + status.getHBaseVersion());
568 offlineHdfsIntegrityRepair();
569
570
571 boolean oldBalancer = admin.setBalancerRunning(false, true);
572 try {
573 onlineConsistencyRepair();
574 }
575 finally {
576 admin.setBalancerRunning(oldBalancer, false);
577 }
578
579 if (checkRegionBoundaries) {
580 checkRegionBoundaries();
581 }
582
583 offlineReferenceFileRepair();
584
585 checkAndFixTableLocks();
586
587
588 unlockHbck();
589
590
591 printTableSummary(tablesInfo);
592 return errors.summarize();
593 }
594
595 public static byte[] keyOnly (byte[] b) {
596 if (b == null)
597 return b;
598 int rowlength = Bytes.toShort(b, 0);
599 byte[] result = new byte[rowlength];
600 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
601 return result;
602 }
603
604 private static class RegionBoundariesInformation {
605 public byte [] regionName;
606 public byte [] metaFirstKey;
607 public byte [] metaLastKey;
608 public byte [] storesFirstKey;
609 public byte [] storesLastKey;
610 @Override
611 public String toString () {
612 return "regionName=" + Bytes.toStringBinary(regionName) +
613 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
614 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
615 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
616 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
617 }
618 }
619
620 public void checkRegionBoundaries() {
621 try {
622 ByteArrayComparator comparator = new ByteArrayComparator();
623 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), false);
624 final RegionBoundariesInformation currentRegionBoundariesInformation =
625 new RegionBoundariesInformation();
626 Path hbaseRoot = FSUtils.getRootDir(getConf());
627 for (HRegionInfo regionInfo : regions) {
628 Path tableDir = FSUtils.getTableDir(hbaseRoot, regionInfo.getTable());
629 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
630
631
632 Path path = new Path(tableDir, regionInfo.getEncodedName());
633 FileSystem fs = path.getFileSystem(getConf());
634 FileStatus[] files = fs.listStatus(path);
635
636 byte[] storeFirstKey = null;
637 byte[] storeLastKey = null;
638 for (FileStatus file : files) {
639 String fileName = file.getPath().toString();
640 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
641 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
642 FileStatus[] storeFiles = fs.listStatus(file.getPath());
643
644 for (FileStatus storeFile : storeFiles) {
645 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
646 getConf()), getConf());
647 if ((reader.getFirstKey() != null)
648 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
649 reader.getFirstKey()) > 0))) {
650 storeFirstKey = reader.getFirstKey();
651 }
652 if ((reader.getLastKey() != null)
653 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
654 reader.getLastKey())) < 0)) {
655 storeLastKey = reader.getLastKey();
656 }
657 reader.close();
658 }
659 }
660 }
661 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
662 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
663 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
664 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
665 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
666 currentRegionBoundariesInformation.metaFirstKey = null;
667 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
668 currentRegionBoundariesInformation.metaLastKey = null;
669
670
671
672
673
674
675 boolean valid = true;
676
677 if ((currentRegionBoundariesInformation.storesFirstKey != null)
678 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
679 valid = valid
680 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
681 currentRegionBoundariesInformation.metaFirstKey) >= 0;
682 }
683
684 if ((currentRegionBoundariesInformation.storesLastKey != null)
685 && (currentRegionBoundariesInformation.metaLastKey != null)) {
686 valid = valid
687 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
688 currentRegionBoundariesInformation.metaLastKey) < 0;
689 }
690 if (!valid) {
691 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
692 tablesInfo.get(regionInfo.getTable()));
693 LOG.warn("Region's boundaries not alligned between stores and META for:");
694 LOG.warn(currentRegionBoundariesInformation);
695 }
696 }
697 } catch (IOException e) {
698 LOG.error(e);
699 }
700 }
701
702
703
704
705 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
706 for (HbckInfo hi : orphanHdfsDirs) {
707 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
708 adoptHdfsOrphan(hi);
709 }
710 }
711
712
713
714
715
716
717
718
719
720
721 @SuppressWarnings("deprecation")
722 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
723 Path p = hi.getHdfsRegionDir();
724 FileSystem fs = p.getFileSystem(getConf());
725 FileStatus[] dirs = fs.listStatus(p);
726 if (dirs == null) {
727 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
728 p + ". This dir could probably be deleted.");
729 return ;
730 }
731
732 TableName tableName = hi.getTableName();
733 TableInfo tableInfo = tablesInfo.get(tableName);
734 Preconditions.checkNotNull(tableInfo, "Table '" + tableName + "' not present!");
735 HTableDescriptor template = tableInfo.getHTD();
736
737
738 Pair<byte[],byte[]> orphanRegionRange = null;
739 for (FileStatus cf : dirs) {
740 String cfName= cf.getPath().getName();
741
742 if (cfName.startsWith(".") || cfName.equals(HConstants.SPLIT_LOGDIR_NAME)) continue;
743
744 FileStatus[] hfiles = fs.listStatus(cf.getPath());
745 for (FileStatus hfile : hfiles) {
746 byte[] start, end;
747 HFile.Reader hf = null;
748 try {
749 CacheConfig cacheConf = new CacheConfig(getConf());
750 hf = HFile.createReader(fs, hfile.getPath(), cacheConf, getConf());
751 hf.loadFileInfo();
752 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
753 start = startKv.getRow();
754 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
755 end = endKv.getRow();
756 } catch (IOException ioe) {
757 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
758 continue;
759 } catch (NullPointerException ioe) {
760 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
761 continue;
762 } finally {
763 if (hf != null) {
764 hf.close();
765 }
766 }
767
768
769 if (orphanRegionRange == null) {
770
771 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
772 } else {
773
774
775
776 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
777 orphanRegionRange.setFirst(start);
778 }
779 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
780 orphanRegionRange.setSecond(end);
781 }
782 }
783 }
784 }
785 if (orphanRegionRange == null) {
786 LOG.warn("No data in dir " + p + ", sidelining data");
787 fixes++;
788 sidelineRegionDir(fs, hi);
789 return;
790 }
791 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
792 Bytes.toString(orphanRegionRange.getSecond()) + ")");
793
794
795 HRegionInfo hri = new HRegionInfo(template.getTableName(), orphanRegionRange.getFirst(), orphanRegionRange.getSecond());
796 LOG.info("Creating new region : " + hri);
797 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
798 Path target = region.getRegionFileSystem().getRegionDir();
799
800
801 mergeRegionDirs(target, hi);
802 fixes++;
803 }
804
805
806
807
808
809
810
811
812
813 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
814
815 LOG.info("Loading HBase regioninfo from HDFS...");
816 loadHdfsRegionDirs();
817
818 int errs = errors.getErrorList().size();
819
820 tablesInfo = loadHdfsRegionInfos();
821 checkHdfsIntegrity(false, false);
822
823 if (errors.getErrorList().size() == errs) {
824 LOG.info("No integrity errors. We are done with this phase. Glorious.");
825 return 0;
826 }
827
828 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
829 adoptHdfsOrphans(orphanHdfsDirs);
830
831 }
832
833
834 if (shouldFixHdfsHoles()) {
835 clearState();
836 loadHdfsRegionDirs();
837 tablesInfo = loadHdfsRegionInfos();
838 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
839 }
840
841
842 if (shouldFixHdfsOverlaps()) {
843
844 clearState();
845 loadHdfsRegionDirs();
846 tablesInfo = loadHdfsRegionInfos();
847 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
848 }
849
850 return errors.getErrorList().size();
851 }
852
853
854
855
856
857
858
859
860
861 private void offlineReferenceFileRepair() throws IOException {
862 Configuration conf = getConf();
863 Path hbaseRoot = FSUtils.getRootDir(conf);
864 FileSystem fs = hbaseRoot.getFileSystem(conf);
865 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot);
866 for (Path path: allFiles.values()) {
867 boolean isReference = false;
868 try {
869 isReference = StoreFileInfo.isReference(path);
870 } catch (Throwable t) {
871
872
873
874
875 }
876 if (!isReference) continue;
877
878 Path referredToFile = StoreFileInfo.getReferredToFile(path);
879 if (fs.exists(referredToFile)) continue;
880
881
882 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
883 "Found lingering reference file " + path);
884 if (!shouldFixReferenceFiles()) continue;
885
886
887 boolean success = false;
888 String pathStr = path.toString();
889
890
891
892
893
894 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
895 for (int i = 0; index > 0 && i < 5; i++) {
896 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index - 1);
897 }
898 if (index > 0) {
899 Path rootDir = getSidelineDir();
900 Path dst = new Path(rootDir, pathStr.substring(index + 1));
901 fs.mkdirs(dst.getParent());
902 LOG.info("Trying to sildeline reference file "
903 + path + " to " + dst);
904 setShouldRerun();
905
906 success = fs.rename(path, dst);
907 }
908 if (!success) {
909 LOG.error("Failed to sideline reference file " + path);
910 }
911 }
912 }
913
914
915
916
917 private void reportEmptyMetaCells() {
918 errors.print("Number of empty REGIONINFO_QUALIFIER rows in hbase:meta: " +
919 emptyRegionInfoQualifiers.size());
920 if (details) {
921 for (Result r: emptyRegionInfoQualifiers) {
922 errors.print(" " + r);
923 }
924 }
925 }
926
927
928
929
930 private void reportTablesInFlux() {
931 AtomicInteger numSkipped = new AtomicInteger(0);
932 HTableDescriptor[] allTables = getTables(numSkipped);
933 errors.print("Number of Tables: " + allTables.length);
934 if (details) {
935 if (numSkipped.get() > 0) {
936 errors.detail("Number of Tables in flux: " + numSkipped.get());
937 }
938 for (HTableDescriptor td : allTables) {
939 errors.detail(" Table: " + td.getTableName() + "\t" +
940 (td.isReadOnly() ? "ro" : "rw") + "\t" +
941 (td.isMetaRegion() ? "META" : " ") + "\t" +
942 " families: " + td.getFamilies().size());
943 }
944 }
945 }
946
947 public ErrorReporter getErrors() {
948 return errors;
949 }
950
951
952
953
954
955 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
956 Path regionDir = hbi.getHdfsRegionDir();
957 if (regionDir == null) {
958 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
959 return;
960 }
961
962 if (hbi.hdfsEntry.hri != null) {
963
964 return;
965 }
966
967 FileSystem fs = FileSystem.get(getConf());
968 HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
969 LOG.debug("HRegionInfo read: " + hri.toString());
970 hbi.hdfsEntry.hri = hri;
971 }
972
973
974
975
976
977 public static class RegionRepairException extends IOException {
978 private static final long serialVersionUID = 1L;
979 final IOException ioe;
980 public RegionRepairException(String s, IOException ioe) {
981 super(s);
982 this.ioe = ioe;
983 }
984 }
985
986
987
988
989 private SortedMap<TableName, TableInfo> loadHdfsRegionInfos()
990 throws IOException, InterruptedException {
991 tablesInfo.clear();
992
993 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
994
995
996 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
997 List<Future<Void>> hbiFutures;
998
999 for (HbckInfo hbi : hbckInfos) {
1000 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
1001 hbis.add(work);
1002 }
1003
1004
1005 hbiFutures = executor.invokeAll(hbis);
1006
1007 for(int i=0; i<hbiFutures.size(); i++) {
1008 WorkItemHdfsRegionInfo work = hbis.get(i);
1009 Future<Void> f = hbiFutures.get(i);
1010 try {
1011 f.get();
1012 } catch(ExecutionException e) {
1013 LOG.warn("Failed to read .regioninfo file for region " +
1014 work.hbi.getRegionNameAsString(), e.getCause());
1015 }
1016 }
1017
1018 Path hbaseRoot = FSUtils.getRootDir(getConf());
1019 FileSystem fs = hbaseRoot.getFileSystem(getConf());
1020
1021 for (HbckInfo hbi: hbckInfos) {
1022
1023 if (hbi.getHdfsHRI() == null) {
1024
1025 continue;
1026 }
1027
1028
1029
1030 TableName tableName = hbi.getTableName();
1031 if (tableName == null) {
1032
1033 LOG.warn("tableName was null for: " + hbi);
1034 continue;
1035 }
1036
1037 TableInfo modTInfo = tablesInfo.get(tableName);
1038 if (modTInfo == null) {
1039
1040 modTInfo = new TableInfo(tableName);
1041 tablesInfo.put(tableName, modTInfo);
1042 try {
1043 HTableDescriptor htd =
1044 FSTableDescriptors.getTableDescriptorFromFs(fs, hbaseRoot, tableName);
1045 modTInfo.htds.add(htd);
1046 } catch (IOException ioe) {
1047 if (!orphanTableDirs.containsKey(tableName)) {
1048 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
1049
1050 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
1051 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
1052 Set<String> columns = new HashSet<String>();
1053 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
1054 }
1055 }
1056 }
1057 if (!hbi.isSkipChecks()) {
1058 modTInfo.addRegionInfo(hbi);
1059 }
1060 }
1061
1062 loadTableInfosForTablesWithNoRegion();
1063
1064 return tablesInfo;
1065 }
1066
1067
1068
1069
1070
1071
1072
1073
1074 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
1075 Path regionDir = hbi.getHdfsRegionDir();
1076 FileSystem fs = regionDir.getFileSystem(getConf());
1077 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
1078 for (FileStatus subdir : subDirs) {
1079 String columnfamily = subdir.getPath().getName();
1080 columns.add(columnfamily);
1081 }
1082 return columns;
1083 }
1084
1085
1086
1087
1088
1089
1090
1091
1092 private boolean fabricateTableInfo(FSTableDescriptors fstd, TableName tableName,
1093 Set<String> columns) throws IOException {
1094 if (columns ==null || columns.isEmpty()) return false;
1095 HTableDescriptor htd = new HTableDescriptor(tableName);
1096 for (String columnfamimly : columns) {
1097 htd.addFamily(new HColumnDescriptor(columnfamimly));
1098 }
1099 fstd.createTableDescriptor(htd, true);
1100 return true;
1101 }
1102
1103
1104
1105
1106
1107 public void fixEmptyMetaCells() throws IOException {
1108 if (shouldFixEmptyMetaCells() && !emptyRegionInfoQualifiers.isEmpty()) {
1109 LOG.info("Trying to fix empty REGIONINFO_QUALIFIER hbase:meta rows.");
1110 for (Result region : emptyRegionInfoQualifiers) {
1111 deleteMetaRegion(region.getRow());
1112 errors.getErrorList().remove(ERROR_CODE.EMPTY_META_CELL);
1113 }
1114 emptyRegionInfoQualifiers.clear();
1115 }
1116 }
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127 public void fixOrphanTables() throws IOException {
1128 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1129
1130 List<TableName> tmpList = new ArrayList<TableName>();
1131 tmpList.addAll(orphanTableDirs.keySet());
1132 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1133 Iterator<Entry<TableName, Set<String>>> iter =
1134 orphanTableDirs.entrySet().iterator();
1135 int j = 0;
1136 int numFailedCase = 0;
1137 FSTableDescriptors fstd = new FSTableDescriptors(getConf());
1138 while (iter.hasNext()) {
1139 Entry<TableName, Set<String>> entry =
1140 iter.next();
1141 TableName tableName = entry.getKey();
1142 LOG.info("Trying to fix orphan table error: " + tableName);
1143 if (j < htds.length) {
1144 if (tableName.equals(htds[j].getTableName())) {
1145 HTableDescriptor htd = htds[j];
1146 LOG.info("fixing orphan table: " + tableName + " from cache");
1147 fstd.createTableDescriptor(htd, true);
1148 j++;
1149 iter.remove();
1150 }
1151 } else {
1152 if (fabricateTableInfo(fstd, tableName, entry.getValue())) {
1153 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1154 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1155 iter.remove();
1156 } else {
1157 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1158 numFailedCase++;
1159 }
1160 }
1161 fixes++;
1162 }
1163
1164 if (orphanTableDirs.isEmpty()) {
1165
1166
1167 setShouldRerun();
1168 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1169 } else if (numFailedCase > 0) {
1170 LOG.error("Failed to fix " + numFailedCase
1171 + " OrphanTables with default .tableinfo files");
1172 }
1173
1174 }
1175
1176 orphanTableDirs.clear();
1177
1178 }
1179
1180
1181
1182
1183
1184
1185 private HRegion createNewMeta() throws IOException {
1186 Path rootdir = FSUtils.getRootDir(getConf());
1187 Configuration c = getConf();
1188 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1189 HTableDescriptor metaDescriptor = new FSTableDescriptors(c).get(TableName.META_TABLE_NAME);
1190 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, false);
1191 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c, metaDescriptor);
1192 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, true);
1193 return meta;
1194 }
1195
1196
1197
1198
1199
1200
1201
1202 private ArrayList<Put> generatePuts(
1203 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1204 ArrayList<Put> puts = new ArrayList<Put>();
1205 boolean hasProblems = false;
1206 for (Entry<TableName, TableInfo> e : tablesInfo.entrySet()) {
1207 TableName name = e.getKey();
1208
1209
1210 if (name.compareTo(TableName.META_TABLE_NAME) == 0) {
1211 continue;
1212 }
1213
1214 TableInfo ti = e.getValue();
1215 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1216 .entrySet()) {
1217 Collection<HbckInfo> his = spl.getValue();
1218 int sz = his.size();
1219 if (sz != 1) {
1220
1221 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1222 + " had " + sz + " regions instead of exactly 1." );
1223 hasProblems = true;
1224 continue;
1225 }
1226
1227
1228 HbckInfo hi = his.iterator().next();
1229 HRegionInfo hri = hi.getHdfsHRI();
1230 Put p = MetaEditor.makePutFromRegionInfo(hri);
1231 puts.add(p);
1232 }
1233 }
1234 return hasProblems ? null : puts;
1235 }
1236
1237
1238
1239
1240 private void suggestFixes(
1241 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1242 for (TableInfo tInfo : tablesInfo.values()) {
1243 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1244 tInfo.checkRegionChain(handler);
1245 }
1246 }
1247
1248
1249
1250
1251
1252
1253
1254
1255 public boolean rebuildMeta(boolean fix) throws IOException,
1256 InterruptedException {
1257
1258
1259
1260
1261
1262 LOG.info("Loading HBase regioninfo from HDFS...");
1263 loadHdfsRegionDirs();
1264
1265 int errs = errors.getErrorList().size();
1266 tablesInfo = loadHdfsRegionInfos();
1267 checkHdfsIntegrity(false, false);
1268
1269
1270 if (errors.getErrorList().size() != errs) {
1271
1272 while(true) {
1273 fixes = 0;
1274 suggestFixes(tablesInfo);
1275 errors.clear();
1276 loadHdfsRegionInfos();
1277 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1278
1279 int errCount = errors.getErrorList().size();
1280
1281 if (fixes == 0) {
1282 if (errCount > 0) {
1283 return false;
1284 } else {
1285 break;
1286 }
1287 }
1288 }
1289 }
1290
1291
1292 LOG.info("HDFS regioninfo's seems good. Sidelining old hbase:meta");
1293 Path backupDir = sidelineOldMeta();
1294
1295 LOG.info("Creating new hbase:meta");
1296 HRegion meta = createNewMeta();
1297
1298
1299 List<Put> puts = generatePuts(tablesInfo);
1300 if (puts == null) {
1301 LOG.fatal("Problem encountered when creating new hbase:meta entries. " +
1302 "You may need to restore the previously sidelined hbase:meta");
1303 return false;
1304 }
1305 meta.batchMutate(puts.toArray(new Put[puts.size()]));
1306 HRegion.closeHRegion(meta);
1307 LOG.info("Success! hbase:meta table rebuilt.");
1308 LOG.info("Old hbase:meta is moved into " + backupDir);
1309 return true;
1310 }
1311
1312 private SortedMap<TableName, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1313 boolean fixOverlaps) throws IOException {
1314 LOG.info("Checking HBase region split map from HDFS data...");
1315 for (TableInfo tInfo : tablesInfo.values()) {
1316 TableIntegrityErrorHandler handler;
1317 if (fixHoles || fixOverlaps) {
1318 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1319 fixHoles, fixOverlaps);
1320 } else {
1321 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1322 }
1323 if (!tInfo.checkRegionChain(handler)) {
1324
1325 errors.report("Found inconsistency in table " + tInfo.getName());
1326 }
1327 }
1328 return tablesInfo;
1329 }
1330
1331 private Path getSidelineDir() throws IOException {
1332 if (sidelineDir == null) {
1333 Path hbaseDir = FSUtils.getRootDir(getConf());
1334 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1335 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1336 + startMillis);
1337 }
1338 return sidelineDir;
1339 }
1340
1341
1342
1343
1344 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1345 return sidelineRegionDir(fs, null, hi);
1346 }
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356 Path sidelineRegionDir(FileSystem fs,
1357 String parentDir, HbckInfo hi) throws IOException {
1358 TableName tableName = hi.getTableName();
1359 Path regionDir = hi.getHdfsRegionDir();
1360
1361 if (!fs.exists(regionDir)) {
1362 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1363 return null;
1364 }
1365
1366 Path rootDir = getSidelineDir();
1367 if (parentDir != null) {
1368 rootDir = new Path(rootDir, parentDir);
1369 }
1370 Path sidelineTableDir= FSUtils.getTableDir(rootDir, tableName);
1371 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1372 fs.mkdirs(sidelineRegionDir);
1373 boolean success = false;
1374 FileStatus[] cfs = fs.listStatus(regionDir);
1375 if (cfs == null) {
1376 LOG.info("Region dir is empty: " + regionDir);
1377 } else {
1378 for (FileStatus cf : cfs) {
1379 Path src = cf.getPath();
1380 Path dst = new Path(sidelineRegionDir, src.getName());
1381 if (fs.isFile(src)) {
1382
1383 success = fs.rename(src, dst);
1384 if (!success) {
1385 String msg = "Unable to rename file " + src + " to " + dst;
1386 LOG.error(msg);
1387 throw new IOException(msg);
1388 }
1389 continue;
1390 }
1391
1392
1393 fs.mkdirs(dst);
1394
1395 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1396
1397
1398
1399
1400 FileStatus[] hfiles = fs.listStatus(src);
1401 if (hfiles != null && hfiles.length > 0) {
1402 for (FileStatus hfile : hfiles) {
1403 success = fs.rename(hfile.getPath(), dst);
1404 if (!success) {
1405 String msg = "Unable to rename file " + src + " to " + dst;
1406 LOG.error(msg);
1407 throw new IOException(msg);
1408 }
1409 }
1410 }
1411 LOG.debug("Sideline directory contents:");
1412 debugLsr(sidelineRegionDir);
1413 }
1414 }
1415
1416 LOG.info("Removing old region dir: " + regionDir);
1417 success = fs.delete(regionDir, true);
1418 if (!success) {
1419 String msg = "Unable to delete dir " + regionDir;
1420 LOG.error(msg);
1421 throw new IOException(msg);
1422 }
1423 return sidelineRegionDir;
1424 }
1425
1426
1427
1428
1429 void sidelineTable(FileSystem fs, TableName tableName, Path hbaseDir,
1430 Path backupHbaseDir) throws IOException {
1431 Path tableDir = FSUtils.getTableDir(hbaseDir, tableName);
1432 if (fs.exists(tableDir)) {
1433 Path backupTableDir= FSUtils.getTableDir(backupHbaseDir, tableName);
1434 fs.mkdirs(backupTableDir.getParent());
1435 boolean success = fs.rename(tableDir, backupTableDir);
1436 if (!success) {
1437 throw new IOException("Failed to move " + tableName + " from "
1438 + tableDir + " to " + backupTableDir);
1439 }
1440 } else {
1441 LOG.info("No previous " + tableName + " exists. Continuing.");
1442 }
1443 }
1444
1445
1446
1447
1448 Path sidelineOldMeta() throws IOException {
1449
1450 Path hbaseDir = FSUtils.getRootDir(getConf());
1451 FileSystem fs = hbaseDir.getFileSystem(getConf());
1452 Path backupDir = getSidelineDir();
1453 fs.mkdirs(backupDir);
1454
1455 try {
1456 sidelineTable(fs, TableName.META_TABLE_NAME, hbaseDir, backupDir);
1457 } catch (IOException e) {
1458 LOG.fatal("... failed to sideline meta. Currently in inconsistent state. To restore "
1459 + "try to rename hbase:meta in " + backupDir.getName() + " to "
1460 + hbaseDir.getName() + ".", e);
1461 throw e;
1462 }
1463 return backupDir;
1464 }
1465
1466
1467
1468
1469
1470
1471 private void loadDisabledTables()
1472 throws ZooKeeperConnectionException, IOException {
1473 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1474 @Override
1475 public Void connect(HConnection connection) throws IOException {
1476 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1477 try {
1478 for (TableName tableName :
1479 ZKTableReadOnly.getDisabledOrDisablingTables(zkw)) {
1480 disabledTables.add(tableName);
1481 }
1482 } catch (KeeperException ke) {
1483 throw new IOException(ke);
1484 } finally {
1485 zkw.close();
1486 }
1487 return null;
1488 }
1489 });
1490 }
1491
1492
1493
1494
1495 private boolean isTableDisabled(HRegionInfo regionInfo) {
1496 return disabledTables.contains(regionInfo.getTable());
1497 }
1498
1499
1500
1501
1502
1503 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1504 Path rootDir = FSUtils.getRootDir(getConf());
1505 FileSystem fs = rootDir.getFileSystem(getConf());
1506
1507
1508 List<FileStatus> tableDirs = Lists.newArrayList();
1509
1510 boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME));
1511
1512 List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
1513 for (Path path : paths) {
1514 TableName tableName = FSUtils.getTableName(path);
1515 if ((!checkMetaOnly &&
1516 isTableIncluded(tableName)) ||
1517 tableName.equals(TableName.META_TABLE_NAME)) {
1518 tableDirs.add(fs.getFileStatus(path));
1519 }
1520 }
1521
1522
1523 if (!foundVersionFile) {
1524 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1525 "Version file does not exist in root dir " + rootDir);
1526 if (shouldFixVersionFile()) {
1527 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1528 + " file.");
1529 setShouldRerun();
1530 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1531 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1532 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1533 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1534 }
1535 }
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561 private boolean recordMetaRegion() throws IOException {
1562 HRegionLocation metaLocation = connection.locateRegion(
1563 TableName.META_TABLE_NAME, HConstants.EMPTY_START_ROW);
1564
1565
1566 if (metaLocation == null || metaLocation.getRegionInfo() == null ||
1567 metaLocation.getHostname() == null) {
1568 errors.reportError(ERROR_CODE.NULL_META_REGION,
1569 "META region or some of its attributes are null.");
1570 return false;
1571 }
1572 ServerName sn;
1573 try {
1574 sn = getMetaRegionServerName();
1575 } catch (KeeperException e) {
1576 throw new IOException(e);
1577 }
1578 MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis());
1579 HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
1580 if (hbckInfo == null) {
1581 regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
1582 } else {
1583 hbckInfo.metaEntry = m;
1584 }
1585 return true;
1586 }
1587
1588 private ZooKeeperWatcher createZooKeeperWatcher() throws IOException {
1589 return new ZooKeeperWatcher(getConf(), "hbase Fsck", new Abortable() {
1590 @Override
1591 public void abort(String why, Throwable e) {
1592 LOG.error(why, e);
1593 System.exit(1);
1594 }
1595
1596 @Override
1597 public boolean isAborted() {
1598 return false;
1599 }
1600
1601 });
1602 }
1603
1604 private ServerName getMetaRegionServerName()
1605 throws IOException, KeeperException {
1606 ZooKeeperWatcher zkw = createZooKeeperWatcher();
1607 ServerName sn = null;
1608 try {
1609 sn = MetaRegionTracker.getMetaRegionLocation(zkw);
1610 } finally {
1611 zkw.close();
1612 }
1613 return sn;
1614 }
1615
1616
1617
1618
1619
1620
1621 void processRegionServers(Collection<ServerName> regionServerList)
1622 throws IOException, InterruptedException {
1623
1624 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1625 List<Future<Void>> workFutures;
1626
1627
1628 for (ServerName rsinfo: regionServerList) {
1629 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1630 }
1631
1632 workFutures = executor.invokeAll(workItems);
1633
1634 for(int i=0; i<workFutures.size(); i++) {
1635 WorkItemRegion item = workItems.get(i);
1636 Future<Void> f = workFutures.get(i);
1637 try {
1638 f.get();
1639 } catch(ExecutionException e) {
1640 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1641 e.getCause());
1642 }
1643 }
1644 }
1645
1646
1647
1648
1649 private void checkAndFixConsistency()
1650 throws IOException, KeeperException, InterruptedException {
1651 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1652 checkRegionConsistency(e.getKey(), e.getValue());
1653 }
1654 }
1655
1656 private void preCheckPermission() throws IOException, AccessDeniedException {
1657 if (shouldIgnorePreCheckPermission()) {
1658 return;
1659 }
1660
1661 Path hbaseDir = FSUtils.getRootDir(getConf());
1662 FileSystem fs = hbaseDir.getFileSystem(getConf());
1663 UserProvider userProvider = UserProvider.instantiate(getConf());
1664 UserGroupInformation ugi = userProvider.getCurrent().getUGI();
1665 FileStatus[] files = fs.listStatus(hbaseDir);
1666 for (FileStatus file : files) {
1667 try {
1668 FSUtils.checkAccess(ugi, file, FsAction.WRITE);
1669 } catch (AccessDeniedException ace) {
1670 LOG.warn("Got AccessDeniedException when preCheckPermission ", ace);
1671 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + ugi.getUserName()
1672 + " does not have write perms to " + file.getPath()
1673 + ". Please rerun hbck as hdfs user " + file.getOwner());
1674 throw ace;
1675 }
1676 }
1677 }
1678
1679
1680
1681
1682 private void deleteMetaRegion(HbckInfo hi) throws IOException {
1683 deleteMetaRegion(hi.metaEntry.getRegionName());
1684 }
1685
1686
1687
1688
1689 private void deleteMetaRegion(byte[] metaKey) throws IOException {
1690 Delete d = new Delete(metaKey);
1691 meta.delete(d);
1692 meta.flushCommits();
1693 LOG.info("Deleted " + Bytes.toString(metaKey) + " from META" );
1694 }
1695
1696
1697
1698
1699 private void resetSplitParent(HbckInfo hi) throws IOException {
1700 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
1701 Delete d = new Delete(hi.metaEntry.getRegionName());
1702 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
1703 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
1704 mutations.add(d);
1705
1706 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
1707 hri.setOffline(false);
1708 hri.setSplit(false);
1709 Put p = MetaEditor.makePutFromRegionInfo(hri);
1710 mutations.add(p);
1711
1712 meta.mutateRow(mutations);
1713 meta.flushCommits();
1714 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
1715 }
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725 private void offline(byte[] regionName) throws IOException {
1726 String regionString = Bytes.toStringBinary(regionName);
1727 if (!rsSupportsOffline) {
1728 LOG.warn("Using unassign region " + regionString
1729 + " instead of using offline method, you should"
1730 + " restart HMaster after these repairs");
1731 admin.unassign(regionName, true);
1732 return;
1733 }
1734
1735
1736 try {
1737 LOG.info("Offlining region " + regionString);
1738 admin.offline(regionName);
1739 } catch (IOException ioe) {
1740 String notFoundMsg = "java.lang.NoSuchMethodException: " +
1741 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
1742 if (ioe.getMessage().contains(notFoundMsg)) {
1743 LOG.warn("Using unassign region " + regionString
1744 + " instead of using offline method, you should"
1745 + " restart HMaster after these repairs");
1746 rsSupportsOffline = false;
1747 admin.unassign(regionName, true);
1748 return;
1749 }
1750 throw ioe;
1751 }
1752 }
1753
1754 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
1755 for (OnlineEntry rse : hi.deployedEntries) {
1756 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
1757 try {
1758 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, rse.hsa, rse.hri);
1759 offline(rse.hri.getRegionName());
1760 } catch (IOException ioe) {
1761 LOG.warn("Got exception when attempting to offline region "
1762 + Bytes.toString(rse.hri.getRegionName()), ioe);
1763 }
1764 }
1765 }
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
1780 if (hi.metaEntry == null && hi.hdfsEntry == null) {
1781 undeployRegions(hi);
1782 return;
1783 }
1784
1785
1786 Get get = new Get(hi.getRegionName());
1787 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
1788 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
1789 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
1790 Result r = meta.get(get);
1791 ServerName serverName = HRegionInfo.getServerName(r);
1792 if (serverName == null) {
1793 errors.reportError("Unable to close region "
1794 + hi.getRegionNameAsString() + " because meta does not "
1795 + "have handle to reach it.");
1796 return;
1797 }
1798
1799 HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
1800 if (hri == null) {
1801 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
1802 + " because hbase:meta had invalid or missing "
1803 + HConstants.CATALOG_FAMILY_STR + ":"
1804 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
1805 + " qualifier value.");
1806 return;
1807 }
1808
1809
1810 HBaseFsckRepair.closeRegionSilentlyAndWait(admin, serverName, hri);
1811 }
1812
1813 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
1814 KeeperException, InterruptedException {
1815
1816 if (shouldFixAssignments()) {
1817 errors.print(msg);
1818 undeployRegions(hbi);
1819 setShouldRerun();
1820 HRegionInfo hri = hbi.getHdfsHRI();
1821 if (hri == null) {
1822 hri = hbi.metaEntry;
1823 }
1824 HBaseFsckRepair.fixUnassigned(admin, hri);
1825 HBaseFsckRepair.waitUntilAssigned(admin, hri);
1826 }
1827 }
1828
1829
1830
1831
1832 private void checkRegionConsistency(final String key, final HbckInfo hbi)
1833 throws IOException, KeeperException, InterruptedException {
1834 String descriptiveName = hbi.toString();
1835
1836 boolean inMeta = hbi.metaEntry != null;
1837
1838 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
1839 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
1840 boolean isDeployed = !hbi.deployedOn.isEmpty();
1841 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
1842 boolean deploymentMatchesMeta =
1843 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
1844 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
1845 boolean splitParent =
1846 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
1847 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
1848 boolean recentlyModified = inHdfs &&
1849 hbi.getModTime() + timelag > System.currentTimeMillis();
1850
1851
1852 if (hbi.containsOnlyHdfsEdits()) {
1853 return;
1854 }
1855 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
1856 return;
1857 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
1858 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
1859 "tabled that is not deployed");
1860 return;
1861 } else if (recentlyModified) {
1862 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
1863 return;
1864 }
1865
1866 else if (!inMeta && !inHdfs && !isDeployed) {
1867
1868 assert false : "Entry for region with no data";
1869 } else if (!inMeta && !inHdfs && isDeployed) {
1870 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
1871 + descriptiveName + ", key=" + key + ", not on HDFS or in hbase:meta but " +
1872 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1873 if (shouldFixAssignments()) {
1874 undeployRegions(hbi);
1875 }
1876
1877 } else if (!inMeta && inHdfs && !isDeployed) {
1878 if (hbi.isMerged()) {
1879
1880
1881 hbi.setSkipChecks(true);
1882 LOG.info("Region " + descriptiveName
1883 + " got merge recently, its file(s) will be cleaned by CatalogJanitor later");
1884 return;
1885 }
1886 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
1887 + descriptiveName + " on HDFS, but not listed in hbase:meta " +
1888 "or deployed on any region server");
1889
1890 if (shouldFixMeta()) {
1891 if (!hbi.isHdfsRegioninfoPresent()) {
1892 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
1893 + " in table integrity repair phase if -fixHdfsOrphans was" +
1894 " used.");
1895 return;
1896 }
1897
1898 HRegionInfo hri = hbi.getHdfsHRI();
1899 TableInfo tableInfo = tablesInfo.get(hri.getTable());
1900 if (tableInfo.regionsFromMeta.isEmpty()) {
1901 for (HbckInfo h : regionInfoMap.values()) {
1902 if (hri.getTable().equals(h.getTableName())) {
1903 if (h.metaEntry != null) tableInfo.regionsFromMeta
1904 .add((HRegionInfo) h.metaEntry);
1905 }
1906 }
1907 Collections.sort(tableInfo.regionsFromMeta);
1908 }
1909 for (HRegionInfo region : tableInfo.regionsFromMeta) {
1910 if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0
1911 && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(),
1912 hri.getEndKey()) >= 0)
1913 && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) {
1914 if(region.isSplit() || region.isOffline()) continue;
1915 Path regionDir = hbi.getHdfsRegionDir();
1916 FileSystem fs = regionDir.getFileSystem(getConf());
1917 List<Path> familyDirs = FSUtils.getFamilyDirs(fs, regionDir);
1918 for (Path familyDir : familyDirs) {
1919 List<Path> referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir);
1920 for (Path referenceFilePath : referenceFilePaths) {
1921 Path parentRegionDir =
1922 StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent();
1923 if (parentRegionDir.toString().endsWith(region.getEncodedName())) {
1924 LOG.warn(hri + " start and stop keys are in the range of " + region
1925 + ". The region might not be cleaned up from hdfs when region " + region
1926 + " split failed. Hence deleting from hdfs.");
1927 HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs,
1928 regionDir.getParent(), hri);
1929 return;
1930 }
1931 }
1932 }
1933 }
1934 }
1935
1936 LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
1937 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1938
1939 tryAssignmentRepair(hbi, "Trying to reassign region...");
1940 }
1941
1942 } else if (!inMeta && inHdfs && isDeployed) {
1943 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
1944 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1945 debugLsr(hbi.getHdfsRegionDir());
1946 if (shouldFixMeta()) {
1947 if (!hbi.isHdfsRegioninfoPresent()) {
1948 LOG.error("This should have been repaired in table integrity repair phase");
1949 return;
1950 }
1951
1952 LOG.info("Patching hbase:meta with with .regioninfo: " + hbi.getHdfsHRI());
1953 HBaseFsckRepair.fixMetaHoleOnline(getConf(), hbi.getHdfsHRI());
1954
1955 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
1956 }
1957
1958
1959 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
1960
1961
1962 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
1963
1964 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
1965 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
1966 if (infoA != null && infoB != null) {
1967
1968 hbi.setSkipChecks(true);
1969 return;
1970 }
1971 }
1972 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
1973 + descriptiveName + " is a split parent in META, in HDFS, "
1974 + "and not deployed on any region server. This could be transient.");
1975 if (shouldFixSplitParents()) {
1976 setShouldRerun();
1977 resetSplitParent(hbi);
1978 }
1979 } else if (inMeta && !inHdfs && !isDeployed) {
1980 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
1981 + descriptiveName + " found in META, but not in HDFS "
1982 + "or deployed on any region server.");
1983 if (shouldFixMeta()) {
1984 deleteMetaRegion(hbi);
1985 }
1986 } else if (inMeta && !inHdfs && isDeployed) {
1987 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
1988 + " found in META, but not in HDFS, " +
1989 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
1990
1991
1992
1993 if (shouldFixAssignments()) {
1994 errors.print("Trying to fix unassigned region...");
1995 undeployRegions(hbi);
1996 }
1997 if (shouldFixMeta()) {
1998
1999 deleteMetaRegion(hbi);
2000 }
2001 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
2002 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
2003 + " not deployed on any region server.");
2004 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2005 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
2006 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
2007 "Region " + descriptiveName + " should not be deployed according " +
2008 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2009 if (shouldFixAssignments()) {
2010 errors.print("Trying to close the region " + descriptiveName);
2011 setShouldRerun();
2012 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
2013 }
2014 } else if (inMeta && inHdfs && isMultiplyDeployed) {
2015 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
2016 + " is listed in hbase:meta on region server " + hbi.metaEntry.regionServer
2017 + " but is multiply assigned to region servers " +
2018 Joiner.on(", ").join(hbi.deployedOn));
2019
2020 if (shouldFixAssignments()) {
2021 errors.print("Trying to fix assignment error...");
2022 setShouldRerun();
2023 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
2024 }
2025 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
2026 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
2027 + descriptiveName + " listed in hbase:meta on region server " +
2028 hbi.metaEntry.regionServer + " but found on region server " +
2029 hbi.deployedOn.get(0));
2030
2031 if (shouldFixAssignments()) {
2032 errors.print("Trying to fix assignment error...");
2033 setShouldRerun();
2034 HBaseFsckRepair.fixMultiAssignment(admin, hbi.metaEntry, hbi.deployedOn);
2035 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
2036 }
2037 } else {
2038 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
2039 " is in an unforeseen state:" +
2040 " inMeta=" + inMeta +
2041 " inHdfs=" + inHdfs +
2042 " isDeployed=" + isDeployed +
2043 " isMultiplyDeployed=" + isMultiplyDeployed +
2044 " deploymentMatchesMeta=" + deploymentMatchesMeta +
2045 " shouldBeDeployed=" + shouldBeDeployed);
2046 }
2047 }
2048
2049
2050
2051
2052
2053
2054
2055 SortedMap<TableName, TableInfo> checkIntegrity() throws IOException {
2056 tablesInfo = new TreeMap<TableName,TableInfo> ();
2057 List<HbckInfo> noHDFSRegionInfos = new ArrayList<HbckInfo>();
2058 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
2059 for (HbckInfo hbi : regionInfoMap.values()) {
2060
2061 if (hbi.metaEntry == null) {
2062
2063 noHDFSRegionInfos.add(hbi);
2064 Path p = hbi.getHdfsRegionDir();
2065 if (p == null) {
2066 errors.report("No regioninfo in Meta or HDFS. " + hbi);
2067 }
2068
2069
2070 continue;
2071 }
2072 if (hbi.metaEntry.regionServer == null) {
2073 errors.detail("Skipping region because no region server: " + hbi);
2074 continue;
2075 }
2076 if (hbi.metaEntry.isOffline()) {
2077 errors.detail("Skipping region because it is offline: " + hbi);
2078 continue;
2079 }
2080 if (hbi.containsOnlyHdfsEdits()) {
2081 errors.detail("Skipping region because it only contains edits" + hbi);
2082 continue;
2083 }
2084
2085
2086
2087
2088
2089
2090 if (hbi.deployedOn.size() == 0) continue;
2091
2092
2093 TableName tableName = hbi.metaEntry.getTable();
2094 TableInfo modTInfo = tablesInfo.get(tableName);
2095 if (modTInfo == null) {
2096 modTInfo = new TableInfo(tableName);
2097 }
2098 for (ServerName server : hbi.deployedOn) {
2099 modTInfo.addServer(server);
2100 }
2101
2102 if (!hbi.isSkipChecks()) {
2103 modTInfo.addRegionInfo(hbi);
2104 }
2105
2106 tablesInfo.put(tableName, modTInfo);
2107 }
2108
2109 loadTableInfosForTablesWithNoRegion();
2110
2111 for (TableInfo tInfo : tablesInfo.values()) {
2112 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
2113 if (!tInfo.checkRegionChain(handler)) {
2114 errors.report("Found inconsistency in table " + tInfo.getName());
2115 }
2116 }
2117 return tablesInfo;
2118 }
2119
2120
2121
2122
2123 private void loadTableInfosForTablesWithNoRegion() throws IOException {
2124 Map<String, HTableDescriptor> allTables = new FSTableDescriptors(getConf()).getAll();
2125 for (HTableDescriptor htd : allTables.values()) {
2126 if (checkMetaOnly && !htd.isMetaTable()) {
2127 continue;
2128 }
2129
2130 TableName tableName = htd.getTableName();
2131 if (isTableIncluded(tableName) && !tablesInfo.containsKey(tableName)) {
2132 TableInfo tableInfo = new TableInfo(tableName);
2133 tableInfo.htds.add(htd);
2134 tablesInfo.put(htd.getTableName(), tableInfo);
2135 }
2136 }
2137 }
2138
2139
2140
2141
2142
2143 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2144 int fileMoves = 0;
2145 String thread = Thread.currentThread().getName();
2146 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2147 debugLsr(contained.getHdfsRegionDir());
2148
2149
2150 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2151 FileStatus[] dirs = null;
2152 try {
2153 dirs = fs.listStatus(contained.getHdfsRegionDir());
2154 } catch (FileNotFoundException fnfe) {
2155
2156
2157 if (!fs.exists(contained.getHdfsRegionDir())) {
2158 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2159 + " is missing. Assuming already sidelined or moved.");
2160 } else {
2161 sidelineRegionDir(fs, contained);
2162 }
2163 return fileMoves;
2164 }
2165
2166 if (dirs == null) {
2167 if (!fs.exists(contained.getHdfsRegionDir())) {
2168 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2169 + " already sidelined.");
2170 } else {
2171 sidelineRegionDir(fs, contained);
2172 }
2173 return fileMoves;
2174 }
2175
2176 for (FileStatus cf : dirs) {
2177 Path src = cf.getPath();
2178 Path dst = new Path(targetRegionDir, src.getName());
2179
2180 if (src.getName().equals(HRegionFileSystem.REGION_INFO_FILE)) {
2181
2182 continue;
2183 }
2184
2185 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2186
2187 continue;
2188 }
2189
2190 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2191
2192
2193
2194
2195 for (FileStatus hfile : fs.listStatus(src)) {
2196 boolean success = fs.rename(hfile.getPath(), dst);
2197 if (success) {
2198 fileMoves++;
2199 }
2200 }
2201 LOG.debug("[" + thread + "] Sideline directory contents:");
2202 debugLsr(targetRegionDir);
2203 }
2204
2205
2206 sidelineRegionDir(fs, contained);
2207 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2208 getSidelineDir());
2209 debugLsr(contained.getHdfsRegionDir());
2210
2211 return fileMoves;
2212 }
2213
2214
2215 static class WorkItemOverlapMerge implements Callable<Void> {
2216 private TableIntegrityErrorHandler handler;
2217 Collection<HbckInfo> overlapgroup;
2218
2219 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2220 this.handler = handler;
2221 this.overlapgroup = overlapgroup;
2222 }
2223
2224 @Override
2225 public Void call() throws Exception {
2226 handler.handleOverlapGroup(overlapgroup);
2227 return null;
2228 }
2229 };
2230
2231
2232
2233
2234
2235 public class TableInfo {
2236 TableName tableName;
2237 TreeSet <ServerName> deployedOn;
2238
2239
2240 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2241
2242
2243 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2244
2245
2246 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2247
2248
2249 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2250
2251
2252 final Multimap<byte[], HbckInfo> overlapGroups =
2253 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2254
2255
2256 final List<HRegionInfo> regionsFromMeta = new ArrayList<HRegionInfo>();
2257
2258 TableInfo(TableName name) {
2259 this.tableName = name;
2260 deployedOn = new TreeSet <ServerName>();
2261 }
2262
2263
2264
2265
2266 private HTableDescriptor getHTD() {
2267 if (htds.size() == 1) {
2268 return (HTableDescriptor)htds.toArray()[0];
2269 } else {
2270 LOG.error("None/Multiple table descriptors found for table '"
2271 + tableName + "' regions: " + htds);
2272 }
2273 return null;
2274 }
2275
2276 public void addRegionInfo(HbckInfo hir) {
2277 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2278
2279 sc.add(hir);
2280 return;
2281 }
2282
2283
2284 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2285 errors.reportError(
2286 ERROR_CODE.REGION_CYCLE,
2287 String.format("The endkey for this region comes before the "
2288 + "startkey, startkey=%s, endkey=%s",
2289 Bytes.toStringBinary(hir.getStartKey()),
2290 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2291 backwards.add(hir);
2292 return;
2293 }
2294
2295
2296 sc.add(hir);
2297 }
2298
2299 public void addServer(ServerName server) {
2300 this.deployedOn.add(server);
2301 }
2302
2303 public TableName getName() {
2304 return tableName;
2305 }
2306
2307 public int getNumRegions() {
2308 return sc.getStarts().size() + backwards.size();
2309 }
2310
2311 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2312 ErrorReporter errors;
2313
2314 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2315 this.errors = errors;
2316 setTableInfo(ti);
2317 }
2318
2319 @Override
2320 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2321 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2322 "First region should start with an empty key. You need to "
2323 + " create a new region and regioninfo in HDFS to plug the hole.",
2324 getTableInfo(), hi);
2325 }
2326
2327 @Override
2328 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2329 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2330 "Last region should end with an empty key. You need to "
2331 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2332 }
2333
2334 @Override
2335 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2336 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2337 "Region has the same start and end key.", getTableInfo(), hi);
2338 }
2339
2340 @Override
2341 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2342 byte[] key = r1.getStartKey();
2343
2344 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2345 "Multiple regions have the same startkey: "
2346 + Bytes.toStringBinary(key), getTableInfo(), r1);
2347 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2348 "Multiple regions have the same startkey: "
2349 + Bytes.toStringBinary(key), getTableInfo(), r2);
2350 }
2351
2352 @Override
2353 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2354 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2355 "There is an overlap in the region chain.",
2356 getTableInfo(), hi1, hi2);
2357 }
2358
2359 @Override
2360 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2361 errors.reportError(
2362 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2363 "There is a hole in the region chain between "
2364 + Bytes.toStringBinary(holeStart) + " and "
2365 + Bytes.toStringBinary(holeStop)
2366 + ". You need to create a new .regioninfo and region "
2367 + "dir in hdfs to plug the hole.");
2368 }
2369 };
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2384 Configuration conf;
2385
2386 boolean fixOverlaps = true;
2387
2388 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2389 boolean fixHoles, boolean fixOverlaps) {
2390 super(ti, errors);
2391 this.conf = conf;
2392 this.fixOverlaps = fixOverlaps;
2393
2394 }
2395
2396
2397
2398
2399
2400
2401 @Override
2402 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2403 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2404 "First region should start with an empty key. Creating a new " +
2405 "region and regioninfo in HDFS to plug the hole.",
2406 getTableInfo(), next);
2407 HTableDescriptor htd = getTableInfo().getHTD();
2408
2409 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(),
2410 HConstants.EMPTY_START_ROW, next.getStartKey());
2411
2412
2413 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2414 LOG.info("Table region start key was not empty. Created new empty region: "
2415 + newRegion + " " +region);
2416 fixes++;
2417 }
2418
2419 @Override
2420 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2421 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2422 "Last region should end with an empty key. Creating a new "
2423 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2424 HTableDescriptor htd = getTableInfo().getHTD();
2425
2426 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), curEndKey,
2427 HConstants.EMPTY_START_ROW);
2428
2429 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2430 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2431 + " " + region);
2432 fixes++;
2433 }
2434
2435
2436
2437
2438
2439 @Override
2440 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2441 errors.reportError(
2442 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2443 "There is a hole in the region chain between "
2444 + Bytes.toStringBinary(holeStartKey) + " and "
2445 + Bytes.toStringBinary(holeStopKey)
2446 + ". Creating a new regioninfo and region "
2447 + "dir in hdfs to plug the hole.");
2448 HTableDescriptor htd = getTableInfo().getHTD();
2449 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), holeStartKey, holeStopKey);
2450 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2451 LOG.info("Plugged hole by creating new empty region: "+ newRegion + " " +region);
2452 fixes++;
2453 }
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466 @Override
2467 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2468 throws IOException {
2469 Preconditions.checkNotNull(overlap);
2470 Preconditions.checkArgument(overlap.size() >0);
2471
2472 if (!this.fixOverlaps) {
2473 LOG.warn("Not attempting to repair overlaps.");
2474 return;
2475 }
2476
2477 if (overlap.size() > maxMerge) {
2478 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2479 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2480 if (sidelineBigOverlaps) {
2481
2482 sidelineBigOverlaps(overlap);
2483 }
2484 return;
2485 }
2486
2487 mergeOverlaps(overlap);
2488 }
2489
2490 void mergeOverlaps(Collection<HbckInfo> overlap)
2491 throws IOException {
2492 String thread = Thread.currentThread().getName();
2493 LOG.info("== [" + thread + "] Merging regions into one region: "
2494 + Joiner.on(",").join(overlap));
2495
2496 Pair<byte[], byte[]> range = null;
2497 for (HbckInfo hi : overlap) {
2498 if (range == null) {
2499 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2500 } else {
2501 if (RegionSplitCalculator.BYTES_COMPARATOR
2502 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2503 range.setFirst(hi.getStartKey());
2504 }
2505 if (RegionSplitCalculator.BYTES_COMPARATOR
2506 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2507 range.setSecond(hi.getEndKey());
2508 }
2509 }
2510
2511 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2512 LOG.debug("[" + thread + "] Contained region dir before close");
2513 debugLsr(hi.getHdfsRegionDir());
2514 try {
2515 LOG.info("[" + thread + "] Closing region: " + hi);
2516 closeRegion(hi);
2517 } catch (IOException ioe) {
2518 LOG.warn("[" + thread + "] Was unable to close region " + hi
2519 + ". Just continuing... ", ioe);
2520 } catch (InterruptedException e) {
2521 LOG.warn("[" + thread + "] Was unable to close region " + hi
2522 + ". Just continuing... ", e);
2523 }
2524
2525 try {
2526 LOG.info("[" + thread + "] Offlining region: " + hi);
2527 offline(hi.getRegionName());
2528 } catch (IOException ioe) {
2529 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2530 + ". Just continuing... ", ioe);
2531 }
2532 }
2533
2534
2535 HTableDescriptor htd = getTableInfo().getHTD();
2536
2537 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), range.getFirst(),
2538 range.getSecond());
2539 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2540 LOG.info("[" + thread + "] Created new empty container region: " +
2541 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2542 debugLsr(region.getRegionFileSystem().getRegionDir());
2543
2544
2545 boolean didFix= false;
2546 Path target = region.getRegionFileSystem().getRegionDir();
2547 for (HbckInfo contained : overlap) {
2548 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
2549 int merges = mergeRegionDirs(target, contained);
2550 if (merges > 0) {
2551 didFix = true;
2552 }
2553 }
2554 if (didFix) {
2555 fixes++;
2556 }
2557 }
2558
2559
2560
2561
2562
2563
2564
2565
2566 void sidelineBigOverlaps(
2567 Collection<HbckInfo> bigOverlap) throws IOException {
2568 int overlapsToSideline = bigOverlap.size() - maxMerge;
2569 if (overlapsToSideline > maxOverlapsToSideline) {
2570 overlapsToSideline = maxOverlapsToSideline;
2571 }
2572 List<HbckInfo> regionsToSideline =
2573 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
2574 FileSystem fs = FileSystem.get(conf);
2575 for (HbckInfo regionToSideline: regionsToSideline) {
2576 try {
2577 LOG.info("Closing region: " + regionToSideline);
2578 closeRegion(regionToSideline);
2579 } catch (IOException ioe) {
2580 LOG.warn("Was unable to close region " + regionToSideline
2581 + ". Just continuing... ", ioe);
2582 } catch (InterruptedException e) {
2583 LOG.warn("Was unable to close region " + regionToSideline
2584 + ". Just continuing... ", e);
2585 }
2586
2587 try {
2588 LOG.info("Offlining region: " + regionToSideline);
2589 offline(regionToSideline.getRegionName());
2590 } catch (IOException ioe) {
2591 LOG.warn("Unable to offline region from master: " + regionToSideline
2592 + ". Just continuing... ", ioe);
2593 }
2594
2595 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
2596 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
2597 if (sidelineRegionDir != null) {
2598 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
2599 LOG.info("After sidelined big overlapped region: "
2600 + regionToSideline.getRegionNameAsString()
2601 + " to " + sidelineRegionDir.toString());
2602 fixes++;
2603 }
2604 }
2605 }
2606 }
2607
2608
2609
2610
2611
2612
2613
2614 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
2615
2616
2617
2618 if (disabledTables.contains(this.tableName)) {
2619 return true;
2620 }
2621 int originalErrorsCount = errors.getErrorList().size();
2622 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
2623 SortedSet<byte[]> splits = sc.getSplits();
2624
2625 byte[] prevKey = null;
2626 byte[] problemKey = null;
2627
2628 if (splits.size() == 0) {
2629
2630 handler.handleHoleInRegionChain(HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW);
2631 }
2632
2633 for (byte[] key : splits) {
2634 Collection<HbckInfo> ranges = regions.get(key);
2635 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
2636 for (HbckInfo rng : ranges) {
2637 handler.handleRegionStartKeyNotEmpty(rng);
2638 }
2639 }
2640
2641
2642 for (HbckInfo rng : ranges) {
2643
2644 byte[] endKey = rng.getEndKey();
2645 endKey = (endKey.length == 0) ? null : endKey;
2646 if (Bytes.equals(rng.getStartKey(),endKey)) {
2647 handler.handleDegenerateRegion(rng);
2648 }
2649 }
2650
2651 if (ranges.size() == 1) {
2652
2653 if (problemKey != null) {
2654 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2655 }
2656 problemKey = null;
2657 } else if (ranges.size() > 1) {
2658
2659
2660 if (problemKey == null) {
2661
2662 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
2663 problemKey = key;
2664 }
2665 overlapGroups.putAll(problemKey, ranges);
2666
2667
2668 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
2669
2670 for (HbckInfo r1 : ranges) {
2671 subRange.remove(r1);
2672 for (HbckInfo r2 : subRange) {
2673 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
2674 handler.handleDuplicateStartKeys(r1,r2);
2675 } else {
2676
2677 handler.handleOverlapInRegionChain(r1, r2);
2678 }
2679 }
2680 }
2681
2682 } else if (ranges.size() == 0) {
2683 if (problemKey != null) {
2684 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
2685 }
2686 problemKey = null;
2687
2688 byte[] holeStopKey = sc.getSplits().higher(key);
2689
2690 if (holeStopKey != null) {
2691
2692 handler.handleHoleInRegionChain(key, holeStopKey);
2693 }
2694 }
2695 prevKey = key;
2696 }
2697
2698
2699
2700 if (prevKey != null) {
2701 handler.handleRegionEndKeyNotEmpty(prevKey);
2702 }
2703
2704
2705 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
2706 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
2707 " false to run serially.");
2708 boolean ok = handleOverlapsParallel(handler, prevKey);
2709 if (!ok) {
2710 return false;
2711 }
2712 } else {
2713 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
2714 " true to run in parallel.");
2715 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2716 handler.handleOverlapGroup(overlap);
2717 }
2718 }
2719
2720 if (details) {
2721
2722 errors.print("---- Table '" + this.tableName
2723 + "': region split map");
2724 dump(splits, regions);
2725 errors.print("---- Table '" + this.tableName
2726 + "': overlap groups");
2727 dumpOverlapProblems(overlapGroups);
2728 errors.print("There are " + overlapGroups.keySet().size()
2729 + " overlap groups with " + overlapGroups.size()
2730 + " overlapping regions");
2731 }
2732 if (!sidelinedRegions.isEmpty()) {
2733 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
2734 errors.print("---- Table '" + this.tableName
2735 + "': sidelined big overlapped regions");
2736 dumpSidelinedRegions(sidelinedRegions);
2737 }
2738 return errors.getErrorList().size() == originalErrorsCount;
2739 }
2740
2741 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
2742 throws IOException {
2743
2744
2745 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
2746 List<Future<Void>> rets;
2747 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
2748
2749 merges.add(new WorkItemOverlapMerge(overlap, handler));
2750 }
2751 try {
2752 rets = executor.invokeAll(merges);
2753 } catch (InterruptedException e) {
2754 LOG.error("Overlap merges were interrupted", e);
2755 return false;
2756 }
2757 for(int i=0; i<merges.size(); i++) {
2758 WorkItemOverlapMerge work = merges.get(i);
2759 Future<Void> f = rets.get(i);
2760 try {
2761 f.get();
2762 } catch(ExecutionException e) {
2763 LOG.warn("Failed to merge overlap group" + work, e.getCause());
2764 } catch (InterruptedException e) {
2765 LOG.error("Waiting for overlap merges was interrupted", e);
2766 return false;
2767 }
2768 }
2769 return true;
2770 }
2771
2772
2773
2774
2775
2776
2777
2778 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
2779
2780 StringBuilder sb = new StringBuilder();
2781 for (byte[] k : splits) {
2782 sb.setLength(0);
2783 sb.append(Bytes.toStringBinary(k) + ":\t");
2784 for (HbckInfo r : regions.get(k)) {
2785 sb.append("[ "+ r.toString() + ", "
2786 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
2787 }
2788 errors.print(sb.toString());
2789 }
2790 }
2791 }
2792
2793 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
2794
2795
2796 for (byte[] k : regions.keySet()) {
2797 errors.print(Bytes.toStringBinary(k) + ":");
2798 for (HbckInfo r : regions.get(k)) {
2799 errors.print("[ " + r.toString() + ", "
2800 + Bytes.toStringBinary(r.getEndKey()) + "]");
2801 }
2802 errors.print("----");
2803 }
2804 }
2805
2806 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
2807 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
2808 TableName tableName = entry.getValue().getTableName();
2809 Path path = entry.getKey();
2810 errors.print("This sidelined region dir should be bulk loaded: "
2811 + path.toString());
2812 errors.print("Bulk load command looks like: "
2813 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
2814 + path.toUri().getPath() + " "+ tableName);
2815 }
2816 }
2817
2818 public Multimap<byte[], HbckInfo> getOverlapGroups(
2819 TableName table) {
2820 TableInfo ti = tablesInfo.get(table);
2821 return ti.overlapGroups;
2822 }
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
2834 List<TableName> tableNames = new ArrayList<TableName>();
2835 long now = System.currentTimeMillis();
2836
2837 for (HbckInfo hbi : regionInfoMap.values()) {
2838 MetaEntry info = hbi.metaEntry;
2839
2840
2841
2842 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
2843 if (info.modTime + timelag < now) {
2844 tableNames.add(info.getTable());
2845 } else {
2846 numSkipped.incrementAndGet();
2847 }
2848 }
2849 }
2850 return getHTableDescriptors(tableNames);
2851 }
2852
2853 HTableDescriptor[] getHTableDescriptors(List<TableName> tableNames) {
2854 HTableDescriptor[] htd = new HTableDescriptor[0];
2855 try {
2856 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
2857 htd = new HBaseAdmin(getConf()).getTableDescriptorsByTableName(tableNames);
2858 } catch (IOException e) {
2859 LOG.debug("Exception getting table descriptors", e);
2860 }
2861 return htd;
2862 }
2863
2864
2865
2866
2867
2868
2869 private synchronized HbckInfo getOrCreateInfo(String name) {
2870 HbckInfo hbi = regionInfoMap.get(name);
2871 if (hbi == null) {
2872 hbi = new HbckInfo(null);
2873 regionInfoMap.put(name, hbi);
2874 }
2875 return hbi;
2876 }
2877
2878 private void checkAndFixTableLocks() throws IOException {
2879 TableLockChecker checker = new TableLockChecker(createZooKeeperWatcher(), errors);
2880 checker.checkTableLocks();
2881
2882 if (this.fixTableLocks) {
2883 checker.fixExpiredTableLocks();
2884 }
2885 }
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896 boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException {
2897 List<HbckInfo> metaRegions = Lists.newArrayList();
2898 for (HbckInfo value : regionInfoMap.values()) {
2899 if (value.metaEntry != null && value.metaEntry.isMetaRegion()) {
2900 metaRegions.add(value);
2901 }
2902 }
2903
2904
2905
2906 List<ServerName> servers = new ArrayList<ServerName>();
2907 HbckInfo metaHbckInfo = null;
2908 if (!metaRegions.isEmpty()) {
2909 metaHbckInfo = metaRegions.get(0);
2910 servers = metaHbckInfo.deployedOn;
2911 }
2912 if (servers.size() != 1) {
2913 if (servers.size() == 0) {
2914 errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta is not found on any region.");
2915 if (shouldFixAssignments()) {
2916 errors.print("Trying to fix a problem with hbase:meta..");
2917 setShouldRerun();
2918
2919 HBaseFsckRepair.fixUnassigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
2920 HBaseFsckRepair.waitUntilAssigned(admin, HRegionInfo.FIRST_META_REGIONINFO);
2921 }
2922 } else if (servers.size() > 1) {
2923 errors
2924 .reportError(ERROR_CODE.MULTI_META_REGION, "hbase:meta is found on more than one region.");
2925 if (shouldFixAssignments()) {
2926 if (metaHbckInfo == null) {
2927 errors.print(
2928 "Unable to fix problem with hbase:meta due to hbase:meta region info missing");
2929 return false;
2930 }
2931 errors.print("Trying to fix a problem with hbase:meta..");
2932 setShouldRerun();
2933
2934 HBaseFsckRepair.fixMultiAssignment(admin, metaHbckInfo.metaEntry, servers);
2935 }
2936 }
2937
2938 return false;
2939 }
2940
2941 return true;
2942 }
2943
2944
2945
2946
2947
2948 boolean loadMetaEntries() throws IOException {
2949 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
2950 int countRecord = 1;
2951
2952
2953 final Comparator<Cell> comp = new Comparator<Cell>() {
2954 @Override
2955 public int compare(Cell k1, Cell k2) {
2956 return (int)(k1.getTimestamp() - k2.getTimestamp());
2957 }
2958 };
2959
2960 @Override
2961 public boolean processRow(Result result) throws IOException {
2962 try {
2963
2964
2965 long ts = Collections.max(result.listCells(), comp).getTimestamp();
2966 Pair<HRegionInfo, ServerName> pair = HRegionInfo.getHRegionInfoAndServerName(result);
2967 if (pair == null || pair.getFirst() == null) {
2968 emptyRegionInfoQualifiers.add(result);
2969 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
2970 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
2971 return true;
2972 }
2973 ServerName sn = null;
2974 if (pair.getSecond() != null) {
2975 sn = pair.getSecond();
2976 }
2977 HRegionInfo hri = pair.getFirst();
2978 if (!(isTableIncluded(hri.getTable())
2979 || hri.isMetaRegion())) {
2980 return true;
2981 }
2982 PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(result);
2983 MetaEntry m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
2984 HbckInfo previous = regionInfoMap.get(hri.getEncodedName());
2985 if (previous == null) {
2986 regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m));
2987 } else if (previous.metaEntry == null) {
2988 previous.metaEntry = m;
2989 } else {
2990 throw new IOException("Two entries in hbase:meta are same " + previous);
2991 }
2992
2993 PairOfSameType<HRegionInfo> mergeRegions = HRegionInfo.getMergeRegions(result);
2994 for (HRegionInfo mergeRegion : new HRegionInfo[] {
2995 mergeRegions.getFirst(), mergeRegions.getSecond() }) {
2996 if (mergeRegion != null) {
2997
2998 HbckInfo hbInfo = getOrCreateInfo(mergeRegion.getEncodedName());
2999 hbInfo.setMerged(true);
3000 }
3001 }
3002
3003
3004 if (countRecord % 100 == 0) {
3005 errors.progress();
3006 }
3007 countRecord++;
3008 return true;
3009 } catch (RuntimeException e) {
3010 LOG.error("Result=" + result);
3011 throw e;
3012 }
3013 }
3014 };
3015 if (!checkMetaOnly) {
3016
3017 MetaScanner.metaScan(getConf(), visitor);
3018 }
3019
3020 errors.print("");
3021 return true;
3022 }
3023
3024
3025
3026
3027 static class MetaEntry extends HRegionInfo {
3028 ServerName regionServer;
3029 long modTime;
3030 HRegionInfo splitA, splitB;
3031
3032 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
3033 this(rinfo, regionServer, modTime, null, null);
3034 }
3035
3036 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
3037 HRegionInfo splitA, HRegionInfo splitB) {
3038 super(rinfo);
3039 this.regionServer = regionServer;
3040 this.modTime = modTime;
3041 this.splitA = splitA;
3042 this.splitB = splitB;
3043 }
3044
3045 @Override
3046 public boolean equals(Object o) {
3047 boolean superEq = super.equals(o);
3048 if (!superEq) {
3049 return superEq;
3050 }
3051
3052 MetaEntry me = (MetaEntry) o;
3053 if (!regionServer.equals(me.regionServer)) {
3054 return false;
3055 }
3056 return (modTime == me.modTime);
3057 }
3058
3059 @Override
3060 public int hashCode() {
3061 int hash = Arrays.hashCode(getRegionName());
3062 hash ^= getRegionId();
3063 hash ^= Arrays.hashCode(getStartKey());
3064 hash ^= Arrays.hashCode(getEndKey());
3065 hash ^= Boolean.valueOf(isOffline()).hashCode();
3066 hash ^= getTable().hashCode();
3067 if (regionServer != null) {
3068 hash ^= regionServer.hashCode();
3069 }
3070 hash ^= modTime;
3071 return hash;
3072 }
3073 }
3074
3075
3076
3077
3078 static class HdfsEntry {
3079 HRegionInfo hri;
3080 Path hdfsRegionDir = null;
3081 long hdfsRegionDirModTime = 0;
3082 boolean hdfsRegioninfoFilePresent = false;
3083 boolean hdfsOnlyEdits = false;
3084 }
3085
3086
3087
3088
3089 static class OnlineEntry {
3090 HRegionInfo hri;
3091 ServerName hsa;
3092
3093 @Override
3094 public String toString() {
3095 return hsa.toString() + ";" + hri.getRegionNameAsString();
3096 }
3097 }
3098
3099
3100
3101
3102
3103 public static class HbckInfo implements KeyRange {
3104 private MetaEntry metaEntry = null;
3105 private HdfsEntry hdfsEntry = null;
3106 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
3107 private List<ServerName> deployedOn = Lists.newArrayList();
3108 private boolean skipChecks = false;
3109 private boolean isMerged = false;
3110
3111 HbckInfo(MetaEntry metaEntry) {
3112 this.metaEntry = metaEntry;
3113 }
3114
3115 public synchronized void addServer(HRegionInfo hri, ServerName server) {
3116 OnlineEntry rse = new OnlineEntry() ;
3117 rse.hri = hri;
3118 rse.hsa = server;
3119 this.deployedEntries.add(rse);
3120 this.deployedOn.add(server);
3121 }
3122
3123 @Override
3124 public synchronized String toString() {
3125 StringBuilder sb = new StringBuilder();
3126 sb.append("{ meta => ");
3127 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
3128 sb.append( ", hdfs => " + getHdfsRegionDir());
3129 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
3130 sb.append(" }");
3131 return sb.toString();
3132 }
3133
3134 @Override
3135 public byte[] getStartKey() {
3136 if (this.metaEntry != null) {
3137 return this.metaEntry.getStartKey();
3138 } else if (this.hdfsEntry != null) {
3139 return this.hdfsEntry.hri.getStartKey();
3140 } else {
3141 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3142 return null;
3143 }
3144 }
3145
3146 @Override
3147 public byte[] getEndKey() {
3148 if (this.metaEntry != null) {
3149 return this.metaEntry.getEndKey();
3150 } else if (this.hdfsEntry != null) {
3151 return this.hdfsEntry.hri.getEndKey();
3152 } else {
3153 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3154 return null;
3155 }
3156 }
3157
3158 public TableName getTableName() {
3159 if (this.metaEntry != null) {
3160 return this.metaEntry.getTable();
3161 } else if (this.hdfsEntry != null) {
3162
3163
3164 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3165 return FSUtils.getTableName(tableDir);
3166 } else {
3167
3168
3169 return null;
3170 }
3171 }
3172
3173 public String getRegionNameAsString() {
3174 if (metaEntry != null) {
3175 return metaEntry.getRegionNameAsString();
3176 } else if (hdfsEntry != null) {
3177 if (hdfsEntry.hri != null) {
3178 return hdfsEntry.hri.getRegionNameAsString();
3179 }
3180 }
3181 return null;
3182 }
3183
3184 public byte[] getRegionName() {
3185 if (metaEntry != null) {
3186 return metaEntry.getRegionName();
3187 } else if (hdfsEntry != null) {
3188 return hdfsEntry.hri.getRegionName();
3189 } else {
3190 return null;
3191 }
3192 }
3193
3194 Path getHdfsRegionDir() {
3195 if (hdfsEntry == null) {
3196 return null;
3197 }
3198 return hdfsEntry.hdfsRegionDir;
3199 }
3200
3201 boolean containsOnlyHdfsEdits() {
3202 if (hdfsEntry == null) {
3203 return false;
3204 }
3205 return hdfsEntry.hdfsOnlyEdits;
3206 }
3207
3208 boolean isHdfsRegioninfoPresent() {
3209 if (hdfsEntry == null) {
3210 return false;
3211 }
3212 return hdfsEntry.hdfsRegioninfoFilePresent;
3213 }
3214
3215 long getModTime() {
3216 if (hdfsEntry == null) {
3217 return 0;
3218 }
3219 return hdfsEntry.hdfsRegionDirModTime;
3220 }
3221
3222 HRegionInfo getHdfsHRI() {
3223 if (hdfsEntry == null) {
3224 return null;
3225 }
3226 return hdfsEntry.hri;
3227 }
3228
3229 public void setSkipChecks(boolean skipChecks) {
3230 this.skipChecks = skipChecks;
3231 }
3232
3233 public boolean isSkipChecks() {
3234 return skipChecks;
3235 }
3236
3237 public void setMerged(boolean isMerged) {
3238 this.isMerged = isMerged;
3239 }
3240
3241 public boolean isMerged() {
3242 return this.isMerged;
3243 }
3244 }
3245
3246 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3247 @Override
3248 public int compare(HbckInfo l, HbckInfo r) {
3249 if (l == r) {
3250
3251 return 0;
3252 }
3253
3254 int tableCompare = l.getTableName().compareTo(r.getTableName());
3255 if (tableCompare != 0) {
3256 return tableCompare;
3257 }
3258
3259 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3260 l.getStartKey(), r.getStartKey());
3261 if (startComparison != 0) {
3262 return startComparison;
3263 }
3264
3265
3266 byte[] endKey = r.getEndKey();
3267 endKey = (endKey.length == 0) ? null : endKey;
3268 byte[] endKey2 = l.getEndKey();
3269 endKey2 = (endKey2.length == 0) ? null : endKey2;
3270 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3271 endKey2, endKey);
3272
3273 if (endComparison != 0) {
3274 return endComparison;
3275 }
3276
3277
3278
3279 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3280 return 0;
3281 }
3282 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3283 return 1;
3284 }
3285
3286 if (r.hdfsEntry == null) {
3287 return -1;
3288 }
3289
3290 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3291 }
3292 };
3293
3294
3295
3296
3297 private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
3298 StringBuilder sb = new StringBuilder();
3299 errors.print("Summary:");
3300 for (TableInfo tInfo : tablesInfo.values()) {
3301 if (errors.tableHasErrors(tInfo)) {
3302 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3303 } else {
3304 errors.print(" " + tInfo.getName() + " is okay.");
3305 }
3306 errors.print(" Number of regions: " + tInfo.getNumRegions());
3307 sb.setLength(0);
3308 sb.append(" Deployed on: ");
3309 for (ServerName server : tInfo.deployedOn) {
3310 sb.append(" " + server.toString());
3311 }
3312 errors.print(sb.toString());
3313 }
3314 }
3315
3316 static ErrorReporter getErrorReporter(
3317 final Configuration conf) throws ClassNotFoundException {
3318 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3319 return ReflectionUtils.newInstance(reporter, conf);
3320 }
3321
3322 public interface ErrorReporter {
3323 enum ERROR_CODE {
3324 UNKNOWN, NO_META_REGION, NULL_META_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3325 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3326 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3327 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3328 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3329 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3330 WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK, BOUNDARIES_ERROR
3331 }
3332 void clear();
3333 void report(String message);
3334 void reportError(String message);
3335 void reportError(ERROR_CODE errorCode, String message);
3336 void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3337 void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3338 void reportError(
3339 ERROR_CODE errorCode,
3340 String message,
3341 TableInfo table,
3342 HbckInfo info1,
3343 HbckInfo info2
3344 );
3345 int summarize();
3346 void detail(String details);
3347 ArrayList<ERROR_CODE> getErrorList();
3348 void progress();
3349 void print(String message);
3350 void resetErrors();
3351 boolean tableHasErrors(TableInfo table);
3352 }
3353
3354 static class PrintingErrorReporter implements ErrorReporter {
3355 public int errorCount = 0;
3356 private int showProgress;
3357
3358 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3359
3360
3361 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3362
3363 @Override
3364 public void clear() {
3365 errorTables.clear();
3366 errorList.clear();
3367 errorCount = 0;
3368 }
3369
3370 @Override
3371 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3372 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3373 System.err.println(message);
3374 return;
3375 }
3376
3377 errorList.add(errorCode);
3378 if (!summary) {
3379 System.out.println("ERROR: " + message);
3380 }
3381 errorCount++;
3382 showProgress = 0;
3383 }
3384
3385 @Override
3386 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3387 errorTables.add(table);
3388 reportError(errorCode, message);
3389 }
3390
3391 @Override
3392 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3393 HbckInfo info) {
3394 errorTables.add(table);
3395 String reference = "(region " + info.getRegionNameAsString() + ")";
3396 reportError(errorCode, reference + " " + message);
3397 }
3398
3399 @Override
3400 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3401 HbckInfo info1, HbckInfo info2) {
3402 errorTables.add(table);
3403 String reference = "(regions " + info1.getRegionNameAsString()
3404 + " and " + info2.getRegionNameAsString() + ")";
3405 reportError(errorCode, reference + " " + message);
3406 }
3407
3408 @Override
3409 public synchronized void reportError(String message) {
3410 reportError(ERROR_CODE.UNKNOWN, message);
3411 }
3412
3413
3414
3415
3416
3417
3418 @Override
3419 public synchronized void report(String message) {
3420 if (! summary) {
3421 System.out.println("ERROR: " + message);
3422 }
3423 showProgress = 0;
3424 }
3425
3426 @Override
3427 public synchronized int summarize() {
3428 System.out.println(Integer.toString(errorCount) +
3429 " inconsistencies detected.");
3430 if (errorCount == 0) {
3431 System.out.println("Status: OK");
3432 return 0;
3433 } else {
3434 System.out.println("Status: INCONSISTENT");
3435 return -1;
3436 }
3437 }
3438
3439 @Override
3440 public ArrayList<ERROR_CODE> getErrorList() {
3441 return errorList;
3442 }
3443
3444 @Override
3445 public synchronized void print(String message) {
3446 if (!summary) {
3447 System.out.println(message);
3448 }
3449 }
3450
3451 @Override
3452 public boolean tableHasErrors(TableInfo table) {
3453 return errorTables.contains(table);
3454 }
3455
3456 @Override
3457 public void resetErrors() {
3458 errorCount = 0;
3459 }
3460
3461 @Override
3462 public synchronized void detail(String message) {
3463 if (details) {
3464 System.out.println(message);
3465 }
3466 showProgress = 0;
3467 }
3468
3469 @Override
3470 public synchronized void progress() {
3471 if (showProgress++ == 10) {
3472 if (!summary) {
3473 System.out.print(".");
3474 }
3475 showProgress = 0;
3476 }
3477 }
3478 }
3479
3480
3481
3482
3483 static class WorkItemRegion implements Callable<Void> {
3484 private HBaseFsck hbck;
3485 private ServerName rsinfo;
3486 private ErrorReporter errors;
3487 private HConnection connection;
3488
3489 WorkItemRegion(HBaseFsck hbck, ServerName info,
3490 ErrorReporter errors, HConnection connection) {
3491 this.hbck = hbck;
3492 this.rsinfo = info;
3493 this.errors = errors;
3494 this.connection = connection;
3495 }
3496
3497 @Override
3498 public synchronized Void call() throws IOException {
3499 errors.progress();
3500 try {
3501 BlockingInterface server = connection.getAdmin(rsinfo);
3502
3503
3504 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
3505 regions = filterRegions(regions);
3506
3507 if (details) {
3508 errors.detail("RegionServer: " + rsinfo.getServerName() +
3509 " number of regions: " + regions.size());
3510 for (HRegionInfo rinfo: regions) {
3511 errors.detail(" " + rinfo.getRegionNameAsString() +
3512 " id: " + rinfo.getRegionId() +
3513 " encoded_name: " + rinfo.getEncodedName() +
3514 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
3515 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
3516 }
3517 }
3518
3519
3520 for (HRegionInfo r:regions) {
3521 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
3522 hbi.addServer(r, rsinfo);
3523 }
3524 } catch (IOException e) {
3525 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
3526 " Unable to fetch region information. " + e);
3527 throw e;
3528 }
3529 return null;
3530 }
3531
3532 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
3533 List<HRegionInfo> ret = Lists.newArrayList();
3534 for (HRegionInfo hri : regions) {
3535 if (hri.isMetaTable() || (!hbck.checkMetaOnly
3536 && hbck.isTableIncluded(hri.getTable()))) {
3537 ret.add(hri);
3538 }
3539 }
3540 return ret;
3541 }
3542 }
3543
3544
3545
3546
3547
3548 static class WorkItemHdfsDir implements Callable<Void> {
3549 private HBaseFsck hbck;
3550 private FileStatus tableDir;
3551 private ErrorReporter errors;
3552 private FileSystem fs;
3553
3554 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
3555 FileStatus status) {
3556 this.hbck = hbck;
3557 this.fs = fs;
3558 this.tableDir = status;
3559 this.errors = errors;
3560 }
3561
3562 @Override
3563 public synchronized Void call() throws IOException {
3564 try {
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619 static class WorkItemHdfsRegionInfo implements Callable<Void> {
3620 private HbckInfo hbi;
3621 private HBaseFsck hbck;
3622 private ErrorReporter errors;
3623
3624 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
3625 this.hbi = hbi;
3626 this.hbck = hbck;
3627 this.errors = errors;
3628 }
3629
3630 @Override
3631 public synchronized Void call() throws IOException {
3632
3633 if (hbi.getHdfsHRI() == null) {
3634 try {
3635 hbck.loadHdfsRegioninfo(hbi);
3636 } catch (IOException ioe) {
3637 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
3638 + hbi.getTableName() + " in hdfs dir "
3639 + hbi.getHdfsRegionDir()
3640 + "! It may be an invalid format or version file. Treating as "
3641 + "an orphaned regiondir.";
3642 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
3643 try {
3644 hbck.debugLsr(hbi.getHdfsRegionDir());
3645 } catch (IOException ioe2) {
3646 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
3647 throw ioe2;
3648 }
3649 hbck.orphanHdfsDirs.add(hbi);
3650 throw ioe;
3651 }
3652 }
3653 return null;
3654 }
3655 };
3656
3657
3658
3659
3660
3661 public static void setDisplayFullReport() {
3662 details = true;
3663 }
3664
3665
3666
3667
3668
3669 void setSummary() {
3670 summary = true;
3671 }
3672
3673
3674
3675
3676
3677 void setCheckMetaOnly() {
3678 checkMetaOnly = true;
3679 }
3680
3681
3682
3683
3684 void setRegionBoundariesCheck() {
3685 checkRegionBoundaries = true;
3686 }
3687
3688
3689
3690
3691
3692 public void setFixTableLocks(boolean shouldFix) {
3693 fixTableLocks = shouldFix;
3694 fixAny |= shouldFix;
3695 }
3696
3697
3698
3699
3700
3701
3702
3703 void setShouldRerun() {
3704 rerun = true;
3705 }
3706
3707 boolean shouldRerun() {
3708 return rerun;
3709 }
3710
3711
3712
3713
3714
3715 public void setFixAssignments(boolean shouldFix) {
3716 fixAssignments = shouldFix;
3717 fixAny |= shouldFix;
3718 }
3719
3720 boolean shouldFixAssignments() {
3721 return fixAssignments;
3722 }
3723
3724 public void setFixMeta(boolean shouldFix) {
3725 fixMeta = shouldFix;
3726 fixAny |= shouldFix;
3727 }
3728
3729 boolean shouldFixMeta() {
3730 return fixMeta;
3731 }
3732
3733 public void setFixEmptyMetaCells(boolean shouldFix) {
3734 fixEmptyMetaCells = shouldFix;
3735 fixAny |= shouldFix;
3736 }
3737
3738 boolean shouldFixEmptyMetaCells() {
3739 return fixEmptyMetaCells;
3740 }
3741
3742 public void setCheckHdfs(boolean checking) {
3743 checkHdfs = checking;
3744 }
3745
3746 boolean shouldCheckHdfs() {
3747 return checkHdfs;
3748 }
3749
3750 public void setFixHdfsHoles(boolean shouldFix) {
3751 fixHdfsHoles = shouldFix;
3752 fixAny |= shouldFix;
3753 }
3754
3755 boolean shouldFixHdfsHoles() {
3756 return fixHdfsHoles;
3757 }
3758
3759 public void setFixTableOrphans(boolean shouldFix) {
3760 fixTableOrphans = shouldFix;
3761 fixAny |= shouldFix;
3762 }
3763
3764 boolean shouldFixTableOrphans() {
3765 return fixTableOrphans;
3766 }
3767
3768 public void setFixHdfsOverlaps(boolean shouldFix) {
3769 fixHdfsOverlaps = shouldFix;
3770 fixAny |= shouldFix;
3771 }
3772
3773 boolean shouldFixHdfsOverlaps() {
3774 return fixHdfsOverlaps;
3775 }
3776
3777 public void setFixHdfsOrphans(boolean shouldFix) {
3778 fixHdfsOrphans = shouldFix;
3779 fixAny |= shouldFix;
3780 }
3781
3782 boolean shouldFixHdfsOrphans() {
3783 return fixHdfsOrphans;
3784 }
3785
3786 public void setFixVersionFile(boolean shouldFix) {
3787 fixVersionFile = shouldFix;
3788 fixAny |= shouldFix;
3789 }
3790
3791 public boolean shouldFixVersionFile() {
3792 return fixVersionFile;
3793 }
3794
3795 public void setSidelineBigOverlaps(boolean sbo) {
3796 this.sidelineBigOverlaps = sbo;
3797 }
3798
3799 public boolean shouldSidelineBigOverlaps() {
3800 return sidelineBigOverlaps;
3801 }
3802
3803 public void setFixSplitParents(boolean shouldFix) {
3804 fixSplitParents = shouldFix;
3805 fixAny |= shouldFix;
3806 }
3807
3808 boolean shouldFixSplitParents() {
3809 return fixSplitParents;
3810 }
3811
3812 public void setFixReferenceFiles(boolean shouldFix) {
3813 fixReferenceFiles = shouldFix;
3814 fixAny |= shouldFix;
3815 }
3816
3817 boolean shouldFixReferenceFiles() {
3818 return fixReferenceFiles;
3819 }
3820
3821 public boolean shouldIgnorePreCheckPermission() {
3822 return !fixAny || ignorePreCheckPermission;
3823 }
3824
3825 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
3826 this.ignorePreCheckPermission = ignorePreCheckPermission;
3827 }
3828
3829
3830
3831
3832 public void setMaxMerge(int mm) {
3833 this.maxMerge = mm;
3834 }
3835
3836 public int getMaxMerge() {
3837 return maxMerge;
3838 }
3839
3840 public void setMaxOverlapsToSideline(int mo) {
3841 this.maxOverlapsToSideline = mo;
3842 }
3843
3844 public int getMaxOverlapsToSideline() {
3845 return maxOverlapsToSideline;
3846 }
3847
3848
3849
3850
3851
3852 boolean isTableIncluded(TableName table) {
3853 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
3854 }
3855
3856 public void includeTable(TableName table) {
3857 tablesIncluded.add(table);
3858 }
3859
3860 Set<TableName> getIncludedTables() {
3861 return new HashSet<TableName>(tablesIncluded);
3862 }
3863
3864
3865
3866
3867
3868
3869 public void setTimeLag(long seconds) {
3870 timelag = seconds * 1000;
3871 }
3872
3873
3874
3875
3876
3877 public void setSidelineDir(String sidelineDir) {
3878 this.sidelineDir = new Path(sidelineDir);
3879 }
3880
3881 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
3882 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
3883 }
3884
3885 public HFileCorruptionChecker getHFilecorruptionChecker() {
3886 return hfcc;
3887 }
3888
3889 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
3890 this.hfcc = hfcc;
3891 }
3892
3893 public void setRetCode(int code) {
3894 this.retcode = code;
3895 }
3896
3897 public int getRetCode() {
3898 return retcode;
3899 }
3900
3901 protected HBaseFsck printUsageAndExit() {
3902 StringWriter sw = new StringWriter(2048);
3903 PrintWriter out = new PrintWriter(sw);
3904 out.println("Usage: fsck [opts] {only tables}");
3905 out.println(" where [opts] are:");
3906 out.println(" -help Display help options (this)");
3907 out.println(" -details Display full report of all regions.");
3908 out.println(" -timelag <timeInSeconds> Process only regions that " +
3909 " have not experienced any metadata updates in the last " +
3910 " <timeInSeconds> seconds.");
3911 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
3912 " before checking if the fix worked if run with -fix");
3913 out.println(" -summary Print only summary of the tables and status.");
3914 out.println(" -metaonly Only check the state of the hbase:meta table.");
3915 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
3916 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
3917
3918 out.println("");
3919 out.println(" Metadata Repair options: (expert features, use with caution!)");
3920 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
3921 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
3922 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
3923 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
3924 + " Assumes hbase:meta region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
3925 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
3926 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
3927 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
3928 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
3929 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
3930 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
3931 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
3932 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
3933 out.println(" -fixSplitParents Try to force offline split parents to be online.");
3934 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
3935 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
3936 out.println(" -fixEmptyMetaCells Try to fix hbase:meta entries not referencing any region"
3937 + " (empty REGIONINFO_QUALIFIER rows)");
3938
3939 out.println("");
3940 out.println(" Datafile Repair options: (expert features, use with caution!)");
3941 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
3942 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
3943
3944 out.println("");
3945 out.println(" Metadata Repair shortcuts");
3946 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
3947 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles -fixTableLocks");
3948 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
3949
3950 out.println("");
3951 out.println(" Table lock options");
3952 out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)");
3953
3954 out.flush();
3955 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
3956
3957 setRetCode(-2);
3958 return this;
3959 }
3960
3961
3962
3963
3964
3965
3966
3967 public static void main(String[] args) throws Exception {
3968
3969 Configuration conf = HBaseConfiguration.create();
3970 Path hbasedir = FSUtils.getRootDir(conf);
3971 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
3972 FSUtils.setFsDefault(conf, new Path(defaultFs));
3973 int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
3974 System.exit(ret);
3975 }
3976
3977
3978
3979
3980 static class HBaseFsckTool extends Configured implements Tool {
3981 HBaseFsckTool(Configuration conf) { super(conf); }
3982 @Override
3983 public int run(String[] args) throws Exception {
3984 HBaseFsck hbck = new HBaseFsck(getConf());
3985 hbck.exec(hbck.executor, args);
3986 return hbck.getRetCode();
3987 }
3988 };
3989
3990
3991 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
3992 ServiceException, InterruptedException {
3993 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
3994
3995 boolean checkCorruptHFiles = false;
3996 boolean sidelineCorruptHFiles = false;
3997
3998
3999 for (int i = 0; i < args.length; i++) {
4000 String cmd = args[i];
4001 if (cmd.equals("-help") || cmd.equals("-h")) {
4002 return printUsageAndExit();
4003 } else if (cmd.equals("-details")) {
4004 setDisplayFullReport();
4005 } else if (cmd.equals("-timelag")) {
4006 if (i == args.length - 1) {
4007 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
4008 return printUsageAndExit();
4009 }
4010 try {
4011 long timelag = Long.parseLong(args[i+1]);
4012 setTimeLag(timelag);
4013 } catch (NumberFormatException e) {
4014 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
4015 return printUsageAndExit();
4016 }
4017 i++;
4018 } else if (cmd.equals("-sleepBeforeRerun")) {
4019 if (i == args.length - 1) {
4020 errors.reportError(ERROR_CODE.WRONG_USAGE,
4021 "HBaseFsck: -sleepBeforeRerun needs a value.");
4022 return printUsageAndExit();
4023 }
4024 try {
4025 sleepBeforeRerun = Long.parseLong(args[i+1]);
4026 } catch (NumberFormatException e) {
4027 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
4028 return printUsageAndExit();
4029 }
4030 i++;
4031 } else if (cmd.equals("-sidelineDir")) {
4032 if (i == args.length - 1) {
4033 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
4034 return printUsageAndExit();
4035 }
4036 i++;
4037 setSidelineDir(args[i]);
4038 } else if (cmd.equals("-fix")) {
4039 errors.reportError(ERROR_CODE.WRONG_USAGE,
4040 "This option is deprecated, please use -fixAssignments instead.");
4041 setFixAssignments(true);
4042 } else if (cmd.equals("-fixAssignments")) {
4043 setFixAssignments(true);
4044 } else if (cmd.equals("-fixMeta")) {
4045 setFixMeta(true);
4046 } else if (cmd.equals("-noHdfsChecking")) {
4047 setCheckHdfs(false);
4048 } else if (cmd.equals("-fixHdfsHoles")) {
4049 setFixHdfsHoles(true);
4050 } else if (cmd.equals("-fixHdfsOrphans")) {
4051 setFixHdfsOrphans(true);
4052 } else if (cmd.equals("-fixTableOrphans")) {
4053 setFixTableOrphans(true);
4054 } else if (cmd.equals("-fixHdfsOverlaps")) {
4055 setFixHdfsOverlaps(true);
4056 } else if (cmd.equals("-fixVersionFile")) {
4057 setFixVersionFile(true);
4058 } else if (cmd.equals("-sidelineBigOverlaps")) {
4059 setSidelineBigOverlaps(true);
4060 } else if (cmd.equals("-fixSplitParents")) {
4061 setFixSplitParents(true);
4062 } else if (cmd.equals("-ignorePreCheckPermission")) {
4063 setIgnorePreCheckPermission(true);
4064 } else if (cmd.equals("-checkCorruptHFiles")) {
4065 checkCorruptHFiles = true;
4066 } else if (cmd.equals("-sidelineCorruptHFiles")) {
4067 sidelineCorruptHFiles = true;
4068 } else if (cmd.equals("-fixReferenceFiles")) {
4069 setFixReferenceFiles(true);
4070 } else if (cmd.equals("-fixEmptyMetaCells")) {
4071 setFixEmptyMetaCells(true);
4072 } else if (cmd.equals("-repair")) {
4073
4074
4075 setFixHdfsHoles(true);
4076 setFixHdfsOrphans(true);
4077 setFixMeta(true);
4078 setFixAssignments(true);
4079 setFixHdfsOverlaps(true);
4080 setFixVersionFile(true);
4081 setSidelineBigOverlaps(true);
4082 setFixSplitParents(false);
4083 setCheckHdfs(true);
4084 setFixReferenceFiles(true);
4085 setFixTableLocks(true);
4086 } else if (cmd.equals("-repairHoles")) {
4087
4088 setFixHdfsHoles(true);
4089 setFixHdfsOrphans(false);
4090 setFixMeta(true);
4091 setFixAssignments(true);
4092 setFixHdfsOverlaps(false);
4093 setSidelineBigOverlaps(false);
4094 setFixSplitParents(false);
4095 setCheckHdfs(true);
4096 } else if (cmd.equals("-maxOverlapsToSideline")) {
4097 if (i == args.length - 1) {
4098 errors.reportError(ERROR_CODE.WRONG_USAGE,
4099 "-maxOverlapsToSideline needs a numeric value argument.");
4100 return printUsageAndExit();
4101 }
4102 try {
4103 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
4104 setMaxOverlapsToSideline(maxOverlapsToSideline);
4105 } catch (NumberFormatException e) {
4106 errors.reportError(ERROR_CODE.WRONG_USAGE,
4107 "-maxOverlapsToSideline needs a numeric value argument.");
4108 return printUsageAndExit();
4109 }
4110 i++;
4111 } else if (cmd.equals("-maxMerge")) {
4112 if (i == args.length - 1) {
4113 errors.reportError(ERROR_CODE.WRONG_USAGE,
4114 "-maxMerge needs a numeric value argument.");
4115 return printUsageAndExit();
4116 }
4117 try {
4118 int maxMerge = Integer.parseInt(args[i+1]);
4119 setMaxMerge(maxMerge);
4120 } catch (NumberFormatException e) {
4121 errors.reportError(ERROR_CODE.WRONG_USAGE,
4122 "-maxMerge needs a numeric value argument.");
4123 return printUsageAndExit();
4124 }
4125 i++;
4126 } else if (cmd.equals("-summary")) {
4127 setSummary();
4128 } else if (cmd.equals("-metaonly")) {
4129 setCheckMetaOnly();
4130 } else if (cmd.equals("-boundaries")) {
4131 setRegionBoundariesCheck();
4132 } else if (cmd.equals("-fixTableLocks")) {
4133 setFixTableLocks(true);
4134 } else if (cmd.startsWith("-")) {
4135 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
4136 return printUsageAndExit();
4137 } else {
4138 includeTable(TableName.valueOf(cmd));
4139 errors.print("Allow checking/fixes for table: " + cmd);
4140 }
4141 }
4142
4143 errors.print("HBaseFsck command line options: " + StringUtils.join(args, " "));
4144
4145
4146 try {
4147 preCheckPermission();
4148 } catch (AccessDeniedException ace) {
4149 Runtime.getRuntime().exit(-1);
4150 } catch (IOException ioe) {
4151 Runtime.getRuntime().exit(-1);
4152 }
4153
4154
4155 connect();
4156
4157 try {
4158
4159 if (checkCorruptHFiles || sidelineCorruptHFiles) {
4160 LOG.info("Checking all hfiles for corruption");
4161 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
4162 setHFileCorruptionChecker(hfcc);
4163 Collection<TableName> tables = getIncludedTables();
4164 Collection<Path> tableDirs = new ArrayList<Path>();
4165 Path rootdir = FSUtils.getRootDir(getConf());
4166 if (tables.size() > 0) {
4167 for (TableName t : tables) {
4168 tableDirs.add(FSUtils.getTableDir(rootdir, t));
4169 }
4170 } else {
4171 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
4172 }
4173 hfcc.checkTables(tableDirs);
4174 hfcc.report(errors);
4175 }
4176
4177
4178 int code = onlineHbck();
4179 setRetCode(code);
4180
4181
4182
4183
4184 if (shouldRerun()) {
4185 try {
4186 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
4187 Thread.sleep(sleepBeforeRerun);
4188 } catch (InterruptedException ie) {
4189 return this;
4190 }
4191
4192 setFixAssignments(false);
4193 setFixMeta(false);
4194 setFixHdfsHoles(false);
4195 setFixHdfsOverlaps(false);
4196 setFixVersionFile(false);
4197 setFixTableOrphans(false);
4198 errors.resetErrors();
4199 code = onlineHbck();
4200 setRetCode(code);
4201 }
4202 } finally {
4203 IOUtils.cleanup(null, connection, meta, admin);
4204 }
4205 return this;
4206 }
4207
4208
4209
4210
4211 void debugLsr(Path p) throws IOException {
4212 debugLsr(getConf(), p, errors);
4213 }
4214
4215
4216
4217
4218 public static void debugLsr(Configuration conf,
4219 Path p) throws IOException {
4220 debugLsr(conf, p, new PrintingErrorReporter());
4221 }
4222
4223
4224
4225
4226 public static void debugLsr(Configuration conf,
4227 Path p, ErrorReporter errors) throws IOException {
4228 if (!LOG.isDebugEnabled() || p == null) {
4229 return;
4230 }
4231 FileSystem fs = p.getFileSystem(conf);
4232
4233 if (!fs.exists(p)) {
4234
4235 return;
4236 }
4237 errors.print(p.toString());
4238
4239 if (fs.isFile(p)) {
4240 return;
4241 }
4242
4243 if (fs.getFileStatus(p).isDir()) {
4244 FileStatus[] fss= fs.listStatus(p);
4245 for (FileStatus status : fss) {
4246 debugLsr(conf, status.getPath(), errors);
4247 }
4248 }
4249 }
4250 }