1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.util;
21
22 import java.io.IOException;
23 import java.util.*;
24 import java.util.concurrent.atomic.AtomicInteger;
25 import java.util.concurrent.ThreadPoolExecutor;
26 import java.util.concurrent.TimeUnit;
27 import java.util.concurrent.LinkedBlockingQueue;
28
29 import org.apache.commons.logging.Log;
30 import org.apache.commons.logging.LogFactory;
31 import org.apache.hadoop.conf.Configuration;
32 import org.apache.hadoop.fs.FileStatus;
33 import org.apache.hadoop.fs.FileSystem;
34 import org.apache.hadoop.fs.Path;
35 import org.apache.hadoop.hbase.ClusterStatus;
36 import org.apache.hadoop.hbase.HBaseConfiguration;
37 import org.apache.hadoop.hbase.HConstants;
38 import org.apache.hadoop.hbase.HRegionInfo;
39 import org.apache.hadoop.hbase.HRegionLocation;
40 import org.apache.hadoop.hbase.HServerAddress;
41 import org.apache.hadoop.hbase.HServerInfo;
42 import org.apache.hadoop.hbase.HTableDescriptor;
43 import org.apache.hadoop.hbase.KeyValue;
44 import org.apache.hadoop.hbase.MasterNotRunningException;
45 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
46 import org.apache.hadoop.hbase.client.HBaseAdmin;
47 import org.apache.hadoop.hbase.client.HConnection;
48 import org.apache.hadoop.hbase.client.HConnectionManager;
49 import org.apache.hadoop.hbase.client.MetaScanner;
50 import org.apache.hadoop.hbase.client.Result;
51 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
52 import org.apache.hadoop.hbase.ipc.HRegionInterface;
53 import org.apache.hadoop.hbase.regionserver.wal.HLog;
54 import org.apache.hadoop.hbase.zookeeper.ZKTable;
55 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
56 import org.apache.zookeeper.KeeperException;
57
58 import com.google.common.base.Joiner;
59 import com.google.common.collect.Lists;
60
61 import static org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
62
63
64
65
66
67 public class HBaseFsck {
68 public static final long DEFAULT_TIME_LAG = 60000;
69 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
70
71 private static final int MAX_NUM_THREADS = 50;
72 private static final long THREADS_KEEP_ALIVE_SECONDS = 60;
73
74 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
75 private Configuration conf;
76
77 private ClusterStatus status;
78 private HConnection connection;
79
80 private TreeMap<String, HbckInfo> regionInfo = new TreeMap<String, HbckInfo>();
81 private TreeMap<String, TInfo> tablesInfo = new TreeMap<String, TInfo>();
82 private TreeSet<byte[]> disabledTables =
83 new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
84 ErrorReporter errors = new PrintingErrorReporter();
85
86 private static boolean details = false;
87 private long timelag = DEFAULT_TIME_LAG;
88 private boolean fix = false;
89 private boolean rerun = false;
90 private static boolean summary = false;
91
92 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
93 private int numThreads = MAX_NUM_THREADS;
94
95 ThreadPoolExecutor executor;
96
97
98
99
100
101
102
103
104 public HBaseFsck(Configuration conf)
105 throws MasterNotRunningException, ZooKeeperConnectionException, IOException {
106 this.conf = conf;
107
108 HBaseAdmin admin = new HBaseAdmin(conf);
109 status = admin.getMaster().getClusterStatus();
110 connection = admin.getConnection();
111
112 numThreads = conf.getInt("hbasefsck.numthreads", numThreads);
113 executor = new ThreadPoolExecutor(0, numThreads,
114 THREADS_KEEP_ALIVE_SECONDS, TimeUnit.SECONDS,
115 new LinkedBlockingQueue<Runnable>());
116 }
117
118
119
120
121
122
123
124
125 int doWork() throws IOException, KeeperException, InterruptedException {
126
127 errors.print("Version: " + status.getHBaseVersion());
128
129
130 regionInfo.clear();
131 tablesInfo.clear();
132 emptyRegionInfoQualifiers.clear();
133 disabledTables.clear();
134 errors.clear();
135
136
137
138 if (!recordRootRegion()) {
139
140 errors.reportError("Encountered fatal error. Exiting...");
141 return -1;
142 }
143 getMetaEntries();
144
145
146 if (!checkMetaEntries()) {
147
148 errors.reportError("Encountered fatal error. Exiting...");
149 return -1;
150 }
151
152
153 AtomicInteger numSkipped = new AtomicInteger(0);
154 HTableDescriptor[] allTables = getTables(numSkipped);
155 errors.print("Number of Tables: " + allTables.length);
156 if (details) {
157 if (numSkipped.get() > 0) {
158 errors.detail("Number of Tables in flux: " + numSkipped.get());
159 }
160 for (HTableDescriptor td : allTables) {
161 String tableName = td.getNameAsString();
162 errors.detail(" Table: " + tableName + "\t" +
163 (td.isReadOnly() ? "ro" : "rw") + "\t" +
164 (td.isRootRegion() ? "ROOT" :
165 (td.isMetaRegion() ? "META" : " ")) + "\t" +
166 " families: " + td.getFamilies().size());
167 }
168 }
169
170
171 Collection<HServerInfo> regionServers = status.getServerInfo();
172 errors.print("Number of live region servers: " +
173 regionServers.size());
174 if (details) {
175 for (HServerInfo rsinfo: regionServers) {
176 errors.print(" " + rsinfo.getServerName());
177 }
178 }
179
180
181 Collection<String> deadRegionServers = status.getDeadServerNames();
182 errors.print("Number of dead region servers: " +
183 deadRegionServers.size());
184 if (details) {
185 for (String name: deadRegionServers) {
186 errors.print(" " + name);
187 }
188 }
189
190
191 processRegionServers(regionServers);
192
193
194 checkHdfs();
195
196
197 errors.print("Number of empty REGIONINFO_QUALIFIER rows in .META.: " +
198 emptyRegionInfoQualifiers.size());
199 if (details) {
200 for (Result r: emptyRegionInfoQualifiers) {
201 errors.print(" " + r);
202 }
203 }
204
205
206 loadDisabledTables();
207
208
209 checkConsistency();
210
211
212 checkIntegrity();
213
214
215 printTableSummary();
216
217 return errors.summarize();
218 }
219
220 public ErrorReporter getErrors() {
221 return errors;
222 }
223
224
225
226
227
228
229
230 private void loadDisabledTables()
231 throws ZooKeeperConnectionException, IOException, KeeperException {
232 ZooKeeperWatcher zkw =
233 HConnectionManager.getConnection(conf).getZooKeeperWatcher();
234 for (String tableName : ZKTable.getDisabledOrDisablingTables(zkw)) {
235 disabledTables.add(Bytes.toBytes(tableName));
236 }
237 }
238
239
240
241
242
243
244
245 private boolean isTableDisabled(HRegionInfo regionInfo) {
246 return disabledTables.contains(regionInfo.getTableDesc().getName());
247 }
248
249
250
251
252
253 void checkHdfs() throws IOException, InterruptedException {
254 Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
255 FileSystem fs = rootDir.getFileSystem(conf);
256
257
258 List<FileStatus> tableDirs = Lists.newArrayList();
259
260 boolean foundVersionFile = false;
261 FileStatus[] files = fs.listStatus(rootDir);
262 for (FileStatus file : files) {
263 if (file.getPath().getName().equals(HConstants.VERSION_FILE_NAME)) {
264 foundVersionFile = true;
265 } else {
266 tableDirs.add(file);
267 }
268 }
269
270
271 if (!foundVersionFile) {
272 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
273 "Version file does not exist in root dir " + rootDir);
274 }
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300 boolean recordRootRegion() throws IOException {
301 HRegionLocation rootLocation = connection.locateRegion(
302 HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_START_ROW);
303
304
305 if (rootLocation == null || rootLocation.getRegionInfo() == null ||
306 rootLocation.getServerAddress() == null) {
307 errors.reportError(ERROR_CODE.NULL_ROOT_REGION,
308 "Root Region or some of its attributes are null.");
309 return false;
310 }
311
312 MetaEntry m = new MetaEntry(rootLocation.getRegionInfo(),
313 rootLocation.getServerAddress(), null, System.currentTimeMillis());
314 HbckInfo hbInfo = new HbckInfo(m);
315 regionInfo.put(rootLocation.getRegionInfo().getEncodedName(), hbInfo);
316 return true;
317 }
318
319
320
321
322
323
324 void processRegionServers(Collection<HServerInfo> regionServerList)
325 throws IOException, InterruptedException {
326
327 WorkItemRegion[] work = new WorkItemRegion[regionServerList.size()];
328 int num = 0;
329
330
331 for (HServerInfo rsinfo:regionServerList) {
332 work[num] = new WorkItemRegion(this, rsinfo, errors, connection);
333 executor.execute(work[num]);
334 num++;
335 }
336
337
338 for (int i = 0; i < num; i++) {
339 synchronized (work[i]) {
340 while (!work[i].isDone()) {
341 work[i].wait();
342 }
343 }
344 }
345 }
346
347
348
349
350
351
352 void checkConsistency()
353 throws IOException, KeeperException, InterruptedException {
354 for (java.util.Map.Entry<String, HbckInfo> e: regionInfo.entrySet()) {
355 doConsistencyCheck(e.getKey(), e.getValue());
356 }
357 }
358
359
360
361
362
363
364 void doConsistencyCheck(final String key, final HbckInfo hbi)
365 throws IOException, KeeperException, InterruptedException {
366 String descriptiveName = hbi.toString();
367
368 boolean inMeta = hbi.metaEntry != null;
369 boolean inHdfs = hbi.foundRegionDir != null;
370 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
371 boolean isDeployed = !hbi.deployedOn.isEmpty();
372 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
373 boolean deploymentMatchesMeta =
374 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
375 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
376 boolean splitParent =
377 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
378 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
379 boolean recentlyModified = hbi.foundRegionDir != null &&
380 hbi.foundRegionDir.getModificationTime() + timelag > System.currentTimeMillis();
381
382
383 if (hbi.onlyEdits) {
384 return;
385 }
386 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
387 return;
388 } else if (inMeta && !isDeployed && splitParent) {
389 return;
390 } else if (inMeta && !shouldBeDeployed && !isDeployed) {
391 return;
392 } else if (recentlyModified) {
393 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
394 return;
395 }
396
397 else if (!inMeta && !inHdfs && !isDeployed) {
398
399 assert false : "Entry for region with no data";
400 } else if (!inMeta && !inHdfs && isDeployed) {
401 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
402 + descriptiveName + ", key=" + key + ", not on HDFS or in META but " +
403 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
404 } else if (!inMeta && inHdfs && !isDeployed) {
405 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
406 + descriptiveName + " on HDFS, but not listed in META " +
407 "or deployed on any region server");
408 } else if (!inMeta && inHdfs && isDeployed) {
409 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
410 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
411
412
413 } else if (inMeta && !inHdfs && !isDeployed) {
414 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
415 + descriptiveName + " found in META, but not in HDFS "
416 + "or deployed on any region server.");
417 } else if (inMeta && !inHdfs && isDeployed) {
418 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
419 + " found in META, but not in HDFS, " +
420 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
421 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
422 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
423 + " not deployed on any region server.");
424
425 if (shouldFix()) {
426 errors.print("Trying to fix unassigned region...");
427 setShouldRerun();
428 HBaseFsckRepair.fixUnassigned(this.conf, hbi.metaEntry);
429 }
430 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
431 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED, "Region "
432 + descriptiveName + " should not be deployed according " +
433 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
434 } else if (inMeta && inHdfs && isMultiplyDeployed) {
435 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
436 + " is listed in META on region server " + hbi.metaEntry.regionServer
437 + " but is multiply assigned to region servers " +
438 Joiner.on(", ").join(hbi.deployedOn));
439
440 if (shouldFix()) {
441 errors.print("Trying to fix assignment error...");
442 setShouldRerun();
443 HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
444 }
445 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
446 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
447 + descriptiveName + " listed in META on region server " +
448 hbi.metaEntry.regionServer + " but found on region server " +
449 hbi.deployedOn.get(0));
450
451 if (shouldFix()) {
452 errors.print("Trying to fix assignment error...");
453 setShouldRerun();
454 HBaseFsckRepair.fixDupeAssignment(this.conf, hbi.metaEntry, hbi.deployedOn);
455 }
456 } else {
457 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
458 " is in an unforeseen state:" +
459 " inMeta=" + inMeta +
460 " inHdfs=" + inHdfs +
461 " isDeployed=" + isDeployed +
462 " isMultiplyDeployed=" + isMultiplyDeployed +
463 " deploymentMatchesMeta=" + deploymentMatchesMeta +
464 " shouldBeDeployed=" + shouldBeDeployed);
465 }
466 }
467
468
469
470
471
472
473 void checkIntegrity() {
474 for (HbckInfo hbi : regionInfo.values()) {
475
476 if (hbi.metaEntry == null) continue;
477 if (hbi.metaEntry.regionServer == null) continue;
478 if (hbi.onlyEdits) continue;
479
480
481
482
483
484
485 if (hbi.deployedOn.size() == 0) continue;
486
487
488 String tableName = hbi.metaEntry.getTableDesc().getNameAsString();
489 TInfo modTInfo = tablesInfo.get(tableName);
490 if (modTInfo == null) {
491 modTInfo = new TInfo(tableName);
492 }
493 for (HServerAddress server : hbi.deployedOn) {
494 modTInfo.addServer(server);
495 }
496
497
498 modTInfo.addRegionInfo(hbi);
499
500 tablesInfo.put(tableName, modTInfo);
501 }
502
503 for (TInfo tInfo : tablesInfo.values()) {
504 if (!tInfo.checkRegionChain()) {
505 errors.report("Found inconsistency in table " + tInfo.getName());
506 }
507 }
508 }
509
510
511
512
513 private class TInfo {
514 String tableName;
515 TreeSet <HServerAddress> deployedOn;
516
517 List<HbckInfo> regions = new ArrayList<HbckInfo>();
518
519 TInfo(String name) {
520 this.tableName = name;
521 deployedOn = new TreeSet <HServerAddress>();
522 }
523
524 public void addRegionInfo (HbckInfo r) {
525 regions.add(r);
526 }
527
528 public void addServer(HServerAddress server) {
529 this.deployedOn.add(server);
530 }
531
532 public String getName() {
533 return tableName;
534 }
535
536 public int getNumRegions() {
537 return regions.size();
538 }
539
540
541
542
543
544
545 public boolean checkRegionChain() {
546 Collections.sort(regions);
547 HbckInfo last = null;
548 int originalErrorsCount = errors.getErrorList().size();
549
550 for (HbckInfo r : regions) {
551 if (last == null) {
552
553 if (! Bytes.equals(r.metaEntry.getStartKey(), HConstants.EMPTY_BYTE_ARRAY)) {
554 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
555 "First region should start with an empty key.",
556 this, r);
557 }
558 } else {
559
560
561
562
563
564 if (! Bytes.equals(r.metaEntry.getEndKey(), HConstants.EMPTY_BYTE_ARRAY)) {
565
566 int cmpRegionKeys = Bytes.compareTo(r.metaEntry.getStartKey(),
567 r.metaEntry.getEndKey());
568 if (cmpRegionKeys > 0) {
569 errors.reportError(ERROR_CODE.REGION_CYCLE,
570 String.format("The endkey for this region comes before the "
571 + "startkey, startkey=%s, endkey=%s",
572 Bytes.toStringBinary(r.metaEntry.getStartKey()),
573 Bytes.toStringBinary(r.metaEntry.getEndKey())),
574 this, r, last);
575 }
576 }
577
578
579 if (Bytes.equals(r.metaEntry.getStartKey(), last.metaEntry.getStartKey())) {
580 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
581 "Two regions have the same startkey: "
582 + Bytes.toStringBinary(r.metaEntry.getStartKey()),
583 this, r, last);
584 } else {
585
586 int cmp = Bytes.compareTo(r.metaEntry.getStartKey(),
587 last.metaEntry.getEndKey());
588 if (cmp > 0) {
589
590 errors.reportError(ERROR_CODE.HOLE_IN_REGION_CHAIN,
591 "There is a hole in the region chain.",
592 this, r, last);
593 } else if (cmp < 0) {
594
595 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
596 "There is an overlap in the region chain.",
597 this, r, last);
598 }
599 }
600
601 }
602
603 last = r;
604 }
605
606 return errors.getErrorList().size() == originalErrorsCount;
607 }
608
609 }
610
611
612
613
614
615
616
617
618
619
620 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
621 TreeSet<HTableDescriptor> uniqueTables = new TreeSet<HTableDescriptor>();
622 long now = System.currentTimeMillis();
623
624 for (HbckInfo hbi : regionInfo.values()) {
625 MetaEntry info = hbi.metaEntry;
626
627
628
629 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
630 if (info.modTime + timelag < now) {
631 uniqueTables.add(info.getTableDesc());
632 } else {
633 numSkipped.incrementAndGet();
634 }
635 }
636 }
637 return uniqueTables.toArray(new HTableDescriptor[uniqueTables.size()]);
638 }
639
640
641
642
643
644
645 private synchronized HbckInfo getOrCreateInfo(String name) {
646 HbckInfo hbi = regionInfo.get(name);
647 if (hbi == null) {
648 hbi = new HbckInfo(null);
649 regionInfo.put(name, hbi);
650 }
651 return hbi;
652 }
653
654
655
656
657
658
659
660
661
662
663 boolean checkMetaEntries()
664 throws IOException, KeeperException, InterruptedException {
665 List <HbckInfo> metaRegions = Lists.newArrayList();
666 for (HbckInfo value : regionInfo.values()) {
667 if (value.metaEntry.isMetaTable()) {
668 metaRegions.add(value);
669 }
670 }
671
672
673 if (metaRegions.size() != 1) {
674 HRegionLocation rootLocation = connection.locateRegion(
675 HConstants.ROOT_TABLE_NAME, HConstants.EMPTY_START_ROW);
676 HbckInfo root =
677 regionInfo.get(rootLocation.getRegionInfo().getEncodedName());
678
679
680 if (metaRegions.size() == 0) {
681 errors.reportError(ERROR_CODE.NO_META_REGION, ".META. is not found on any region.");
682 if (shouldFix()) {
683 errors.print("Trying to fix a problem with .META...");
684 setShouldRerun();
685
686 HBaseFsckRepair.fixUnassigned(conf, root.metaEntry);
687 }
688 }
689
690 else if (metaRegions.size() > 1) {
691 errors.reportError(ERROR_CODE.MULTI_META_REGION, ".META. is found on more than one region.");
692 if (shouldFix()) {
693 errors.print("Trying to fix a problem with .META...");
694 setShouldRerun();
695
696 List <HServerAddress> deployedOn = Lists.newArrayList();
697 for (HbckInfo mRegion : metaRegions) {
698 deployedOn.add(mRegion.metaEntry.regionServer);
699 }
700 HBaseFsckRepair.fixDupeAssignment(conf, root.metaEntry, deployedOn);
701 }
702 }
703
704 return false;
705 }
706
707 return true;
708 }
709
710
711
712
713
714 void getMetaEntries() throws IOException {
715 MetaScannerVisitor visitor = new MetaScannerVisitor() {
716 int countRecord = 1;
717
718
719 final Comparator<KeyValue> comp = new Comparator<KeyValue>() {
720 public int compare(KeyValue k1, KeyValue k2) {
721 return (int)(k1.getTimestamp() - k2.getTimestamp());
722 }
723 };
724
725 public boolean processRow(Result result) throws IOException {
726 try {
727
728
729 long ts = Collections.max(result.list(), comp).getTimestamp();
730
731
732 byte [] value = result.getValue(HConstants.CATALOG_FAMILY,
733 HConstants.REGIONINFO_QUALIFIER);
734 if (value == null || value.length == 0) {
735 emptyRegionInfoQualifiers.add(result);
736 return true;
737 }
738 HRegionInfo info = Writables.getHRegionInfo(value);
739 HServerAddress server = null;
740 byte[] startCode = null;
741
742
743 value = result.getValue(HConstants.CATALOG_FAMILY,
744 HConstants.SERVER_QUALIFIER);
745 if (value != null && value.length > 0) {
746 String address = Bytes.toString(value);
747 server = new HServerAddress(address);
748 }
749
750
751 value = result.getValue(HConstants.CATALOG_FAMILY,
752 HConstants.STARTCODE_QUALIFIER);
753 if (value != null) {
754 startCode = value;
755 }
756 MetaEntry m = new MetaEntry(info, server, startCode, ts);
757 HbckInfo hbInfo = new HbckInfo(m);
758 HbckInfo previous = regionInfo.put(info.getEncodedName(), hbInfo);
759 if (previous != null) {
760 throw new IOException("Two entries in META are same " + previous);
761 }
762
763
764 if (countRecord % 100 == 0) {
765 errors.progress();
766 }
767 countRecord++;
768 return true;
769 } catch (RuntimeException e) {
770 LOG.error("Result=" + result);
771 throw e;
772 }
773 }
774 };
775
776
777 MetaScanner.metaScan(conf, visitor, null, null,
778 Integer.MAX_VALUE, HConstants.ROOT_TABLE_NAME);
779
780
781 MetaScanner.metaScan(conf, visitor);
782 errors.print("");
783 }
784
785
786
787
788 private static class MetaEntry extends HRegionInfo {
789 HServerAddress regionServer;
790 long modTime;
791
792 public MetaEntry(HRegionInfo rinfo, HServerAddress regionServer,
793 byte[] startCode, long modTime) {
794 super(rinfo);
795 this.regionServer = regionServer;
796 this.modTime = modTime;
797 }
798 }
799
800
801
802
803 static class HbckInfo implements Comparable {
804 boolean onlyEdits = false;
805 MetaEntry metaEntry = null;
806 FileStatus foundRegionDir = null;
807 List<HServerAddress> deployedOn = Lists.newArrayList();
808
809 HbckInfo(MetaEntry metaEntry) {
810 this.metaEntry = metaEntry;
811 }
812
813 public synchronized void addServer(HServerAddress server) {
814 this.deployedOn.add(server);
815 }
816
817 public synchronized String toString() {
818 if (metaEntry != null) {
819 return metaEntry.getRegionNameAsString();
820 } else if (foundRegionDir != null) {
821 return foundRegionDir.getPath().toString();
822 } else {
823 return "UNKNOWN_REGION on " + Joiner.on(", ").join(deployedOn);
824 }
825 }
826
827 @Override
828 public int compareTo(Object o) {
829 HbckInfo other = (HbckInfo) o;
830 int startComparison = Bytes.compareTo(this.metaEntry.getStartKey(), other.metaEntry.getStartKey());
831 if (startComparison != 0)
832 return startComparison;
833 else
834 return Bytes.compareTo(this.metaEntry.getEndKey(), other.metaEntry.getEndKey());
835 }
836 }
837
838
839
840
841 private void printTableSummary() {
842 System.out.println("Summary:");
843 for (TInfo tInfo : tablesInfo.values()) {
844 if (errors.tableHasErrors(tInfo)) {
845 System.out.println("Table " + tInfo.getName() + " is inconsistent.");
846 } else {
847 System.out.println(" " + tInfo.getName() + " is okay.");
848 }
849 System.out.println(" Number of regions: " + tInfo.getNumRegions());
850 System.out.print(" Deployed on: ");
851 for (HServerAddress server : tInfo.deployedOn) {
852 System.out.print(" " + server.toString());
853 }
854 System.out.println();
855 }
856 }
857
858 interface ErrorReporter {
859 public static enum ERROR_CODE {
860 UNKNOWN, NO_META_REGION, NULL_ROOT_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
861 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
862 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
863 FIRST_REGION_STARTKEY_NOT_EMPTY, DUPE_STARTKEYS,
864 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE
865 }
866 public void clear();
867 public void report(String message);
868 public void reportError(String message);
869 public void reportError(ERROR_CODE errorCode, String message);
870 public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info);
871 public void reportError(ERROR_CODE errorCode, String message, TInfo table, HbckInfo info1, HbckInfo info2);
872 public int summarize();
873 public void detail(String details);
874 public ArrayList<ERROR_CODE> getErrorList();
875 public void progress();
876 public void print(String message);
877 public void resetErrors();
878 public boolean tableHasErrors(TInfo table);
879 }
880
881 private static class PrintingErrorReporter implements ErrorReporter {
882 public int errorCount = 0;
883 private int showProgress;
884
885 Set<TInfo> errorTables = new HashSet<TInfo>();
886
887
888 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
889
890 public void clear() {
891 errorTables.clear();
892 errorList.clear();
893 errorCount = 0;
894 }
895
896 public synchronized void reportError(ERROR_CODE errorCode, String message) {
897 errorList.add(errorCode);
898 if (!summary) {
899 System.out.println("ERROR: " + message);
900 }
901 errorCount++;
902 showProgress = 0;
903 }
904
905 public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
906 HbckInfo info) {
907 errorTables.add(table);
908 String reference = "(region " + info.metaEntry.getRegionNameAsString() + ")";
909 reportError(errorCode, reference + " " + message);
910 }
911
912 public synchronized void reportError(ERROR_CODE errorCode, String message, TInfo table,
913 HbckInfo info1, HbckInfo info2) {
914 errorTables.add(table);
915 String reference = "(regions " + info1.metaEntry.getRegionNameAsString()
916 + " and " + info2.metaEntry.getRegionNameAsString() + ")";
917 reportError(errorCode, reference + " " + message);
918 }
919
920 public synchronized void reportError(String message) {
921 reportError(ERROR_CODE.UNKNOWN, message);
922 }
923
924
925
926
927
928
929 public synchronized void report(String message) {
930 if (! summary) {
931 System.out.println("ERROR: " + message);
932 }
933 showProgress = 0;
934 }
935
936 public synchronized int summarize() {
937 System.out.println(Integer.toString(errorCount) +
938 " inconsistencies detected.");
939 if (errorCount == 0) {
940 System.out.println("Status: OK");
941 return 0;
942 } else {
943 System.out.println("Status: INCONSISTENT");
944 return -1;
945 }
946 }
947
948 public ArrayList<ERROR_CODE> getErrorList() {
949 return errorList;
950 }
951
952 public synchronized void print(String message) {
953 if (!summary) {
954 System.out.println(message);
955 }
956 }
957
958 @Override
959 public boolean tableHasErrors(TInfo table) {
960 return errorTables.contains(table);
961 }
962
963 @Override
964 public void resetErrors() {
965 errorCount = 0;
966 }
967
968 public synchronized void detail(String message) {
969 if (details) {
970 System.out.println(message);
971 }
972 showProgress = 0;
973 }
974
975 public synchronized void progress() {
976 if (showProgress++ == 10) {
977 if (!summary) {
978 System.out.print(".");
979 }
980 showProgress = 0;
981 }
982 }
983 }
984
985
986
987
988 static class WorkItemRegion implements Runnable {
989 private HBaseFsck hbck;
990 private HServerInfo rsinfo;
991 private ErrorReporter errors;
992 private HConnection connection;
993 private boolean done;
994
995 WorkItemRegion(HBaseFsck hbck, HServerInfo info,
996 ErrorReporter errors, HConnection connection) {
997 this.hbck = hbck;
998 this.rsinfo = info;
999 this.errors = errors;
1000 this.connection = connection;
1001 this.done = false;
1002 }
1003
1004
1005 synchronized boolean isDone() {
1006 return done;
1007 }
1008
1009 @Override
1010 public synchronized void run() {
1011 errors.progress();
1012 try {
1013 HRegionInterface server = connection.getHRegionConnection(
1014 rsinfo.getServerAddress());
1015
1016
1017 List<HRegionInfo> regions = server.getOnlineRegions();
1018 if (details) {
1019 errors.detail("RegionServer: " + rsinfo.getServerName() +
1020 " number of regions: " + regions.size());
1021 for (HRegionInfo rinfo: regions) {
1022 errors.detail(" " + rinfo.getRegionNameAsString() +
1023 " id: " + rinfo.getRegionId() +
1024 " encoded_name: " + rinfo.getEncodedName() +
1025 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
1026 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
1027 }
1028 }
1029
1030
1031 for (HRegionInfo r:regions) {
1032 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
1033 hbi.addServer(rsinfo.getServerAddress());
1034 }
1035 } catch (IOException e) {
1036 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
1037 " Unable to fetch region information. " + e);
1038 } finally {
1039 done = true;
1040 notifyAll();
1041 }
1042 }
1043 }
1044
1045
1046
1047
1048 static class WorkItemHdfsDir implements Runnable {
1049 private HBaseFsck hbck;
1050 private FileStatus tableDir;
1051 private ErrorReporter errors;
1052 private FileSystem fs;
1053 private boolean done;
1054
1055 WorkItemHdfsDir(HBaseFsck hbck, FileSystem fs, ErrorReporter errors,
1056 FileStatus status) {
1057 this.hbck = hbck;
1058 this.fs = fs;
1059 this.tableDir = status;
1060 this.errors = errors;
1061 this.done = false;
1062 }
1063
1064 synchronized boolean isDone() {
1065 return done;
1066 }
1067
1068 @Override
1069 public synchronized void run() {
1070 try {
1071 String tableName = tableDir.getPath().getName();
1072
1073 if (tableName.startsWith(".") &&
1074 !tableName.equals( Bytes.toString(HConstants.META_TABLE_NAME)))
1075 return;
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120 void displayFullReport() {
1121 details = true;
1122 }
1123
1124
1125
1126
1127
1128 void setSummary() {
1129 summary = true;
1130 }
1131
1132
1133
1134
1135
1136
1137
1138 void setShouldRerun() {
1139 rerun = true;
1140 }
1141
1142 boolean shouldRerun() {
1143 return rerun;
1144 }
1145
1146
1147
1148
1149
1150 void setFixErrors(boolean shouldFix) {
1151 fix = shouldFix;
1152 }
1153
1154 boolean shouldFix() {
1155 return fix;
1156 }
1157
1158
1159
1160
1161
1162
1163 void setTimeLag(long seconds) {
1164 timelag = seconds * 1000;
1165 }
1166
1167 protected static void printUsageAndExit() {
1168 System.err.println("Usage: fsck [opts] ");
1169 System.err.println(" where [opts] are:");
1170 System.err.println(" -details Display full report of all regions.");
1171 System.err.println(" -timelag {timeInSeconds} Process only regions that " +
1172 " have not experienced any metadata updates in the last " +
1173 " {{timeInSeconds} seconds.");
1174 System.err.println(" -fix Try to fix some of the errors.");
1175 System.err.println(" -sleepBeforeRerun {timeInSeconds} Sleep this many seconds" +
1176 " before checking if the fix worked if run with -fix");
1177 System.err.println(" -summary Print only summary of the tables and status.");
1178
1179 Runtime.getRuntime().exit(-2);
1180 }
1181
1182
1183
1184
1185
1186
1187 public static void main(String [] args) throws Exception {
1188
1189
1190 Configuration conf = HBaseConfiguration.create();
1191 conf.set("fs.defaultFS", conf.get("hbase.rootdir"));
1192 HBaseFsck fsck = new HBaseFsck(conf);
1193 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
1194
1195
1196 for (int i = 0; i < args.length; i++) {
1197 String cmd = args[i];
1198 if (cmd.equals("-details")) {
1199 fsck.displayFullReport();
1200 } else if (cmd.equals("-timelag")) {
1201 if (i == args.length - 1) {
1202 System.err.println("HBaseFsck: -timelag needs a value.");
1203 printUsageAndExit();
1204 }
1205 try {
1206 long timelag = Long.parseLong(args[i+1]);
1207 fsck.setTimeLag(timelag);
1208 } catch (NumberFormatException e) {
1209 System.err.println("-timelag needs a numeric value.");
1210 printUsageAndExit();
1211 }
1212 i++;
1213 } else if (cmd.equals("-sleepBeforeRerun")) {
1214 if (i == args.length - 1) {
1215 System.err.println("HBaseFsck: -sleepBeforeRerun needs a value.");
1216 printUsageAndExit();
1217 }
1218 try {
1219 sleepBeforeRerun = Long.parseLong(args[i+1]);
1220 } catch (NumberFormatException e) {
1221 System.err.println("-sleepBeforeRerun needs a numeric value.");
1222 printUsageAndExit();
1223 }
1224 i++;
1225 } else if (cmd.equals("-fix")) {
1226 fsck.setFixErrors(true);
1227 } else if (cmd.equals("-summary")) {
1228 fsck.setSummary();
1229 } else {
1230 String str = "Unknown command line option : " + cmd;
1231 LOG.info(str);
1232 System.out.println(str);
1233 printUsageAndExit();
1234 }
1235 }
1236
1237 int code = fsck.doWork();
1238
1239
1240
1241
1242 if (fsck.shouldRerun()) {
1243 try {
1244 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
1245 Thread.sleep(sleepBeforeRerun);
1246 } catch (InterruptedException ie) {
1247 Runtime.getRuntime().exit(code);
1248 }
1249
1250 fsck.setFixErrors(false);
1251 fsck.errors.resetErrors();
1252 code = fsck.doWork();
1253 }
1254
1255 Runtime.getRuntime().exit(code);
1256 }
1257 }
1258