1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.client;
21
22 import java.io.IOException;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.Comparator;
26 import java.util.TreeMap;
27 import java.util.TreeSet;
28 import java.util.concurrent.atomic.AtomicInteger;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32 import org.apache.hadoop.conf.Configuration;
33 import org.apache.hadoop.fs.FileStatus;
34 import org.apache.hadoop.fs.FileSystem;
35 import org.apache.hadoop.fs.Path;
36 import org.apache.hadoop.hbase.ClusterStatus;
37 import org.apache.hadoop.hbase.HBaseConfiguration;
38 import org.apache.hadoop.hbase.HConstants;
39 import org.apache.hadoop.hbase.HRegionInfo;
40 import org.apache.hadoop.hbase.HServerAddress;
41 import org.apache.hadoop.hbase.HServerInfo;
42 import org.apache.hadoop.hbase.HTableDescriptor;
43 import org.apache.hadoop.hbase.KeyValue;
44 import org.apache.hadoop.hbase.MasterNotRunningException;
45 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
46 import org.apache.hadoop.hbase.ipc.HMasterInterface;
47 import org.apache.hadoop.hbase.ipc.HRegionInterface;
48 import org.apache.hadoop.hbase.util.Bytes;
49 import org.apache.hadoop.hbase.util.Writables;
50
51
52
53
54
55 public class HBaseFsck extends HBaseAdmin {
56 public static final long DEFAULT_TIME_LAG = 60000;
57
58 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
59 private Configuration conf;
60 private FileSystem fs;
61 private Path rootDir;
62
63 private ClusterStatus status;
64 private HMasterInterface master;
65 private HConnection connection;
66 private TreeMap<HRegionInfo, MetaEntry> metaEntries;
67
68 private boolean details = false;
69 private long timelag = DEFAULT_TIME_LAG;
70
71
72
73
74
75
76
77 public HBaseFsck(Configuration conf)
78 throws MasterNotRunningException, IOException {
79 super(conf);
80 this.conf = conf;
81
82
83 this.fs = FileSystem.get(conf);
84 this.rootDir = new Path(conf.get(HConstants.HBASE_DIR));
85
86
87
88 master = getMaster();
89 status = master.getClusterStatus();
90 connection = getConnection();
91 this.metaEntries = new TreeMap<HRegionInfo, MetaEntry>();
92 }
93
94
95
96
97
98
99 int doWork() throws IOException {
100
101 System.out.println("Version: " + status.getHBaseVersion());
102
103
104
105 getMetaEntries(metaEntries);
106
107
108 AtomicInteger numSkipped = new AtomicInteger(0);
109 HTableDescriptor[] allTables = getTables(metaEntries, numSkipped);
110 System.out.println("Number of Tables: " + allTables.length);
111 if (details) {
112 if (numSkipped.get() > 0) {
113 System.out.println("\n Number of Tables in flux: " + numSkipped.get());
114 }
115 for (HTableDescriptor td : allTables) {
116 String tableName = td.getNameAsString();
117 System.out.println("\t Table: " + tableName + "\t" +
118 (td.isReadOnly() ? "ro" : "rw") + "\t" +
119 (td.isRootRegion() ? "ROOT" :
120 (td.isMetaRegion() ? "META" : " ")) + "\t" +
121 " families:" + td.getFamilies().size());
122 }
123 }
124
125
126 Collection<HServerInfo> regionServers = status.getServerInfo();
127 System.out.println("Number of live region servers:" +
128 regionServers.size());
129 if (details) {
130 for (HServerInfo rsinfo: regionServers) {
131 System.out.println("\t RegionServer:" + rsinfo.getServerName());
132 }
133 }
134
135
136 Collection<String> deadRegionServers = status.getDeadServerNames();
137 System.out.println("Number of dead region servers:" +
138 deadRegionServers.size());
139 if (details) {
140 for (String name: deadRegionServers) {
141 System.out.println("\t RegionServer(dead):" + name);
142 }
143 }
144
145
146 boolean status1 = processRegionServers(regionServers);
147
148
149 boolean status2 = checkHdfs();
150
151 if (status1 == true && status2 == true) {
152 System.out.println("\nRest easy, buddy! HBase is clean. ");
153 return 0;
154 } else {
155 System.out.println("\nInconsistencies detected.");
156 return -1;
157 }
158 }
159
160
161
162
163
164 boolean checkHdfs() throws IOException {
165
166 boolean status = true;
167
168
169 TreeMap<String, MetaEntry> regions = new TreeMap<String, MetaEntry>();
170 for (MetaEntry meta: metaEntries.values()) {
171 regions.put(meta.getTableDesc().getNameAsString(), meta);
172 }
173
174
175 TreeMap<Path, FileStatus> allTableDirs = new TreeMap<Path, FileStatus>();
176 FileStatus[] files = fs.listStatus(rootDir);
177 for (int i = 0; files != null && i < files.length; i++) {
178 allTableDirs.put(files[i].getPath(), files[i]);
179 }
180
181
182 Path rdir = new Path(rootDir, Bytes.toString(HConstants.ROOT_TABLE_NAME));
183 FileStatus ignore = allTableDirs.remove(rdir);
184 if (ignore == null) {
185 status = false;
186 System.out.print("\nERROR: Path " + rdir + " for ROOT table does not exist.");
187 }
188 Path mdir = new Path(rootDir, Bytes.toString(HConstants.META_TABLE_NAME));
189 ignore = allTableDirs.remove(mdir);
190 if (ignore == null) {
191 status = false;
192 System.out.print("\nERROR: Path " + mdir + " for META table does not exist.");
193 }
194
195
196 Path vfile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
197 ignore = allTableDirs.remove(vfile);
198 if (ignore == null) {
199 status = false;
200 System.out.print("\nERROR: Version file " + vfile + " does not exist.");
201 }
202
203
204 for (HRegionInfo rinfo: metaEntries.values()) {
205 Path tableDir = HTableDescriptor.getTableDir(rootDir,
206 rinfo.getTableDesc().getName());
207
208
209
210 FileStatus found = allTableDirs.remove(tableDir);
211 if (found != null) {
212 regions.remove(tableDir.getName());
213 }
214 }
215
216
217
218
219 long now = System.currentTimeMillis();
220 for (FileStatus region: allTableDirs.values()) {
221 if (region.getModificationTime() + timelag < now) {
222 String finalComponent = region.getPath().getName();
223 if (!finalComponent.startsWith(".")) {
224
225 System.out.print("\nERROR: Path " + region.getPath() +
226 " does not have a corresponding entry in META.");
227 status = false;
228 }
229 }
230 }
231
232
233 for (HRegionInfo rinfo: regions.values()) {
234 System.out.println("\nERROR: Region " + rinfo.getRegionNameAsString() +
235 " does not have a corresponding entry in HDFS.");
236 status = false;
237 }
238 return status;
239 }
240
241
242
243
244
245
246
247 boolean processRegionServers(Collection<HServerInfo> regionServerList)
248 throws IOException {
249
250
251 TreeMap<HRegionInfo, MetaEntry> tmp =
252 new TreeMap<HRegionInfo, MetaEntry>(metaEntries);
253 long errorCount = 0;
254 int showProgress = 0;
255
256
257 for (HServerInfo rsinfo: regionServerList) {
258 showProgress++;
259 try {
260 HRegionInterface server = connection.getHRegionConnection(
261 rsinfo.getServerAddress());
262
263
264 HRegionInfo[] regions = server.getRegionsAssignment();
265 if (details) {
266 System.out.print("\nRegionServer:" + rsinfo.getServerName() +
267 " number of regions:" + regions.length);
268 for (HRegionInfo rinfo: regions) {
269 System.out.print("\n\t name:" + rinfo.getRegionNameAsString() +
270 " id:" + rinfo.getRegionId() +
271 " encoded name:" + rinfo.getEncodedName() +
272 " start :" + Bytes.toStringBinary(rinfo.getStartKey()) +
273 " end :" + Bytes.toStringBinary(rinfo.getEndKey()));
274 }
275 showProgress = 0;
276 }
277
278
279 for (HRegionInfo r: regions) {
280 MetaEntry metaEntry = metaEntries.get(r);
281
282
283 if (metaEntry == null) {
284 if (r.isMetaRegion()) {
285 continue;
286 }
287 System.out.print("\nERROR: Region " + r.getRegionNameAsString() +
288 " found on server " + rsinfo.getServerAddress() +
289 " but is not listed in META.");
290 errorCount++;
291 showProgress = 0;
292 continue;
293 }
294 if (!metaEntry.regionServer.equals(rsinfo.getServerAddress())) {
295 System.out.print("\nERROR: Region " + r.getRegionNameAsString() +
296 " found on server " + rsinfo.getServerAddress() +
297 " but is listed in META to be on server " +
298 metaEntry.regionServer);
299 errorCount++;
300 showProgress = 0;
301 }
302
303
304 tmp.remove(r);
305 }
306 } catch (IOException e) {
307 if (details) {
308 System.out.print("\nRegionServer:" + rsinfo.getServerName() +
309 " Unable to fetch region information. " + e);
310 }
311 }
312 if (showProgress % 10 == 0) {
313 System.out.print(".");
314 showProgress = 0;
315 }
316 }
317
318
319 for (MetaEntry metaEntry: tmp.values()) {
320
321
322
323
324 if (metaEntry.isOffline()) continue;
325 System.out.print("\nERROR: Region " + metaEntry.getRegionNameAsString() +
326 " is not served by any region server " +
327 " but is listed in META to be on server " +
328 metaEntry.regionServer);
329 errorCount++;
330 }
331
332 if (errorCount > 0) {
333 System.out.println("\nDetected " + errorCount + " inconsistencies. " +
334 "This might not indicate a real problem because these regions " +
335 "could be in the midst of a split. Consider re-running with a " +
336 "larger value of -timelag.");
337 return false;
338 }
339 return true;
340 }
341
342
343
344
345
346
347
348
349
350
351
352 HTableDescriptor[] getTables(final TreeMap<HRegionInfo, MetaEntry> regionList,
353 AtomicInteger numSkipped) {
354 TreeSet<HTableDescriptor> uniqueTables = new TreeSet<HTableDescriptor>();
355 long now = System.currentTimeMillis();
356
357 for (MetaEntry m: regionList.values()) {
358 HRegionInfo info = m;
359
360
361
362 if (info != null && info.getStartKey().length == 0) {
363 if (m.modTime + timelag < now) {
364 uniqueTables.add(info.getTableDesc());
365 } else {
366 numSkipped.incrementAndGet();
367 }
368 }
369 }
370 return uniqueTables.toArray(new HTableDescriptor[uniqueTables.size()]);
371 }
372
373
374
375
376
377
378 void getMetaEntries(final TreeMap<HRegionInfo,MetaEntry> regionList) throws IOException {
379 MetaScannerVisitor visitor = new MetaScannerVisitor() {
380 int countRecord = 1;
381
382
383 final Comparator<KeyValue> comp = new Comparator<KeyValue>() {
384 public int compare(KeyValue k1, KeyValue k2) {
385 return (int)(k1.getTimestamp() - k2.getTimestamp());
386 }
387 };
388
389 public boolean processRow(Result result) throws IOException {
390 try {
391
392
393 long ts = Collections.max(result.list(), comp).getTimestamp();
394
395
396 byte[] value = result.getValue(HConstants.CATALOG_FAMILY,
397 HConstants.REGIONINFO_QUALIFIER);
398 HRegionInfo info = null;
399 HServerAddress server = null;
400 byte[] startCode = null;
401 if (value != null) {
402 info = Writables.getHRegionInfo(value);
403 }
404
405
406 value = result.getValue(HConstants.CATALOG_FAMILY,
407 HConstants.SERVER_QUALIFIER);
408 if (value != null && value.length > 0) {
409 String address = Bytes.toString(value);
410 server = new HServerAddress(address);
411 }
412
413
414 value = result.getValue(HConstants.CATALOG_FAMILY,
415 HConstants.STARTCODE_QUALIFIER);
416 if (value != null) {
417 startCode = value;
418 }
419 MetaEntry m = new MetaEntry(info, server, startCode, ts);
420 m = regionList.put(m ,m);
421 if (m != null) {
422 throw new IOException("Two entries in META are same " + m);
423 }
424
425
426 if (countRecord % 100 == 0) {
427 System.out.print(".");
428 }
429 countRecord++;
430 return true;
431 } catch (RuntimeException e) {
432 LOG.error("Result=" + result);
433 throw e;
434 }
435 }
436 };
437 MetaScanner.metaScan(conf, visitor);
438 System.out.println("");
439 }
440
441
442
443
444 private static class MetaEntry extends HRegionInfo {
445 HServerAddress regionServer;
446 byte[] startCode;
447 long modTime;
448
449 public MetaEntry(HRegionInfo rinfo, HServerAddress regionServer,
450 byte[] startCode, long modTime) {
451 super(rinfo);
452 this.regionServer = regionServer;
453 this.startCode = startCode;
454 this.modTime = modTime;
455 }
456 }
457
458
459
460
461
462 void displayFullReport() {
463 details = true;
464 }
465
466
467
468
469
470
471 void setTimeLag(long seconds) {
472 timelag = seconds * 1000;
473 }
474
475 protected static void printUsageAndExit() {
476 System.err.println("Usage: fsck [opts] ");
477 System.err.println(" where [opts] are:");
478 System.err.println(" -details Display full report of all regions.");
479 System.err.println(" -timelag {timeInSeconds} Process only regions that " +
480 " have not experienced any metadata updates in the last " +
481 " {{timeInSeconds} seconds.");
482 Runtime.getRuntime().exit(-2);
483 }
484
485
486
487
488
489 public static void main(String [] args)
490 throws IOException, MasterNotRunningException {
491
492
493 Configuration conf = HBaseConfiguration.create();
494 conf.set("fs.defaultFS", conf.get("hbase.rootdir"));
495 HBaseFsck fsck = new HBaseFsck(conf);
496
497
498 for (int i = 0; i < args.length; i++) {
499 String cmd = args[i];
500 if (cmd.equals("-details")) {
501 fsck.displayFullReport();
502 } else if (cmd.equals("-timelag")) {
503 if (i == args.length - 1) {
504 System.err.println("HBaseFsck: -timelag needs a value.");
505 printUsageAndExit();
506 }
507 try {
508 long timelag = Long.parseLong(args[i+1]);
509 fsck.setTimeLag(timelag);
510 } catch (NumberFormatException e) {
511 System.err.println("-timelag needs a numeric value.");
512 printUsageAndExit();
513 }
514 i++;
515 } else {
516 String str = "Unknown command line option : " + cmd;
517 LOG.info(str);
518 System.out.println(str);
519 printUsageAndExit();
520 }
521 }
522
523 int code = fsck.doWork();
524 Runtime.getRuntime().exit(code);
525 }
526 }
527