1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util.hbck;
19
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27 import java.util.concurrent.Callable;
28 import java.util.concurrent.ConcurrentSkipListSet;
29 import java.util.concurrent.ExecutionException;
30 import java.util.concurrent.ExecutorService;
31 import java.util.concurrent.Future;
32 import java.util.concurrent.atomic.AtomicInteger;
33
34 import org.apache.commons.logging.Log;
35 import org.apache.commons.logging.LogFactory;
36 import org.apache.hadoop.conf.Configuration;
37 import org.apache.hadoop.fs.FileStatus;
38 import org.apache.hadoop.fs.FileSystem;
39 import org.apache.hadoop.fs.Path;
40 import org.apache.hadoop.hbase.HConstants;
41 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
42 import org.apache.hadoop.hbase.io.hfile.CorruptHFileException;
43 import org.apache.hadoop.hbase.io.hfile.HFile;
44 import org.apache.hadoop.hbase.util.FSUtils.FamilyDirFilter;
45 import org.apache.hadoop.hbase.util.FSUtils.HFileFilter;
46 import org.apache.hadoop.hbase.util.FSUtils.RegionDirFilter;
47 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
48
49
50
51
52
53
54
55
56
57 public class HFileCorruptionChecker {
58 private static final Log LOG = LogFactory.getLog(HFileCorruptionChecker.class);
59
60 final Configuration conf;
61 final FileSystem fs;
62 final CacheConfig cacheConf;
63 final ExecutorService executor;
64 final Set<Path> corrupted = new ConcurrentSkipListSet<Path>();
65 final Set<Path> failures = new ConcurrentSkipListSet<Path>();
66 final Set<Path> quarantined = new ConcurrentSkipListSet<Path>();
67 final Set<Path> missing = new ConcurrentSkipListSet<Path>();
68 final boolean inQuarantineMode;
69 final AtomicInteger hfilesChecked = new AtomicInteger();
70
71 public HFileCorruptionChecker(Configuration conf, ExecutorService executor,
72 boolean quarantine) throws IOException {
73 this.conf = conf;
74 this.fs = FileSystem.get(conf);
75 this.cacheConf = new CacheConfig(conf);
76 this.executor = executor;
77 this.inQuarantineMode = quarantine;
78 }
79
80
81
82
83
84
85
86
87
88 protected void checkHFile(Path p) throws IOException {
89 HFile.Reader r = null;
90 try {
91 r = HFile.createReader(fs, p, cacheConf);
92 } catch (CorruptHFileException che) {
93 LOG.warn("Found corrupt HFile " + p, che);
94 corrupted.add(p);
95 if (inQuarantineMode) {
96 Path dest = createQuarantinePath(p);
97 LOG.warn("Quarantining corrupt HFile " + p + " into " + dest);
98 boolean success = fs.mkdirs(dest.getParent());
99 success = success ? fs.rename(p, dest): false;
100 if (!success) {
101 failures.add(p);
102 } else {
103 quarantined.add(dest);
104 }
105 }
106 return;
107 } catch (FileNotFoundException fnfe) {
108 LOG.warn("HFile " + p + " was missing. Likely removed due to compaction/split?");
109 missing.add(p);
110 } finally {
111 hfilesChecked.addAndGet(1);
112 if (r != null) {
113 r.close(true);
114 }
115 }
116 }
117
118
119
120
121
122
123
124
125
126
127
128 Path createQuarantinePath(Path hFile) {
129
130 Path cfDir = hFile.getParent();
131 Path regionDir = cfDir.getParent();
132 Path tableDir = regionDir.getParent();
133
134
135 Path corruptBaseDir = new Path(conf.get(HConstants.HBASE_DIR), conf.get(
136 "hbase.hfile.quarantine.dir", HConstants.CORRUPT_DIR_NAME));
137 Path corruptTableDir = new Path(corruptBaseDir, tableDir.getName());
138 Path corruptRegionDir = new Path(corruptTableDir, regionDir.getName());
139 Path corruptFamilyDir = new Path(corruptRegionDir, cfDir.getName());
140 Path corruptHfile = new Path(corruptFamilyDir, hFile.getName());
141 return corruptHfile;
142 }
143
144
145
146
147
148
149
150
151 protected void checkColFamDir(Path cfDir) throws IOException {
152 FileStatus[] hfs = null;
153 try {
154 hfs = fs.listStatus(cfDir, new HFileFilter(fs));
155 } catch (FileNotFoundException fnfe) {
156
157 LOG.warn("Colfam Directory " + cfDir +
158 " does not exist. Likely due to concurrent split/compaction. Skipping.");
159 missing.add(cfDir);
160 return;
161 }
162
163
164 if (hfs.length == 0 && !fs.exists(cfDir)) {
165 LOG.warn("Colfam Directory " + cfDir +
166 " does not exist. Likely due to concurrent split/compaction. Skipping.");
167 missing.add(cfDir);
168 return;
169 }
170 for (FileStatus hfFs : hfs) {
171 Path hf = hfFs.getPath();
172 checkHFile(hf);
173 }
174 }
175
176
177
178
179
180
181
182
183 protected void checkRegionDir(Path regionDir) throws IOException {
184 FileStatus[] cfs = null;
185 try {
186 cfs = fs.listStatus(regionDir, new FamilyDirFilter(fs));
187 } catch (FileNotFoundException fnfe) {
188
189 LOG.warn("Region Directory " + regionDir +
190 " does not exist. Likely due to concurrent split/compaction. Skipping.");
191 missing.add(regionDir);
192 return;
193 }
194
195
196 if (cfs.length == 0 && !fs.exists(regionDir)) {
197 LOG.warn("Region Directory " + regionDir +
198 " does not exist. Likely due to concurrent split/compaction. Skipping.");
199 missing.add(regionDir);
200 return;
201 }
202
203 for (FileStatus cfFs : cfs) {
204 Path cfDir = cfFs.getPath();
205 checkColFamDir(cfDir);
206 }
207 }
208
209
210
211
212
213
214
215
216 void checkTableDir(Path tableDir) throws IOException {
217 FileStatus[] rds = fs.listStatus(tableDir, new RegionDirFilter(fs));
218 if (rds.length == 0 && !fs.exists(tableDir)) {
219
220 LOG.warn("Table Directory " + tableDir +
221 " does not exist. Likely due to concurrent delete. Skipping.");
222 missing.add(tableDir);
223 return;
224 }
225
226
227 List<RegionDirChecker> rdcs = new ArrayList<RegionDirChecker>();
228 List<Future<Void>> rdFutures;
229
230 for (FileStatus rdFs : rds) {
231 Path rdDir = rdFs.getPath();
232 RegionDirChecker work = new RegionDirChecker(rdDir);
233 rdcs.add(work);
234 }
235
236
237 try {
238 rdFutures = executor.invokeAll(rdcs);
239 } catch (InterruptedException ie) {
240 Thread.currentThread().interrupt();
241 LOG.warn("Region dirs checking interrupted!", ie);
242 return;
243 }
244
245 for (int i = 0; i < rdFutures.size(); i++) {
246 Future<Void> f = rdFutures.get(i);
247 try {
248 f.get();
249 } catch (ExecutionException e) {
250 LOG.warn("Failed to quaratine an HFile in regiondir "
251 + rdcs.get(i).regionDir, e.getCause());
252
253 if (e.getCause() instanceof IOException) {
254 throw (IOException) e.getCause();
255 }
256
257
258 if (e.getCause() instanceof RuntimeException) {
259 throw (RuntimeException) e.getCause();
260 }
261
262
263 LOG.error("Unexpected exception encountered", e);
264 return;
265 } catch (InterruptedException ie) {
266 Thread.currentThread().interrupt();
267 LOG.warn("Region dirs check interrupted!", ie);
268
269 return;
270 }
271 }
272 }
273
274
275
276
277
278 private class RegionDirChecker implements Callable<Void> {
279 final Path regionDir;
280
281 RegionDirChecker(Path regionDir) {
282 this.regionDir = regionDir;
283 }
284
285 @Override
286 public Void call() throws IOException {
287 checkRegionDir(regionDir);
288 return null;
289 }
290 }
291
292
293
294
295 public void checkTables(Collection<Path> tables) throws IOException {
296 for (Path t : tables) {
297 checkTableDir(t);
298 }
299 }
300
301
302
303
304 public Collection<Path> getFailures() {
305 return new HashSet<Path>(failures);
306 }
307
308
309
310
311 public Collection<Path> getCorrupted() {
312 return new HashSet<Path>(corrupted);
313 }
314
315
316
317
318 public int getHFilesChecked() {
319 return hfilesChecked.get();
320 }
321
322
323
324
325 public Collection<Path> getQuarantined() {
326 return new HashSet<Path>(quarantined);
327 }
328
329
330
331
332
333 public Collection<Path> getMissing() {
334 return new HashSet<Path>(missing);
335 }
336
337
338
339
340
341 public void report(ErrorReporter out) {
342 out.print("Checked " + hfilesChecked.get() + " hfile for corruption");
343 out.print(" HFiles corrupted: " + corrupted.size());
344 if (inQuarantineMode) {
345 out.print(" HFiles successfully quarantined: " + quarantined.size());
346 for (Path sq : quarantined) {
347 out.print(" " + sq);
348 }
349 out.print(" HFiles failed quarantine: " + failures.size());
350 for (Path fq : failures) {
351 out.print(" " + fq);
352 }
353 }
354 out.print(" HFiles moved while checking: " + missing.size());
355 for (Path mq : missing) {
356 out.print(" " + mq);
357 }
358
359 String initialState = (corrupted.size() == 0) ? "OK" : "CORRUPTED";
360 String fixedState = (corrupted.size() == quarantined.size()) ? "OK"
361 : "CORRUPTED";
362
363 if (inQuarantineMode) {
364 out.print("Summary: " + initialState + " => " + fixedState);
365 } else {
366 out.print("Summary: " + initialState);
367 }
368 }
369 }