View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.util.ArrayList;
25  import java.util.Collection;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.UUID;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.hadoop.hbase.classification.InterfaceAudience;
33  import org.apache.hadoop.conf.Configuration;
34  import org.apache.hadoop.fs.FSDataInputStream;
35  import org.apache.hadoop.fs.FSDataOutputStream;
36  import org.apache.hadoop.fs.FileStatus;
37  import org.apache.hadoop.fs.FileSystem;
38  import org.apache.hadoop.fs.FileUtil;
39  import org.apache.hadoop.fs.Path;
40  import org.apache.hadoop.fs.PathFilter;
41  import org.apache.hadoop.fs.permission.FsPermission;
42  import org.apache.hadoop.hbase.HColumnDescriptor;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.HRegionInfo;
45  import org.apache.hadoop.hbase.HTableDescriptor;
46  import org.apache.hadoop.hbase.KeyValue;
47  import org.apache.hadoop.hbase.backup.HFileArchiver;
48  import org.apache.hadoop.hbase.fs.HFileSystem;
49  import org.apache.hadoop.hbase.io.Reference;
50  import org.apache.hadoop.hbase.util.Bytes;
51  import org.apache.hadoop.hbase.util.FSHDFSUtils;
52  import org.apache.hadoop.hbase.util.FSUtils;
53  import org.apache.hadoop.hbase.util.Threads;
54  
55  /**
56   * View to an on-disk Region.
57   * Provides the set of methods necessary to interact with the on-disk region data.
58   */
59  @InterfaceAudience.Private
60  public class HRegionFileSystem {
61    public static final Log LOG = LogFactory.getLog(HRegionFileSystem.class);
62  
63    /** Name of the region info file that resides just under the region directory. */
64    public final static String REGION_INFO_FILE = ".regioninfo";
65  
66    /** Temporary subdirectory of the region directory used for merges. */
67    public static final String REGION_MERGES_DIR = ".merges";
68  
69    /** Temporary subdirectory of the region directory used for splits. */
70    public static final String REGION_SPLITS_DIR = ".splits";
71  
72    /** Temporary subdirectory of the region directory used for compaction output. */
73    private static final String REGION_TEMP_DIR = ".tmp";
74  
75    private final HRegionInfo regionInfo;
76    private final Configuration conf;
77    private final Path tableDir;
78    private final FileSystem fs;
79  
80    /**
81     * In order to handle NN connectivity hiccups, one need to retry non-idempotent operation at the
82     * client level.
83     */
84    private final int hdfsClientRetriesNumber;
85    private final int baseSleepBeforeRetries;
86    private static final int DEFAULT_HDFS_CLIENT_RETRIES_NUMBER = 10;
87    private static final int DEFAULT_BASE_SLEEP_BEFORE_RETRIES = 1000;
88  
89    /**
90     * Create a view to the on-disk region
91     * @param conf the {@link Configuration} to use
92     * @param fs {@link FileSystem} that contains the region
93     * @param tableDir {@link Path} to where the table is being stored
94     * @param regionInfo {@link HRegionInfo} for region
95     */
96    HRegionFileSystem(final Configuration conf, final FileSystem fs, final Path tableDir,
97        final HRegionInfo regionInfo) {
98      this.fs = fs;
99      this.conf = conf;
100     this.tableDir = tableDir;
101     this.regionInfo = regionInfo;
102     this.hdfsClientRetriesNumber = conf.getInt("hdfs.client.retries.number",
103       DEFAULT_HDFS_CLIENT_RETRIES_NUMBER);
104     this.baseSleepBeforeRetries = conf.getInt("hdfs.client.sleep.before.retries",
105       DEFAULT_BASE_SLEEP_BEFORE_RETRIES);
106  }
107 
108   /** @return the underlying {@link FileSystem} */
109   public FileSystem getFileSystem() {
110     return this.fs;
111   }
112 
113   /** @return the {@link HRegionInfo} that describe this on-disk region view */
114   public HRegionInfo getRegionInfo() {
115     return this.regionInfo;
116   }
117 
118   /** @return {@link Path} to the region's root directory. */
119   public Path getTableDir() {
120     return this.tableDir;
121   }
122 
123   /** @return {@link Path} to the region directory. */
124   public Path getRegionDir() {
125     return new Path(this.tableDir, this.regionInfo.getEncodedName());
126   }
127 
128   // ===========================================================================
129   //  Temp Helpers
130   // ===========================================================================
131   /** @return {@link Path} to the region's temp directory, used for file creations */
132   Path getTempDir() {
133     return new Path(getRegionDir(), REGION_TEMP_DIR);
134   }
135 
136   /**
137    * Clean up any temp detritus that may have been left around from previous operation attempts.
138    */
139   void cleanupTempDir() throws IOException {
140     deleteDir(getTempDir());
141   }
142 
143   // ===========================================================================
144   //  Store/StoreFile Helpers
145   // ===========================================================================
146   /**
147    * Returns the directory path of the specified family
148    * @param familyName Column Family Name
149    * @return {@link Path} to the directory of the specified family
150    */
151   public Path getStoreDir(final String familyName) {
152     return new Path(this.getRegionDir(), familyName);
153   }
154 
155   /**
156    * Create the store directory for the specified family name
157    * @param familyName Column Family Name
158    * @return {@link Path} to the directory of the specified family
159    * @throws IOException if the directory creation fails.
160    */
161   Path createStoreDir(final String familyName) throws IOException {
162     Path storeDir = getStoreDir(familyName);
163     if(!fs.exists(storeDir) && !createDir(storeDir))
164       throw new IOException("Failed creating "+storeDir);
165     return storeDir;
166   }
167 
168   /**
169    * Returns the store files available for the family.
170    * This methods performs the filtering based on the valid store files.
171    * @param familyName Column Family Name
172    * @return a set of {@link StoreFileInfo} for the specified family.
173    */
174   public Collection<StoreFileInfo> getStoreFiles(final byte[] familyName) throws IOException {
175     return getStoreFiles(Bytes.toString(familyName));
176   }
177 
178   public Collection<StoreFileInfo> getStoreFiles(final String familyName) throws IOException {
179     return getStoreFiles(familyName, true);
180   }
181 
182   /**
183    * Returns the store files available for the family.
184    * This methods performs the filtering based on the valid store files.
185    * @param familyName Column Family Name
186    * @return a set of {@link StoreFileInfo} for the specified family.
187    */
188   public Collection<StoreFileInfo> getStoreFiles(final String familyName, final boolean validate)
189       throws IOException {
190     Path familyDir = getStoreDir(familyName);
191     FileStatus[] files = FSUtils.listStatus(this.fs, familyDir);
192     if (files == null) {
193       LOG.debug("No StoreFiles for: " + familyDir);
194       return null;
195     }
196 
197     ArrayList<StoreFileInfo> storeFiles = new ArrayList<StoreFileInfo>(files.length);
198     for (FileStatus status: files) {
199       if (validate && !StoreFileInfo.isValid(status)) {
200         LOG.warn("Invalid StoreFile: " + status.getPath());
201         continue;
202       }
203 
204       storeFiles.add(new StoreFileInfo(this.conf, this.fs, status));
205     }
206     return storeFiles;
207   }
208 
209   /**
210    * Return Qualified Path of the specified family/file
211    *
212    * @param familyName Column Family Name
213    * @param fileName File Name
214    * @return The qualified Path for the specified family/file
215    */
216   Path getStoreFilePath(final String familyName, final String fileName) {
217     Path familyDir = getStoreDir(familyName);
218     return new Path(familyDir, fileName).makeQualified(this.fs);
219   }
220 
221   /**
222    * Return the store file information of the specified family/file.
223    *
224    * @param familyName Column Family Name
225    * @param fileName File Name
226    * @return The {@link StoreFileInfo} for the specified family/file
227    */
228   StoreFileInfo getStoreFileInfo(final String familyName, final String fileName)
229       throws IOException {
230     Path familyDir = getStoreDir(familyName);
231     FileStatus status = fs.getFileStatus(new Path(familyDir, fileName));
232     return new StoreFileInfo(this.conf, this.fs, status);
233   }
234 
235   /**
236    * Returns true if the specified family has reference files
237    * @param familyName Column Family Name
238    * @return true if family contains reference files
239    * @throws IOException
240    */
241   public boolean hasReferences(final String familyName) throws IOException {
242     FileStatus[] files = FSUtils.listStatus(fs, getStoreDir(familyName),
243       new PathFilter () {
244         public boolean accept(Path path) {
245           return StoreFileInfo.isReference(path);
246         }
247       }
248     );
249     return files != null && files.length > 0;
250   }
251 
252   /**
253    * Check whether region has Reference file
254    * @param htd table desciptor of the region
255    * @return true if region has reference file
256    * @throws IOException
257    */
258   public boolean hasReferences(final HTableDescriptor htd) throws IOException {
259     for (HColumnDescriptor family : htd.getFamilies()) {
260       if (hasReferences(family.getNameAsString())) {
261         return true;
262       }
263     }
264     return false;
265   }
266 
267   /**
268    * @return the set of families present on disk
269    * @throws IOException
270    */
271   public Collection<String> getFamilies() throws IOException {
272     FileStatus[] fds = FSUtils.listStatus(fs, getRegionDir(), new FSUtils.FamilyDirFilter(fs));
273     if (fds == null) return null;
274 
275     ArrayList<String> families = new ArrayList<String>(fds.length);
276     for (FileStatus status: fds) {
277       families.add(status.getPath().getName());
278     }
279 
280     return families;
281   }
282 
283   /**
284    * Remove the region family from disk, archiving the store files.
285    * @param familyName Column Family Name
286    * @throws IOException if an error occours during the archiving
287    */
288   public void deleteFamily(final String familyName) throws IOException {
289     // archive family store files
290     HFileArchiver.archiveFamily(fs, conf, regionInfo, tableDir, Bytes.toBytes(familyName));
291 
292     // delete the family folder
293     Path familyDir = getStoreDir(familyName);
294     if(fs.exists(familyDir) && !deleteDir(familyDir))
295       throw new IOException("Could not delete family " + familyName
296           + " from FileSystem for region " + regionInfo.getRegionNameAsString() + "("
297           + regionInfo.getEncodedName() + ")");
298   }
299 
300   /**
301    * Generate a unique file name, used by createTempName() and commitStoreFile()
302    * @param suffix extra information to append to the generated name
303    * @return Unique file name
304    */
305   private static String generateUniqueName(final String suffix) {
306     String name = UUID.randomUUID().toString().replaceAll("-", "");
307     if (suffix != null) name += suffix;
308     return name;
309   }
310 
311   /**
312    * Generate a unique temporary Path. Used in conjuction with commitStoreFile()
313    * to get a safer file creation.
314    * <code>
315    * Path file = fs.createTempName();
316    * ...StoreFile.Writer(file)...
317    * fs.commitStoreFile("family", file);
318    * </code>
319    *
320    * @return Unique {@link Path} of the temporary file
321    */
322   public Path createTempName() {
323     return createTempName(null);
324   }
325 
326   /**
327    * Generate a unique temporary Path. Used in conjuction with commitStoreFile()
328    * to get a safer file creation.
329    * <code>
330    * Path file = fs.createTempName();
331    * ...StoreFile.Writer(file)...
332    * fs.commitStoreFile("family", file);
333    * </code>
334    *
335    * @param suffix extra information to append to the generated name
336    * @return Unique {@link Path} of the temporary file
337    */
338   public Path createTempName(final String suffix) {
339     return new Path(getTempDir(), generateUniqueName(suffix));
340   }
341 
342   /**
343    * Move the file from a build/temp location to the main family store directory.
344    * @param familyName Family that will gain the file
345    * @param buildPath {@link Path} to the file to commit.
346    * @return The new {@link Path} of the committed file
347    * @throws IOException
348    */
349   public Path commitStoreFile(final String familyName, final Path buildPath) throws IOException {
350     return commitStoreFile(familyName, buildPath, -1, false);
351   }
352 
353   /**
354    * Move the file from a build/temp location to the main family store directory.
355    * @param familyName Family that will gain the file
356    * @param buildPath {@link Path} to the file to commit.
357    * @param seqNum Sequence Number to append to the file name (less then 0 if no sequence number)
358    * @param generateNewName False if you want to keep the buildPath name
359    * @return The new {@link Path} of the committed file
360    * @throws IOException
361    */
362   private Path commitStoreFile(final String familyName, final Path buildPath,
363       final long seqNum, final boolean generateNewName) throws IOException {
364     Path storeDir = getStoreDir(familyName);
365     if(!fs.exists(storeDir) && !createDir(storeDir))
366       throw new IOException("Failed creating " + storeDir);
367 
368     String name = buildPath.getName();
369     if (generateNewName) {
370       name = generateUniqueName((seqNum < 0) ? null : "_SeqId_" + seqNum + "_");
371     }
372     Path dstPath = new Path(storeDir, name);
373     if (!fs.exists(buildPath)) {
374       throw new FileNotFoundException(buildPath.toString());
375     }
376     LOG.debug("Committing store file " + buildPath + " as " + dstPath);
377     // buildPath exists, therefore not doing an exists() check.
378     if (!rename(buildPath, dstPath)) {
379       throw new IOException("Failed rename of " + buildPath + " to " + dstPath);
380     }
381     return dstPath;
382   }
383 
384 
385   /**
386    * Moves multiple store files to the relative region's family store directory.
387    * @param storeFiles list of store files divided by family
388    * @throws IOException
389    */
390   void commitStoreFiles(final Map<byte[], List<StoreFile>> storeFiles) throws IOException {
391     for (Map.Entry<byte[], List<StoreFile>> es: storeFiles.entrySet()) {
392       String familyName = Bytes.toString(es.getKey());
393       for (StoreFile sf: es.getValue()) {
394         commitStoreFile(familyName, sf.getPath());
395       }
396     }
397   }
398 
399   /**
400    * Archives the specified store file from the specified family.
401    * @param familyName Family that contains the store files
402    * @param filePath {@link Path} to the store file to remove
403    * @throws IOException if the archiving fails
404    */
405   public void removeStoreFile(final String familyName, final Path filePath)
406       throws IOException {
407     HFileArchiver.archiveStoreFile(this.conf, this.fs, this.regionInfo,
408         this.tableDir, Bytes.toBytes(familyName), filePath);
409   }
410 
411   /**
412    * Closes and archives the specified store files from the specified family.
413    * @param familyName Family that contains the store files
414    * @param storeFiles set of store files to remove
415    * @throws IOException if the archiving fails
416    */
417   public void removeStoreFiles(final String familyName, final Collection<StoreFile> storeFiles)
418       throws IOException {
419     HFileArchiver.archiveStoreFiles(this.conf, this.fs, this.regionInfo,
420         this.tableDir, Bytes.toBytes(familyName), storeFiles);
421   }
422 
423   /**
424    * Bulk load: Add a specified store file to the specified family.
425    * If the source file is on the same different file-system is moved from the
426    * source location to the destination location, otherwise is copied over.
427    *
428    * @param familyName Family that will gain the file
429    * @param srcPath {@link Path} to the file to import
430    * @param seqNum Bulk Load sequence number
431    * @return The destination {@link Path} of the bulk loaded file
432    * @throws IOException
433    */
434   Path bulkLoadStoreFile(final String familyName, Path srcPath, long seqNum)
435       throws IOException {
436     // Copy the file if it's on another filesystem
437     FileSystem srcFs = srcPath.getFileSystem(conf);
438     FileSystem desFs = fs instanceof HFileSystem ? ((HFileSystem)fs).getBackingFs() : fs;
439 
440     // We can't compare FileSystem instances as equals() includes UGI instance
441     // as part of the comparison and won't work when doing SecureBulkLoad
442     // TODO deal with viewFS
443     if (!FSHDFSUtils.isSameHdfs(conf, srcFs, desFs)) {
444       LOG.info("Bulk-load file " + srcPath + " is on different filesystem than " +
445           "the destination store. Copying file over to destination filesystem.");
446       Path tmpPath = createTempName();
447       FileUtil.copy(srcFs, srcPath, fs, tmpPath, false, conf);
448       LOG.info("Copied " + srcPath + " to temporary path on destination filesystem: " + tmpPath);
449       srcPath = tmpPath;
450     }
451 
452     return commitStoreFile(familyName, srcPath, seqNum, true);
453   }
454 
455   // ===========================================================================
456   //  Splits Helpers
457   // ===========================================================================
458   /** @return {@link Path} to the temp directory used during split operations */
459   Path getSplitsDir() {
460     return new Path(getRegionDir(), REGION_SPLITS_DIR);
461   }
462 
463   Path getSplitsDir(final HRegionInfo hri) {
464     return new Path(getSplitsDir(), hri.getEncodedName());
465   }
466 
467   /**
468    * Clean up any split detritus that may have been left around from previous split attempts.
469    */
470   void cleanupSplitsDir() throws IOException {
471     deleteDir(getSplitsDir());
472   }
473 
474   /**
475    * Clean up any split detritus that may have been left around from previous
476    * split attempts.
477    * Call this method on initial region deploy.
478    * @throws IOException
479    */
480   void cleanupAnySplitDetritus() throws IOException {
481     Path splitdir = this.getSplitsDir();
482     if (!fs.exists(splitdir)) return;
483     // Look at the splitdir.  It could have the encoded names of the daughter
484     // regions we tried to make.  See if the daughter regions actually got made
485     // out under the tabledir.  If here under splitdir still, then the split did
486     // not complete.  Try and do cleanup.  This code WILL NOT catch the case
487     // where we successfully created daughter a but regionserver crashed during
488     // the creation of region b.  In this case, there'll be an orphan daughter
489     // dir in the filesystem.  TOOD: Fix.
490     FileStatus[] daughters = FSUtils.listStatus(fs, splitdir, new FSUtils.DirFilter(fs));
491     if (daughters != null) {
492       for (FileStatus daughter: daughters) {
493         Path daughterDir = new Path(getTableDir(), daughter.getPath().getName());
494         if (fs.exists(daughterDir) && !deleteDir(daughterDir)) {
495           throw new IOException("Failed delete of " + daughterDir);
496         }
497       }
498     }
499     cleanupSplitsDir();
500     LOG.info("Cleaned up old failed split transaction detritus: " + splitdir);
501   }
502 
503   /**
504    * Remove daughter region
505    * @param regionInfo daughter {@link HRegionInfo}
506    * @throws IOException
507    */
508   void cleanupDaughterRegion(final HRegionInfo regionInfo) throws IOException {
509     Path regionDir = new Path(this.tableDir, regionInfo.getEncodedName());
510     if (this.fs.exists(regionDir) && !deleteDir(regionDir)) {
511       throw new IOException("Failed delete of " + regionDir);
512     }
513   }
514 
515   /**
516    * Commit a daughter region, moving it from the split temporary directory
517    * to the proper location in the filesystem.
518    * @param regionInfo daughter {@link HRegionInfo}
519    * @throws IOException
520    */
521   Path commitDaughterRegion(final HRegionInfo regionInfo) throws IOException {
522     Path regionDir = new Path(this.tableDir, regionInfo.getEncodedName());
523     Path daughterTmpDir = this.getSplitsDir(regionInfo);
524     if (fs.exists(daughterTmpDir)) {
525       // Write HRI to a file in case we need to recover hbase:meta
526       Path regionInfoFile = new Path(daughterTmpDir, REGION_INFO_FILE);
527       byte[] regionInfoContent = getRegionInfoFileContent(regionInfo);
528       writeRegionInfoFileContent(conf, fs, regionInfoFile, regionInfoContent);
529 
530       // Move the daughter temp dir to the table dir
531       if (!rename(daughterTmpDir, regionDir)) {
532         throw new IOException("Unable to rename " + daughterTmpDir + " to " + regionDir);
533       }
534     }
535     return regionDir;
536   }
537 
538   /**
539    * Create the region splits directory.
540    */
541   void createSplitsDir() throws IOException {
542     Path splitdir = getSplitsDir();
543     if (fs.exists(splitdir)) {
544       LOG.info("The " + splitdir + " directory exists.  Hence deleting it to recreate it");
545       if (!deleteDir(splitdir)) {
546         throw new IOException("Failed deletion of " + splitdir
547             + " before creating them again.");
548       }
549     }
550     // splitDir doesn't exists now. No need to do an exists() call for it.
551     if (!createDir(splitdir)) {
552       throw new IOException("Failed create of " + splitdir);
553     }
554   }
555 
556   /**
557    * Write out a split reference. Package local so it doesnt leak out of
558    * regionserver.
559    * @param hri {@link HRegionInfo} of the destination
560    * @param familyName Column Family Name
561    * @param f File to split.
562    * @param splitRow Split Row
563    * @param top True if we are referring to the top half of the hfile.
564    * @return Path to created reference.
565    * @throws IOException
566    */
567   Path splitStoreFile(final HRegionInfo hri, final String familyName,
568       final StoreFile f, final byte[] splitRow, final boolean top) throws IOException {
569 
570     // Check whether the split row lies in the range of the store file
571     // If it is outside the range, return directly.
572     if (top) {
573       //check if larger than last key.
574       KeyValue splitKey = KeyValue.createFirstOnRow(splitRow);
575       byte[] lastKey = f.createReader().getLastKey();      
576       // If lastKey is null means storefile is empty.
577       if (lastKey == null) return null;
578       if (f.getReader().getComparator().compareFlatKey(splitKey.getBuffer(),
579           splitKey.getKeyOffset(), splitKey.getKeyLength(), lastKey, 0, lastKey.length) > 0) {
580         return null;
581       }
582     } else {
583       //check if smaller than first key
584       KeyValue splitKey = KeyValue.createLastOnRow(splitRow);
585       byte[] firstKey = f.createReader().getFirstKey();
586       // If firstKey is null means storefile is empty.
587       if (firstKey == null) return null;
588       if (f.getReader().getComparator().compareFlatKey(splitKey.getBuffer(),
589           splitKey.getKeyOffset(), splitKey.getKeyLength(), firstKey, 0, firstKey.length) < 0) {
590         return null;
591       }
592     }
593 
594     f.getReader().close(true);
595 
596     Path splitDir = new Path(getSplitsDir(hri), familyName);
597     // A reference to the bottom half of the hsf store file.
598     Reference r =
599       top ? Reference.createTopReference(splitRow): Reference.createBottomReference(splitRow);
600     // Add the referred-to regions name as a dot separated suffix.
601     // See REF_NAME_REGEX regex above.  The referred-to regions name is
602     // up in the path of the passed in <code>f</code> -- parentdir is family,
603     // then the directory above is the region name.
604     String parentRegionName = regionInfo.getEncodedName();
605     // Write reference with same file id only with the other region name as
606     // suffix and into the new region location (under same family).
607     Path p = new Path(splitDir, f.getPath().getName() + "." + parentRegionName);
608     return r.write(fs, p);
609   }
610 
611   // ===========================================================================
612   //  Merge Helpers
613   // ===========================================================================
614   /** @return {@link Path} to the temp directory used during merge operations */
615   Path getMergesDir() {
616     return new Path(getRegionDir(), REGION_MERGES_DIR);
617   }
618 
619   Path getMergesDir(final HRegionInfo hri) {
620     return new Path(getMergesDir(), hri.getEncodedName());
621   }
622 
623   /**
624    * Clean up any merge detritus that may have been left around from previous merge attempts.
625    */
626   void cleanupMergesDir() throws IOException {
627     deleteDir(getMergesDir());
628   }
629 
630   /**
631    * Remove merged region
632    * @param mergedRegion {@link HRegionInfo}
633    * @throws IOException
634    */
635   void cleanupMergedRegion(final HRegionInfo mergedRegion) throws IOException {
636     Path regionDir = new Path(this.tableDir, mergedRegion.getEncodedName());
637     if (this.fs.exists(regionDir) && !this.fs.delete(regionDir, true)) {
638       throw new IOException("Failed delete of " + regionDir);
639     }
640   }
641 
642   /**
643    * Create the region merges directory.
644    * @throws IOException If merges dir already exists or we fail to create it.
645    * @see HRegionFileSystem#cleanupMergesDir()
646    */
647   void createMergesDir() throws IOException {
648     Path mergesdir = getMergesDir();
649     if (fs.exists(mergesdir)) {
650       LOG.info("The " + mergesdir
651           + " directory exists.  Hence deleting it to recreate it");
652       if (!fs.delete(mergesdir, true)) {
653         throw new IOException("Failed deletion of " + mergesdir
654             + " before creating them again.");
655       }
656     }
657     if (!fs.mkdirs(mergesdir))
658       throw new IOException("Failed create of " + mergesdir);
659   }
660 
661   /**
662    * Write out a merge reference under the given merges directory. Package local
663    * so it doesnt leak out of regionserver.
664    * @param mergedRegion {@link HRegionInfo} of the merged region
665    * @param familyName Column Family Name
666    * @param f File to create reference.
667    * @param mergedDir
668    * @return Path to created reference.
669    * @throws IOException
670    */
671   Path mergeStoreFile(final HRegionInfo mergedRegion, final String familyName,
672       final StoreFile f, final Path mergedDir)
673       throws IOException {
674     Path referenceDir = new Path(new Path(mergedDir,
675         mergedRegion.getEncodedName()), familyName);
676     // A whole reference to the store file.
677     Reference r = Reference.createTopReference(regionInfo.getStartKey());
678     // Add the referred-to regions name as a dot separated suffix.
679     // See REF_NAME_REGEX regex above. The referred-to regions name is
680     // up in the path of the passed in <code>f</code> -- parentdir is family,
681     // then the directory above is the region name.
682     String mergingRegionName = regionInfo.getEncodedName();
683     // Write reference with same file id only with the other region name as
684     // suffix and into the new region location (under same family).
685     Path p = new Path(referenceDir, f.getPath().getName() + "."
686         + mergingRegionName);
687     return r.write(fs, p);
688   }
689 
690   /**
691    * Commit a merged region, moving it from the merges temporary directory to
692    * the proper location in the filesystem.
693    * @param mergedRegionInfo merged region {@link HRegionInfo}
694    * @throws IOException
695    */
696   void commitMergedRegion(final HRegionInfo mergedRegionInfo) throws IOException {
697     Path regionDir = new Path(this.tableDir, mergedRegionInfo.getEncodedName());
698     Path mergedRegionTmpDir = this.getMergesDir(mergedRegionInfo);
699     // Move the tmp dir in the expected location
700     if (mergedRegionTmpDir != null && fs.exists(mergedRegionTmpDir)) {
701       if (!fs.rename(mergedRegionTmpDir, regionDir)) {
702         throw new IOException("Unable to rename " + mergedRegionTmpDir + " to "
703             + regionDir);
704       }
705     }
706   }
707 
708   // ===========================================================================
709   //  Create/Open/Delete Helpers
710   // ===========================================================================
711   /**
712    * Log the current state of the region
713    * @param LOG log to output information
714    * @throws IOException if an unexpected exception occurs
715    */
716   void logFileSystemState(final Log LOG) throws IOException {
717     FSUtils.logFileSystemState(fs, this.getRegionDir(), LOG);
718   }
719 
720   /**
721    * @param hri
722    * @return Content of the file we write out to the filesystem under a region
723    * @throws IOException
724    */
725   private static byte[] getRegionInfoFileContent(final HRegionInfo hri) throws IOException {
726     return hri.toDelimitedByteArray();
727   }
728 
729   /**
730    * Create a {@link HRegionInfo} from the serialized version on-disk.
731    * @param fs {@link FileSystem} that contains the Region Info file
732    * @param regionDir {@link Path} to the Region Directory that contains the Info file
733    * @return An {@link HRegionInfo} instance gotten from the Region Info file.
734    * @throws IOException if an error occurred during file open/read operation.
735    */
736   public static HRegionInfo loadRegionInfoFileContent(final FileSystem fs, final Path regionDir)
737       throws IOException {
738     FSDataInputStream in = fs.open(new Path(regionDir, REGION_INFO_FILE));
739     try {
740       return HRegionInfo.parseFrom(in);
741     } finally {
742       in.close();
743     }
744   }
745 
746   /**
747    * Write the .regioninfo file on-disk.
748    */
749   private static void writeRegionInfoFileContent(final Configuration conf, final FileSystem fs,
750       final Path regionInfoFile, final byte[] content) throws IOException {
751     // First check to get the permissions
752     FsPermission perms = FSUtils.getFilePermissions(fs, conf, HConstants.DATA_FILE_UMASK_KEY);
753     // Write the RegionInfo file content
754     FSDataOutputStream out = FSUtils.create(fs, regionInfoFile, perms, null);
755     try {
756       out.write(content);
757     } finally {
758       out.close();
759     }
760   }
761 
762   /**
763    * Write out an info file under the stored region directory. Useful recovering mangled regions.
764    * If the regionInfo already exists on-disk, then we fast exit.
765    */
766   void checkRegionInfoOnFilesystem() throws IOException {
767     // Compose the content of the file so we can compare to length in filesystem. If not same,
768     // rewrite it (it may have been written in the old format using Writables instead of pb). The
769     // pb version is much shorter -- we write now w/o the toString version -- so checking length
770     // only should be sufficient. I don't want to read the file every time to check if it pb
771     // serialized.
772     byte[] content = getRegionInfoFileContent(regionInfo);
773     try {
774       Path regionInfoFile = new Path(getRegionDir(), REGION_INFO_FILE);
775 
776       FileStatus status = fs.getFileStatus(regionInfoFile);
777       if (status != null && status.getLen() == content.length) {
778         // Then assume the content good and move on.
779         // NOTE: that the length is not sufficient to define the the content matches.
780         return;
781       }
782 
783       LOG.info("Rewriting .regioninfo file at: " + regionInfoFile);
784       if (!fs.delete(regionInfoFile, false)) {
785         throw new IOException("Unable to remove existing " + regionInfoFile);
786       }
787     } catch (FileNotFoundException e) {
788       LOG.warn(REGION_INFO_FILE + " file not found for region: " + regionInfo.getEncodedName());
789     }
790 
791     // Write HRI to a file in case we need to recover hbase:meta
792     writeRegionInfoOnFilesystem(content, true);
793   }
794 
795   /**
796    * Write out an info file under the region directory. Useful recovering mangled regions.
797    * @param useTempDir indicate whether or not using the region .tmp dir for a safer file creation.
798    */
799   private void writeRegionInfoOnFilesystem(boolean useTempDir) throws IOException {
800     byte[] content = getRegionInfoFileContent(regionInfo);
801     writeRegionInfoOnFilesystem(content, useTempDir);
802   }
803 
804   /**
805    * Write out an info file under the region directory. Useful recovering mangled regions.
806    * @param regionInfoContent serialized version of the {@link HRegionInfo}
807    * @param useTempDir indicate whether or not using the region .tmp dir for a safer file creation.
808    */
809   private void writeRegionInfoOnFilesystem(final byte[] regionInfoContent,
810       final boolean useTempDir) throws IOException {
811     Path regionInfoFile = new Path(getRegionDir(), REGION_INFO_FILE);
812     if (useTempDir) {
813       // Create in tmpDir and then move into place in case we crash after
814       // create but before close. If we don't successfully close the file,
815       // subsequent region reopens will fail the below because create is
816       // registered in NN.
817 
818       // And then create the file
819       Path tmpPath = new Path(getTempDir(), REGION_INFO_FILE);
820 
821       // If datanode crashes or if the RS goes down just before the close is called while trying to
822       // close the created regioninfo file in the .tmp directory then on next
823       // creation we will be getting AlreadyCreatedException.
824       // Hence delete and create the file if exists.
825       if (FSUtils.isExists(fs, tmpPath)) {
826         FSUtils.delete(fs, tmpPath, true);
827       }
828 
829       // Write HRI to a file in case we need to recover hbase:meta
830       writeRegionInfoFileContent(conf, fs, tmpPath, regionInfoContent);
831 
832       // Move the created file to the original path
833       if (fs.exists(tmpPath) &&  !rename(tmpPath, regionInfoFile)) {
834         throw new IOException("Unable to rename " + tmpPath + " to " + regionInfoFile);
835       }
836     } else {
837       // Write HRI to a file in case we need to recover hbase:meta
838       writeRegionInfoFileContent(conf, fs, regionInfoFile, regionInfoContent);
839     }
840   }
841 
842   /**
843    * Create a new Region on file-system.
844    * @param conf the {@link Configuration} to use
845    * @param fs {@link FileSystem} from which to add the region
846    * @param tableDir {@link Path} to where the table is being stored
847    * @param regionInfo {@link HRegionInfo} for region to be added
848    * @throws IOException if the region creation fails due to a FileSystem exception.
849    */
850   public static HRegionFileSystem createRegionOnFileSystem(final Configuration conf,
851       final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo) throws IOException {
852     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tableDir, regionInfo);
853     Path regionDir = regionFs.getRegionDir();
854 
855     if (fs.exists(regionDir)) {
856       LOG.warn("Trying to create a region that already exists on disk: " + regionDir);
857       throw new IOException("The specified region already exists on disk: " + regionDir);
858     }
859 
860     // Create the region directory
861     if (!createDirOnFileSystem(fs, conf, regionDir)) {
862       LOG.warn("Unable to create the region directory: " + regionDir);
863       throw new IOException("Unable to create region directory: " + regionDir);
864     }
865 
866     // Write HRI to a file in case we need to recover hbase:meta
867     regionFs.writeRegionInfoOnFilesystem(false);
868     return regionFs;
869   }
870 
871   /**
872    * Open Region from file-system.
873    * @param conf the {@link Configuration} to use
874    * @param fs {@link FileSystem} from which to add the region
875    * @param tableDir {@link Path} to where the table is being stored
876    * @param regionInfo {@link HRegionInfo} for region to be added
877    * @param readOnly True if you don't want to edit the region data
878    * @throws IOException if the region creation fails due to a FileSystem exception.
879    */
880   public static HRegionFileSystem openRegionFromFileSystem(final Configuration conf,
881       final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo, boolean readOnly)
882       throws IOException {
883     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tableDir, regionInfo);
884     Path regionDir = regionFs.getRegionDir();
885 
886     if (!fs.exists(regionDir)) {
887       LOG.warn("Trying to open a region that do not exists on disk: " + regionDir);
888       throw new IOException("The specified region do not exists on disk: " + regionDir);
889     }
890 
891     if (!readOnly) {
892       // Cleanup temporary directories
893       regionFs.cleanupTempDir();
894       regionFs.cleanupSplitsDir();
895       regionFs.cleanupMergesDir();
896 
897       // if it doesn't exists, Write HRI to a file, in case we need to recover hbase:meta
898       regionFs.checkRegionInfoOnFilesystem();
899     }
900 
901     return regionFs;
902   }
903 
904   /**
905    * Remove the region from the table directory, archiving the region's hfiles.
906    * @param conf the {@link Configuration} to use
907    * @param fs {@link FileSystem} from which to remove the region
908    * @param tableDir {@link Path} to where the table is being stored
909    * @param regionInfo {@link HRegionInfo} for region to be deleted
910    * @throws IOException if the request cannot be completed
911    */
912   public static void deleteRegionFromFileSystem(final Configuration conf,
913       final FileSystem fs, final Path tableDir, final HRegionInfo regionInfo) throws IOException {
914     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tableDir, regionInfo);
915     Path regionDir = regionFs.getRegionDir();
916 
917     if (!fs.exists(regionDir)) {
918       LOG.warn("Trying to delete a region that do not exists on disk: " + regionDir);
919       return;
920     }
921 
922     if (LOG.isDebugEnabled()) {
923       LOG.debug("DELETING region " + regionDir);
924     }
925 
926     // Archive region
927     Path rootDir = FSUtils.getRootDir(conf);
928     HFileArchiver.archiveRegion(fs, rootDir, tableDir, regionDir);
929 
930     // Delete empty region dir
931     if (!fs.delete(regionDir, true)) {
932       LOG.warn("Failed delete of " + regionDir);
933     }
934   }
935 
936   /**
937    * Creates a directory. Assumes the user has already checked for this directory existence.
938    * @param dir
939    * @return the result of fs.mkdirs(). In case underlying fs throws an IOException, it checks
940    *         whether the directory exists or not, and returns true if it exists.
941    * @throws IOException
942    */
943   boolean createDir(Path dir) throws IOException {
944     int i = 0;
945     IOException lastIOE = null;
946     do {
947       try {
948         return fs.mkdirs(dir);
949       } catch (IOException ioe) {
950         lastIOE = ioe;
951         if (fs.exists(dir)) return true; // directory is present
952         sleepBeforeRetry("Create Directory", i+1);
953       }
954     } while (++i <= hdfsClientRetriesNumber);
955     throw new IOException("Exception in createDir", lastIOE);
956   }
957 
958   /**
959    * Renames a directory. Assumes the user has already checked for this directory existence.
960    * @param srcpath
961    * @param dstPath
962    * @return true if rename is successful.
963    * @throws IOException
964    */
965   boolean rename(Path srcpath, Path dstPath) throws IOException {
966     IOException lastIOE = null;
967     int i = 0;
968     do {
969       try {
970         return fs.rename(srcpath, dstPath);
971       } catch (IOException ioe) {
972         lastIOE = ioe;
973         if (!fs.exists(srcpath) && fs.exists(dstPath)) return true; // successful move
974         // dir is not there, retry after some time.
975         sleepBeforeRetry("Rename Directory", i+1);
976       }
977     } while (++i <= hdfsClientRetriesNumber);
978     throw new IOException("Exception in rename", lastIOE);
979   }
980 
981   /**
982    * Deletes a directory. Assumes the user has already checked for this directory existence.
983    * @param dir
984    * @return true if the directory is deleted.
985    * @throws IOException
986    */
987   boolean deleteDir(Path dir) throws IOException {
988     IOException lastIOE = null;
989     int i = 0;
990     do {
991       try {
992         return fs.delete(dir, true);
993       } catch (IOException ioe) {
994         lastIOE = ioe;
995         if (!fs.exists(dir)) return true;
996         // dir is there, retry deleting after some time.
997         sleepBeforeRetry("Delete Directory", i+1);
998       }
999     } while (++i <= hdfsClientRetriesNumber);
1000     throw new IOException("Exception in DeleteDir", lastIOE);
1001   }
1002 
1003   /**
1004    * sleeping logic; handles the interrupt exception.
1005    */
1006   private void sleepBeforeRetry(String msg, int sleepMultiplier) {
1007     sleepBeforeRetry(msg, sleepMultiplier, baseSleepBeforeRetries, hdfsClientRetriesNumber);
1008   }
1009 
1010   /**
1011    * Creates a directory for a filesystem and configuration object. Assumes the user has already
1012    * checked for this directory existence.
1013    * @param fs
1014    * @param conf
1015    * @param dir
1016    * @return the result of fs.mkdirs(). In case underlying fs throws an IOException, it checks
1017    *         whether the directory exists or not, and returns true if it exists.
1018    * @throws IOException
1019    */
1020   private static boolean createDirOnFileSystem(FileSystem fs, Configuration conf, Path dir)
1021       throws IOException {
1022     int i = 0;
1023     IOException lastIOE = null;
1024     int hdfsClientRetriesNumber = conf.getInt("hdfs.client.retries.number",
1025       DEFAULT_HDFS_CLIENT_RETRIES_NUMBER);
1026     int baseSleepBeforeRetries = conf.getInt("hdfs.client.sleep.before.retries",
1027       DEFAULT_BASE_SLEEP_BEFORE_RETRIES);
1028     do {
1029       try {
1030         return fs.mkdirs(dir);
1031       } catch (IOException ioe) {
1032         lastIOE = ioe;
1033         if (fs.exists(dir)) return true; // directory is present
1034         sleepBeforeRetry("Create Directory", i+1, baseSleepBeforeRetries, hdfsClientRetriesNumber);
1035       }
1036     } while (++i <= hdfsClientRetriesNumber);
1037     throw new IOException("Exception in createDir", lastIOE);
1038   }
1039 
1040   /**
1041    * sleeping logic for static methods; handles the interrupt exception. Keeping a static version
1042    * for this to avoid re-looking for the integer values.
1043    */
1044   private static void sleepBeforeRetry(String msg, int sleepMultiplier, int baseSleepBeforeRetries,
1045       int hdfsClientRetriesNumber) {
1046     if (sleepMultiplier > hdfsClientRetriesNumber) {
1047       LOG.debug(msg + ", retries exhausted");
1048       return;
1049     }
1050     LOG.debug(msg + ", sleeping " + baseSleepBeforeRetries + " times " + sleepMultiplier);
1051     Threads.sleep((long)baseSleepBeforeRetries * sleepMultiplier);
1052   }
1053 }