View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver.wal;
21  
22  import static org.apache.hadoop.hbase.util.FSUtils.recoverFileLease;
23  
24  import java.io.DataInput;
25  import java.io.DataOutput;
26  import java.io.EOFException;
27  import java.io.FileNotFoundException;
28  import java.io.IOException;
29  import java.io.OutputStream;
30  import java.io.UnsupportedEncodingException;
31  import java.lang.reflect.Method;
32  import java.net.URLEncoder;
33  import java.util.ArrayList;
34  import java.util.Collections;
35  import java.util.HashMap;
36  import java.util.LinkedList;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.NavigableSet;
40  import java.util.SortedMap;
41  import java.util.TreeMap;
42  import java.util.TreeSet;
43  import java.util.concurrent.Callable;
44  import java.util.concurrent.ConcurrentSkipListMap;
45  import java.util.concurrent.CopyOnWriteArrayList;
46  import java.util.concurrent.ExecutionException;
47  import java.util.concurrent.Executors;
48  import java.util.concurrent.Future;
49  import java.util.concurrent.ThreadPoolExecutor;
50  import java.util.concurrent.TimeUnit;
51  import java.util.concurrent.atomic.AtomicInteger;
52  import java.util.concurrent.atomic.AtomicLong;
53  import java.util.concurrent.locks.Condition;
54  import java.util.concurrent.locks.Lock;
55  import java.util.concurrent.locks.ReentrantLock;
56  import java.util.regex.Matcher;
57  import java.util.regex.Pattern;
58  
59  import org.apache.commons.logging.Log;
60  import org.apache.commons.logging.LogFactory;
61  import org.apache.hadoop.conf.Configuration;
62  import org.apache.hadoop.fs.FileStatus;
63  import org.apache.hadoop.fs.FileSystem;
64  import org.apache.hadoop.fs.Path;
65  import org.apache.hadoop.fs.PathFilter;
66  import org.apache.hadoop.fs.Syncable;
67  import org.apache.hadoop.hbase.HBaseConfiguration;
68  import org.apache.hadoop.hbase.HConstants;
69  import org.apache.hadoop.hbase.HRegionInfo;
70  import org.apache.hadoop.hbase.HServerInfo;
71  import org.apache.hadoop.hbase.HTableDescriptor;
72  import org.apache.hadoop.hbase.KeyValue;
73  import org.apache.hadoop.hbase.RemoteExceptionHandler;
74  import org.apache.hadoop.hbase.regionserver.HRegion;
75  import org.apache.hadoop.hbase.util.Bytes;
76  import org.apache.hadoop.hbase.util.ClassSize;
77  import org.apache.hadoop.hbase.util.FSUtils;
78  import org.apache.hadoop.hbase.util.Threads;
79  import org.apache.hadoop.io.Writable;
80  
81  import com.google.common.util.concurrent.NamingThreadFactory;
82  
83  /**
84   * HLog stores all the edits to the HStore.  Its the hbase write-ahead-log
85   * implementation.
86   *
87   * It performs logfile-rolling, so external callers are not aware that the
88   * underlying file is being rolled.
89   *
90   * <p>
91   * There is one HLog per RegionServer.  All edits for all Regions carried by
92   * a particular RegionServer are entered first in the HLog.
93   *
94   * <p>
95   * Each HRegion is identified by a unique long <code>int</code>. HRegions do
96   * not need to declare themselves before using the HLog; they simply include
97   * their HRegion-id in the <code>append</code> or
98   * <code>completeCacheFlush</code> calls.
99   *
100  * <p>
101  * An HLog consists of multiple on-disk files, which have a chronological order.
102  * As data is flushed to other (better) on-disk structures, the log becomes
103  * obsolete. We can destroy all the log messages for a given HRegion-id up to
104  * the most-recent CACHEFLUSH message from that HRegion.
105  *
106  * <p>
107  * It's only practical to delete entire files. Thus, we delete an entire on-disk
108  * file F when all of the messages in F have a log-sequence-id that's older
109  * (smaller) than the most-recent CACHEFLUSH message for every HRegion that has
110  * a message in F.
111  *
112  * <p>
113  * Synchronized methods can never execute in parallel. However, between the
114  * start of a cache flush and the completion point, appends are allowed but log
115  * rolling is not. To prevent log rolling taking place during this period, a
116  * separate reentrant lock is used.
117  *
118  * <p>To read an HLog, call {@link #getReader(org.apache.hadoop.fs.FileSystem,
119  * org.apache.hadoop.fs.Path, org.apache.hadoop.conf.Configuration)}.
120  *
121  */
122 public class HLog implements Syncable {
123   static final Log LOG = LogFactory.getLog(HLog.class);
124   public static final byte [] METAFAMILY = Bytes.toBytes("METAFAMILY");
125   static final byte [] METAROW = Bytes.toBytes("METAROW");
126 
127   /*
128    * Name of directory that holds recovered edits written by the wal log
129    * splitting code, one per region
130    */
131   private static final String RECOVERED_EDITS_DIR = "recovered.edits";
132   private static final Pattern EDITFILES_NAME_PATTERN =
133     Pattern.compile("-?[0-9]+");
134   
135   private final FileSystem fs;
136   private final Path dir;
137   private final Configuration conf;
138   private final LogRollListener listener;
139   private final long optionalFlushInterval;
140   private final long blocksize;
141   private final int flushlogentries;
142   private final String prefix;
143   private final AtomicInteger unflushedEntries = new AtomicInteger(0);
144   private final Path oldLogDir;
145   private final List<LogActionsListener> actionListeners =
146       Collections.synchronizedList(new ArrayList<LogActionsListener>());
147 
148 
149   private static Class<? extends Writer> logWriterClass;
150   private static Class<? extends Reader> logReaderClass;
151 
152   private OutputStream hdfs_out;     // OutputStream associated with the current SequenceFile.writer
153   private int initialReplication;    // initial replication factor of SequenceFile.writer
154   private Method getNumCurrentReplicas; // refers to DFSOutputStream.getNumCurrentReplicas
155   final static Object [] NO_ARGS = new Object []{};
156 
157   // used to indirectly tell syncFs to force the sync
158   private boolean forceSync = false;
159 
160   public interface Reader {
161     void init(FileSystem fs, Path path, Configuration c) throws IOException;
162     void close() throws IOException;
163     Entry next() throws IOException;
164     Entry next(Entry reuse) throws IOException;
165     void seek(long pos) throws IOException;
166     long getPosition() throws IOException;
167   }
168 
169   public interface Writer {
170     void init(FileSystem fs, Path path, Configuration c) throws IOException;
171     void close() throws IOException;
172     void sync() throws IOException;
173     void append(Entry entry) throws IOException;
174     long getLength() throws IOException;
175   }
176 
177   /*
178    * Current log file.
179    */
180   Writer writer;
181 
182   /*
183    * Map of all log files but the current one.
184    */
185   final SortedMap<Long, Path> outputfiles =
186     Collections.synchronizedSortedMap(new TreeMap<Long, Path>());
187 
188   /*
189    * Map of regions to first sequence/edit id in their memstore.
190    */
191   private final ConcurrentSkipListMap<byte [], Long> lastSeqWritten =
192     new ConcurrentSkipListMap<byte [], Long>(Bytes.BYTES_COMPARATOR);
193 
194   private volatile boolean closed = false;
195 
196   private final AtomicLong logSeqNum = new AtomicLong(0);
197 
198   // The timestamp (in ms) when the log file was created.
199   private volatile long filenum = -1;
200 
201   //number of transactions in the current Hlog.
202   private final AtomicInteger numEntries = new AtomicInteger(0);
203 
204   // If > than this size, roll the log. This is typically 0.95 times the size
205   // of the default Hdfs block size.
206   private final long logrollsize;
207 
208   // This lock prevents starting a log roll during a cache flush.
209   // synchronized is insufficient because a cache flush spans two method calls.
210   private final Lock cacheFlushLock = new ReentrantLock();
211 
212   // We synchronize on updateLock to prevent updates and to prevent a log roll
213   // during an update
214   private final Object updateLock = new Object();
215 
216   private final boolean enabled;
217 
218   /*
219    * If more than this many logs, force flush of oldest region to oldest edit
220    * goes to disk.  If too many and we crash, then will take forever replaying.
221    * Keep the number of logs tidy.
222    */
223   private final int maxLogs;
224 
225   /**
226    * Thread that handles group commit
227    */
228   private final LogSyncer logSyncerThread;
229 
230   private final List<LogEntryVisitor> logEntryVisitors =
231       new CopyOnWriteArrayList<LogEntryVisitor>();
232 
233   /**
234    * Pattern used to validate a HLog file name
235    */
236   private static final Pattern pattern = Pattern.compile(".*\\.\\d*");
237 
238   static byte [] COMPLETE_CACHE_FLUSH;
239   static {
240     try {
241       COMPLETE_CACHE_FLUSH =
242         "HBASE::CACHEFLUSH".getBytes(HConstants.UTF8_ENCODING);
243     } catch (UnsupportedEncodingException e) {
244       assert(false);
245     }
246   }
247 
248   // For measuring latency of writes
249   private static volatile long writeOps;
250   private static volatile long writeTime;
251   // For measuring latency of syncs
252   private static volatile long syncOps;
253   private static volatile long syncTime;
254 
255   public static long getWriteOps() {
256     long ret = writeOps;
257     writeOps = 0;
258     return ret;
259   }
260 
261   public static long getWriteTime() {
262     long ret = writeTime;
263     writeTime = 0;
264     return ret;
265   }
266 
267   public static long getSyncOps() {
268     long ret = syncOps;
269     syncOps = 0;
270     return ret;
271   }
272 
273   public static long getSyncTime() {
274     long ret = syncTime;
275     syncTime = 0;
276     return ret;
277   }
278 
279   /**
280    * HLog creating with a null actions listener.
281    *
282    * @param fs filesystem handle
283    * @param dir path to where hlogs are stored
284    * @param oldLogDir path to where hlogs are archived
285    * @param conf configuration to use
286    * @param listener listerner used to request log rolls
287    * @throws IOException
288    */
289   public HLog(final FileSystem fs, final Path dir, final Path oldLogDir,
290               final Configuration conf, final LogRollListener listener)
291   throws IOException {
292     this(fs, dir, oldLogDir, conf, listener, null, null);
293   }
294 
295   /**
296    * Create an edit log at the given <code>dir</code> location.
297    *
298    * You should never have to load an existing log. If there is a log at
299    * startup, it should have already been processed and deleted by the time the
300    * HLog object is started up.
301    *
302    * @param fs filesystem handle
303    * @param dir path to where hlogs are stored
304    * @param oldLogDir path to where hlogs are archived
305    * @param conf configuration to use
306    * @param listener listerner used to request log rolls
307    * @param actionListener optional listener for hlog actions like archiving
308    * @param prefix should always be hostname and port in distributed env and
309    *        it will be URL encoded before being used.
310    *        If prefix is null, "hlog" will be used
311    * @throws IOException
312    */
313   public HLog(final FileSystem fs, final Path dir, final Path oldLogDir,
314               final Configuration conf, final LogRollListener listener,
315               final LogActionsListener actionListener, final String prefix)
316   throws IOException {
317     super();
318     this.fs = fs;
319     this.dir = dir;
320     this.conf = conf;
321     this.listener = listener;
322     this.flushlogentries =
323       conf.getInt("hbase.regionserver.flushlogentries", 1);
324     this.blocksize = conf.getLong("hbase.regionserver.hlog.blocksize",
325       this.fs.getDefaultBlockSize());
326     // Roll at 95% of block size.
327     float multi = conf.getFloat("hbase.regionserver.logroll.multiplier", 0.95f);
328     this.logrollsize = (long)(this.blocksize * multi);
329     this.optionalFlushInterval =
330       conf.getLong("hbase.regionserver.optionallogflushinterval", 1 * 1000);
331     if (fs.exists(dir)) {
332       throw new IOException("Target HLog directory already exists: " + dir);
333     }
334     fs.mkdirs(dir);
335     this.oldLogDir = oldLogDir;
336     if (!fs.exists(oldLogDir)) {
337       fs.mkdirs(this.oldLogDir);
338     }
339     this.maxLogs = conf.getInt("hbase.regionserver.maxlogs", 32);
340     this.enabled = conf.getBoolean("hbase.regionserver.hlog.enabled", true);
341     LOG.info("HLog configuration: blocksize=" + this.blocksize +
342       ", rollsize=" + this.logrollsize +
343       ", enabled=" + this.enabled +
344       ", flushlogentries=" + this.flushlogentries +
345       ", optionallogflushinternal=" + this.optionalFlushInterval + "ms");
346     if (actionListener != null) {
347       addLogActionsListerner(actionListener);
348     }
349     // If prefix is null||empty then just name it hlog
350     this.prefix = prefix == null || prefix.isEmpty() ?
351         "hlog" : URLEncoder.encode(prefix, "UTF8");
352     // rollWriter sets this.hdfs_out if it can.
353     rollWriter();
354 
355     // handle the reflection necessary to call getNumCurrentReplicas()
356     this.getNumCurrentReplicas = null;
357     if(this.hdfs_out != null) {
358       try {
359         this.getNumCurrentReplicas = this.hdfs_out.getClass().
360           getMethod("getNumCurrentReplicas", new Class<?> []{});
361         this.getNumCurrentReplicas.setAccessible(true);
362       } catch (NoSuchMethodException e) {
363         // Thrown if getNumCurrentReplicas() function isn't available
364       } catch (SecurityException e) {
365         // Thrown if we can't get access to getNumCurrentReplicas()
366         this.getNumCurrentReplicas = null; // could happen on setAccessible()
367       }
368     }
369     if(this.getNumCurrentReplicas != null) {
370       LOG.info("Using getNumCurrentReplicas--HDFS-826");
371     } else {
372       LOG.info("getNumCurrentReplicas--HDFS-826 not available" );
373     }
374 
375     logSyncerThread = new LogSyncer(this.optionalFlushInterval);
376     Threads.setDaemonThreadRunning(logSyncerThread,
377         Thread.currentThread().getName() + ".logSyncer");
378   }
379 
380   /**
381    * @return Current state of the monotonically increasing file id.
382    */
383   public long getFilenum() {
384     return this.filenum;
385   }
386 
387   /**
388    * Called by HRegionServer when it opens a new region to ensure that log
389    * sequence numbers are always greater than the latest sequence number of the
390    * region being brought on-line.
391    *
392    * @param newvalue We'll set log edit/sequence number to this value if it
393    * is greater than the current value.
394    */
395   public void setSequenceNumber(final long newvalue) {
396     for (long id = this.logSeqNum.get(); id < newvalue &&
397         !this.logSeqNum.compareAndSet(id, newvalue); id = this.logSeqNum.get()) {
398       // This could spin on occasion but better the occasional spin than locking
399       // every increment of sequence number.
400       LOG.debug("Changed sequenceid from " + logSeqNum + " to " + newvalue);
401     }
402   }
403 
404   /**
405    * @return log sequence number
406    */
407   public long getSequenceNumber() {
408     return logSeqNum.get();
409   }
410 
411   // usage: see TestLogRolling.java
412   OutputStream getOutputStream() {
413     return this.hdfs_out;
414   }
415 
416   /**
417    * Roll the log writer. That is, start writing log messages to a new file.
418    *
419    * Because a log cannot be rolled during a cache flush, and a cache flush
420    * spans two method calls, a special lock needs to be obtained so that a cache
421    * flush cannot start when the log is being rolled and the log cannot be
422    * rolled during a cache flush.
423    *
424    * <p>Note that this method cannot be synchronized because it is possible that
425    * startCacheFlush runs, obtaining the cacheFlushLock, then this method could
426    * start which would obtain the lock on this but block on obtaining the
427    * cacheFlushLock and then completeCacheFlush could be called which would wait
428    * for the lock on this and consequently never release the cacheFlushLock
429    *
430    * @return If lots of logs, flush the returned regions so next time through
431    * we can clean logs. Returns null if nothing to flush.
432    * @throws org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException
433    * @throws IOException
434    */
435   public byte [][] rollWriter() throws FailedLogCloseException, IOException {
436     // Return if nothing to flush.
437     if (this.writer != null && this.numEntries.get() <= 0) {
438       return null;
439     }
440     byte [][] regionsToFlush = null;
441     this.cacheFlushLock.lock();
442     try {
443       if (closed) {
444         return regionsToFlush;
445       }
446       synchronized (updateLock) {
447         // Clean up current writer.
448         Path oldFile = cleanupCurrentWriter(this.filenum);
449         this.filenum = System.currentTimeMillis();
450         Path newPath = computeFilename();
451         this.writer = createWriter(fs, newPath, HBaseConfiguration.create(conf));
452         this.initialReplication = fs.getFileStatus(newPath).getReplication();
453 
454         // Can we get at the dfsclient outputstream?  If an instance of
455         // SFLW, it'll have done the necessary reflection to get at the
456         // protected field name.
457         this.hdfs_out = null;
458         if (this.writer instanceof SequenceFileLogWriter) {
459           this.hdfs_out =
460             ((SequenceFileLogWriter)this.writer).getDFSCOutputStream();
461         }
462 
463         LOG.info((oldFile != null?
464             "Roll " + FSUtils.getPath(oldFile) + ", entries=" +
465             this.numEntries.get() +
466             ", filesize=" +
467             this.fs.getFileStatus(oldFile).getLen() + ". ": "") +
468           "New hlog " + FSUtils.getPath(newPath));
469         // Tell our listeners that a new log was created
470         if (!this.actionListeners.isEmpty()) {
471           for (LogActionsListener list : this.actionListeners) {
472             list.logRolled(newPath);
473           }
474         }
475         // Can we delete any of the old log files?
476         if (this.outputfiles.size() > 0) {
477           if (this.lastSeqWritten.size() <= 0) {
478             LOG.debug("Last sequenceid written is empty. Deleting all old hlogs");
479             // If so, then no new writes have come in since all regions were
480             // flushed (and removed from the lastSeqWritten map). Means can
481             // remove all but currently open log file.
482             for (Map.Entry<Long, Path> e : this.outputfiles.entrySet()) {
483               archiveLogFile(e.getValue(), e.getKey());
484             }
485             this.outputfiles.clear();
486           } else {
487             regionsToFlush = cleanOldLogs();
488           }
489         }
490         this.numEntries.set(0);
491       }
492     } finally {
493       this.cacheFlushLock.unlock();
494     }
495     return regionsToFlush;
496   }
497 
498   /**
499    * Get a reader for the WAL.
500    * @param fs
501    * @param path
502    * @param conf
503    * @return A WAL reader.  Close when done with it.
504    * @throws IOException
505    */
506   public static Reader getReader(final FileSystem fs,
507     final Path path, Configuration conf)
508   throws IOException {
509     try {
510       if (logReaderClass == null) {
511         logReaderClass =conf.getClass("hbase.regionserver.hlog.reader.impl",
512                 SequenceFileLogReader.class, Reader.class);
513       }
514 
515       HLog.Reader reader = logReaderClass.newInstance();
516       reader.init(fs, path, conf);
517       return reader;
518     } catch (IOException e) {
519       throw e;
520     }
521     catch (Exception e) {
522       throw new IOException("Cannot get log reader", e);
523     }
524   }
525 
526   /**
527    * Get a writer for the WAL.
528    * @param path
529    * @param conf
530    * @return A WAL writer.  Close when done with it.
531    * @throws IOException
532    */
533   public static Writer createWriter(final FileSystem fs,
534       final Path path, Configuration conf)
535   throws IOException {
536     try {
537       if (logWriterClass == null) {
538         logWriterClass = conf.getClass("hbase.regionserver.hlog.writer.impl",
539                 SequenceFileLogWriter.class, Writer.class);
540       }
541       HLog.Writer writer = (HLog.Writer) logWriterClass.newInstance();
542       writer.init(fs, path, conf);
543       return writer;
544     } catch (Exception e) {
545       IOException ie = new IOException("cannot get log writer");
546       ie.initCause(e);
547       throw ie;
548     }
549   }
550 
551   /*
552    * Clean up old commit logs.
553    * @return If lots of logs, flush the returned region so next time through
554    * we can clean logs. Returns null if nothing to flush.
555    * @throws IOException
556    */
557   private byte [][] cleanOldLogs() throws IOException {
558     Long oldestOutstandingSeqNum = getOldestOutstandingSeqNum();
559     // Get the set of all log files whose final ID is older than or
560     // equal to the oldest pending region operation
561     TreeSet<Long> sequenceNumbers =
562       new TreeSet<Long>(this.outputfiles.headMap(
563         (Long.valueOf(oldestOutstandingSeqNum.longValue() + 1L))).keySet());
564     // Now remove old log files (if any)
565     int logsToRemove = sequenceNumbers.size();
566     if (logsToRemove > 0) {
567       if (LOG.isDebugEnabled()) {
568         // Find associated region; helps debugging.
569         byte [] oldestRegion = getOldestRegion(oldestOutstandingSeqNum);
570         LOG.debug("Found " + logsToRemove + " hlogs to remove " +
571           " out of total " + this.outputfiles.size() + "; " +
572           "oldest outstanding sequenceid is " + oldestOutstandingSeqNum +
573           " from region " + Bytes.toString(oldestRegion));
574       }
575       for (Long seq : sequenceNumbers) {
576         archiveLogFile(this.outputfiles.remove(seq), seq);
577       }
578     }
579 
580     // If too many log files, figure which regions we need to flush.
581     byte [][] regions = null;
582     int logCount = this.outputfiles.size() - logsToRemove;
583     if (logCount > this.maxLogs && this.outputfiles != null &&
584         this.outputfiles.size() > 0) {
585       regions = findMemstoresWithEditsOlderThan(this.outputfiles.firstKey(),
586         this.lastSeqWritten);
587       StringBuilder sb = new StringBuilder();
588       for (int i = 0; i < regions.length; i++) {
589         if (i > 0) sb.append(", ");
590         sb.append(Bytes.toStringBinary(regions[i]));
591       }
592       LOG.info("Too many hlogs: logs=" + logCount + ", maxlogs=" +
593         this.maxLogs + "; forcing flush of " + regions.length + " regions(s): " +
594         sb.toString());
595     }
596     return regions;
597   }
598 
599   /**
600    * Return regions (memstores) that have edits that are less than the passed
601    * <code>oldestWALseqid</code>.
602    * @param oldestWALseqid
603    * @param regionsToSeqids
604    * @return All regions whose seqid is < than <code>oldestWALseqid</code> (Not
605    * necessarily in order).  Null if no regions found.
606    */
607   static byte [][] findMemstoresWithEditsOlderThan(final long oldestWALseqid,
608       final Map<byte [], Long> regionsToSeqids) {
609     //  This method is static so it can be unit tested the easier.
610     List<byte []> regions = null;
611     for (Map.Entry<byte [], Long> e: regionsToSeqids.entrySet()) {
612       if (e.getValue().longValue() < oldestWALseqid) {
613         if (regions == null) regions = new ArrayList<byte []>();
614         regions.add(e.getKey());
615       }
616     }
617     return regions == null?
618       null: regions.toArray(new byte [][] {HConstants.EMPTY_BYTE_ARRAY});
619   }
620 
621   /*
622    * @return Logs older than this id are safe to remove.
623    */
624   private Long getOldestOutstandingSeqNum() {
625     return Collections.min(this.lastSeqWritten.values());
626   }
627 
628   private byte [] getOldestRegion(final Long oldestOutstandingSeqNum) {
629     byte [] oldestRegion = null;
630     for (Map.Entry<byte [], Long> e: this.lastSeqWritten.entrySet()) {
631       if (e.getValue().longValue() == oldestOutstandingSeqNum.longValue()) {
632         oldestRegion = e.getKey();
633         break;
634       }
635     }
636     return oldestRegion;
637   }
638 
639   /*
640    * Cleans up current writer closing and adding to outputfiles.
641    * Presumes we're operating inside an updateLock scope.
642    * @return Path to current writer or null if none.
643    * @throws IOException
644    */
645   private Path cleanupCurrentWriter(final long currentfilenum)
646   throws IOException {
647     Path oldFile = null;
648     if (this.writer != null) {
649       // Close the current writer, get a new one.
650       try {
651         this.writer.close();
652       } catch (IOException e) {
653         // Failed close of log file.  Means we're losing edits.  For now,
654         // shut ourselves down to minimize loss.  Alternative is to try and
655         // keep going.  See HBASE-930.
656         FailedLogCloseException flce =
657           new FailedLogCloseException("#" + currentfilenum);
658         flce.initCause(e);
659         throw e;
660       }
661       if (currentfilenum >= 0) {
662         oldFile = computeFilename();
663         this.outputfiles.put(Long.valueOf(this.logSeqNum.get() - 1), oldFile);
664       }
665     }
666     return oldFile;
667   }
668 
669   private void archiveLogFile(final Path p, final Long seqno) throws IOException {
670     Path newPath = getHLogArchivePath(this.oldLogDir, p);
671     LOG.info("moving old hlog file " + FSUtils.getPath(p) +
672       " whose highest sequenceid is " + seqno + " to " +
673       FSUtils.getPath(newPath));
674     this.fs.rename(p, newPath);
675   }
676 
677   /**
678    * This is a convenience method that computes a new filename with a given
679    * file-number.
680    * @return Path
681    */
682   protected Path computeFilename() {
683     if (filenum < 0) {
684       throw new RuntimeException("hlog file number can't be < 0");
685     }
686     return new Path(dir, prefix + "." + filenum);
687   }
688 
689   /**
690    * Shut down the log and delete the log directory
691    *
692    * @throws IOException
693    */
694   public void closeAndDelete() throws IOException {
695     close();
696     FileStatus[] files = fs.listStatus(this.dir);
697     for(FileStatus file : files) {
698       fs.rename(file.getPath(),
699           getHLogArchivePath(this.oldLogDir, file.getPath()));
700     }
701     LOG.debug("Moved " + files.length + " log files to " +
702         FSUtils.getPath(this.oldLogDir));
703     fs.delete(dir, true);
704   }
705 
706   /**
707    * Shut down the log.
708    *
709    * @throws IOException
710    */
711   public void close() throws IOException {
712     try {
713       logSyncerThread.interrupt();
714       // Make sure we synced everything
715       logSyncerThread.join(this.optionalFlushInterval*2);
716     } catch (InterruptedException e) {
717       LOG.error("Exception while waiting for syncer thread to die", e);
718     }
719 
720     cacheFlushLock.lock();
721     try {
722       synchronized (updateLock) {
723         this.closed = true;
724         if (LOG.isDebugEnabled()) {
725           LOG.debug("closing hlog writer in " + this.dir.toString());
726         }
727         this.writer.close();
728       }
729     } finally {
730       cacheFlushLock.unlock();
731     }
732   }
733 
734    /** Append an entry to the log.
735    *
736    * @param regionInfo
737    * @param logEdit
738    * @param now Time of this edit write.
739    * @throws IOException
740    */
741   public void append(HRegionInfo regionInfo, WALEdit logEdit,
742     final long now,
743     final boolean isMetaRegion)
744   throws IOException {
745     byte [] regionName = regionInfo.getRegionName();
746     byte [] tableName = regionInfo.getTableDesc().getName();
747     this.append(regionInfo, makeKey(regionName, tableName, -1, now), logEdit);
748   }
749 
750   /**
751    * @param now
752    * @param regionName
753    * @param tableName
754    * @return New log key.
755    */
756   protected HLogKey makeKey(byte[] regionName, byte[] tableName, long seqnum, long now) {
757     return new HLogKey(regionName, tableName, seqnum, now);
758   }
759 
760 
761 
762   /** Append an entry to the log.
763    *
764    * @param regionInfo
765    * @param logEdit
766    * @param logKey
767    * @throws IOException
768    */
769   public void append(HRegionInfo regionInfo, HLogKey logKey, WALEdit logEdit)
770   throws IOException {
771     if (this.closed) {
772       throw new IOException("Cannot append; log is closed");
773     }
774     byte [] regionName = regionInfo.getRegionName();
775     synchronized (updateLock) {
776       long seqNum = obtainSeqNum();
777       logKey.setLogSeqNum(seqNum);
778       // The 'lastSeqWritten' map holds the sequence number of the oldest
779       // write for each region (i.e. the first edit added to the particular
780       // memstore). When the cache is flushed, the entry for the
781       // region being flushed is removed if the sequence number of the flush
782       // is greater than or equal to the value in lastSeqWritten.
783       this.lastSeqWritten.putIfAbsent(regionName, Long.valueOf(seqNum));
784       doWrite(regionInfo, logKey, logEdit);
785       this.unflushedEntries.incrementAndGet();
786       this.numEntries.incrementAndGet();
787     }
788 
789     // sync txn to file system
790     this.sync(regionInfo.isMetaRegion());
791   }
792 
793   /**
794    * Append a set of edits to the log. Log edits are keyed by regionName,
795    * rowname, and log-sequence-id.
796    *
797    * Later, if we sort by these keys, we obtain all the relevant edits for a
798    * given key-range of the HRegion (TODO). Any edits that do not have a
799    * matching COMPLETE_CACHEFLUSH message can be discarded.
800    *
801    * <p>
802    * Logs cannot be restarted once closed, or once the HLog process dies. Each
803    * time the HLog starts, it must create a new log. This means that other
804    * systems should process the log appropriately upon each startup (and prior
805    * to initializing HLog).
806    *
807    * synchronized prevents appends during the completion of a cache flush or for
808    * the duration of a log roll.
809    *
810    * @param info
811    * @param tableName
812    * @param edits
813    * @param now
814    * @throws IOException
815    */
816   public void append(HRegionInfo info, byte [] tableName, WALEdit edits,
817     final long now)
818   throws IOException {
819     if (edits.isEmpty()) return;
820     
821     byte[] regionName = info.getRegionName();
822     if (this.closed) {
823       throw new IOException("Cannot append; log is closed");
824     }
825     synchronized (this.updateLock) {
826       long seqNum = obtainSeqNum();
827       // The 'lastSeqWritten' map holds the sequence number of the oldest
828       // write for each region (i.e. the first edit added to the particular
829       // memstore). . When the cache is flushed, the entry for the
830       // region being flushed is removed if the sequence number of the flush
831       // is greater than or equal to the value in lastSeqWritten.
832       this.lastSeqWritten.putIfAbsent(regionName, seqNum);
833       HLogKey logKey = makeKey(regionName, tableName, seqNum, now);
834       doWrite(info, logKey, edits);
835       this.numEntries.incrementAndGet();
836 
837       // Only count 1 row as an unflushed entry.
838       this.unflushedEntries.incrementAndGet();
839     }
840     // sync txn to file system
841     this.sync(info.isMetaRegion());
842   }
843 
844   /**
845    * This thread is responsible to call syncFs and buffer up the writers while
846    * it happens.
847    */
848    class LogSyncer extends Thread {
849 
850     // Using fairness to make sure locks are given in order
851     private final ReentrantLock lock = new ReentrantLock(true);
852 
853     // Condition used to wait until we have something to sync
854     private final Condition queueEmpty = lock.newCondition();
855 
856     // Condition used to signal that the sync is done
857     private final Condition syncDone = lock.newCondition();
858 
859     private final long optionalFlushInterval;
860 
861     private boolean syncerShuttingDown = false;
862 
863     LogSyncer(long optionalFlushInterval) {
864       this.optionalFlushInterval = optionalFlushInterval;
865     }
866 
867     @Override
868     public void run() {
869       try {
870         lock.lock();
871         // awaiting with a timeout doesn't always
872         // throw exceptions on interrupt
873         while(!this.isInterrupted()) {
874 
875           // Wait until something has to be hflushed or do it if we waited
876           // enough time (useful if something appends but does not hflush).
877           // 0 or less means that it timed out and maybe waited a bit more.
878           if (!(queueEmpty.awaitNanos(
879               this.optionalFlushInterval*1000000) <= 0)) {
880             forceSync = true;
881           }
882 
883           // We got the signal, let's hflush. We currently own the lock so new
884           // writes are waiting to acquire it in addToSyncQueue while the ones
885           // we hflush are waiting on await()
886           hflush();
887 
888           // Release all the clients waiting on the hflush. Notice that we still
889           // own the lock until we get back to await at which point all the
890           // other threads waiting will first acquire and release locks
891           syncDone.signalAll();
892         }
893       } catch (IOException e) {
894         LOG.error("Error while syncing, requesting close of hlog ", e);
895         requestLogRoll();
896       } catch (InterruptedException e) {
897         LOG.debug(getName() + "interrupted while waiting for sync requests");
898       } finally {
899         syncerShuttingDown = true;
900         syncDone.signalAll();
901         lock.unlock();
902         LOG.info(getName() + " exiting");
903       }
904     }
905 
906     /**
907      * This method first signals the thread that there's a sync needed
908      * and then waits for it to happen before returning.
909      */
910     public void addToSyncQueue(boolean force) {
911 
912       // Don't bother if somehow our append was already hflushed
913       if (unflushedEntries.get() == 0) {
914         return;
915       }
916       lock.lock();
917       try {
918         if (syncerShuttingDown) {
919           LOG.warn(getName() + " was shut down while waiting for sync");
920           return;
921         }
922         if(force) {
923           forceSync = true;
924         }
925         // Wake the thread
926         queueEmpty.signal();
927 
928         // Wait for it to hflush
929         syncDone.await();
930       } catch (InterruptedException e) {
931         LOG.debug(getName() + " was interrupted while waiting for sync", e);
932       }
933       finally {
934         lock.unlock();
935       }
936     }
937   }
938 
939   public void sync(){
940     sync(false);
941   }
942 
943   /**
944    * This method calls the LogSyncer in order to group commit the sync
945    * with other threads.
946    * @param force For catalog regions, force the sync to happen
947    */
948   public void sync(boolean force) {
949     logSyncerThread.addToSyncQueue(force);
950   }
951 
952   public void hflush() throws IOException {
953     synchronized (this.updateLock) {
954       if (this.closed) {
955         return;
956       }
957       boolean logRollRequested = false;
958       if (this.forceSync ||
959           this.unflushedEntries.get() >= this.flushlogentries) {
960         try {
961           long now = System.currentTimeMillis();
962           this.writer.sync();
963           syncTime += System.currentTimeMillis() - now;
964           syncOps++;
965           this.forceSync = false;
966           this.unflushedEntries.set(0);
967 
968           // if the number of replicas in HDFS has fallen below the initial
969           // value, then roll logs.
970           try {
971             int numCurrentReplicas = getLogReplication();
972             if (numCurrentReplicas != 0 &&
973                 numCurrentReplicas < this.initialReplication) {
974               LOG.warn("HDFS pipeline error detected. " +
975                   "Found " + numCurrentReplicas + " replicas but expecting " +
976                   this.initialReplication + " replicas. " +
977                   " Requesting close of hlog.");
978               requestLogRoll();
979               logRollRequested = true;
980             }
981           } catch (Exception e) {
982               LOG.warn("Unable to invoke DFSOutputStream.getNumCurrentReplicas" + e +
983                        " still proceeding ahead...");
984           }
985         } catch (IOException e) {
986           LOG.fatal("Could not append. Requesting close of hlog", e);
987           requestLogRoll();
988           throw e;
989         }
990       }
991 
992       if (!logRollRequested && (this.writer.getLength() > this.logrollsize)) {
993         requestLogRoll();
994       }
995     }
996   }
997 
998   /**
999    * This method gets the datanode replication count for the current HLog.
1000    *
1001    * If the pipeline isn't started yet or is empty, you will get the default
1002    * replication factor.  Therefore, if this function returns 0, it means you
1003    * are not properly running with the HDFS-826 patch.
1004    *
1005    * @throws Exception
1006    */
1007   int getLogReplication() throws Exception {
1008     if(this.getNumCurrentReplicas != null && this.hdfs_out != null) {
1009       Object repl = this.getNumCurrentReplicas.invoke(this.hdfs_out, NO_ARGS);
1010       if (repl instanceof Integer) {
1011         return ((Integer)repl).intValue();
1012       }
1013     }
1014     return 0;
1015   }
1016 
1017   boolean canGetCurReplicas() {
1018     return this.getNumCurrentReplicas != null;
1019   }
1020 
1021   public void hsync() throws IOException {
1022     // Not yet implemented up in hdfs so just call hflush.
1023     hflush();
1024   }
1025 
1026   private void requestLogRoll() {
1027     if (this.listener != null) {
1028       this.listener.logRollRequested();
1029     }
1030   }
1031 
1032   protected void doWrite(HRegionInfo info, HLogKey logKey, WALEdit logEdit)
1033   throws IOException {
1034     if (!this.enabled) {
1035       return;
1036     }
1037     if (!this.logEntryVisitors.isEmpty()) {
1038       for (LogEntryVisitor visitor : this.logEntryVisitors) {
1039         visitor.visitLogEntryBeforeWrite(info, logKey, logEdit);
1040       }
1041     }
1042     try {
1043       long now = System.currentTimeMillis();
1044       this.writer.append(new HLog.Entry(logKey, logEdit));
1045       long took = System.currentTimeMillis() - now;
1046       writeTime += took;
1047       writeOps++;
1048       if (took > 1000) {
1049         LOG.warn(Thread.currentThread().getName() + " took " + took +
1050           "ms appending an edit to hlog; editcount=" + this.numEntries.get());
1051       }
1052     } catch (IOException e) {
1053       LOG.fatal("Could not append. Requesting close of hlog", e);
1054       requestLogRoll();
1055       throw e;
1056     }
1057   }
1058 
1059   /** @return How many items have been added to the log */
1060   int getNumEntries() {
1061     return numEntries.get();
1062   }
1063 
1064   /**
1065    * Obtain a log sequence number.
1066    */
1067   private long obtainSeqNum() {
1068     return this.logSeqNum.incrementAndGet();
1069   }
1070 
1071   /** @return the number of log files in use */
1072   int getNumLogFiles() {
1073     return outputfiles.size();
1074   }
1075 
1076   /**
1077    * By acquiring a log sequence ID, we can allow log messages to continue while
1078    * we flush the cache.
1079    *
1080    * Acquire a lock so that we do not roll the log between the start and
1081    * completion of a cache-flush. Otherwise the log-seq-id for the flush will
1082    * not appear in the correct logfile.
1083    *
1084    * @return sequence ID to pass {@link #completeCacheFlush(byte[], byte[], long, boolean)}
1085    * (byte[], byte[], long)}
1086    * @see #completeCacheFlush(byte[], byte[], long, boolean)
1087    * @see #abortCacheFlush()
1088    */
1089   public long startCacheFlush() {
1090     this.cacheFlushLock.lock();
1091     return obtainSeqNum();
1092   }
1093 
1094   /**
1095    * Complete the cache flush
1096    *
1097    * Protected by cacheFlushLock
1098    *
1099    * @param regionName
1100    * @param tableName
1101    * @param logSeqId
1102    * @throws IOException
1103    */
1104   public void completeCacheFlush(final byte [] regionName, final byte [] tableName,
1105     final long logSeqId,
1106     final boolean isMetaRegion)
1107   throws IOException {
1108     try {
1109       if (this.closed) {
1110         return;
1111       }
1112       synchronized (updateLock) {
1113         long now = System.currentTimeMillis();
1114         WALEdit edit = completeCacheFlushLogEdit();
1115         HLogKey key = makeKey(regionName, tableName, logSeqId,
1116             System.currentTimeMillis());
1117         this.writer.append(new Entry(key, edit));
1118         writeTime += System.currentTimeMillis() - now;
1119         writeOps++;
1120         this.numEntries.incrementAndGet();
1121         Long seq = this.lastSeqWritten.get(regionName);
1122         if (seq != null && logSeqId >= seq.longValue()) {
1123           this.lastSeqWritten.remove(regionName);
1124         }
1125       }
1126       // sync txn to file system
1127       this.sync(isMetaRegion);
1128 
1129     } finally {
1130       this.cacheFlushLock.unlock();
1131     }
1132   }
1133 
1134   private WALEdit completeCacheFlushLogEdit() {
1135     KeyValue kv = new KeyValue(METAROW, METAFAMILY, null,
1136       System.currentTimeMillis(), COMPLETE_CACHE_FLUSH);
1137     WALEdit e = new WALEdit();
1138     e.add(kv);
1139     return e;
1140   }
1141 
1142   /**
1143    * Abort a cache flush.
1144    * Call if the flush fails. Note that the only recovery for an aborted flush
1145    * currently is a restart of the regionserver so the snapshot content dropped
1146    * by the failure gets restored to the memstore.
1147    */
1148   public void abortCacheFlush() {
1149     this.cacheFlushLock.unlock();
1150   }
1151 
1152   /**
1153    * @param family
1154    * @return true if the column is a meta column
1155    */
1156   public static boolean isMetaFamily(byte [] family) {
1157     return Bytes.equals(METAFAMILY, family);
1158   }
1159 
1160   /**
1161    * Split up a bunch of regionserver commit log files that are no longer
1162    * being written to, into new files, one per region for region to replay on
1163    * startup. Delete the old log files when finished.
1164    *
1165    * @param rootDir qualified root directory of the HBase instance
1166    * @param srcDir Directory of log files to split: e.g.
1167    *                <code>${ROOTDIR}/log_HOST_PORT</code>
1168    * @param oldLogDir directory where processed (split) logs will be archived to
1169    * @param fs FileSystem
1170    * @param conf Configuration
1171    * @throws IOException will throw if corrupted hlogs aren't tolerated
1172    * @return the list of splits
1173    */
1174   public static List<Path> splitLog(final Path rootDir, final Path srcDir,
1175     Path oldLogDir, final FileSystem fs, final Configuration conf)
1176   throws IOException {
1177 
1178     long millis = System.currentTimeMillis();
1179     List<Path> splits = null;
1180     if (!fs.exists(srcDir)) {
1181       // Nothing to do
1182       return splits;
1183     }
1184     FileStatus [] logfiles = fs.listStatus(srcDir);
1185     if (logfiles == null || logfiles.length == 0) {
1186       // Nothing to do
1187       return splits;
1188     }
1189     LOG.info("Splitting " + logfiles.length + " hlog(s) in " +
1190       srcDir.toString());
1191     splits = splitLog(rootDir, srcDir, oldLogDir, logfiles, fs, conf);
1192     try {
1193       FileStatus[] files = fs.listStatus(srcDir);
1194       for(FileStatus file : files) {
1195         Path newPath = getHLogArchivePath(oldLogDir, file.getPath());
1196         LOG.info("Moving " +  FSUtils.getPath(file.getPath()) + " to " +
1197                    FSUtils.getPath(newPath));
1198         fs.rename(file.getPath(), newPath);
1199       }
1200       LOG.debug("Moved " + files.length + " log files to " +
1201         FSUtils.getPath(oldLogDir));
1202       fs.delete(srcDir, true);
1203     } catch (IOException e) {
1204       e = RemoteExceptionHandler.checkIOException(e);
1205       IOException io = new IOException("Cannot delete: " + srcDir);
1206       io.initCause(e);
1207       throw io;
1208     }
1209     long endMillis = System.currentTimeMillis();
1210     LOG.info("hlog file splitting completed in " + (endMillis - millis) +
1211         " millis for " + srcDir.toString());
1212     return splits;
1213   }
1214 
1215   // Private immutable datastructure to hold Writer and its Path.
1216   private final static class WriterAndPath {
1217     final Path p;
1218     final Writer w;
1219     WriterAndPath(final Path p, final Writer w) {
1220       this.p = p;
1221       this.w = w;
1222     }
1223   }
1224 
1225   @SuppressWarnings("unchecked")
1226   public static Class<? extends HLogKey> getKeyClass(Configuration conf) {
1227      return (Class<? extends HLogKey>)
1228        conf.getClass("hbase.regionserver.hlog.keyclass", HLogKey.class);
1229   }
1230 
1231   public static HLogKey newKey(Configuration conf) throws IOException {
1232     Class<? extends HLogKey> keyClass = getKeyClass(conf);
1233     try {
1234       return keyClass.newInstance();
1235     } catch (InstantiationException e) {
1236       throw new IOException("cannot create hlog key");
1237     } catch (IllegalAccessException e) {
1238       throw new IOException("cannot create hlog key");
1239     }
1240   }
1241 
1242   /**
1243    * Sorts the HLog edits in the given list of logfiles (that are a mix of edits on multiple regions)
1244    * by region and then splits them per region directories, in batches of (hbase.hlog.split.batch.size)
1245    *
1246    * A batch consists of a set of log files that will be sorted in a single map of edits indexed by region
1247    * the resulting map will be concurrently written by multiple threads to their corresponding regions
1248    *
1249    * Each batch consists of more more log files that are
1250    *  - recovered (files is opened for append then closed to ensure no process is writing into it)
1251    *  - parsed (each edit in the log is appended to a list of edits indexed by region
1252    *    see {@link #parseHLog} for more details)
1253    *  - marked as either processed or corrupt depending on parsing outcome
1254    *  - the resulting edits indexed by region are concurrently written to their corresponding region
1255    *    region directories
1256    *  - original files are then archived to a different directory
1257    *
1258    *
1259    *
1260    * @param rootDir  hbase directory
1261    * @param srcDir   logs directory
1262    * @param oldLogDir directory where processed logs are archived to
1263    * @param logfiles the list of log files to split
1264    * @param fs
1265    * @param conf
1266    * @return
1267    * @throws IOException
1268    */
1269   private static List<Path> splitLog(final Path rootDir, final Path srcDir,
1270     Path oldLogDir, final FileStatus[] logfiles, final FileSystem fs,
1271     final Configuration conf)
1272   throws IOException {
1273     List<Path> processedLogs = new ArrayList<Path>();
1274     List<Path> corruptedLogs = new ArrayList<Path>();
1275     final Map<byte [], WriterAndPath> logWriters =
1276       Collections.synchronizedMap(
1277         new TreeMap<byte [], WriterAndPath>(Bytes.BYTES_COMPARATOR));
1278     List<Path> splits = null;
1279 
1280     // Number of logs in a read batch
1281     // More means faster but bigger mem consumption
1282     //TODO make a note on the conf rename and update hbase-site.xml if needed
1283     int logFilesPerStep = conf.getInt("hbase.hlog.split.batch.size", 3);
1284      boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors", false);
1285 
1286 
1287     try {
1288       int i = -1;
1289       while (i < logfiles.length) {
1290         final Map<byte[], LinkedList<Entry>> editsByRegion =
1291           new TreeMap<byte[], LinkedList<Entry>>(Bytes.BYTES_COMPARATOR);
1292         for (int j = 0; j < logFilesPerStep; j++) {
1293           i++;
1294           if (i == logfiles.length) {
1295             break;
1296           }
1297           FileStatus log = logfiles[i];
1298           Path logPath = log.getPath();
1299           long logLength = log.getLen();
1300           LOG.debug("Splitting hlog " + (i + 1) + " of " + logfiles.length +
1301             ": " + logPath + ", length=" + logLength );
1302           try {
1303             recoverFileLease(fs, logPath, conf);
1304             parseHLog(log, editsByRegion, fs, conf);
1305             processedLogs.add(logPath);
1306            } catch (IOException e) {
1307              if (skipErrors) {
1308                LOG.warn("Got while parsing hlog " + logPath +
1309                  ". Marking as corrupted", e);
1310                corruptedLogs.add(logPath);
1311              } else {
1312                throw e;
1313              }
1314           }
1315         }
1316         writeEditsBatchToRegions(editsByRegion, logWriters, rootDir, fs, conf);
1317       }
1318       if (fs.listStatus(srcDir).length > processedLogs.size() + corruptedLogs.size()) {
1319         throw new IOException("Discovered orphan hlog after split. Maybe " +
1320           "HRegionServer was not dead when we started");
1321       }
1322       archiveLogs(corruptedLogs, processedLogs, oldLogDir, fs, conf);
1323     } finally {
1324       splits = new ArrayList<Path>(logWriters.size());
1325       for (WriterAndPath wap : logWriters.values()) {
1326         wap.w.close();
1327         splits.add(wap.p);
1328         LOG.debug("Closed " + wap.p);
1329       }
1330     }
1331     return splits;
1332   }
1333 
1334 
1335   /**
1336    * Utility class that lets us keep track of the edit with it's key
1337    * Only used when splitting logs
1338    */
1339   public static class Entry implements Writable {
1340     private WALEdit edit;
1341     private HLogKey key;
1342 
1343     public Entry() {
1344       edit = new WALEdit();
1345       key = new HLogKey();
1346     }
1347 
1348     /**
1349      * Constructor for both params
1350      * @param edit log's edit
1351      * @param key log's key
1352      */
1353     public Entry(HLogKey key, WALEdit edit) {
1354       super();
1355       this.key = key;
1356       this.edit = edit;
1357     }
1358     /**
1359      * Gets the edit
1360      * @return edit
1361      */
1362     public WALEdit getEdit() {
1363       return edit;
1364     }
1365     /**
1366      * Gets the key
1367      * @return key
1368      */
1369     public HLogKey getKey() {
1370       return key;
1371     }
1372 
1373     @Override
1374     public String toString() {
1375       return this.key + "=" + this.edit;
1376     }
1377 
1378     @Override
1379     public void write(DataOutput dataOutput) throws IOException {
1380       this.key.write(dataOutput);
1381       this.edit.write(dataOutput);
1382     }
1383 
1384     @Override
1385     public void readFields(DataInput dataInput) throws IOException {
1386       this.key.readFields(dataInput);
1387       this.edit.readFields(dataInput);
1388     }
1389   }
1390 
1391   /**
1392    * Construct the HLog directory name
1393    *
1394    * @param info HServerInfo for server
1395    * @return the HLog directory name
1396    */
1397   public static String getHLogDirectoryName(HServerInfo info) {
1398     return getHLogDirectoryName(info.getServerName());
1399   }
1400 
1401   /**
1402    * Construct the HLog directory name
1403    *
1404    * @param serverAddress
1405    * @param startCode
1406    * @return the HLog directory name
1407    */
1408   public static String getHLogDirectoryName(String serverAddress,
1409       long startCode) {
1410     if (serverAddress == null || serverAddress.length() == 0) {
1411       return null;
1412     }
1413     return getHLogDirectoryName(
1414         HServerInfo.getServerName(serverAddress, startCode));
1415   }
1416 
1417   /**
1418    * Construct the HLog directory name
1419    *
1420    * @param serverName
1421    * @return the HLog directory name
1422    */
1423   public static String getHLogDirectoryName(String serverName) {
1424     StringBuilder dirName = new StringBuilder(HConstants.HREGION_LOGDIR_NAME);
1425     dirName.append("/");
1426     dirName.append(serverName);
1427     return dirName.toString();
1428   }
1429 
1430   public static boolean validateHLogFilename(String filename) {
1431     return pattern.matcher(filename).matches();
1432   }
1433 
1434   private static Path getHLogArchivePath(Path oldLogDir, Path p) {
1435     return new Path(oldLogDir, p.getName());
1436   }
1437 
1438   /**
1439    * Takes splitLogsMap and concurrently writes them to region directories using a thread pool
1440    *
1441    * @param splitLogsMap map that contains the log splitting result indexed by region
1442    * @param logWriters map that contains a writer per region
1443    * @param rootDir hbase root dir
1444    * @param fs
1445    * @param conf
1446    * @throws IOException
1447    */
1448   private static void writeEditsBatchToRegions(
1449     final Map<byte[], LinkedList<Entry>> splitLogsMap,
1450     final Map<byte[], WriterAndPath> logWriters,
1451     final Path rootDir, final FileSystem fs, final Configuration conf)
1452   throws IOException {
1453     // Number of threads to use when log splitting to rewrite the logs.
1454     // More means faster but bigger mem consumption.
1455     int logWriterThreads =
1456       conf.getInt("hbase.regionserver.hlog.splitlog.writer.threads", 3);
1457     boolean skipErrors = conf.getBoolean("hbase.skip.errors", false);
1458     HashMap<byte[], Future> writeFutureResult = new HashMap<byte[], Future>();
1459     NamingThreadFactory f  = new NamingThreadFactory(
1460             "SplitWriter-%1$d", Executors.defaultThreadFactory());
1461     ThreadPoolExecutor threadPool = (ThreadPoolExecutor)Executors.newFixedThreadPool(logWriterThreads, f);
1462     for (final byte [] region : splitLogsMap.keySet()) {
1463       Callable splitter = createNewSplitter(rootDir, logWriters, splitLogsMap, region, fs, conf);
1464       writeFutureResult.put(region, threadPool.submit(splitter));
1465     }
1466 
1467     threadPool.shutdown();
1468     // Wait for all threads to terminate
1469     try {
1470       for (int j = 0; !threadPool.awaitTermination(5, TimeUnit.SECONDS); j++) {
1471         String message = "Waiting for hlog writers to terminate, elapsed " + j * 5 + " seconds";
1472         if (j < 30) {
1473           LOG.debug(message);
1474         } else {
1475           LOG.info(message);
1476         }
1477 
1478       }
1479     } catch(InterruptedException ex) {
1480       LOG.warn("Hlog writers were interrupted, possible data loss!");
1481       if (!skipErrors) {
1482         throw new IOException("Could not finish writing log entries",  ex);
1483         //TODO  maybe we should fail here regardless if skipErrors is active or not
1484       }
1485     }
1486 
1487     for (Map.Entry<byte[], Future> entry : writeFutureResult.entrySet()) {
1488       try {
1489         entry.getValue().get();
1490       } catch (ExecutionException e) {
1491         throw (new IOException(e.getCause()));
1492       } catch (InterruptedException e1) {
1493         LOG.warn("Writer for region " +  Bytes.toString(entry.getKey()) +
1494                 " was interrupted, however the write process should have " +
1495                 "finished. Throwing up ", e1);
1496         throw (new IOException(e1.getCause()));
1497       }
1498     }
1499   }
1500 
1501   /*
1502    * Parse a single hlog and put the edits in @splitLogsMap
1503    *
1504    * @param logfile to split
1505    * @param splitLogsMap output parameter: a map with region names as keys and a
1506    * list of edits as values
1507    * @param fs the filesystem
1508    * @param conf the configuration
1509    * @throws IOException if hlog is corrupted, or can't be open
1510    */
1511   private static void parseHLog(final FileStatus logfile,
1512     final Map<byte[], LinkedList<Entry>> splitLogsMap, final FileSystem fs,
1513     final Configuration conf)
1514   throws IOException {
1515     // Check for possibly empty file. With appends, currently Hadoop reports a
1516     // zero length even if the file has been sync'd. Revisit if HDFS-376 or
1517     // HDFS-878 is committed.
1518     long length = logfile.getLen();
1519     if (length <= 0) {
1520       LOG.warn("File " + logfile.getPath() + " might be still open, length is 0");
1521     }
1522     Path path = logfile.getPath();
1523     Reader in;
1524     int editsCount = 0;
1525     try {
1526       in = HLog.getReader(fs, path, conf);
1527     } catch (EOFException e) {
1528       if (length <= 0) {
1529         //TODO should we ignore an empty, not-last log file if skip.errors is false?
1530         //Either way, the caller should decide what to do. E.g. ignore if this is the last
1531         //log in sequence.
1532         //TODO is this scenario still possible if the log has been recovered (i.e. closed)
1533         LOG.warn("Could not open " + path + " for reading. File is empty" + e);
1534         return;
1535       } else {
1536         throw e;
1537       }
1538     }
1539     try {
1540       Entry entry;
1541       while ((entry = in.next()) != null) {
1542         byte[] region = entry.getKey().getRegionName();
1543         LinkedList<Entry> queue = splitLogsMap.get(region);
1544         if (queue == null) {
1545           queue = new LinkedList<Entry>();
1546           splitLogsMap.put(region, queue);
1547         }
1548         queue.addLast(entry);
1549         editsCount++;
1550       }
1551       LOG.debug("Pushed=" + editsCount + " entries from " + path);
1552     } finally {
1553       try {
1554         if (in != null) {
1555           in.close();
1556         }
1557       } catch (IOException e) {
1558         LOG.warn("Close log reader in finally threw exception -- continuing", e);
1559       }
1560     }
1561   }
1562 
1563   private static Callable<Void> createNewSplitter(final Path rootDir,
1564     final Map<byte[], WriterAndPath> logWriters,
1565     final Map<byte[], LinkedList<Entry>> logEntries,
1566     final byte[] region, final FileSystem fs, final Configuration conf) {
1567     return new Callable<Void>() {
1568       public String getName() {
1569         return "Split writer thread for region " + Bytes.toStringBinary(region);
1570       }
1571 
1572       @Override
1573       public Void call() throws IOException {
1574         LinkedList<Entry> entries = logEntries.get(region);
1575         LOG.debug(this.getName()+" got " + entries.size() + " to process");
1576         long threadTime = System.currentTimeMillis();
1577         try {
1578           int editsCount = 0;
1579           WriterAndPath wap = logWriters.get(region);
1580           for (Entry logEntry: entries) {
1581             if (wap == null) {
1582               Path regionedits = getRegionSplitEditsPath(fs, logEntry, rootDir);
1583               if (fs.exists(regionedits)) {
1584                 LOG.warn("Found existing old edits file. It could be the " +
1585                   "result of a previous failed split attempt. Deleting " +
1586                   regionedits + ", length=" + fs.getFileStatus(regionedits).getLen());
1587                 if (!fs.delete(regionedits, false)) {
1588                   LOG.warn("Failed delete of old " + regionedits);
1589                 }
1590               }
1591               Writer w = createWriter(fs, regionedits, conf);
1592               wap = new WriterAndPath(regionedits, w);
1593               logWriters.put(region, wap);
1594               LOG.debug("Creating writer path=" + regionedits +
1595                 " region=" + Bytes.toStringBinary(region));
1596             }
1597             wap.w.append(logEntry);
1598             editsCount++;
1599           }
1600           LOG.debug(this.getName() + " Applied " + editsCount +
1601             " total edits to " + Bytes.toStringBinary(region) +
1602             " in " + (System.currentTimeMillis() - threadTime) + "ms");
1603         } catch (IOException e) {
1604           e = RemoteExceptionHandler.checkIOException(e);
1605           LOG.fatal(this.getName() + " Got while writing log entry to log", e);
1606           throw e;
1607         }
1608         return null;
1609       }
1610     };
1611   }
1612 
1613   /**
1614    * Moves processed logs to a oldLogDir after successful processing
1615    * Moves corrupted logs (any log that couldn't be successfully parsed
1616    * to corruptDir (.corrupt) for later investigation
1617    *
1618    * @param corruptedLogs
1619    * @param processedLogs
1620    * @param oldLogDir
1621    * @param fs
1622    * @param conf
1623    * @throws IOException
1624    */
1625   private static void archiveLogs(final List<Path> corruptedLogs,
1626     final List<Path> processedLogs, final Path oldLogDir,
1627     final FileSystem fs, final Configuration conf)
1628   throws IOException{
1629     final Path corruptDir = new Path(conf.get(HConstants.HBASE_DIR),
1630       conf.get("hbase.regionserver.hlog.splitlog.corrupt.dir", ".corrupt"));
1631 
1632     fs.mkdirs(corruptDir);
1633     fs.mkdirs(oldLogDir);
1634 
1635     for (Path corrupted: corruptedLogs) {
1636       Path p = new Path(corruptDir, corrupted.getName());
1637       LOG.info("Moving corrupted log " + corrupted + " to " + p);
1638       fs.rename(corrupted, p);
1639     }
1640 
1641     for (Path p: processedLogs) {
1642       Path newPath = getHLogArchivePath(oldLogDir, p);
1643       fs.rename(p, newPath);
1644       LOG.info("Archived processed log " + p + " to " + newPath);
1645     }
1646   }
1647 
1648   /*
1649    * Path to a file under RECOVERED_EDITS_DIR directory of the region found in
1650    * <code>logEntry</code> named for the sequenceid in the passed
1651    * <code>logEntry</code>: e.g. /hbase/some_table/2323432434/recovered.edits/2332.
1652    * This method also ensures existence of RECOVERED_EDITS_DIR under the region
1653    * creating it if necessary.
1654    * @param fs
1655    * @param logEntry
1656    * @param rootDir HBase root dir.
1657    * @return Path to file into which to dump split log edits.
1658    * @throws IOException
1659    */
1660   private static Path getRegionSplitEditsPath(final FileSystem fs,
1661       final Entry logEntry, final Path rootDir)
1662   throws IOException {
1663     Path tableDir = HTableDescriptor.getTableDir(rootDir,
1664       logEntry.getKey().getTablename());
1665     Path regiondir = HRegion.getRegionDir(tableDir,
1666       HRegionInfo.encodeRegionName(logEntry.getKey().getRegionName()));
1667     Path dir = getRegionDirRecoveredEditsDir(regiondir);
1668     if (!fs.exists(dir)) {
1669       if (!fs.mkdirs(dir)) LOG.warn("mkdir failed on " + dir);
1670     }
1671     return new Path(dir,
1672       formatRecoveredEditsFileName(logEntry.getKey().getLogSeqNum()));
1673    }
1674 
1675   static String formatRecoveredEditsFileName(final long seqid) {
1676     return String.format("%019d", seqid);
1677   }
1678 
1679 
1680   /**
1681    * Returns sorted set of edit files made by wal-log splitter.
1682    * @param fs
1683    * @param regiondir
1684    * @return Files in passed <code>regiondir</code> as a sorted set.
1685    * @throws IOException
1686    */
1687   public static NavigableSet<Path> getSplitEditFilesSorted(final FileSystem fs,
1688       final Path regiondir)
1689   throws IOException {
1690     Path editsdir = getRegionDirRecoveredEditsDir(regiondir);
1691     FileStatus [] files = fs.listStatus(editsdir, new PathFilter () {
1692       @Override
1693       public boolean accept(Path p) {
1694         boolean result = false;
1695         try {
1696           // Return files and only files that match the editfile names pattern.
1697           // There can be other files in this directory other than edit files.
1698           // In particular, on error, we'll move aside the bad edit file giving
1699           // it a timestamp suffix.  See moveAsideBadEditsFile.
1700           Matcher m = EDITFILES_NAME_PATTERN.matcher(p.getName());
1701           result = fs.isFile(p) && m.matches();
1702         } catch (IOException e) {
1703           LOG.warn("Failed isFile check on " + p);
1704         }
1705         return result;
1706       }
1707     });
1708     NavigableSet<Path> filesSorted = new TreeSet<Path>();
1709     if (files == null) return filesSorted;
1710     for (FileStatus status: files) {
1711       filesSorted.add(status.getPath());
1712     }
1713     return filesSorted;
1714   }
1715 
1716   /**
1717    * Move aside a bad edits file.
1718    * @param fs
1719    * @param edits Edits file to move aside.
1720    * @return The name of the moved aside file.
1721    * @throws IOException
1722    */
1723   public static Path moveAsideBadEditsFile(final FileSystem fs,
1724       final Path edits)
1725   throws IOException {
1726     Path moveAsideName = new Path(edits.getParent(), edits.getName() + "." +
1727       System.currentTimeMillis());
1728     if (!fs.rename(edits, moveAsideName)) {
1729       LOG.warn("Rename failed from " + edits + " to " + moveAsideName);
1730     }
1731     return moveAsideName;
1732   }
1733 
1734   /**
1735    * @param regiondir This regions directory in the filesystem.
1736    * @return The directory that holds recovered edits files for the region
1737    * <code>regiondir</code>
1738    */
1739   public static Path getRegionDirRecoveredEditsDir(final Path regiondir) {
1740     return new Path(regiondir, RECOVERED_EDITS_DIR);
1741   }
1742 
1743   /**
1744    *
1745    * @param visitor
1746    */
1747   public void addLogEntryVisitor(LogEntryVisitor visitor) {
1748     this.logEntryVisitors.add(visitor);
1749   }
1750 
1751   /**
1752    * 
1753    * @param visitor
1754    */
1755   public void removeLogEntryVisitor(LogEntryVisitor visitor) {
1756     this.logEntryVisitors.remove(visitor);
1757   }
1758 
1759 
1760   public void addLogActionsListerner(LogActionsListener list) {
1761     LOG.info("Adding a listener");
1762     this.actionListeners.add(list);
1763   }
1764 
1765   public boolean removeLogActionsListener(LogActionsListener list) {
1766     return this.actionListeners.remove(list);
1767   }
1768 
1769   private static void usage() {
1770     System.err.println("Usage: java org.apache.hbase.HLog" +
1771         " {--dump <logfile>... | --split <logdir>...}");
1772   }
1773 
1774   /**
1775    * Pass one or more log file names and it will either dump out a text version
1776    * on <code>stdout</code> or split the specified log files.
1777    *
1778    * @param args
1779    * @throws IOException
1780    */
1781   public static void main(String[] args) throws IOException {
1782     if (args.length < 2) {
1783       usage();
1784       System.exit(-1);
1785     }
1786     boolean dump = true;
1787     if (args[0].compareTo("--dump") != 0) {
1788       if (args[0].compareTo("--split") == 0) {
1789         dump = false;
1790 
1791       } else {
1792         usage();
1793         System.exit(-1);
1794       }
1795     }
1796     Configuration conf = HBaseConfiguration.create();
1797     FileSystem fs = FileSystem.get(conf);
1798     final Path baseDir = new Path(conf.get(HConstants.HBASE_DIR));
1799     final Path oldLogDir = new Path(baseDir, HConstants.HREGION_OLDLOGDIR_NAME);
1800     for (int i = 1; i < args.length; i++) {
1801       Path logPath = new Path(args[i]);
1802       if (!fs.exists(logPath)) {
1803         throw new FileNotFoundException(args[i] + " does not exist");
1804       }
1805       if (dump) {
1806         if (!fs.isFile(logPath)) {
1807           throw new IOException(args[i] + " is not a file");
1808         }
1809         Reader log = getReader(fs, logPath, conf);
1810         try {
1811           HLog.Entry entry;
1812           while ((entry = log.next()) != null) {
1813             System.out.println(entry.toString());
1814           }
1815         } finally {
1816           log.close();
1817         }
1818       } else {
1819         if (!fs.getFileStatus(logPath).isDir()) {
1820           throw new IOException(args[i] + " is not a directory");
1821         }
1822         splitLog(baseDir, logPath, oldLogDir, fs, conf);
1823       }
1824     }
1825   }
1826 
1827   public static final long FIXED_OVERHEAD = ClassSize.align(
1828       ClassSize.OBJECT + (5 * ClassSize.REFERENCE) +
1829       ClassSize.ATOMIC_INTEGER + Bytes.SIZEOF_INT + (3 * Bytes.SIZEOF_LONG));
1830 
1831 }