View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver.wal;
21  
22  import java.io.DataInput;
23  import java.io.DataOutput;
24  import java.io.FileNotFoundException;
25  import java.io.IOException;
26  import java.io.OutputStream;
27  import java.io.UnsupportedEncodingException;
28  import java.lang.reflect.InvocationTargetException;
29  import java.lang.reflect.Method;
30  import java.net.URLEncoder;
31  import java.util.ArrayList;
32  import java.util.Arrays;
33  import java.util.Collections;
34  import java.util.LinkedList;
35  import java.util.List;
36  import java.util.Map;
37  import java.util.NavigableSet;
38  import java.util.SortedMap;
39  import java.util.TreeMap;
40  import java.util.TreeSet;
41  import java.util.UUID;
42  import java.util.concurrent.ConcurrentSkipListMap;
43  import java.util.concurrent.CopyOnWriteArrayList;
44  import java.util.concurrent.atomic.AtomicBoolean;
45  import java.util.concurrent.atomic.AtomicInteger;
46  import java.util.concurrent.atomic.AtomicLong;
47  import java.util.concurrent.locks.Lock;
48  import java.util.concurrent.locks.ReentrantLock;
49  import java.util.regex.Matcher;
50  import java.util.regex.Pattern;
51  
52  import org.apache.commons.logging.Log;
53  import org.apache.commons.logging.LogFactory;
54  import org.apache.hadoop.conf.Configuration;
55  import org.apache.hadoop.fs.FSDataOutputStream;
56  import org.apache.hadoop.fs.FileStatus;
57  import org.apache.hadoop.fs.FileSystem;
58  import org.apache.hadoop.fs.Path;
59  import org.apache.hadoop.fs.PathFilter;
60  import org.apache.hadoop.fs.Syncable;
61  import org.apache.hadoop.hbase.HBaseConfiguration;
62  import org.apache.hadoop.hbase.HBaseFileSystem;
63  import org.apache.hadoop.hbase.HConstants;
64  import org.apache.hadoop.hbase.HRegionInfo;
65  import org.apache.hadoop.hbase.HTableDescriptor;
66  import org.apache.hadoop.hbase.KeyValue;
67  import org.apache.hadoop.hbase.ServerName;
68  import org.apache.hadoop.hbase.util.Bytes;
69  import org.apache.hadoop.hbase.util.ClassSize;
70  import org.apache.hadoop.hbase.util.FSUtils;
71  import org.apache.hadoop.hbase.util.HasThread;
72  import org.apache.hadoop.hbase.util.Threads;
73  import org.apache.hadoop.io.Writable;
74  import org.apache.hadoop.util.StringUtils;
75  
76  /**
77   * HLog stores all the edits to the HStore.  Its the hbase write-ahead-log
78   * implementation.
79   *
80   * It performs logfile-rolling, so external callers are not aware that the
81   * underlying file is being rolled.
82   *
83   * <p>
84   * There is one HLog per RegionServer.  All edits for all Regions carried by
85   * a particular RegionServer are entered first in the HLog.
86   *
87   * <p>
88   * Each HRegion is identified by a unique long <code>int</code>. HRegions do
89   * not need to declare themselves before using the HLog; they simply include
90   * their HRegion-id in the <code>append</code> or
91   * <code>completeCacheFlush</code> calls.
92   *
93   * <p>
94   * An HLog consists of multiple on-disk files, which have a chronological order.
95   * As data is flushed to other (better) on-disk structures, the log becomes
96   * obsolete. We can destroy all the log messages for a given HRegion-id up to
97   * the most-recent CACHEFLUSH message from that HRegion.
98   *
99   * <p>
100  * It's only practical to delete entire files. Thus, we delete an entire on-disk
101  * file F when all of the messages in F have a log-sequence-id that's older
102  * (smaller) than the most-recent CACHEFLUSH message for every HRegion that has
103  * a message in F.
104  *
105  * <p>
106  * Synchronized methods can never execute in parallel. However, between the
107  * start of a cache flush and the completion point, appends are allowed but log
108  * rolling is not. To prevent log rolling taking place during this period, a
109  * separate reentrant lock is used.
110  *
111  * <p>To read an HLog, call {@link #getReader(org.apache.hadoop.fs.FileSystem,
112  * org.apache.hadoop.fs.Path, org.apache.hadoop.conf.Configuration)}.
113  *
114  */
115 public class HLog implements Syncable {
116   static final Log LOG = LogFactory.getLog(HLog.class);
117   public static final byte [] METAFAMILY = Bytes.toBytes("METAFAMILY");
118   static final byte [] METAROW = Bytes.toBytes("METAROW");
119 
120   /** File Extension used while splitting an HLog into regions (HBASE-2312) */
121   public static final String SPLITTING_EXT = "-splitting";
122   public static final boolean SPLIT_SKIP_ERRORS_DEFAULT = false;
123   /** The META region's HLog filename extension */
124   public static final String META_HLOG_FILE_EXTN = ".meta";
125   public static final String SEPARATE_HLOG_FOR_META = "hbase.regionserver.separate.hlog.for.meta";
126 
127   /*
128    * Name of directory that holds recovered edits written by the wal log
129    * splitting code, one per region
130    */
131   public static final String RECOVERED_EDITS_DIR = "recovered.edits";
132   private static final Pattern EDITFILES_NAME_PATTERN =
133     Pattern.compile("-?[0-9]+");
134   public static final String RECOVERED_LOG_TMPFILE_SUFFIX = ".temp";
135   
136   private final FileSystem fs;
137   private final Path dir;
138   private final Configuration conf;
139   private final HLogFileSystem hlogFs;
140   // Listeners that are called on WAL events.
141   private List<WALActionsListener> listeners =
142     new CopyOnWriteArrayList<WALActionsListener>();
143   private final long optionalFlushInterval;
144   private final long blocksize;
145   private final String prefix;
146   private final AtomicLong unflushedEntries = new AtomicLong(0);
147   private volatile long syncedTillHere = 0;
148   private long lastDeferredTxid;
149   private final Path oldLogDir;
150   private volatile boolean logRollRunning;
151 
152   private static Class<? extends Writer> logWriterClass;
153   private static Class<? extends Reader> logReaderClass;
154 
155   private WALCoprocessorHost coprocessorHost;
156 
157   static void resetLogReaderClass() {
158     HLog.logReaderClass = null;
159   }
160 
161   private FSDataOutputStream hdfs_out; // FSDataOutputStream associated with the current SequenceFile.writer
162   // Minimum tolerable replicas, if the actual value is lower than it, 
163   // rollWriter will be triggered
164   private int minTolerableReplication;
165   private Method getNumCurrentReplicas; // refers to DFSOutputStream.getNumCurrentReplicas
166   final static Object [] NO_ARGS = new Object []{};
167 
168   public interface Reader {
169     void init(FileSystem fs, Path path, Configuration c) throws IOException;
170     void close() throws IOException;
171     Entry next() throws IOException;
172     Entry next(Entry reuse) throws IOException;
173     void seek(long pos) throws IOException;
174     long getPosition() throws IOException;
175     void reset() throws IOException;
176   }
177 
178   public interface Writer {
179     void init(FileSystem fs, Path path, Configuration c) throws IOException;
180     void close() throws IOException;
181     void sync() throws IOException;
182     void append(Entry entry) throws IOException;
183     long getLength() throws IOException;
184   }
185 
186   /*
187    * Current log file.
188    */
189   Writer writer;
190 
191   /*
192    * Map of all log files but the current one.
193    */
194   final SortedMap<Long, Path> outputfiles =
195     Collections.synchronizedSortedMap(new TreeMap<Long, Path>());
196 
197   /*
198    * Map of encoded region names to their most recent sequence/edit id in their
199    * memstore.
200    */
201   private final ConcurrentSkipListMap<byte [], Long> lastSeqWritten =
202     new ConcurrentSkipListMap<byte [], Long>(Bytes.BYTES_COMPARATOR);
203 
204   private volatile boolean closed = false;
205 
206   private final AtomicLong logSeqNum = new AtomicLong(0);
207 
208   private boolean forMeta = false;
209 
210   // The timestamp (in ms) when the log file was created.
211   private volatile long filenum = -1;
212 
213   //number of transactions in the current Hlog.
214   private final AtomicInteger numEntries = new AtomicInteger(0);
215 
216   // If live datanode count is lower than the default replicas value,
217   // RollWriter will be triggered in each sync(So the RollWriter will be
218   // triggered one by one in a short time). Using it as a workaround to slow
219   // down the roll frequency triggered by checkLowReplication().
220   private volatile int consecutiveLogRolls = 0;
221   private final int lowReplicationRollLimit;
222 
223   // If consecutiveLogRolls is larger than lowReplicationRollLimit,
224   // then disable the rolling in checkLowReplication().
225   // Enable it if the replications recover.
226   private volatile boolean lowReplicationRollEnabled = true;
227 
228   // If > than this size, roll the log. This is typically 0.95 times the size
229   // of the default Hdfs block size.
230   private final long logrollsize;
231 
232   // This lock prevents starting a log roll during a cache flush.
233   // synchronized is insufficient because a cache flush spans two method calls.
234   private final Lock cacheFlushLock = new ReentrantLock();
235 
236   // We synchronize on updateLock to prevent updates and to prevent a log roll
237   // during an update
238   // locked during appends
239   private final Object updateLock = new Object();
240   private final Object flushLock = new Object();
241 
242   private final boolean enabled;
243 
244   /*
245    * If more than this many logs, force flush of oldest region to oldest edit
246    * goes to disk.  If too many and we crash, then will take forever replaying.
247    * Keep the number of logs tidy.
248    */
249   private final int maxLogs;
250 
251   /**
252    * Thread that handles optional sync'ing
253    */
254   private final LogSyncer logSyncer;
255 
256   /** Number of log close errors tolerated before we abort */
257   private final int closeErrorsTolerated;
258 
259   private final AtomicInteger closeErrorCount = new AtomicInteger();
260 
261   /**
262    * Pattern used to validate a HLog file name
263    */
264   private static final Pattern pattern = 
265       Pattern.compile(".*\\.\\d*("+HLog.META_HLOG_FILE_EXTN+")*");
266 
267   static byte [] COMPLETE_CACHE_FLUSH;
268   static {
269     try {
270       COMPLETE_CACHE_FLUSH =
271         "HBASE::CACHEFLUSH".getBytes(HConstants.UTF8_ENCODING);
272     } catch (UnsupportedEncodingException e) {
273       assert(false);
274     }
275   }
276 
277   public static class Metric {
278     public long min = Long.MAX_VALUE;
279     public long max = 0;
280     public long total = 0;
281     public int count = 0;
282 
283     synchronized void inc(final long val) {
284       min = Math.min(min, val);
285       max = Math.max(max, val);
286       total += val;
287       ++count;
288     }
289 
290     synchronized Metric get() {
291       Metric copy = new Metric();
292       copy.min = min;
293       copy.max = max;
294       copy.total = total;
295       copy.count = count;
296       this.min = Long.MAX_VALUE;
297       this.max = 0;
298       this.total = 0;
299       this.count = 0;
300       return copy;
301     }
302   }
303 
304   // For measuring latency of writes
305   private static Metric writeTime = new Metric();
306   private static Metric writeSize = new Metric();
307   // For measuring latency of syncs
308   private static Metric syncTime = new Metric();
309   //For measuring slow HLog appends
310   private static AtomicLong slowHLogAppendCount = new AtomicLong();
311   private static Metric slowHLogAppendTime = new Metric();
312   
313   public static Metric getWriteTime() {
314     return writeTime.get();
315   }
316 
317   public static Metric getWriteSize() {
318     return writeSize.get();
319   }
320 
321   public static Metric getSyncTime() {
322     return syncTime.get();
323   }
324 
325   public static long getSlowAppendCount() {
326     return slowHLogAppendCount.get();
327   }
328   
329   public static Metric getSlowAppendTime() {
330     return slowHLogAppendTime.get();
331   }
332 
333   /**
334    * Constructor.
335    *
336    * @param fs filesystem handle
337    * @param dir path to where hlogs are stored
338    * @param oldLogDir path to where hlogs are archived
339    * @param conf configuration to use
340    * @throws IOException
341    */
342   public HLog(final FileSystem fs, final Path dir, final Path oldLogDir,
343               final Configuration conf)
344   throws IOException {
345     this(fs, dir, oldLogDir, conf, null, true, null, false);
346   }
347 
348   /**
349    * Create an edit log at the given <code>dir</code> location.
350    *
351    * You should never have to load an existing log. If there is a log at
352    * startup, it should have already been processed and deleted by the time the
353    * HLog object is started up.
354    *
355    * @param fs filesystem handle
356    * @param dir path to where hlogs are stored
357    * @param oldLogDir path to where hlogs are archived
358    * @param conf configuration to use
359    * @param listeners Listeners on WAL events. Listeners passed here will
360    * be registered before we do anything else; e.g. the
361    * Constructor {@link #rollWriter()}.
362    * @param prefix should always be hostname and port in distributed env and
363    *        it will be URL encoded before being used.
364    *        If prefix is null, "hlog" will be used
365    * @throws IOException
366    */
367   public HLog(final FileSystem fs, final Path dir, final Path oldLogDir,
368       final Configuration conf, final List<WALActionsListener> listeners,
369       final String prefix) throws IOException {
370     this(fs, dir, oldLogDir, conf, listeners, true, prefix, false);
371   }
372 
373   /**
374    * Create an edit log at the given <code>dir</code> location.
375    *
376    * You should never have to load an existing log. If there is a log at
377    * startup, it should have already been processed and deleted by the time the
378    * HLog object is started up.
379    *
380    * @param fs filesystem handle
381    * @param dir path to where hlogs are stored
382    * @param oldLogDir path to where hlogs are archived
383    * @param conf configuration to use
384    * @param listeners Listeners on WAL events. Listeners passed here will
385    * be registered before we do anything else; e.g. the
386    * Constructor {@link #rollWriter()}.
387    * @param failIfLogDirExists If true IOException will be thrown if dir already exists.
388    * @param prefix should always be hostname and port in distributed env and
389    *        it will be URL encoded before being used.
390    *        If prefix is null, "hlog" will be used
391    * @param forMeta if this hlog is meant for meta updates
392    * @throws IOException
393    */
394   public HLog(final FileSystem fs, final Path dir, final Path oldLogDir,
395       final Configuration conf, final List<WALActionsListener> listeners,
396       final boolean failIfLogDirExists, final String prefix, boolean forMeta)
397   throws IOException {
398     super();
399     this.fs = fs;
400     this.dir = dir;
401     this.conf = conf;
402     this.hlogFs = new HLogFileSystem(conf);
403     if (listeners != null) {
404       for (WALActionsListener i: listeners) {
405         registerWALActionsListener(i);
406       }
407     }
408     this.blocksize = conf.getLong("hbase.regionserver.hlog.blocksize",
409         FSUtils.getDefaultBlockSize(this.fs, this.dir));
410     // Roll at 95% of block size.
411     float multi = conf.getFloat("hbase.regionserver.logroll.multiplier", 0.95f);
412     this.logrollsize = (long)(this.blocksize * multi);
413     this.optionalFlushInterval =
414       conf.getLong("hbase.regionserver.optionallogflushinterval", 1 * 1000);
415     boolean dirExists = false;
416     if (failIfLogDirExists && (dirExists = this.fs.exists(dir))) {
417       throw new IOException("Target HLog directory already exists: " + dir);
418     }
419     if (!dirExists && !HBaseFileSystem.makeDirOnFileSystem(fs, dir)) {
420       throw new IOException("Unable to mkdir " + dir);
421     }
422     this.oldLogDir = oldLogDir;
423     if (!fs.exists(oldLogDir) && !HBaseFileSystem.makeDirOnFileSystem(fs, oldLogDir)) {
424       throw new IOException("Unable to mkdir " + this.oldLogDir);
425     }
426     this.forMeta = forMeta;
427     this.maxLogs = conf.getInt("hbase.regionserver.maxlogs", 32);
428     this.minTolerableReplication = conf.getInt(
429         "hbase.regionserver.hlog.tolerable.lowreplication",
430         FSUtils.getDefaultReplication(this.fs, this.dir));
431     this.lowReplicationRollLimit = conf.getInt(
432         "hbase.regionserver.hlog.lowreplication.rolllimit", 5);
433     this.enabled = conf.getBoolean("hbase.regionserver.hlog.enabled", true);
434     this.closeErrorsTolerated = conf.getInt(
435         "hbase.regionserver.logroll.errors.tolerated", 0);
436 
437     LOG.info("HLog configuration: blocksize=" +
438       StringUtils.byteDesc(this.blocksize) +
439       ", rollsize=" + StringUtils.byteDesc(this.logrollsize) +
440       ", enabled=" + this.enabled +
441       ", optionallogflushinternal=" + this.optionalFlushInterval + "ms");
442     // If prefix is null||empty then just name it hlog
443     this.prefix = prefix == null || prefix.isEmpty() ?
444         "hlog" : URLEncoder.encode(prefix, "UTF8");
445     // rollWriter sets this.hdfs_out if it can.
446     rollWriter();
447 
448     // handle the reflection necessary to call getNumCurrentReplicas()
449     this.getNumCurrentReplicas = getGetNumCurrentReplicas(this.hdfs_out);
450 
451     logSyncer = new LogSyncer(this.optionalFlushInterval);
452     // When optionalFlushInterval is set as 0, don't start a thread for deferred log sync.
453     if (this.optionalFlushInterval > 0) {
454       Threads.setDaemonThreadRunning(logSyncer.getThread(), Thread.currentThread().getName()
455           + ".logSyncer");
456     } else {
457       LOG.info("hbase.regionserver.optionallogflushinterval is set as "
458           + this.optionalFlushInterval + ". Deferred log syncing won't work. "
459           + "Any Mutation, marked to be deferred synced, will be flushed immediately.");
460     }
461     coprocessorHost = new WALCoprocessorHost(this, conf);
462   }
463 
464   /**
465    * Find the 'getNumCurrentReplicas' on the passed <code>os</code> stream.
466    * @return Method or null.
467    */
468   private Method getGetNumCurrentReplicas(final FSDataOutputStream os) {
469     Method m = null;
470     if (os != null) {
471       Class<? extends OutputStream> wrappedStreamClass = os.getWrappedStream()
472           .getClass();
473       try {
474         m = wrappedStreamClass.getDeclaredMethod("getNumCurrentReplicas",
475             new Class<?>[] {});
476         m.setAccessible(true);
477       } catch (NoSuchMethodException e) {
478         LOG.info("FileSystem's output stream doesn't support"
479             + " getNumCurrentReplicas; --HDFS-826 not available; fsOut="
480             + wrappedStreamClass.getName());
481       } catch (SecurityException e) {
482         LOG.info("Doesn't have access to getNumCurrentReplicas on "
483             + "FileSystems's output stream --HDFS-826 not available; fsOut="
484             + wrappedStreamClass.getName(), e);
485         m = null; // could happen on setAccessible()
486       }
487     }
488     if (m != null) {
489       LOG.info("Using getNumCurrentReplicas--HDFS-826");
490     }
491     return m;
492   }
493 
494   public void registerWALActionsListener(final WALActionsListener listener) {
495     this.listeners.add(listener);
496   }
497 
498   public boolean unregisterWALActionsListener(final WALActionsListener listener) {
499     return this.listeners.remove(listener);
500   }
501 
502   /**
503    * @return Current state of the monotonically increasing file id.
504    */
505   public long getFilenum() {
506     return this.filenum;
507   }
508 
509   /**
510    * Called by HRegionServer when it opens a new region to ensure that log
511    * sequence numbers are always greater than the latest sequence number of the
512    * region being brought on-line.
513    *
514    * @param newvalue We'll set log edit/sequence number to this value if it
515    * is greater than the current value.
516    */
517   public void setSequenceNumber(final long newvalue) {
518     for (long id = this.logSeqNum.get(); id < newvalue &&
519         !this.logSeqNum.compareAndSet(id, newvalue); id = this.logSeqNum.get()) {
520       // This could spin on occasion but better the occasional spin than locking
521       // every increment of sequence number.
522       LOG.debug("Changed sequenceid from " + logSeqNum + " to " + newvalue);
523     }
524   }
525 
526   /**
527    * @return log sequence number
528    */
529   public long getSequenceNumber() {
530     return logSeqNum.get();
531   }
532 
533   /**
534    * Method used internal to this class and for tests only.
535    * @return The wrapped stream our writer is using; its not the
536    * writer's 'out' FSDatoOutputStream but the stream that this 'out' wraps
537    * (In hdfs its an instance of DFSDataOutputStream).
538    */
539   // usage: see TestLogRolling.java
540   OutputStream getOutputStream() {
541     return this.hdfs_out.getWrappedStream();
542   }
543 
544   /**
545    * Roll the log writer. That is, start writing log messages to a new file.
546    *
547    * Because a log cannot be rolled during a cache flush, and a cache flush
548    * spans two method calls, a special lock needs to be obtained so that a cache
549    * flush cannot start when the log is being rolled and the log cannot be
550    * rolled during a cache flush.
551    *
552    * <p>Note that this method cannot be synchronized because it is possible that
553    * startCacheFlush runs, obtaining the cacheFlushLock, then this method could
554    * start which would obtain the lock on this but block on obtaining the
555    * cacheFlushLock and then completeCacheFlush could be called which would wait
556    * for the lock on this and consequently never release the cacheFlushLock
557    *
558    * @return If lots of logs, flush the returned regions so next time through
559    * we can clean logs. Returns null if nothing to flush.  Names are actual
560    * region names as returned by {@link HRegionInfo#getEncodedName()}
561    * @throws org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException
562    * @throws IOException
563    */
564   public byte [][] rollWriter() throws FailedLogCloseException, IOException {
565     return rollWriter(false);
566   }
567 
568   /**
569    * Roll the log writer. That is, start writing log messages to a new file.
570    *
571    * Because a log cannot be rolled during a cache flush, and a cache flush
572    * spans two method calls, a special lock needs to be obtained so that a cache
573    * flush cannot start when the log is being rolled and the log cannot be
574    * rolled during a cache flush.
575    *
576    * <p>Note that this method cannot be synchronized because it is possible that
577    * startCacheFlush runs, obtaining the cacheFlushLock, then this method could
578    * start which would obtain the lock on this but block on obtaining the
579    * cacheFlushLock and then completeCacheFlush could be called which would wait
580    * for the lock on this and consequently never release the cacheFlushLock
581    *
582    * @param force If true, force creation of a new writer even if no entries
583    * have been written to the current writer
584    * @return If lots of logs, flush the returned regions so next time through
585    * we can clean logs. Returns null if nothing to flush.  Names are actual
586    * region names as returned by {@link HRegionInfo#getEncodedName()}
587    * @throws org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException
588    * @throws IOException
589    */
590   public byte [][] rollWriter(boolean force)
591       throws FailedLogCloseException, IOException {
592     // Return if nothing to flush.
593     if (!force && this.writer != null && this.numEntries.get() <= 0) {
594       return null;
595     }
596     byte [][] regionsToFlush = null;
597     this.cacheFlushLock.lock();
598     this.logRollRunning = true;
599     try {
600       if (closed) {
601         LOG.debug("HLog closed.  Skipping rolling of writer");
602         return regionsToFlush;
603       }
604       // Do all the preparation outside of the updateLock to block
605       // as less as possible the incoming writes
606       long currentFilenum = this.filenum;
607       Path oldPath = null;
608       if (currentFilenum > 0) {
609         //computeFilename  will take care of meta hlog filename
610         oldPath = computeFilename(currentFilenum);
611       }
612       this.filenum = System.currentTimeMillis();
613       Path newPath = computeFilename();
614 
615       // Tell our listeners that a new log is about to be created
616       if (!this.listeners.isEmpty()) {
617         for (WALActionsListener i : this.listeners) {
618           i.preLogRoll(oldPath, newPath);
619         }
620       }
621       HLog.Writer nextWriter = this.createWriterInstance(fs, newPath, conf);
622       // Can we get at the dfsclient outputstream?  If an instance of
623       // SFLW, it'll have done the necessary reflection to get at the
624       // protected field name.
625       FSDataOutputStream nextHdfsOut = null;
626       if (nextWriter instanceof SequenceFileLogWriter) {
627         nextHdfsOut = ((SequenceFileLogWriter)nextWriter).getWriterFSDataOutputStream();
628       }
629 
630       synchronized (updateLock) {
631         // Clean up current writer.
632         Path oldFile = cleanupCurrentWriter(currentFilenum);
633         this.writer = nextWriter;
634         this.hdfs_out = nextHdfsOut;
635 
636         LOG.info((oldFile != null?
637             "Roll " + FSUtils.getPath(oldFile) + ", entries=" +
638             this.numEntries.get() +
639             ", filesize=" +
640             this.fs.getFileStatus(oldFile).getLen() + ". ": "") +
641           " for " + FSUtils.getPath(newPath));
642         this.numEntries.set(0);
643       }
644       // Tell our listeners that a new log was created
645       if (!this.listeners.isEmpty()) {
646         for (WALActionsListener i : this.listeners) {
647           i.postLogRoll(oldPath, newPath);
648         }
649       }
650 
651       // Can we delete any of the old log files?
652       if (this.outputfiles.size() > 0) {
653         if (this.lastSeqWritten.isEmpty()) {
654           LOG.debug("Last sequenceid written is empty. Deleting all old hlogs");
655           // If so, then no new writes have come in since all regions were
656           // flushed (and removed from the lastSeqWritten map). Means can
657           // remove all but currently open log file.
658           for (Map.Entry<Long, Path> e : this.outputfiles.entrySet()) {
659             archiveLogFile(e.getValue(), e.getKey());
660           }
661           this.outputfiles.clear();
662         } else {
663           regionsToFlush = cleanOldLogs();
664         }
665       }
666     } finally {
667       this.logRollRunning = false;
668       this.cacheFlushLock.unlock();
669     }
670     return regionsToFlush;
671   }
672 
673   /**
674    * This method allows subclasses to inject different writers without having to
675    * extend other methods like rollWriter().
676    * 
677    * @param fs
678    * @param path
679    * @param conf
680    * @return Writer instance
681    * @throws IOException
682    */
683   protected Writer createWriterInstance(final FileSystem fs, final Path path,
684       final Configuration conf) throws IOException {
685     if (forMeta) {
686       //TODO: set a higher replication for the hlog files (HBASE-6773)
687     }
688     return this.hlogFs.createWriter(fs, conf, path);
689   }
690 
691   /**
692    * Get a reader for the WAL.
693    * The proper way to tail a log that can be under construction is to first use this method
694    * to get a reader then call {@link HLog.Reader#reset()} to see the new data. It will also
695    * take care of keeping implementation-specific context (like compression).
696    * @param fs
697    * @param path
698    * @param conf
699    * @return A WAL reader.  Close when done with it.
700    * @throws IOException
701    */
702   public static Reader getReader(final FileSystem fs, final Path path,
703                                  Configuration conf)
704       throws IOException {
705     try {
706 
707       if (logReaderClass == null) {
708 
709         logReaderClass = conf.getClass("hbase.regionserver.hlog.reader.impl",
710             SequenceFileLogReader.class, Reader.class);
711       }
712 
713 
714       HLog.Reader reader = logReaderClass.newInstance();
715       reader.init(fs, path, conf);
716       return reader;
717     } catch (IOException e) {
718       throw e;
719     }
720     catch (Exception e) {
721       throw new IOException("Cannot get log reader", e);
722     }
723   }
724 
725   /**
726    * Get a writer for the WAL.
727    * @param path
728    * @param conf
729    * @return A WAL writer.  Close when done with it.
730    * @throws IOException
731    */
732   public static Writer createWriter(final FileSystem fs,
733       final Path path, Configuration conf)
734   throws IOException {
735     try {
736       if (logWriterClass == null) {
737         logWriterClass = conf.getClass("hbase.regionserver.hlog.writer.impl",
738             SequenceFileLogWriter.class, Writer.class);
739       }
740       HLog.Writer writer = (HLog.Writer) logWriterClass.newInstance();
741       writer.init(fs, path, conf);
742       return writer;
743     } catch (Exception e) {
744       throw new IOException("cannot get log writer", e);
745     }
746   }
747 
748   /*
749    * Clean up old commit logs.
750    * @return If lots of logs, flush the returned region so next time through
751    * we can clean logs. Returns null if nothing to flush.  Returns array of
752    * encoded region names to flush.
753    * @throws IOException
754    */
755   private byte [][] cleanOldLogs() throws IOException {
756     Long oldestOutstandingSeqNum = getOldestOutstandingSeqNum();
757     // Get the set of all log files whose last sequence number is smaller than
758     // the oldest edit's sequence number.
759     TreeSet<Long> sequenceNumbers =
760       new TreeSet<Long>(this.outputfiles.headMap(
761         (Long.valueOf(oldestOutstandingSeqNum.longValue()))).keySet());
762     // Now remove old log files (if any)
763     int logsToRemove = sequenceNumbers.size();
764     if (logsToRemove > 0) {
765       if (LOG.isDebugEnabled()) {
766         // Find associated region; helps debugging.
767         byte [] oldestRegion = getOldestRegion(oldestOutstandingSeqNum);
768         LOG.debug("Found " + logsToRemove + " hlogs to remove" +
769           " out of total " + this.outputfiles.size() + ";" +
770           " oldest outstanding sequenceid is " + oldestOutstandingSeqNum +
771           " from region " + Bytes.toStringBinary(oldestRegion));
772       }
773       for (Long seq : sequenceNumbers) {
774         archiveLogFile(this.outputfiles.remove(seq), seq);
775       }
776     }
777 
778     // If too many log files, figure which regions we need to flush.
779     // Array is an array of encoded region names.
780     byte [][] regions = null;
781     int logCount = this.outputfiles == null? 0: this.outputfiles.size();
782     if (logCount > this.maxLogs && logCount > 0) {
783       // This is an array of encoded region names.
784       regions = findMemstoresWithEditsEqualOrOlderThan(this.outputfiles.firstKey(),
785         this.lastSeqWritten);
786       if (regions != null) {
787         StringBuilder sb = new StringBuilder();
788         for (int i = 0; i < regions.length; i++) {
789           if (i > 0) sb.append(", ");
790           sb.append(Bytes.toStringBinary(regions[i]));
791         }
792         LOG.info("Too many hlogs: logs=" + logCount + ", maxlogs=" +
793            this.maxLogs + "; forcing flush of " + regions.length + " regions(s): " +
794            sb.toString());
795       }
796     }
797     return regions;
798   }
799 
800   /**
801    * Return regions (memstores) that have edits that are equal or less than
802    * the passed <code>oldestWALseqid</code>.
803    * @param oldestWALseqid
804    * @param regionsToSeqids Encoded region names to sequence ids
805    * @return All regions whose seqid is < than <code>oldestWALseqid</code> (Not
806    * necessarily in order).  Null if no regions found.
807    */
808   static byte [][] findMemstoresWithEditsEqualOrOlderThan(final long oldestWALseqid,
809       final Map<byte [], Long> regionsToSeqids) {
810     //  This method is static so it can be unit tested the easier.
811     List<byte []> regions = null;
812     for (Map.Entry<byte [], Long> e: regionsToSeqids.entrySet()) {
813       if (e.getValue().longValue() <= oldestWALseqid) {
814         if (regions == null) regions = new ArrayList<byte []>();
815         // Key is encoded region name.
816         regions.add(e.getKey());
817       }
818     }
819     return regions == null?
820       null: regions.toArray(new byte [][] {HConstants.EMPTY_BYTE_ARRAY});
821   }
822 
823   /*
824    * @return Logs older than this id are safe to remove.
825    */
826   private Long getOldestOutstandingSeqNum() {
827     return Collections.min(this.lastSeqWritten.values());
828   }
829 
830   /**
831    * @param oldestOutstandingSeqNum
832    * @return (Encoded) name of oldest outstanding region.
833    */
834   private byte [] getOldestRegion(final Long oldestOutstandingSeqNum) {
835     byte [] oldestRegion = null;
836     for (Map.Entry<byte [], Long> e: this.lastSeqWritten.entrySet()) {
837       if (e.getValue().longValue() == oldestOutstandingSeqNum.longValue()) {
838         // Key is encoded region name.
839         oldestRegion = e.getKey();
840         break;
841       }
842     }
843     return oldestRegion;
844   }
845 
846   /*
847    * Cleans up current writer closing and adding to outputfiles.
848    * Presumes we're operating inside an updateLock scope.
849    * @return Path to current writer or null if none.
850    * @throws IOException
851    */
852   Path cleanupCurrentWriter(final long currentfilenum) throws IOException {
853     Path oldFile = null;
854     if (this.writer != null) {
855       // Close the current writer, get a new one.
856       try {
857         // Wait till all current transactions are written to the hlog.
858         // No new transactions can occur because we have the updatelock.
859         if (this.unflushedEntries.get() != this.syncedTillHere) {
860           LOG.debug("cleanupCurrentWriter " +
861                    " waiting for transactions to get synced " +
862                    " total " + this.unflushedEntries.get() +
863                    " synced till here " + syncedTillHere);
864           sync();
865         }
866         this.writer.close();
867         this.writer = null;
868         closeErrorCount.set(0);
869       } catch (IOException e) {
870         LOG.error("Failed close of HLog writer", e);
871         int errors = closeErrorCount.incrementAndGet();
872         if (errors <= closeErrorsTolerated && !hasDeferredEntries()) {
873           LOG.warn("Riding over HLog close failure! error count="+errors);
874         } else {
875           if (hasDeferredEntries()) {
876             LOG.error("Aborting due to unflushed edits in HLog");
877           }
878           // Failed close of log file.  Means we're losing edits.  For now,
879           // shut ourselves down to minimize loss.  Alternative is to try and
880           // keep going.  See HBASE-930.
881           FailedLogCloseException flce =
882             new FailedLogCloseException("#" + currentfilenum);
883           flce.initCause(e);
884           throw flce;
885         }
886       }
887       if (currentfilenum >= 0) {
888         oldFile = computeFilename(currentfilenum);
889         this.outputfiles.put(Long.valueOf(this.logSeqNum.get()), oldFile);
890       }
891     }
892     return oldFile;
893   }
894 
895   private void archiveLogFile(final Path p, final Long seqno) throws IOException {
896     Path newPath = getHLogArchivePath(this.oldLogDir, p);
897     LOG.info("moving old hlog file " + FSUtils.getPath(p) +
898       " whose highest sequenceid is " + seqno + " to " +
899       FSUtils.getPath(newPath));
900 
901     // Tell our listeners that a log is going to be archived.
902     if (!this.listeners.isEmpty()) {
903       for (WALActionsListener i : this.listeners) {
904         i.preLogArchive(p, newPath);
905       }
906     }
907     if (!HBaseFileSystem.renameAndSetModifyTime(this.fs, p, newPath)) {
908       throw new IOException("Unable to rename " + p + " to " + newPath);
909     }
910     // Tell our listeners that a log has been archived.
911     if (!this.listeners.isEmpty()) {
912       for (WALActionsListener i : this.listeners) {
913         i.postLogArchive(p, newPath);
914       }
915     }
916   }
917 
918   /**
919    * This is a convenience method that computes a new filename with a given
920    * using the current HLog file-number
921    * @return Path
922    */
923   protected Path computeFilename() {
924     return computeFilename(this.filenum);
925   }
926 
927   /**
928    * This is a convenience method that computes a new filename with a given
929    * file-number.
930    * @param filenum to use
931    * @return Path
932    */
933   protected Path computeFilename(long filenum) {
934     if (filenum < 0) {
935       throw new RuntimeException("hlog file number can't be < 0");
936     }
937     String child = prefix + "." + filenum;
938     if (forMeta) {
939       child += HLog.META_HLOG_FILE_EXTN;
940     }
941     return new Path(dir, child);
942   }
943 
944   public static boolean isMetaFile(Path p) {
945     if (p.getName().endsWith(HLog.META_HLOG_FILE_EXTN)) {
946       return true;
947     }
948     return false;
949   }
950 
951   /**
952    * Shut down the log and delete the log directory
953    *
954    * @throws IOException
955    */
956   public void closeAndDelete() throws IOException {
957     close();
958     if (!fs.exists(this.dir)) return;
959     FileStatus[] files = fs.listStatus(this.dir);
960     for(FileStatus file : files) {
961 
962       Path p = getHLogArchivePath(this.oldLogDir, file.getPath());
963       // Tell our listeners that a log is going to be archived.
964       if (!this.listeners.isEmpty()) {
965         for (WALActionsListener i : this.listeners) {
966           i.preLogArchive(file.getPath(), p);
967         }
968       }
969       if (!HBaseFileSystem.renameAndSetModifyTime(fs, file.getPath(), p)) {
970         throw new IOException("Unable to rename " + file.getPath() + " to " + p);
971       }
972       // Tell our listeners that a log was archived.
973       if (!this.listeners.isEmpty()) {
974         for (WALActionsListener i : this.listeners) {
975           i.postLogArchive(file.getPath(), p);
976         }
977       }
978     }
979     LOG.debug("Moved " + files.length + " log files to " +
980       FSUtils.getPath(this.oldLogDir));
981     if (!HBaseFileSystem.deleteDirFromFileSystem(fs, dir)) {
982       LOG.info("Unable to delete " + dir);
983     }
984   }
985 
986   /**
987    * Shut down the log.
988    *
989    * @throws IOException
990    */
991   public void close() throws IOException {
992     // When optionalFlushInterval is 0, the logSyncer is not started as a Thread.
993     if (this.optionalFlushInterval > 0) {
994       try {
995         logSyncer.close();
996         // Make sure we synced everything
997         logSyncer.join(this.optionalFlushInterval * 2);
998       } catch (InterruptedException e) {
999         LOG.error("Exception while waiting for syncer thread to die", e);
1000       }
1001     }
1002 
1003     cacheFlushLock.lock();
1004     try {
1005       // Tell our listeners that the log is closing
1006       if (!this.listeners.isEmpty()) {
1007         for (WALActionsListener i : this.listeners) {
1008           i.logCloseRequested();
1009         }
1010       }
1011       synchronized (updateLock) {
1012         this.closed = true;
1013         if (LOG.isDebugEnabled()) {
1014           LOG.debug("closing hlog writer in " + this.dir.toString());
1015         }
1016         if (this.writer != null) {
1017           this.writer.close();
1018         }
1019       }
1020     } finally {
1021       cacheFlushLock.unlock();
1022     }
1023   }
1024 
1025   /**
1026    * @param now
1027    * @param regionName
1028    * @param tableName
1029    * @param clusterId
1030    * @return New log key.
1031    */
1032   protected HLogKey makeKey(byte[] regionName, byte[] tableName, long seqnum,
1033       long now, UUID clusterId) {
1034     return new HLogKey(regionName, tableName, seqnum, now, clusterId);
1035   }
1036 
1037 
1038   /** Append an entry to the log.
1039    *
1040    * @param regionInfo
1041    * @param logEdit
1042    * @param logKey
1043    * @param doSync shall we sync after writing the transaction
1044    * @return The txid of this transaction
1045    * @throws IOException
1046    */
1047   public long append(HRegionInfo regionInfo, HLogKey logKey, WALEdit logEdit,
1048                      HTableDescriptor htd, boolean doSync)
1049   throws IOException {
1050     if (this.closed) {
1051       throw new IOException("Cannot append; log is closed");
1052     }
1053     long txid = 0;
1054     synchronized (updateLock) {
1055       long seqNum = obtainSeqNum();
1056       logKey.setLogSeqNum(seqNum);
1057       // The 'lastSeqWritten' map holds the sequence number of the oldest
1058       // write for each region (i.e. the first edit added to the particular
1059       // memstore). When the cache is flushed, the entry for the
1060       // region being flushed is removed if the sequence number of the flush
1061       // is greater than or equal to the value in lastSeqWritten.
1062       this.lastSeqWritten.putIfAbsent(regionInfo.getEncodedNameAsBytes(),
1063         Long.valueOf(seqNum));
1064       doWrite(regionInfo, logKey, logEdit, htd);
1065       txid = this.unflushedEntries.incrementAndGet();
1066       this.numEntries.incrementAndGet();
1067       if (htd.isDeferredLogFlush()) {
1068         lastDeferredTxid = txid;
1069       }
1070     }
1071 
1072     // Sync if catalog region, and if not then check if that table supports
1073     // deferred log flushing
1074     if (doSync &&
1075         (regionInfo.isMetaRegion() ||
1076         !htd.isDeferredLogFlush())) {
1077       // sync txn to file system
1078       this.sync(txid);
1079     }
1080     return txid;
1081   }
1082 
1083   /**
1084    * Only used in tests.
1085    *
1086    * @param info
1087    * @param tableName
1088    * @param edits
1089    * @param now
1090    * @param htd
1091    * @throws IOException
1092    */
1093   public void append(HRegionInfo info, byte [] tableName, WALEdit edits,
1094     final long now, HTableDescriptor htd)
1095   throws IOException {
1096     append(info, tableName, edits, HConstants.DEFAULT_CLUSTER_ID, now, htd);
1097   }
1098 
1099   /**
1100    * Append a set of edits to the log. Log edits are keyed by (encoded)
1101    * regionName, rowname, and log-sequence-id.
1102    *
1103    * Later, if we sort by these keys, we obtain all the relevant edits for a
1104    * given key-range of the HRegion (TODO). Any edits that do not have a
1105    * matching COMPLETE_CACHEFLUSH message can be discarded.
1106    *
1107    * <p>
1108    * Logs cannot be restarted once closed, or once the HLog process dies. Each
1109    * time the HLog starts, it must create a new log. This means that other
1110    * systems should process the log appropriately upon each startup (and prior
1111    * to initializing HLog).
1112    *
1113    * synchronized prevents appends during the completion of a cache flush or for
1114    * the duration of a log roll.
1115    *
1116    * @param info
1117    * @param tableName
1118    * @param edits
1119    * @param clusterId The originating clusterId for this edit (for replication)
1120    * @param now
1121    * @param doSync shall we sync?
1122    * @return txid of this transaction
1123    * @throws IOException
1124    */
1125   private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId,
1126       final long now, HTableDescriptor htd, boolean doSync)
1127     throws IOException {
1128       if (edits.isEmpty()) return this.unflushedEntries.get();;
1129       if (this.closed) {
1130         throw new IOException("Cannot append; log is closed");
1131       }
1132       long txid = 0;
1133       synchronized (this.updateLock) {
1134         long seqNum = obtainSeqNum();
1135         // The 'lastSeqWritten' map holds the sequence number of the oldest
1136         // write for each region (i.e. the first edit added to the particular
1137         // memstore). . When the cache is flushed, the entry for the
1138         // region being flushed is removed if the sequence number of the flush
1139         // is greater than or equal to the value in lastSeqWritten.
1140         // Use encoded name.  Its shorter, guaranteed unique and a subset of
1141         // actual  name.
1142         byte [] encodedRegionName = info.getEncodedNameAsBytes();
1143         this.lastSeqWritten.putIfAbsent(encodedRegionName, seqNum);
1144         HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId);
1145         doWrite(info, logKey, edits, htd);
1146         this.numEntries.incrementAndGet();
1147         txid = this.unflushedEntries.incrementAndGet();
1148         if (htd.isDeferredLogFlush()) {
1149           lastDeferredTxid = txid;
1150         }
1151       }
1152       // Sync if catalog region, and if not then check if that table supports
1153       // deferred log flushing
1154       if (doSync && 
1155           (info.isMetaRegion() ||
1156           !htd.isDeferredLogFlush())) {
1157         // sync txn to file system
1158         this.sync(txid);
1159       }
1160       return txid;
1161     }
1162 
1163   /**
1164    * Append a set of edits to the log. Log edits are keyed by (encoded)
1165    * regionName, rowname, and log-sequence-id. The HLog is not flushed
1166    * after this transaction is written to the log.
1167    *
1168    * @param info
1169    * @param tableName
1170    * @param edits
1171    * @param clusterId The originating clusterId for this edit (for replication)
1172    * @param now
1173    * @return txid of this transaction
1174    * @throws IOException
1175    */
1176   public long appendNoSync(HRegionInfo info, byte [] tableName, WALEdit edits, 
1177     UUID clusterId, final long now, HTableDescriptor htd)
1178     throws IOException {
1179     return append(info, tableName, edits, clusterId, now, htd, false);
1180   }
1181 
1182   /**
1183    * Append a set of edits to the log. Log edits are keyed by (encoded)
1184    * regionName, rowname, and log-sequence-id. The HLog is flushed
1185    * after this transaction is written to the log.
1186    *
1187    * @param info
1188    * @param tableName
1189    * @param edits
1190    * @param clusterId The originating clusterId for this edit (for replication)
1191    * @param now
1192    * @return txid of this transaction
1193    * @throws IOException
1194    */
1195   public long append(HRegionInfo info, byte [] tableName, WALEdit edits, 
1196     UUID clusterId, final long now, HTableDescriptor htd)
1197     throws IOException {
1198     return append(info, tableName, edits, clusterId, now, htd, true);
1199   }
1200 
1201   /**
1202    * This class is responsible to hold the HLog's appended Entry list
1203    * and to sync them according to a configurable interval.
1204    *
1205    * Deferred log flushing works first by piggy backing on this process by
1206    * simply not sync'ing the appended Entry. It can also be sync'd by other
1207    * non-deferred log flushed entries outside of this thread.
1208    */
1209   class LogSyncer extends HasThread {
1210 
1211     private final long optionalFlushInterval;
1212 
1213     private AtomicBoolean closeLogSyncer = new AtomicBoolean(false);
1214 
1215     // List of pending writes to the HLog. There corresponds to transactions
1216     // that have not yet returned to the client. We keep them cached here
1217     // instead of writing them to HDFS piecemeal, because the HDFS write 
1218     // method is pretty heavyweight as far as locking is concerned. The 
1219     // goal is to increase the batchsize for writing-to-hdfs as well as
1220     // sync-to-hdfs, so that we can get better system throughput.
1221     private List<Entry> pendingWrites = new LinkedList<Entry>();
1222 
1223     LogSyncer(long optionalFlushInterval) {
1224       this.optionalFlushInterval = optionalFlushInterval;
1225     }
1226 
1227     @Override
1228     public void run() {
1229       try {
1230         // awaiting with a timeout doesn't always
1231         // throw exceptions on interrupt
1232         while(!this.isInterrupted() && !closeLogSyncer.get()) {
1233 
1234           try {
1235             if (unflushedEntries.get() <= syncedTillHere) {
1236               synchronized (closeLogSyncer) {
1237                 closeLogSyncer.wait(this.optionalFlushInterval);
1238               }
1239             }
1240             // Calling sync since we waited or had unflushed entries.
1241             // Entries appended but not sync'd are taken care of here AKA
1242             // deferred log flush
1243             sync();
1244           } catch (IOException e) {
1245             LOG.error("Error while syncing, requesting close of hlog ", e);
1246             requestLogRoll();
1247           }
1248         }
1249       } catch (InterruptedException e) {
1250         LOG.debug(getName() + " interrupted while waiting for sync requests");
1251       } finally {
1252         LOG.info(getName() + " exiting");
1253       }
1254     }
1255 
1256     // appends new writes to the pendingWrites. It is better to keep it in
1257     // our own queue rather than writing it to the HDFS output stream because
1258     // HDFSOutputStream.writeChunk is not lightweight at all.
1259     synchronized void append(Entry e) throws IOException {
1260       pendingWrites.add(e);
1261     }
1262 
1263     // Returns all currently pending writes. New writes
1264     // will accumulate in a new list.
1265     synchronized List<Entry> getPendingWrites() {
1266       List<Entry> save = this.pendingWrites;
1267       this.pendingWrites = new LinkedList<Entry>();
1268       return save;
1269     }
1270 
1271     // writes out pending entries to the HLog
1272     void hlogFlush(Writer writer, List<Entry> pending) throws IOException {
1273       if (pending == null) return;
1274 
1275       // write out all accumulated Entries to hdfs.
1276       for (Entry e : pending) {
1277         writer.append(e);
1278       }
1279     }
1280 
1281     void close() {
1282       synchronized (closeLogSyncer) {
1283         closeLogSyncer.set(true);
1284         closeLogSyncer.notifyAll();
1285       }
1286     }
1287   }
1288 
1289   // sync all known transactions
1290   private void syncer() throws IOException {
1291     syncer(this.unflushedEntries.get()); // sync all pending items
1292   }
1293 
1294   // sync all transactions upto the specified txid
1295   private void syncer(long txid) throws IOException {
1296     // if the transaction that we are interested in is already
1297     // synced, then return immediately.
1298     if (txid <= this.syncedTillHere) {
1299       return;
1300     }
1301     Writer tempWriter;
1302     synchronized (this.updateLock) {
1303       if (this.closed) return;
1304       tempWriter = this.writer; // guaranteed non-null
1305     }
1306     try {
1307       long doneUpto;
1308       long now = System.currentTimeMillis();
1309       // First flush all the pending writes to HDFS. Then 
1310       // issue the sync to HDFS. If sync is successful, then update
1311       // syncedTillHere to indicate that transactions till this
1312       // number has been successfully synced.
1313       IOException ioe = null;
1314       List<Entry> pending = null;
1315       synchronized (flushLock) {
1316         if (txid <= this.syncedTillHere) {
1317           return;
1318         }
1319         doneUpto = this.unflushedEntries.get();
1320         pending = logSyncer.getPendingWrites();
1321         try {
1322           logSyncer.hlogFlush(tempWriter, pending);
1323         } catch(IOException io) {
1324           ioe = io;
1325           LOG.error("syncer encountered error, will retry. txid=" + txid, ioe);
1326         }
1327       }
1328       if (ioe != null && pending != null) {
1329         synchronized (this.updateLock) {
1330           synchronized (flushLock) {
1331             // HBASE-4387, HBASE-5623, retry with updateLock held
1332             tempWriter = this.writer;
1333             logSyncer.hlogFlush(tempWriter, pending);
1334           }
1335         }
1336       }
1337       // another thread might have sync'ed avoid double-sync'ing
1338       if (txid <= this.syncedTillHere) {
1339         return;
1340       }
1341       try {
1342         tempWriter.sync();
1343       } catch (IOException io) {
1344         synchronized (this.updateLock) {
1345           // HBASE-4387, HBASE-5623, retry with updateLock held
1346           tempWriter = this.writer;
1347           tempWriter.sync();
1348         }
1349       }
1350       this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto);
1351 
1352       syncTime.inc(System.currentTimeMillis() - now);
1353       if (!this.logRollRunning) {
1354         checkLowReplication();
1355         try {
1356           if (tempWriter.getLength() > this.logrollsize) {
1357             requestLogRoll();
1358           }
1359         } catch (IOException x) {
1360           LOG.debug("Log roll failed and will be retried. (This is not an error)");
1361         }
1362       }
1363     } catch (IOException e) {
1364       LOG.fatal("Could not sync. Requesting close of hlog", e);
1365       requestLogRoll();
1366       throw e;
1367     }
1368   }
1369 
1370   private void checkLowReplication() {
1371     // if the number of replicas in HDFS has fallen below the configured
1372     // value, then roll logs.
1373     try {
1374       int numCurrentReplicas = getLogReplication();
1375       if (numCurrentReplicas != 0
1376           && numCurrentReplicas < this.minTolerableReplication) {
1377         if (this.lowReplicationRollEnabled) {
1378           if (this.consecutiveLogRolls < this.lowReplicationRollLimit) {
1379             LOG.warn("HDFS pipeline error detected. " + "Found "
1380                 + numCurrentReplicas + " replicas but expecting no less than "
1381                 + this.minTolerableReplication + " replicas. "
1382                 + " Requesting close of hlog.");
1383             requestLogRoll();
1384             // If rollWriter is requested, increase consecutiveLogRolls. Once it
1385             // is larger than lowReplicationRollLimit, disable the
1386             // LowReplication-Roller
1387             this.consecutiveLogRolls++;
1388           } else {
1389             LOG.warn("Too many consecutive RollWriter requests, it's a sign of "
1390                 + "the total number of live datanodes is lower than the tolerable replicas.");
1391             this.consecutiveLogRolls = 0;
1392             this.lowReplicationRollEnabled = false;
1393           }
1394         }
1395       } else if (numCurrentReplicas >= this.minTolerableReplication) {
1396 
1397         if (!this.lowReplicationRollEnabled) {
1398           // The new writer's log replicas is always the default value.
1399           // So we should not enable LowReplication-Roller. If numEntries
1400           // is lower than or equals 1, we consider it as a new writer.
1401           if (this.numEntries.get() <= 1) {
1402             return;
1403           }
1404           // Once the live datanode number and the replicas return to normal,
1405           // enable the LowReplication-Roller.
1406           this.lowReplicationRollEnabled = true;
1407           LOG.info("LowReplication-Roller was enabled.");
1408         }
1409       }
1410     } catch (Exception e) {
1411       LOG.warn("Unable to invoke DFSOutputStream.getNumCurrentReplicas" + e +
1412           " still proceeding ahead...");
1413     }
1414   }
1415 
1416   /**
1417    * This method gets the datanode replication count for the current HLog.
1418    *
1419    * If the pipeline isn't started yet or is empty, you will get the default
1420    * replication factor.  Therefore, if this function returns 0, it means you
1421    * are not properly running with the HDFS-826 patch.
1422    * @throws InvocationTargetException
1423    * @throws IllegalAccessException
1424    * @throws IllegalArgumentException
1425    *
1426    * @throws Exception
1427    */
1428   int getLogReplication()
1429   throws IllegalArgumentException, IllegalAccessException, InvocationTargetException {
1430     if (this.getNumCurrentReplicas != null && this.hdfs_out != null) {
1431       Object repl = this.getNumCurrentReplicas.invoke(getOutputStream(), NO_ARGS);
1432       if (repl instanceof Integer) {
1433         return ((Integer)repl).intValue();
1434       }
1435     }
1436     return 0;
1437   }
1438 
1439   boolean canGetCurReplicas() {
1440     return this.getNumCurrentReplicas != null;
1441   }
1442 
1443   public void hsync() throws IOException {
1444     syncer();
1445   }
1446 
1447   public void hflush() throws IOException {
1448     syncer();
1449   }
1450 
1451   public void sync() throws IOException {
1452     syncer();
1453   }
1454 
1455   public void sync(long txid) throws IOException {
1456     syncer(txid);
1457   }
1458 
1459   private void requestLogRoll() {
1460     if (!this.listeners.isEmpty()) {
1461       for (WALActionsListener i: this.listeners) {
1462         i.logRollRequested();
1463       }
1464     }
1465   }
1466 
1467   protected void doWrite(HRegionInfo info, HLogKey logKey, WALEdit logEdit,
1468                            HTableDescriptor htd)
1469   throws IOException {
1470     if (!this.enabled) {
1471       return;
1472     }
1473     if (!this.listeners.isEmpty()) {
1474       for (WALActionsListener i: this.listeners) {
1475         i.visitLogEntryBeforeWrite(htd, logKey, logEdit);
1476       }
1477     }
1478     try {
1479       long now = System.currentTimeMillis();
1480       // coprocessor hook:
1481       if (!coprocessorHost.preWALWrite(info, logKey, logEdit)) {
1482         // write to our buffer for the Hlog file.
1483         logSyncer.append(new HLog.Entry(logKey, logEdit));
1484       }
1485       long took = System.currentTimeMillis() - now;
1486       coprocessorHost.postWALWrite(info, logKey, logEdit);
1487       writeTime.inc(took);
1488       long len = 0;
1489       for (KeyValue kv : logEdit.getKeyValues()) {
1490         len += kv.getLength();
1491       }
1492       writeSize.inc(len);
1493       if (took > 1000) {
1494         LOG.warn(String.format(
1495           "%s took %d ms appending an edit to hlog; editcount=%d, len~=%s",
1496           Thread.currentThread().getName(), took, this.numEntries.get(),
1497           StringUtils.humanReadableInt(len)));
1498         slowHLogAppendCount.incrementAndGet();
1499         slowHLogAppendTime.inc(took);
1500       }
1501     } catch (IOException e) {
1502       LOG.fatal("Could not append. Requesting close of hlog", e);
1503       requestLogRoll();
1504       throw e;
1505     }
1506   }
1507 
1508 
1509   /** @return How many items have been added to the log */
1510   int getNumEntries() {
1511     return numEntries.get();
1512   }
1513 
1514   /**
1515    * Obtain a log sequence number.
1516    */
1517   public long obtainSeqNum() {
1518     return this.logSeqNum.incrementAndGet();
1519   }
1520 
1521   /** @return the number of log files in use */
1522   int getNumLogFiles() {
1523     return outputfiles.size();
1524   }
1525 
1526   private byte[] getSnapshotName(byte[] encodedRegionName) {
1527     byte snp[] = new byte[encodedRegionName.length + 3];
1528     // an encoded region name has only hex digits. s, n or p are not hex
1529     // and therefore snapshot-names will never collide with
1530     // encoded-region-names
1531     snp[0] = 's'; snp[1] = 'n'; snp[2] = 'p';
1532     for (int i = 0; i < encodedRegionName.length; i++) {
1533       snp[i+3] = encodedRegionName[i];
1534     }
1535     return snp;
1536   }
1537 
1538   /**
1539    * By acquiring a log sequence ID, we can allow log messages to continue while
1540    * we flush the cache.
1541    *
1542    * Acquire a lock so that we do not roll the log between the start and
1543    * completion of a cache-flush. Otherwise the log-seq-id for the flush will
1544    * not appear in the correct logfile.
1545    *
1546    * Ensuring that flushes and log-rolls don't happen concurrently also allows
1547    * us to temporarily put a log-seq-number in lastSeqWritten against the region
1548    * being flushed that might not be the earliest in-memory log-seq-number for
1549    * that region. By the time the flush is completed or aborted and before the
1550    * cacheFlushLock is released it is ensured that lastSeqWritten again has the
1551    * oldest in-memory edit's lsn for the region that was being flushed.
1552    *
1553    * In this method, by removing the entry in lastSeqWritten for the region
1554    * being flushed we ensure that the next edit inserted in this region will be
1555    * correctly recorded in {@link #append(HRegionInfo, byte[], WALEdit, long, HTableDescriptor)} The
1556    * lsn of the earliest in-memory lsn - which is now in the memstore snapshot -
1557    * is saved temporarily in the lastSeqWritten map while the flush is active.
1558    *
1559    * @return sequence ID to pass
1560    *         {@link #completeCacheFlush(byte[], byte[], long, boolean)} (byte[],
1561    *         byte[], long)}
1562    * @see #completeCacheFlush(byte[], byte[], long, boolean)
1563    * @see #abortCacheFlush(byte[])
1564    */
1565   public long startCacheFlush(final byte[] encodedRegionName) {
1566     this.cacheFlushLock.lock();
1567     Long seq = this.lastSeqWritten.remove(encodedRegionName);
1568     // seq is the lsn of the oldest edit associated with this region. If a
1569     // snapshot already exists - because the last flush failed - then seq will
1570     // be the lsn of the oldest edit in the snapshot
1571     if (seq != null) {
1572       // keeping the earliest sequence number of the snapshot in
1573       // lastSeqWritten maintains the correctness of
1574       // getOldestOutstandingSeqNum(). But it doesn't matter really because
1575       // everything is being done inside of cacheFlush lock.
1576       Long oldseq =
1577         lastSeqWritten.put(getSnapshotName(encodedRegionName), seq);
1578       if (oldseq != null) {
1579         LOG.error("Logic Error Snapshot seq id from earlier flush still" +
1580             " present! for region " + Bytes.toString(encodedRegionName) +
1581             " overwritten oldseq=" + oldseq + "with new seq=" + seq);
1582         Runtime.getRuntime().halt(1);
1583       }
1584     }
1585     return obtainSeqNum();
1586   }
1587 
1588 
1589   /**
1590    * Complete the cache flush
1591    *
1592    * Protected by cacheFlushLock
1593    *
1594    * @param encodedRegionName
1595    * @param tableName
1596    * @param logSeqId
1597    * @throws IOException
1598    */
1599   public void completeCacheFlush(final byte [] encodedRegionName,
1600       final byte [] tableName, final long logSeqId, final boolean isMetaRegion)
1601   throws IOException {
1602     try {
1603       if (this.closed) {
1604         return;
1605       }
1606       long txid = 0;
1607       synchronized (updateLock) {
1608         long now = System.currentTimeMillis();
1609         WALEdit edit = completeCacheFlushLogEdit();
1610         HLogKey key = makeKey(encodedRegionName, tableName, logSeqId,
1611             System.currentTimeMillis(), HConstants.DEFAULT_CLUSTER_ID);
1612         logSyncer.append(new Entry(key, edit));
1613         txid = this.unflushedEntries.incrementAndGet();
1614         writeTime.inc(System.currentTimeMillis() - now);
1615         long len = 0;
1616         for (KeyValue kv : edit.getKeyValues()) {
1617           len += kv.getLength();
1618         }
1619         writeSize.inc(len);
1620         this.numEntries.incrementAndGet();
1621       }
1622       // sync txn to file system
1623       this.sync(txid);
1624 
1625     } finally {
1626       // updateLock not needed for removing snapshot's entry
1627       // Cleaning up of lastSeqWritten is in the finally clause because we
1628       // don't want to confuse getOldestOutstandingSeqNum()
1629       this.lastSeqWritten.remove(getSnapshotName(encodedRegionName));
1630       this.cacheFlushLock.unlock();
1631     }
1632   }
1633 
1634   private WALEdit completeCacheFlushLogEdit() {
1635     KeyValue kv = new KeyValue(METAROW, METAFAMILY, null,
1636       System.currentTimeMillis(), COMPLETE_CACHE_FLUSH);
1637     WALEdit e = new WALEdit();
1638     e.add(kv);
1639     return e;
1640   }
1641 
1642   /**
1643    * Abort a cache flush.
1644    * Call if the flush fails. Note that the only recovery for an aborted flush
1645    * currently is a restart of the regionserver so the snapshot content dropped
1646    * by the failure gets restored to the memstore.
1647    */
1648   public void abortCacheFlush(byte[] encodedRegionName) {
1649     Long snapshot_seq =
1650       this.lastSeqWritten.remove(getSnapshotName(encodedRegionName));
1651     if (snapshot_seq != null) {
1652       // updateLock not necessary because we are racing against
1653       // lastSeqWritten.putIfAbsent() in append() and we will always win
1654       // before releasing cacheFlushLock make sure that the region's entry in
1655       // lastSeqWritten points to the earliest edit in the region
1656       Long current_memstore_earliest_seq =
1657         this.lastSeqWritten.put(encodedRegionName, snapshot_seq);
1658       if (current_memstore_earliest_seq != null &&
1659           (current_memstore_earliest_seq.longValue() <=
1660             snapshot_seq.longValue())) {
1661         LOG.error("Logic Error region " + Bytes.toString(encodedRegionName) +
1662             "acquired edits out of order current memstore seq=" +
1663             current_memstore_earliest_seq + " snapshot seq=" + snapshot_seq);
1664         Runtime.getRuntime().halt(1);
1665       }
1666     }
1667     this.cacheFlushLock.unlock();
1668   }
1669 
1670   /**
1671    * @param family
1672    * @return true if the column is a meta column
1673    */
1674   public static boolean isMetaFamily(byte [] family) {
1675     return Bytes.equals(METAFAMILY, family);
1676   }
1677 
1678   /**
1679    * Get LowReplication-Roller status
1680    * 
1681    * @return lowReplicationRollEnabled
1682    */
1683   public boolean isLowReplicationRollEnabled() {
1684     return lowReplicationRollEnabled;
1685   }
1686 
1687   @SuppressWarnings("unchecked")
1688   public static Class<? extends HLogKey> getKeyClass(Configuration conf) {
1689      return (Class<? extends HLogKey>)
1690        conf.getClass("hbase.regionserver.hlog.keyclass", HLogKey.class);
1691   }
1692 
1693   public static HLogKey newKey(Configuration conf) throws IOException {
1694     Class<? extends HLogKey> keyClass = getKeyClass(conf);
1695     try {
1696       return keyClass.newInstance();
1697     } catch (InstantiationException e) {
1698       throw new IOException("cannot create hlog key");
1699     } catch (IllegalAccessException e) {
1700       throw new IOException("cannot create hlog key");
1701     }
1702   }
1703 
1704   /**
1705    * Utility class that lets us keep track of the edit with it's key
1706    * Only used when splitting logs
1707    */
1708   public static class Entry implements Writable {
1709     private WALEdit edit;
1710     private HLogKey key;
1711 
1712     public Entry() {
1713       edit = new WALEdit();
1714       key = new HLogKey();
1715     }
1716 
1717     /**
1718      * Constructor for both params
1719      * @param edit log's edit
1720      * @param key log's key
1721      */
1722     public Entry(HLogKey key, WALEdit edit) {
1723       super();
1724       this.key = key;
1725       this.edit = edit;
1726     }
1727     /**
1728      * Gets the edit
1729      * @return edit
1730      */
1731     public WALEdit getEdit() {
1732       return edit;
1733     }
1734     /**
1735      * Gets the key
1736      * @return key
1737      */
1738     public HLogKey getKey() {
1739       return key;
1740     }
1741 
1742     @Override
1743     public String toString() {
1744       return this.key + "=" + this.edit;
1745     }
1746 
1747     @Override
1748     public void write(DataOutput dataOutput) throws IOException {
1749       this.key.write(dataOutput);
1750       this.edit.write(dataOutput);
1751     }
1752 
1753     @Override
1754     public void readFields(DataInput dataInput) throws IOException {
1755       this.key.readFields(dataInput);
1756       this.edit.readFields(dataInput);
1757     }
1758   }
1759 
1760   /**
1761    * Construct the HLog directory name
1762    *
1763    * @param serverName Server name formatted as described in {@link ServerName}
1764    * @return the HLog directory name
1765    */
1766   public static String getHLogDirectoryName(final String serverName) {
1767     StringBuilder dirName = new StringBuilder(HConstants.HREGION_LOGDIR_NAME);
1768     dirName.append("/");
1769     dirName.append(serverName);
1770     return dirName.toString();
1771   }
1772 
1773   /**
1774    * Get the directory we are making logs in.
1775    * 
1776    * @return dir
1777    */
1778   protected Path getDir() {
1779     return dir;
1780   }
1781   
1782   /**
1783    * @param filename name of the file to validate
1784    * @return <tt>true</tt> if the filename matches an HLog, <tt>false</tt>
1785    *         otherwise
1786    */
1787   public static boolean validateHLogFilename(String filename) {
1788     return pattern.matcher(filename).matches();
1789   }
1790 
1791   static Path getHLogArchivePath(Path oldLogDir, Path p) {
1792     return new Path(oldLogDir, p.getName());
1793   }
1794 
1795   static String formatRecoveredEditsFileName(final long seqid) {
1796     return String.format("%019d", seqid);
1797   }
1798 
1799   /**
1800    * Returns sorted set of edit files made by wal-log splitter, excluding files
1801    * with '.temp' suffix.
1802    * @param fs
1803    * @param regiondir
1804    * @return Files in passed <code>regiondir</code> as a sorted set.
1805    * @throws IOException
1806    */
1807   public static NavigableSet<Path> getSplitEditFilesSorted(final FileSystem fs,
1808       final Path regiondir)
1809   throws IOException {
1810     NavigableSet<Path> filesSorted = new TreeSet<Path>();
1811     Path editsdir = getRegionDirRecoveredEditsDir(regiondir);
1812     if (!fs.exists(editsdir)) return filesSorted;
1813     FileStatus[] files = FSUtils.listStatus(fs, editsdir, new PathFilter() {
1814       @Override
1815       public boolean accept(Path p) {
1816         boolean result = false;
1817         try {
1818           // Return files and only files that match the editfile names pattern.
1819           // There can be other files in this directory other than edit files.
1820           // In particular, on error, we'll move aside the bad edit file giving
1821           // it a timestamp suffix.  See moveAsideBadEditsFile.
1822           Matcher m = EDITFILES_NAME_PATTERN.matcher(p.getName());
1823           result = fs.isFile(p) && m.matches();
1824           // Skip the file whose name ends with RECOVERED_LOG_TMPFILE_SUFFIX,
1825           // because it means splithlog thread is writting this file.
1826           if (p.getName().endsWith(RECOVERED_LOG_TMPFILE_SUFFIX)) {
1827             result = false;
1828           }
1829         } catch (IOException e) {
1830           LOG.warn("Failed isFile check on " + p);
1831         }
1832         return result;
1833       }
1834     });
1835     if (files == null) return filesSorted;
1836     for (FileStatus status: files) {
1837       filesSorted.add(status.getPath());
1838     }
1839     return filesSorted;
1840   }
1841 
1842   /**
1843    * Move aside a bad edits file.
1844    * @param fs
1845    * @param edits Edits file to move aside.
1846    * @return The name of the moved aside file.
1847    * @throws IOException
1848    */
1849   public static Path moveAsideBadEditsFile(final FileSystem fs,
1850       final Path edits)
1851   throws IOException {
1852     Path moveAsideName = new Path(edits.getParent(), edits.getName() + "." +
1853       System.currentTimeMillis());
1854     if (!HBaseFileSystem.renameDirForFileSystem(fs, edits, moveAsideName)) {
1855       LOG.warn("Rename failed from " + edits + " to " + moveAsideName);
1856     }
1857     return moveAsideName;
1858   }
1859 
1860   /**
1861    * @param regiondir This regions directory in the filesystem.
1862    * @return The directory that holds recovered edits files for the region
1863    * <code>regiondir</code>
1864    */
1865   public static Path getRegionDirRecoveredEditsDir(final Path regiondir) {
1866     return new Path(regiondir, RECOVERED_EDITS_DIR);
1867   }
1868 
1869   public static final long FIXED_OVERHEAD = ClassSize.align(
1870     ClassSize.OBJECT + (5 * ClassSize.REFERENCE) +
1871     ClassSize.ATOMIC_INTEGER + Bytes.SIZEOF_INT + (3 * Bytes.SIZEOF_LONG));
1872 
1873   private static void usage() {
1874     System.err.println("Usage: HLog <ARGS>");
1875     System.err.println("Arguments:");
1876     System.err.println(" --dump  Dump textual representation of passed one or more files");
1877     System.err.println("         For example: HLog --dump hdfs://example.com:9000/hbase/.logs/MACHINE/LOGFILE");
1878     System.err.println(" --split Split the passed directory of WAL logs");
1879     System.err.println("         For example: HLog --split hdfs://example.com:9000/hbase/.logs/DIR");
1880   }
1881 
1882   private static void split(final Configuration conf, final Path p)
1883   throws IOException {
1884     FileSystem fs = FileSystem.get(conf);
1885     if (!fs.exists(p)) {
1886       throw new FileNotFoundException(p.toString());
1887     }
1888     final Path baseDir = new Path(conf.get(HConstants.HBASE_DIR));
1889     final Path oldLogDir = new Path(baseDir, HConstants.HREGION_OLDLOGDIR_NAME);
1890     if (!fs.getFileStatus(p).isDir()) {
1891       throw new IOException(p + " is not a directory");
1892     }
1893 
1894     HLogSplitter logSplitter = HLogSplitter.createLogSplitter(
1895         conf, baseDir, p, oldLogDir, fs);
1896     logSplitter.splitLog();
1897   }
1898 
1899   /**
1900    * @return Coprocessor host.
1901    */
1902   public WALCoprocessorHost getCoprocessorHost() {
1903     return coprocessorHost;
1904   }
1905 
1906   /** Provide access to currently deferred sequence num for tests */
1907   boolean hasDeferredEntries() {
1908     return lastDeferredTxid > syncedTillHere;
1909   }
1910 
1911   /**
1912    * Pass one or more log file names and it will either dump out a text version
1913    * on <code>stdout</code> or split the specified log files.
1914    *
1915    * @param args
1916    * @throws IOException
1917    */
1918   public static void main(String[] args) throws IOException {
1919     if (args.length < 2) {
1920       usage();
1921       System.exit(-1);
1922     }
1923     // either dump using the HLogPrettyPrinter or split, depending on args
1924     if (args[0].compareTo("--dump") == 0) {
1925       HLogPrettyPrinter.run(Arrays.copyOfRange(args, 1, args.length));
1926     } else if (args[0].compareTo("--split") == 0) {
1927       Configuration conf = HBaseConfiguration.create();
1928       for (int i = 1; i < args.length; i++) {
1929         try {
1930           conf.set("fs.default.name", args[i]);
1931           conf.set("fs.defaultFS", args[i]);
1932           Path logPath = new Path(args[i]);
1933           split(conf, logPath);
1934         } catch (Throwable t) {
1935           t.printStackTrace(System.err);
1936           System.exit(-1);
1937         }
1938       }
1939     } else {
1940       usage();
1941       System.exit(-1);
1942     }
1943   }
1944 }