View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver.wal;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InterruptedIOException;
25  import java.lang.reflect.Constructor;
26  import java.lang.reflect.InvocationTargetException;
27  import java.text.ParseException;
28  import java.util.ArrayList;
29  import java.util.Collections;
30  import java.util.HashSet;
31  import java.util.LinkedList;
32  import java.util.List;
33  import java.util.Map;
34  import java.util.Set;
35  import java.util.TreeMap;
36  import java.util.TreeSet;
37  import java.util.concurrent.Callable;
38  import java.util.concurrent.CompletionService;
39  import java.util.concurrent.ConcurrentHashMap;
40  import java.util.concurrent.CountDownLatch;
41  import java.util.concurrent.ExecutionException;
42  import java.util.concurrent.ExecutorCompletionService;
43  import java.util.concurrent.Future;
44  import java.util.concurrent.ThreadFactory;
45  import java.util.concurrent.ThreadPoolExecutor;
46  import java.util.concurrent.TimeUnit;
47  import java.util.concurrent.atomic.AtomicBoolean;
48  import java.util.concurrent.atomic.AtomicLong;
49  import java.util.concurrent.atomic.AtomicReference;
50  
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  import org.apache.hadoop.classification.InterfaceAudience;
54  import org.apache.hadoop.conf.Configuration;
55  import org.apache.hadoop.fs.FileStatus;
56  import org.apache.hadoop.fs.FileSystem;
57  import org.apache.hadoop.fs.Path;
58  import org.apache.hadoop.hbase.TableName;
59  import org.apache.hadoop.hbase.HConstants;
60  import org.apache.hadoop.hbase.HRegionInfo;
61  import org.apache.hadoop.hbase.HRegionLocation;
62  import org.apache.hadoop.hbase.KeyValue;
63  import org.apache.hadoop.hbase.RemoteExceptionHandler;
64  import org.apache.hadoop.hbase.ServerName;
65  import org.apache.hadoop.hbase.TableNotFoundException;
66  import org.apache.hadoop.hbase.client.ConnectionUtils;
67  import org.apache.hadoop.hbase.client.Delete;
68  import org.apache.hadoop.hbase.client.HConnection;
69  import org.apache.hadoop.hbase.client.HConnectionManager;
70  import org.apache.hadoop.hbase.client.Put;
71  import org.apache.hadoop.hbase.client.Row;
72  import org.apache.hadoop.hbase.exceptions.RegionOpeningException;
73  import org.apache.hadoop.hbase.io.HeapSize;
74  import org.apache.hadoop.hbase.master.SplitLogManager;
75  import org.apache.hadoop.hbase.monitoring.MonitoredTask;
76  import org.apache.hadoop.hbase.monitoring.TaskMonitor;
77  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
78  import org.apache.hadoop.hbase.protobuf.RequestConverter;
79  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoRequest;
80  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse;
81  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
82  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.RegionStoreSequenceIds;
83  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.StoreSequenceId;
84  import org.apache.hadoop.hbase.regionserver.HRegion;
85  import org.apache.hadoop.hbase.regionserver.LastSequenceId;
86  import org.apache.hadoop.hbase.regionserver.wal.HLog.Entry;
87  import org.apache.hadoop.hbase.regionserver.wal.HLog.Reader;
88  import org.apache.hadoop.hbase.regionserver.wal.HLog.Writer;
89  import org.apache.hadoop.hbase.util.Bytes;
90  import org.apache.hadoop.hbase.util.CancelableProgressable;
91  import org.apache.hadoop.hbase.util.ClassSize;
92  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
93  import org.apache.hadoop.hbase.util.FSUtils;
94  import org.apache.hadoop.hbase.util.Pair;
95  import org.apache.hadoop.hbase.util.Threads;
96  import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
97  import org.apache.hadoop.hbase.zookeeper.ZKTable;
98  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
99  import org.apache.hadoop.io.MultipleIOException;
100 import org.apache.zookeeper.KeeperException;
101 
102 import com.google.common.base.Preconditions;
103 import com.google.common.collect.Lists;
104 import com.google.protobuf.ServiceException;
105 
106 /**
107  * This class is responsible for splitting up a bunch of regionserver commit log
108  * files that are no longer being written to, into new files, one per region for
109  * region to replay on startup. Delete the old log files when finished.
110  */
111 @InterfaceAudience.Private
112 public class HLogSplitter {
113   static final Log LOG = LogFactory.getLog(HLogSplitter.class);
114 
115   // Parameters for split process
116   protected final Path rootDir;
117   protected final FileSystem fs;
118   protected final Configuration conf;
119 
120   // Major subcomponents of the split process.
121   // These are separated into inner classes to make testing easier.
122   OutputSink outputSink;
123   EntryBuffers entryBuffers;
124 
125   private Set<TableName> disablingOrDisabledTables =
126       new HashSet<TableName>();
127   private ZooKeeperWatcher watcher;
128 
129   // If an exception is thrown by one of the other threads, it will be
130   // stored here.
131   protected AtomicReference<Throwable> thrown = new AtomicReference<Throwable>();
132 
133   // Wait/notify for when data has been produced by the reader thread,
134   // consumed by the reader thread, or an exception occurred
135   final Object dataAvailable = new Object();
136 
137   private MonitoredTask status;
138 
139   // For checking the latest flushed sequence id
140   protected final LastSequenceId sequenceIdChecker;
141 
142   protected boolean distributedLogReplay;
143 
144   // Map encodedRegionName -> lastFlushedSequenceId
145   protected Map<String, Long> lastFlushedSequenceIds = new ConcurrentHashMap<String, Long>();
146 
147   // Map encodedRegionName -> maxSeqIdInStores
148   protected Map<String, Map<byte[], Long>> regionMaxSeqIdInStores =
149       new ConcurrentHashMap<String, Map<byte[], Long>>();
150 
151   // Failed region server that the wal file being split belongs to
152   protected String failedServerName = "";
153 
154   // Number of writer threads
155   private final int numWriterThreads;
156 
157   // Min batch size when replay WAL edits
158   private final int minBatchSize;
159 
160   HLogSplitter(Configuration conf, Path rootDir,
161       FileSystem fs, LastSequenceId idChecker, ZooKeeperWatcher zkw) {
162     this.conf = conf;
163     this.rootDir = rootDir;
164     this.fs = fs;
165     this.sequenceIdChecker = idChecker;
166     this.watcher = zkw;
167 
168     entryBuffers = new EntryBuffers(
169         conf.getInt("hbase.regionserver.hlog.splitlog.buffersize",
170             128*1024*1024));
171 
172     this.minBatchSize = conf.getInt("hbase.regionserver.wal.logreplay.batch.size", 512);
173     this.distributedLogReplay = this.conf.getBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY,
174       HConstants.DEFAULT_DISTRIBUTED_LOG_REPLAY_CONFIG);
175 
176     this.numWriterThreads = conf.getInt("hbase.regionserver.hlog.splitlog.writer.threads", 3);
177     if (zkw != null && this.distributedLogReplay) {
178       outputSink = new LogReplayOutputSink(numWriterThreads);
179     } else {
180       if (this.distributedLogReplay) {
181         LOG.info("ZooKeeperWatcher is passed in as NULL so disable distrubitedLogRepaly.");
182       }
183       this.distributedLogReplay = false;
184       outputSink = new LogRecoveredEditsOutputSink(numWriterThreads);
185     }
186   }
187 
188   /**
189    * Splits a HLog file into region's recovered-edits directory.
190    * This is the main entry point for distributed log splitting from SplitLogWorker.
191    * <p>
192    * If the log file has N regions then N recovered.edits files will be produced.
193    * <p>
194    * @param rootDir
195    * @param logfile
196    * @param fs
197    * @param conf
198    * @param reporter
199    * @param idChecker
200    * @param zkw ZooKeeperWatcher if it's null, we will back to the old-style log splitting where we
201    *          dump out recoved.edits files for regions to replay on.
202    * @return false if it is interrupted by the progress-able.
203    * @throws IOException
204    */
205   public static boolean splitLogFile(Path rootDir, FileStatus logfile, FileSystem fs,
206       Configuration conf, CancelableProgressable reporter, LastSequenceId idChecker,
207       ZooKeeperWatcher zkw) throws IOException {
208     HLogSplitter s = new HLogSplitter(conf, rootDir, fs, idChecker, zkw);
209     return s.splitLogFile(logfile, reporter);
210   }
211 
212   // A wrapper to split one log folder using the method used by distributed
213   // log splitting. Used by tools and unit tests. It should be package private.
214   // It is public only because TestWALObserver is in a different package,
215   // which uses this method to to log splitting.
216   public static List<Path> split(Path rootDir, Path logDir, Path oldLogDir,
217       FileSystem fs, Configuration conf) throws IOException {
218     FileStatus[] logfiles = fs.listStatus(logDir);
219     List<Path> splits = new ArrayList<Path>();
220     if (logfiles != null && logfiles.length > 0) {
221       for (FileStatus logfile: logfiles) {
222         HLogSplitter s = new HLogSplitter(conf, rootDir, fs, null, null);
223         if (s.splitLogFile(logfile, null)) {
224           finishSplitLogFile(rootDir, oldLogDir, logfile.getPath(), conf);
225           if (s.outputSink.splits != null) {
226             splits.addAll(s.outputSink.splits);
227           }
228         }
229       }
230     }
231     if (!fs.delete(logDir, true)) {
232       throw new IOException("Unable to delete src dir: " + logDir);
233     }
234     return splits;
235   }
236 
237   // The real log splitter. It just splits one log file.
238   boolean splitLogFile(FileStatus logfile,
239       CancelableProgressable reporter) throws IOException {
240     boolean isCorrupted = false;
241     Preconditions.checkState(status == null);
242     boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors",
243       HLog.SPLIT_SKIP_ERRORS_DEFAULT);
244     int interval = conf.getInt("hbase.splitlog.report.interval.loglines", 1024);
245     Path logPath = logfile.getPath();
246     boolean outputSinkStarted = false;
247     boolean progress_failed = false;
248     int editsCount = 0;
249     int editsSkipped = 0;
250 
251     try {
252       status = TaskMonitor.get().createStatus(
253         "Splitting log file " + logfile.getPath() +
254         "into a temporary staging area.");
255       long logLength = logfile.getLen();
256       LOG.info("Splitting hlog: " + logPath + ", length=" + logLength);
257       LOG.info("DistributedLogReplay = " + this.distributedLogReplay);
258       status.setStatus("Opening log file");
259       if (reporter != null && !reporter.progress()) {
260         progress_failed = true;
261         return false;
262       }
263       Reader in = null;
264       try {
265         in = getReader(fs, logfile, conf, skipErrors, reporter);
266       } catch (CorruptedLogFileException e) {
267         LOG.warn("Could not get reader, corrupted log file " + logPath, e);
268         ZKSplitLog.markCorrupted(rootDir, logfile.getPath().getName(), fs);
269         isCorrupted = true;
270       }
271       if (in == null) {
272         status.markComplete("Was nothing to split in log file");
273         LOG.warn("Nothing to split in log file " + logPath);
274         return true;
275       }
276       if(watcher != null) {
277         try {
278           disablingOrDisabledTables = ZKTable.getDisabledOrDisablingTables(watcher);
279         } catch (KeeperException e) {
280           throw new IOException("Can't get disabling/disabled tables", e);
281         }
282       }
283       int numOpenedFilesBeforeReporting = conf.getInt("hbase.splitlog.report.openedfiles", 3);
284       int numOpenedFilesLastCheck = 0;
285       outputSink.setReporter(reporter);
286       outputSink.startWriterThreads();
287       outputSinkStarted = true;
288       Entry entry;
289       Long lastFlushedSequenceId = -1L;
290       ServerName serverName = HLogUtil.getServerNameFromHLogDirectoryName(logPath);
291       failedServerName = (serverName == null) ? "" : serverName.getServerName();
292       while ((entry = getNextLogLine(in, logPath, skipErrors)) != null) {
293         byte[] region = entry.getKey().getEncodedRegionName();
294         String key = Bytes.toString(region);
295         lastFlushedSequenceId = lastFlushedSequenceIds.get(key);
296         if (lastFlushedSequenceId == null) {
297           if (this.distributedLogReplay) {
298             RegionStoreSequenceIds ids =
299                 SplitLogManager.getRegionFlushedSequenceId(this.watcher, failedServerName, key);
300             if (ids != null) {
301               lastFlushedSequenceId = ids.getLastFlushedSequenceId();
302             }
303           } else if (sequenceIdChecker != null) {
304             lastFlushedSequenceId = sequenceIdChecker.getLastSequenceId(region);
305           }
306           if (lastFlushedSequenceId == null) {
307             lastFlushedSequenceId = -1L;
308           }
309           lastFlushedSequenceIds.put(key, lastFlushedSequenceId);
310         }
311         if (lastFlushedSequenceId >= entry.getKey().getLogSeqNum()) {
312           editsSkipped++;
313           continue;
314         }
315         entryBuffers.appendEntry(entry);
316         editsCount++;
317         int moreWritersFromLastCheck = this.getNumOpenWriters() - numOpenedFilesLastCheck;
318         // If sufficient edits have passed, check if we should report progress.
319         if (editsCount % interval == 0
320             || moreWritersFromLastCheck > numOpenedFilesBeforeReporting) {
321           numOpenedFilesLastCheck = this.getNumOpenWriters();
322           String countsStr = (editsCount - (editsSkipped + outputSink.getSkippedEdits()))
323               + " edits, skipped " + editsSkipped + " edits.";
324           status.setStatus("Split " + countsStr);
325           if (reporter != null && !reporter.progress()) {
326             progress_failed = true;
327             return false;
328           }
329         }
330       }
331     } catch (InterruptedException ie) {
332       IOException iie = new InterruptedIOException();
333       iie.initCause(ie);
334       throw iie;
335     } catch (CorruptedLogFileException e) {
336       LOG.warn("Could not parse, corrupted log file " + logPath, e);
337       ZKSplitLog.markCorrupted(rootDir, logfile.getPath().getName(), fs);
338       isCorrupted = true;
339     } catch (IOException e) {
340       e = RemoteExceptionHandler.checkIOException(e);
341       throw e;
342     } finally {
343       LOG.info("Finishing writing output logs and closing down.");
344       if (outputSinkStarted) {
345         progress_failed = outputSink.finishWritingAndClose() == null;
346       }
347       String msg = "Processed " + editsCount + " edits across "
348           + outputSink.getNumberOfRecoveredRegions() + " regions; log file=" + logPath
349           + " is corrupted = " + isCorrupted + " progress failed = " + progress_failed;
350       LOG.info(msg);
351       status.markComplete(msg);
352     }
353     return !progress_failed;
354   }
355 
356   /**
357    * Completes the work done by splitLogFile by archiving logs
358    * <p>
359    * It is invoked by SplitLogManager once it knows that one of the
360    * SplitLogWorkers have completed the splitLogFile() part. If the master
361    * crashes then this function might get called multiple times.
362    * <p>
363    * @param logfile
364    * @param conf
365    * @throws IOException
366    */
367   public static void finishSplitLogFile(String logfile,
368       Configuration conf)  throws IOException {
369     Path rootdir = FSUtils.getRootDir(conf);
370     Path oldLogDir = new Path(rootdir, HConstants.HREGION_OLDLOGDIR_NAME);
371     Path logPath;
372     if (FSUtils.isStartingWithPath(rootdir, logfile)) {
373       logPath = new Path(logfile);
374     } else {
375       logPath = new Path(rootdir, logfile);
376     }
377     finishSplitLogFile(rootdir, oldLogDir, logPath, conf);
378   }
379 
380   static void finishSplitLogFile(Path rootdir, Path oldLogDir,
381       Path logPath, Configuration conf) throws IOException {
382     List<Path> processedLogs = new ArrayList<Path>();
383     List<Path> corruptedLogs = new ArrayList<Path>();
384     FileSystem fs;
385     fs = rootdir.getFileSystem(conf);
386     if (ZKSplitLog.isCorrupted(rootdir, logPath.getName(), fs)) {
387       corruptedLogs.add(logPath);
388     } else {
389       processedLogs.add(logPath);
390     }
391     archiveLogs(corruptedLogs, processedLogs, oldLogDir, fs, conf);
392     Path stagingDir = ZKSplitLog.getSplitLogDir(rootdir, logPath.getName());
393     fs.delete(stagingDir, true);
394   }
395 
396   /**
397    * Moves processed logs to a oldLogDir after successful processing Moves
398    * corrupted logs (any log that couldn't be successfully parsed to corruptDir
399    * (.corrupt) for later investigation
400    *
401    * @param corruptedLogs
402    * @param processedLogs
403    * @param oldLogDir
404    * @param fs
405    * @param conf
406    * @throws IOException
407    */
408   private static void archiveLogs(
409       final List<Path> corruptedLogs,
410       final List<Path> processedLogs, final Path oldLogDir,
411       final FileSystem fs, final Configuration conf) throws IOException {
412     final Path corruptDir = new Path(FSUtils.getRootDir(conf), conf.get(
413         "hbase.regionserver.hlog.splitlog.corrupt.dir",  HConstants.CORRUPT_DIR_NAME));
414 
415     if (!fs.mkdirs(corruptDir)) {
416       LOG.info("Unable to mkdir " + corruptDir);
417     }
418     fs.mkdirs(oldLogDir);
419 
420     // this method can get restarted or called multiple times for archiving
421     // the same log files.
422     for (Path corrupted : corruptedLogs) {
423       Path p = new Path(corruptDir, corrupted.getName());
424       if (fs.exists(corrupted)) {
425         if (!fs.rename(corrupted, p)) {
426           LOG.warn("Unable to move corrupted log " + corrupted + " to " + p);
427         } else {
428           LOG.warn("Moved corrupted log " + corrupted + " to " + p);
429         }
430       }
431     }
432 
433     for (Path p : processedLogs) {
434       Path newPath = FSHLog.getHLogArchivePath(oldLogDir, p);
435       if (fs.exists(p)) {
436         if (!FSUtils.renameAndSetModifyTime(fs, p, newPath)) {
437           LOG.warn("Unable to move  " + p + " to " + newPath);
438         } else {
439           LOG.debug("Archived processed log " + p + " to " + newPath);
440         }
441       }
442     }
443   }
444 
445   /**
446    * Path to a file under RECOVERED_EDITS_DIR directory of the region found in
447    * <code>logEntry</code> named for the sequenceid in the passed
448    * <code>logEntry</code>: e.g. /hbase/some_table/2323432434/recovered.edits/2332.
449    * This method also ensures existence of RECOVERED_EDITS_DIR under the region
450    * creating it if necessary.
451    * @param fs
452    * @param logEntry
453    * @param rootDir HBase root dir.
454    * @return Path to file into which to dump split log edits.
455    * @throws IOException
456    */
457   @SuppressWarnings("deprecation")
458   static Path getRegionSplitEditsPath(final FileSystem fs,
459       final Entry logEntry, final Path rootDir, boolean isCreate)
460   throws IOException {
461     Path tableDir = FSUtils.getTableDir(rootDir, logEntry.getKey().getTablename());
462     String encodedRegionName = Bytes.toString(logEntry.getKey().getEncodedRegionName());
463     Path regiondir = HRegion.getRegionDir(tableDir, encodedRegionName);
464     Path dir = HLogUtil.getRegionDirRecoveredEditsDir(regiondir);
465 
466     if (!fs.exists(regiondir)) {
467       LOG.info("This region's directory doesn't exist: "
468           + regiondir.toString() + ". It is very likely that it was" +
469           " already split so it's safe to discard those edits.");
470       return null;
471     }
472     if (fs.exists(dir) && fs.isFile(dir)) {
473       Path tmp = new Path("/tmp");
474       if (!fs.exists(tmp)) {
475         fs.mkdirs(tmp);
476       }
477       tmp = new Path(tmp,
478         HConstants.RECOVERED_EDITS_DIR + "_" + encodedRegionName);
479       LOG.warn("Found existing old file: " + dir + ". It could be some "
480         + "leftover of an old installation. It should be a folder instead. "
481         + "So moving it to " + tmp);
482       if (!fs.rename(dir, tmp)) {
483         LOG.warn("Failed to sideline old file " + dir);
484       }
485     }
486 
487     if (isCreate && !fs.exists(dir)) {
488       if (!fs.mkdirs(dir)) LOG.warn("mkdir failed on " + dir);
489     }
490     // Append file name ends with RECOVERED_LOG_TMPFILE_SUFFIX to ensure
491     // region's replayRecoveredEdits will not delete it
492     String fileName = formatRecoveredEditsFileName(logEntry.getKey().getLogSeqNum());
493     fileName = getTmpRecoveredEditsFileName(fileName);
494     return new Path(dir, fileName);
495   }
496 
497   static String getTmpRecoveredEditsFileName(String fileName) {
498     return fileName + HLog.RECOVERED_LOG_TMPFILE_SUFFIX;
499   }
500 
501   /**
502    * Get the completed recovered edits file path, renaming it to be by last edit
503    * in the file from its first edit. Then we could use the name to skip
504    * recovered edits when doing {@link HRegion#replayRecoveredEditsIfAny}.
505    * @param srcPath
506    * @param maximumEditLogSeqNum
507    * @return dstPath take file's last edit log seq num as the name
508    */
509   static Path getCompletedRecoveredEditsFilePath(Path srcPath,
510       Long maximumEditLogSeqNum) {
511     String fileName = formatRecoveredEditsFileName(maximumEditLogSeqNum);
512     return new Path(srcPath.getParent(), fileName);
513   }
514 
515   static String formatRecoveredEditsFileName(final long seqid) {
516     return String.format("%019d", seqid);
517   }
518 
519   /**
520    * Create a new {@link Reader} for reading logs to split.
521    *
522    * @param fs
523    * @param file
524    * @param conf
525    * @return A new Reader instance
526    * @throws IOException
527    * @throws CorruptedLogFileException
528    */
529   protected Reader getReader(FileSystem fs, FileStatus file, Configuration conf,
530       boolean skipErrors, CancelableProgressable reporter)
531       throws IOException, CorruptedLogFileException {
532     Path path = file.getPath();
533     long length = file.getLen();
534     Reader in;
535 
536     // Check for possibly empty file. With appends, currently Hadoop reports a
537     // zero length even if the file has been sync'd. Revisit if HDFS-376 or
538     // HDFS-878 is committed.
539     if (length <= 0) {
540       LOG.warn("File " + path + " might be still open, length is 0");
541     }
542 
543     try {
544       FSUtils.getInstance(fs, conf).recoverFileLease(fs, path, conf, reporter);
545       try {
546         in = getReader(fs, path, conf, reporter);
547       } catch (EOFException e) {
548         if (length <= 0) {
549           // TODO should we ignore an empty, not-last log file if skip.errors
550           // is false? Either way, the caller should decide what to do. E.g.
551           // ignore if this is the last log in sequence.
552           // TODO is this scenario still possible if the log has been
553           // recovered (i.e. closed)
554           LOG.warn("Could not open " + path + " for reading. File is empty", e);
555           return null;
556         } else {
557           // EOFException being ignored
558           return null;
559         }
560       }
561     } catch (IOException e) {
562       if (e instanceof FileNotFoundException) {
563         // A wal file may not exist anymore. Nothing can be recovered so move on
564         LOG.warn("File " + path + " doesn't exist anymore.", e);
565         return null;
566       }
567       if (!skipErrors || e instanceof InterruptedIOException) {
568         throw e; // Don't mark the file corrupted if interrupted, or not skipErrors
569       }
570       CorruptedLogFileException t =
571         new CorruptedLogFileException("skipErrors=true Could not open hlog " +
572             path + " ignoring");
573       t.initCause(e);
574       throw t;
575     }
576     return in;
577   }
578 
579   static private Entry getNextLogLine(Reader in, Path path, boolean skipErrors)
580   throws CorruptedLogFileException, IOException {
581     try {
582       return in.next();
583     } catch (EOFException eof) {
584       // truncated files are expected if a RS crashes (see HBASE-2643)
585       LOG.info("EOF from hlog " + path + ".  continuing");
586       return null;
587     } catch (IOException e) {
588       // If the IOE resulted from bad file format,
589       // then this problem is idempotent and retrying won't help
590       if (e.getCause() != null &&
591           (e.getCause() instanceof ParseException ||
592            e.getCause() instanceof org.apache.hadoop.fs.ChecksumException)) {
593         LOG.warn("Parse exception " + e.getCause().toString() + " from hlog "
594            + path + ".  continuing");
595         return null;
596       }
597       if (!skipErrors) {
598         throw e;
599       }
600       CorruptedLogFileException t =
601         new CorruptedLogFileException("skipErrors=true Ignoring exception" +
602             " while parsing hlog " + path + ". Marking as corrupted");
603       t.initCause(e);
604       throw t;
605     }
606   }
607 
608   private void writerThreadError(Throwable t) {
609     thrown.compareAndSet(null, t);
610   }
611 
612   /**
613    * Check for errors in the writer threads. If any is found, rethrow it.
614    */
615   private void checkForErrors() throws IOException {
616     Throwable thrown = this.thrown.get();
617     if (thrown == null) return;
618     if (thrown instanceof IOException) {
619       throw new IOException(thrown);
620     } else {
621       throw new RuntimeException(thrown);
622     }
623   }
624   /**
625    * Create a new {@link Writer} for writing log splits.
626    */
627   protected Writer createWriter(FileSystem fs, Path logfile, Configuration conf)
628       throws IOException {
629     return HLogFactory.createWriter(fs, logfile, conf);
630   }
631 
632   /**
633    * Create a new {@link Reader} for reading logs to split.
634    */
635   protected Reader getReader(FileSystem fs, Path curLogFile,
636       Configuration conf, CancelableProgressable reporter) throws IOException {
637     return HLogFactory.createReader(fs, curLogFile, conf, reporter);
638   }
639 
640   /**
641    * Get current open writers
642    * @return
643    */
644   private int getNumOpenWriters() {
645     int result = 0;
646     if (this.outputSink != null) {
647       result += this.outputSink.getNumOpenWriters();
648     }
649     return result;
650   }
651 
652   /**
653    * Class which accumulates edits and separates them into a buffer per region
654    * while simultaneously accounting RAM usage. Blocks if the RAM usage crosses
655    * a predefined threshold.
656    *
657    * Writer threads then pull region-specific buffers from this class.
658    */
659   class EntryBuffers {
660     Map<byte[], RegionEntryBuffer> buffers =
661       new TreeMap<byte[], RegionEntryBuffer>(Bytes.BYTES_COMPARATOR);
662 
663     /* Track which regions are currently in the middle of writing. We don't allow
664        an IO thread to pick up bytes from a region if we're already writing
665        data for that region in a different IO thread. */
666     Set<byte[]> currentlyWriting = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
667 
668     long totalBuffered = 0;
669     long maxHeapUsage;
670 
671     EntryBuffers(long maxHeapUsage) {
672       this.maxHeapUsage = maxHeapUsage;
673     }
674 
675     /**
676      * Append a log entry into the corresponding region buffer.
677      * Blocks if the total heap usage has crossed the specified threshold.
678      *
679      * @throws InterruptedException
680      * @throws IOException
681      */
682     void appendEntry(Entry entry) throws InterruptedException, IOException {
683       HLogKey key = entry.getKey();
684 
685       RegionEntryBuffer buffer;
686       long incrHeap;
687       synchronized (this) {
688         buffer = buffers.get(key.getEncodedRegionName());
689         if (buffer == null) {
690           buffer = new RegionEntryBuffer(key.getTablename(), key.getEncodedRegionName());
691           buffers.put(key.getEncodedRegionName(), buffer);
692         }
693         incrHeap= buffer.appendEntry(entry);
694       }
695 
696       // If we crossed the chunk threshold, wait for more space to be available
697       synchronized (dataAvailable) {
698         totalBuffered += incrHeap;
699         while (totalBuffered > maxHeapUsage && thrown.get() == null) {
700           LOG.debug("Used " + totalBuffered + " bytes of buffered edits, waiting for IO threads...");
701           dataAvailable.wait(2000);
702         }
703         dataAvailable.notifyAll();
704       }
705       checkForErrors();
706     }
707 
708     /**
709      * @return RegionEntryBuffer a buffer of edits to be written or replayed.
710      */
711     synchronized RegionEntryBuffer getChunkToWrite() {
712       long biggestSize = 0;
713       byte[] biggestBufferKey = null;
714 
715       for (Map.Entry<byte[], RegionEntryBuffer> entry : buffers.entrySet()) {
716         long size = entry.getValue().heapSize();
717         if (size > biggestSize && (!currentlyWriting.contains(entry.getKey()))) {
718           biggestSize = size;
719           biggestBufferKey = entry.getKey();
720         }
721       }
722       if (biggestBufferKey == null) {
723         return null;
724       }
725 
726       RegionEntryBuffer buffer = buffers.remove(biggestBufferKey);
727       currentlyWriting.add(biggestBufferKey);
728       return buffer;
729     }
730 
731     void doneWriting(RegionEntryBuffer buffer) {
732       synchronized (this) {
733         boolean removed = currentlyWriting.remove(buffer.encodedRegionName);
734         assert removed;
735       }
736       long size = buffer.heapSize();
737 
738       synchronized (dataAvailable) {
739         totalBuffered -= size;
740         // We may unblock writers
741         dataAvailable.notifyAll();
742       }
743     }
744 
745     synchronized boolean isRegionCurrentlyWriting(byte[] region) {
746       return currentlyWriting.contains(region);
747     }
748   }
749 
750   /**
751    * A buffer of some number of edits for a given region.
752    * This accumulates edits and also provides a memory optimization in order to
753    * share a single byte array instance for the table and region name.
754    * Also tracks memory usage of the accumulated edits.
755    */
756   static class RegionEntryBuffer implements HeapSize {
757     long heapInBuffer = 0;
758     List<Entry> entryBuffer;
759     TableName tableName;
760     byte[] encodedRegionName;
761 
762     RegionEntryBuffer(TableName tableName, byte[] region) {
763       this.tableName = tableName;
764       this.encodedRegionName = region;
765       this.entryBuffer = new LinkedList<Entry>();
766     }
767 
768     long appendEntry(Entry entry) {
769       internify(entry);
770       entryBuffer.add(entry);
771       long incrHeap = entry.getEdit().heapSize() +
772         ClassSize.align(2 * ClassSize.REFERENCE) + // HLogKey pointers
773         0; // TODO linkedlist entry
774       heapInBuffer += incrHeap;
775       return incrHeap;
776     }
777 
778     private void internify(Entry entry) {
779       HLogKey k = entry.getKey();
780       k.internTableName(this.tableName);
781       k.internEncodedRegionName(this.encodedRegionName);
782     }
783 
784     public long heapSize() {
785       return heapInBuffer;
786     }
787   }
788 
789   class WriterThread extends Thread {
790     private volatile boolean shouldStop = false;
791     private OutputSink outputSink = null;
792 
793     WriterThread(OutputSink sink, int i) {
794       super("WriterThread-" + i);
795       outputSink = sink;
796     }
797 
798     public void run()  {
799       try {
800         doRun();
801       } catch (Throwable t) {
802         LOG.error("Exiting thread", t);
803         writerThreadError(t);
804       }
805     }
806 
807     private void doRun() throws IOException {
808       LOG.debug("Writer thread " + this + ": starting");
809       while (true) {
810         RegionEntryBuffer buffer = entryBuffers.getChunkToWrite();
811         if (buffer == null) {
812           // No data currently available, wait on some more to show up
813           synchronized (dataAvailable) {
814             if (shouldStop && !this.outputSink.flush()) {
815               return;
816             }
817             try {
818               dataAvailable.wait(500);
819             } catch (InterruptedException ie) {
820               if (!shouldStop) {
821                 throw new RuntimeException(ie);
822               }
823             }
824           }
825           continue;
826         }
827 
828         assert buffer != null;
829         try {
830           writeBuffer(buffer);
831         } finally {
832           entryBuffers.doneWriting(buffer);
833         }
834       }
835     }
836 
837     private void writeBuffer(RegionEntryBuffer buffer) throws IOException {
838       outputSink.append(buffer);
839     }
840 
841     void finish() {
842       synchronized (dataAvailable) {
843         shouldStop = true;
844         dataAvailable.notifyAll();
845       }
846     }
847   }
848 
849   /**
850    * The following class is an abstraction class to provide a common interface to support both
851    * existing recovered edits file sink and region server WAL edits replay sink
852    */
853    abstract class OutputSink {
854 
855     protected Map<byte[], SinkWriter> writers = Collections
856         .synchronizedMap(new TreeMap<byte[], SinkWriter>(Bytes.BYTES_COMPARATOR));;
857 
858     protected final Map<byte[], Long> regionMaximumEditLogSeqNum = Collections
859         .synchronizedMap(new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR));
860 
861     protected final List<WriterThread> writerThreads = Lists.newArrayList();
862 
863     /* Set of regions which we've decided should not output edits */
864     protected final Set<byte[]> blacklistedRegions = Collections
865         .synchronizedSet(new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR));
866 
867     protected boolean closeAndCleanCompleted = false;
868 
869     protected boolean writersClosed = false;
870 
871     protected final int numThreads;
872 
873     protected CancelableProgressable reporter = null;
874 
875     protected AtomicLong skippedEdits = new AtomicLong();
876 
877     protected List<Path> splits = null;
878 
879     public OutputSink(int numWriters) {
880       numThreads = numWriters;
881     }
882 
883     void setReporter(CancelableProgressable reporter) {
884       this.reporter = reporter;
885     }
886 
887     /**
888      * Start the threads that will pump data from the entryBuffers to the output files.
889      */
890     synchronized void startWriterThreads() {
891       for (int i = 0; i < numThreads; i++) {
892         WriterThread t = new WriterThread(this, i);
893         t.start();
894         writerThreads.add(t);
895       }
896     }
897 
898     /**
899      *
900      * Update region's maximum edit log SeqNum.
901      */
902     void updateRegionMaximumEditLogSeqNum(Entry entry) {
903       synchronized (regionMaximumEditLogSeqNum) {
904         Long currentMaxSeqNum = regionMaximumEditLogSeqNum.get(entry.getKey()
905             .getEncodedRegionName());
906         if (currentMaxSeqNum == null || entry.getKey().getLogSeqNum() > currentMaxSeqNum) {
907           regionMaximumEditLogSeqNum.put(entry.getKey().getEncodedRegionName(), entry.getKey()
908               .getLogSeqNum());
909         }
910       }
911     }
912 
913     Long getRegionMaximumEditLogSeqNum(byte[] region) {
914       return regionMaximumEditLogSeqNum.get(region);
915     }
916 
917     /**
918      * @return the number of currently opened writers
919      */
920     int getNumOpenWriters() {
921       return this.writers.size();
922     }
923 
924     long getSkippedEdits() {
925       return this.skippedEdits.get();
926     }
927 
928     /**
929      * Wait for writer threads to dump all info to the sink
930      * @return true when there is no error
931      * @throws IOException
932      */
933     protected boolean finishWriting() throws IOException {
934       LOG.info("Waiting for split writer threads to finish");
935       boolean progress_failed = false;
936       for (WriterThread t : writerThreads) {
937         t.finish();
938       }
939       for (WriterThread t : writerThreads) {
940         if (!progress_failed && reporter != null && !reporter.progress()) {
941           progress_failed = true;
942         }
943         try {
944           t.join();
945         } catch (InterruptedException ie) {
946           IOException iie = new InterruptedIOException();
947           iie.initCause(ie);
948           throw iie;
949         }
950       }
951       checkForErrors();
952       LOG.info("Split writers finished");
953       return (!progress_failed);
954     }
955 
956     abstract List<Path> finishWritingAndClose() throws IOException;
957 
958     /**
959      * @return a map from encoded region ID to the number of edits written out for that region.
960      */
961     abstract Map<byte[], Long> getOutputCounts();
962 
963     /**
964      * @return number of regions we've recovered
965      */
966     abstract int getNumberOfRecoveredRegions();
967 
968     /**
969      * @param buffer A WAL Edit Entry
970      * @throws IOException
971      */
972     abstract void append(RegionEntryBuffer buffer) throws IOException;
973 
974     /**
975      * WriterThread call this function to help flush internal remaining edits in buffer before close
976      * @return true when underlying sink has something to flush
977      */
978     protected boolean flush() throws IOException {
979       return false;
980     }
981   }
982 
983   /**
984    * Class that manages the output streams from the log splitting process.
985    */
986   class LogRecoveredEditsOutputSink extends OutputSink {
987 
988     public LogRecoveredEditsOutputSink(int numWriters) {
989       // More threads could potentially write faster at the expense
990       // of causing more disk seeks as the logs are split.
991       // 3. After a certain setting (probably around 3) the
992       // process will be bound on the reader in the current
993       // implementation anyway.
994       super(numWriters);
995     }
996 
997     /**
998      * @return null if failed to report progress
999      * @throws IOException
1000      */
1001     @Override
1002     List<Path> finishWritingAndClose() throws IOException {
1003       boolean isSuccessful = false;
1004       List<Path> result = null;
1005       try {
1006         isSuccessful = finishWriting();
1007       } finally {
1008         result = close();
1009         List<IOException> thrown = closeLogWriters(null);
1010         if (thrown != null && !thrown.isEmpty()) {
1011           throw MultipleIOException.createIOException(thrown);
1012         }
1013       }
1014       if (isSuccessful) {
1015         splits = result;
1016       }
1017       return splits;
1018     }
1019 
1020     /**
1021      * Close all of the output streams.
1022      * @return the list of paths written.
1023      */
1024     private List<Path> close() throws IOException {
1025       Preconditions.checkState(!closeAndCleanCompleted);
1026 
1027       final List<Path> paths = new ArrayList<Path>();
1028       final List<IOException> thrown = Lists.newArrayList();
1029       ThreadPoolExecutor closeThreadPool = Threads.getBoundedCachedThreadPool(numThreads, 30L,
1030         TimeUnit.SECONDS, new ThreadFactory() {
1031           private int count = 1;
1032 
1033           public Thread newThread(Runnable r) {
1034             Thread t = new Thread(r, "split-log-closeStream-" + count++);
1035             return t;
1036           }
1037         });
1038       CompletionService<Void> completionService =
1039         new ExecutorCompletionService<Void>(closeThreadPool);
1040       for (final Map.Entry<byte[], ? extends SinkWriter> writersEntry : writers.entrySet()) {
1041         LOG.debug("Submitting close of " + ((WriterAndPath)writersEntry.getValue()).p);
1042         completionService.submit(new Callable<Void>() {
1043           public Void call() throws Exception {
1044             WriterAndPath wap = (WriterAndPath) writersEntry.getValue();
1045             LOG.debug("Closing " + wap.p);
1046             try {
1047               wap.w.close();
1048             } catch (IOException ioe) {
1049               LOG.error("Couldn't close log at " + wap.p, ioe);
1050               thrown.add(ioe);
1051               return null;
1052             }
1053             LOG.info("Closed wap " + wap.p + " (wrote " + wap.editsWritten + " edits in "
1054                 + (wap.nanosSpent / 1000 / 1000) + "ms)");
1055 
1056             if (wap.editsWritten == 0) {
1057               // just remove the empty recovered.edits file
1058               if (fs.exists(wap.p) && !fs.delete(wap.p, false)) {
1059                 LOG.warn("Failed deleting empty " + wap.p);
1060                 throw new IOException("Failed deleting empty  " + wap.p);
1061               }
1062               return null;
1063             }
1064 
1065             Path dst = getCompletedRecoveredEditsFilePath(wap.p,
1066               regionMaximumEditLogSeqNum.get(writersEntry.getKey()));
1067             try {
1068               if (!dst.equals(wap.p) && fs.exists(dst)) {
1069                 LOG.warn("Found existing old edits file. It could be the "
1070                     + "result of a previous failed split attempt. Deleting " + dst + ", length="
1071                     + fs.getFileStatus(dst).getLen());
1072                 if (!fs.delete(dst, false)) {
1073                   LOG.warn("Failed deleting of old " + dst);
1074                   throw new IOException("Failed deleting of old " + dst);
1075                 }
1076               }
1077               // Skip the unit tests which create a splitter that reads and
1078               // writes the data without touching disk.
1079               // TestHLogSplit#testThreading is an example.
1080               if (fs.exists(wap.p)) {
1081                 if (!fs.rename(wap.p, dst)) {
1082                   throw new IOException("Failed renaming " + wap.p + " to " + dst);
1083                 }
1084                 LOG.debug("Rename " + wap.p + " to " + dst);
1085               }
1086             } catch (IOException ioe) {
1087               LOG.error("Couldn't rename " + wap.p + " to " + dst, ioe);
1088               thrown.add(ioe);
1089               return null;
1090             }
1091             paths.add(dst);
1092             return null;
1093           }
1094         });
1095       }
1096 
1097       boolean progress_failed = false;
1098       try {
1099         for (int i = 0, n = this.writers.size(); i < n; i++) {
1100           Future<Void> future = completionService.take();
1101           future.get();
1102           if (!progress_failed && reporter != null && !reporter.progress()) {
1103             progress_failed = true;
1104           }
1105         }
1106       } catch (InterruptedException e) {
1107         IOException iie = new InterruptedIOException();
1108         iie.initCause(e);
1109         throw iie;
1110       } catch (ExecutionException e) {
1111         throw new IOException(e.getCause());
1112       } finally {
1113         closeThreadPool.shutdownNow();
1114       }
1115 
1116       if (!thrown.isEmpty()) {
1117         throw MultipleIOException.createIOException(thrown);
1118       }
1119       writersClosed = true;
1120       closeAndCleanCompleted = true;
1121       if (progress_failed) {
1122         return null;
1123       }
1124       return paths;
1125     }
1126 
1127     private List<IOException> closeLogWriters(List<IOException> thrown) throws IOException {
1128       if (writersClosed) {
1129         return thrown;
1130       }
1131 
1132       if (thrown == null) {
1133         thrown = Lists.newArrayList();
1134       }
1135       try {
1136         for (WriterThread t : writerThreads) {
1137           while (t.isAlive()) {
1138             t.shouldStop = true;
1139             t.interrupt();
1140             try {
1141               t.join(10);
1142             } catch (InterruptedException e) {
1143               IOException iie = new InterruptedIOException();
1144               iie.initCause(e);
1145               throw iie;
1146             }
1147           }
1148         }
1149       } finally {
1150         synchronized (writers) {
1151           WriterAndPath wap = null;
1152           for (SinkWriter tmpWAP : writers.values()) {
1153             try {
1154               wap = (WriterAndPath) tmpWAP;
1155               wap.w.close();
1156             } catch (IOException ioe) {
1157               LOG.error("Couldn't close log at " + wap.p, ioe);
1158               thrown.add(ioe);
1159               continue;
1160             }
1161             LOG.info("Closed log " + wap.p + " (wrote " + wap.editsWritten + " edits in "
1162                 + (wap.nanosSpent / 1000 / 1000) + "ms)");
1163           }
1164         }
1165         writersClosed = true;
1166       }
1167 
1168       return thrown;
1169     }
1170 
1171     /**
1172      * Get a writer and path for a log starting at the given entry. This function is threadsafe so
1173      * long as multiple threads are always acting on different regions.
1174      * @return null if this region shouldn't output any logs
1175      */
1176     private WriterAndPath getWriterAndPath(Entry entry) throws IOException {
1177       byte region[] = entry.getKey().getEncodedRegionName();
1178       WriterAndPath ret = (WriterAndPath) writers.get(region);
1179       if (ret != null) {
1180         return ret;
1181       }
1182       // If we already decided that this region doesn't get any output
1183       // we don't need to check again.
1184       if (blacklistedRegions.contains(region)) {
1185         return null;
1186       }
1187       ret = createWAP(region, entry, rootDir, fs, conf);
1188       if (ret == null) {
1189         blacklistedRegions.add(region);
1190         return null;
1191       }
1192       writers.put(region, ret);
1193       return ret;
1194     }
1195 
1196     private WriterAndPath createWAP(byte[] region, Entry entry, Path rootdir, FileSystem fs,
1197         Configuration conf) throws IOException {
1198       Path regionedits = getRegionSplitEditsPath(fs, entry, rootdir, true);
1199       if (regionedits == null) {
1200         return null;
1201       }
1202       if (fs.exists(regionedits)) {
1203         LOG.warn("Found old edits file. It could be the "
1204             + "result of a previous failed split attempt. Deleting " + regionedits + ", length="
1205             + fs.getFileStatus(regionedits).getLen());
1206         if (!fs.delete(regionedits, false)) {
1207           LOG.warn("Failed delete of old " + regionedits);
1208         }
1209       }
1210       Writer w = createWriter(fs, regionedits, conf);
1211       LOG.debug("Creating writer path=" + regionedits + " region=" + Bytes.toStringBinary(region));
1212       return (new WriterAndPath(regionedits, w));
1213     }
1214 
1215     void append(RegionEntryBuffer buffer) throws IOException {
1216       List<Entry> entries = buffer.entryBuffer;
1217       if (entries.isEmpty()) {
1218         LOG.warn("got an empty buffer, skipping");
1219         return;
1220       }
1221 
1222       WriterAndPath wap = null;
1223 
1224       long startTime = System.nanoTime();
1225       try {
1226         int editsCount = 0;
1227 
1228         for (Entry logEntry : entries) {
1229           if (wap == null) {
1230             wap = getWriterAndPath(logEntry);
1231             if (wap == null) {
1232               // getWriterAndPath decided we don't need to write these edits
1233               return;
1234             }
1235           }
1236           wap.w.append(logEntry);
1237           this.updateRegionMaximumEditLogSeqNum(logEntry);
1238           editsCount++;
1239         }
1240         // Pass along summary statistics
1241         wap.incrementEdits(editsCount);
1242         wap.incrementNanoTime(System.nanoTime() - startTime);
1243       } catch (IOException e) {
1244         e = RemoteExceptionHandler.checkIOException(e);
1245         LOG.fatal(" Got while writing log entry to log", e);
1246         throw e;
1247       }
1248     }
1249 
1250     /**
1251      * @return a map from encoded region ID to the number of edits written out for that region.
1252      */
1253     Map<byte[], Long> getOutputCounts() {
1254       TreeMap<byte[], Long> ret = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
1255       synchronized (writers) {
1256         for (Map.Entry<byte[], ? extends SinkWriter> entry : writers.entrySet()) {
1257           ret.put(entry.getKey(), entry.getValue().editsWritten);
1258         }
1259       }
1260       return ret;
1261     }
1262 
1263     @Override
1264     int getNumberOfRecoveredRegions() {
1265       return writers.size();
1266     }
1267   }
1268 
1269   /**
1270    * Class wraps the actual writer which writes data out and related statistics
1271    */
1272   private abstract static class SinkWriter {
1273     /* Count of edits written to this path */
1274     long editsWritten = 0;
1275     /* Number of nanos spent writing to this log */
1276     long nanosSpent = 0;
1277 
1278     void incrementEdits(int edits) {
1279       editsWritten += edits;
1280     }
1281 
1282     void incrementNanoTime(long nanos) {
1283       nanosSpent += nanos;
1284     }
1285   }
1286 
1287   /**
1288    * Private data structure that wraps a Writer and its Path, also collecting statistics about the
1289    * data written to this output.
1290    */
1291   private final static class WriterAndPath extends SinkWriter {
1292     final Path p;
1293     final Writer w;
1294 
1295     WriterAndPath(final Path p, final Writer w) {
1296       this.p = p;
1297       this.w = w;
1298     }
1299   }
1300 
1301   /**
1302    * Class that manages to replay edits from WAL files directly to assigned fail over region servers
1303    */
1304   class LogReplayOutputSink extends OutputSink {
1305     private static final double BUFFER_THRESHOLD = 0.35;
1306     private static final String KEY_DELIMITER = "#";
1307 
1308     private long waitRegionOnlineTimeOut;
1309     private final Set<String> recoveredRegions = Collections.synchronizedSet(new HashSet<String>());
1310     private final Map<String, RegionServerWriter> writers =
1311         new ConcurrentHashMap<String, RegionServerWriter>();
1312     // online encoded region name -> region location map
1313     private final Map<String, HRegionLocation> onlineRegions =
1314         new ConcurrentHashMap<String, HRegionLocation>();
1315 
1316     private Map<TableName, HConnection> tableNameToHConnectionMap = Collections
1317         .synchronizedMap(new TreeMap<TableName, HConnection>());
1318     /**
1319      * Map key -> value layout
1320      * <servername>:<table name> -> Queue<Row>
1321      */
1322     private Map<String, List<Pair<HRegionLocation, Row>>> serverToBufferQueueMap =
1323         new ConcurrentHashMap<String, List<Pair<HRegionLocation, Row>>>();
1324     private List<Throwable> thrown = new ArrayList<Throwable>();
1325 
1326     // The following sink is used in distrubitedLogReplay mode for entries of regions in a disabling
1327     // table. It's a limitation of distributedLogReplay. Because log replay needs a region is
1328     // assigned and online before it can replay wal edits while regions of disabling/disabled table
1329     // won't be assigned by AM. We can retire this code after HBASE-8234.
1330     private LogRecoveredEditsOutputSink logRecoveredEditsOutputSink;
1331     private boolean hasEditsInDisablingOrDisabledTables = false;
1332 
1333     public LogReplayOutputSink(int numWriters) {
1334       super(numWriters);
1335       this.waitRegionOnlineTimeOut = conf.getInt("hbase.splitlog.manager.timeout",
1336         SplitLogManager.DEFAULT_TIMEOUT);
1337       this.logRecoveredEditsOutputSink = new LogRecoveredEditsOutputSink(numWriters);
1338       this.logRecoveredEditsOutputSink.setReporter(reporter);
1339     }
1340 
1341     void append(RegionEntryBuffer buffer) throws IOException {
1342       List<Entry> entries = buffer.entryBuffer;
1343       if (entries.isEmpty()) {
1344         LOG.warn("got an empty buffer, skipping");
1345         return;
1346       }
1347 
1348       // check if current region in a disabling or disabled table
1349       if (disablingOrDisabledTables.contains(buffer.tableName)) {
1350         // need fall back to old way
1351         logRecoveredEditsOutputSink.append(buffer);
1352         hasEditsInDisablingOrDisabledTables = true;
1353         // store regions we have recovered so far
1354         addToRecoveredRegions(Bytes.toString(buffer.encodedRegionName));
1355         return;
1356       }
1357 
1358       // group entries by region servers
1359       groupEditsByServer(entries);
1360 
1361       // process workitems
1362       String maxLocKey = null;
1363       int maxSize = 0;
1364       List<Pair<HRegionLocation, Row>> maxQueue = null;
1365       synchronized (this.serverToBufferQueueMap) {
1366         for (String key : this.serverToBufferQueueMap.keySet()) {
1367           List<Pair<HRegionLocation, Row>> curQueue = this.serverToBufferQueueMap.get(key);
1368           if (curQueue.size() > maxSize) {
1369             maxSize = curQueue.size();
1370             maxQueue = curQueue;
1371             maxLocKey = key;
1372           }
1373         }
1374         if (maxSize < minBatchSize
1375             && entryBuffers.totalBuffered < BUFFER_THRESHOLD * entryBuffers.maxHeapUsage) {
1376           // buffer more to process
1377           return;
1378         } else if (maxSize > 0) {
1379           this.serverToBufferQueueMap.remove(maxLocKey);
1380         }
1381       }
1382 
1383       if (maxSize > 0) {
1384         processWorkItems(maxLocKey, maxQueue);
1385       }
1386     }
1387 
1388     private void addToRecoveredRegions(String encodedRegionName) {
1389       if (!recoveredRegions.contains(encodedRegionName)) {
1390         recoveredRegions.add(encodedRegionName);
1391       }
1392     }
1393 
1394     /**
1395      * Helper function to group WALEntries to individual region servers
1396      * @throws IOException
1397      */
1398     private void groupEditsByServer(List<Entry> entries) throws IOException {
1399       Set<TableName> nonExistentTables = null;
1400       Long cachedLastFlushedSequenceId = -1l;
1401       for (HLog.Entry entry : entries) {
1402         WALEdit edit = entry.getEdit();
1403         TableName table = entry.getKey().getTablename();
1404         String encodeRegionNameStr = Bytes.toString(entry.getKey().getEncodedRegionName());
1405         // skip edits of non-existent tables
1406         if (nonExistentTables != null && nonExistentTables.contains(table)) {
1407           this.skippedEdits.incrementAndGet();
1408           continue;
1409         }
1410 
1411         Map<byte[], Long> maxStoreSequenceIds = null;
1412         boolean needSkip = false;
1413         Put put = null;
1414         Delete del = null;
1415         KeyValue lastKV = null;
1416         HRegionLocation loc = null;
1417         Row preRow = null;
1418         HRegionLocation preLoc = null;
1419         Row lastAddedRow = null; // it is not really needed here just be conservative
1420         String preKey = null;
1421         List<KeyValue> kvs = edit.getKeyValues();
1422         HConnection hconn = this.getConnectionByTableName(table);
1423 
1424         for (KeyValue kv : kvs) {
1425           // filtering HLog meta entries
1426           // We don't handle HBASE-2231 because we may or may not replay a compaction event.
1427           // Details at https://issues.apache.org/jira/browse/HBASE-2231?focusedCommentId=13647143&
1428           // page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13647143
1429           if (kv.matchingFamily(WALEdit.METAFAMILY)) continue;
1430 
1431           if (lastKV == null || lastKV.getType() != kv.getType() || !lastKV.matchingRow(kv)) {
1432             if (preRow != null) {
1433               synchronized (serverToBufferQueueMap) {
1434                 List<Pair<HRegionLocation, Row>> queue = serverToBufferQueueMap.get(preKey);
1435                 if (queue == null) {
1436                   queue = Collections.synchronizedList(new ArrayList<Pair<HRegionLocation, Row>>());
1437                   serverToBufferQueueMap.put(preKey, queue);
1438                 }
1439                 queue.add(new Pair<HRegionLocation, Row>(preLoc, preRow));
1440                 lastAddedRow = preRow;
1441               }
1442               // store regions we have recovered so far
1443               addToRecoveredRegions(preLoc.getRegionInfo().getEncodedName());
1444             }
1445 
1446             try {
1447               loc = locateRegionAndRefreshLastFlushedSequenceId(hconn, table, kv.getRow(),
1448                 encodeRegionNameStr);
1449             } catch (TableNotFoundException ex) {
1450               // table has been deleted so skip edits of the table
1451               LOG.info("Table " + table
1452                   + " doesn't exist. Skip log replay for region " + encodeRegionNameStr);
1453               lastFlushedSequenceIds.put(encodeRegionNameStr, Long.MAX_VALUE);
1454               if (nonExistentTables == null) {
1455                 nonExistentTables = new TreeSet<TableName>();
1456               }
1457               nonExistentTables.add(table);
1458               this.skippedEdits.incrementAndGet();
1459               needSkip = true;
1460               break;
1461             }
1462 
1463             cachedLastFlushedSequenceId =
1464                 lastFlushedSequenceIds.get(loc.getRegionInfo().getEncodedName());
1465             if (cachedLastFlushedSequenceId != null
1466                 && cachedLastFlushedSequenceId >= entry.getKey().getLogSeqNum()) {
1467               // skip the whole HLog entry
1468               this.skippedEdits.incrementAndGet();
1469               needSkip = true;
1470               break;
1471             } else {
1472               if (maxStoreSequenceIds == null) {
1473                 maxStoreSequenceIds =
1474                     regionMaxSeqIdInStores.get(loc.getRegionInfo().getEncodedName());
1475               }
1476               if (maxStoreSequenceIds != null) {
1477                 Long maxStoreSeqId = maxStoreSequenceIds.get(kv.getFamily());
1478                 if (maxStoreSeqId == null || maxStoreSeqId >= entry.getKey().getLogSeqNum()) {
1479                   // skip current kv if column family doesn't exist anymore or already flushed
1480                   continue;
1481                 }
1482               }
1483             }
1484 
1485             if (kv.isDelete()) {
1486               del = new Delete(kv.getRow());
1487               del.setClusterId(entry.getKey().getClusterId());
1488               preRow = del;
1489             } else {
1490               put = new Put(kv.getRow());
1491               put.setClusterId(entry.getKey().getClusterId());
1492               preRow = put;
1493             }
1494             preKey = loc.getHostnamePort() + KEY_DELIMITER + table;
1495             preLoc = loc;
1496           }
1497           if (kv.isDelete()) {
1498             del.addDeleteMarker(kv);
1499           } else {
1500             put.add(kv);
1501           }
1502           lastKV = kv;
1503         }
1504 
1505         // skip the edit
1506         if(needSkip) continue;
1507 
1508         // add the last row
1509         if (preRow != null && lastAddedRow != preRow) {
1510           synchronized (serverToBufferQueueMap) {
1511             List<Pair<HRegionLocation, Row>> queue = serverToBufferQueueMap.get(preKey);
1512             if (queue == null) {
1513               queue = Collections.synchronizedList(new ArrayList<Pair<HRegionLocation, Row>>());
1514               serverToBufferQueueMap.put(preKey, queue);
1515             }
1516             queue.add(new Pair<HRegionLocation, Row>(preLoc, preRow));
1517           }
1518           // store regions we have recovered so far
1519           addToRecoveredRegions(preLoc.getRegionInfo().getEncodedName());
1520         }
1521       }
1522     }
1523 
1524     /**
1525      * Locate destination region based on table name & row. This function also makes sure the
1526      * destination region is online for replay.
1527      * @throws IOException
1528      */
1529     private HRegionLocation locateRegionAndRefreshLastFlushedSequenceId(HConnection hconn,
1530         TableName table, byte[] row, String originalEncodedRegionName) throws IOException {
1531       // fetch location from cache
1532       HRegionLocation loc = onlineRegions.get(originalEncodedRegionName);
1533       if(loc != null) return loc;
1534       // fetch location from .META. directly without using cache to avoid hit old dead server
1535       loc = hconn.getRegionLocation(table, row, true);
1536       if (loc == null) {
1537         throw new IOException("Can't locate location for row:" + Bytes.toString(row)
1538             + " of table:" + table);
1539       }
1540       // check if current row moves to a different region due to region merge/split
1541       if (!originalEncodedRegionName.equalsIgnoreCase(loc.getRegionInfo().getEncodedName())) {
1542         // originalEncodedRegionName should have already flushed
1543         lastFlushedSequenceIds.put(originalEncodedRegionName, Long.MAX_VALUE);
1544         HRegionLocation tmpLoc = onlineRegions.get(loc.getRegionInfo().getEncodedName());
1545         if (tmpLoc != null) return tmpLoc;
1546       }
1547 
1548       Long lastFlushedSequenceId = -1l;
1549       AtomicBoolean isRecovering = new AtomicBoolean(true);
1550       loc = waitUntilRegionOnline(loc, row, this.waitRegionOnlineTimeOut, isRecovering);
1551       if (!isRecovering.get()) {
1552         // region isn't in recovering at all because WAL file may contain a region that has
1553         // been moved to somewhere before hosting RS fails
1554         lastFlushedSequenceIds.put(loc.getRegionInfo().getEncodedName(), Long.MAX_VALUE);
1555         LOG.info("logReplay skip region: " + loc.getRegionInfo().getEncodedName()
1556             + " because it's not in recovering.");
1557       } else {
1558         Long cachedLastFlushedSequenceId =
1559             lastFlushedSequenceIds.get(loc.getRegionInfo().getEncodedName());
1560 
1561         // retrieve last flushed sequence Id from ZK. Because region postOpenDeployTasks will
1562         // update the value for the region
1563         RegionStoreSequenceIds ids =
1564             SplitLogManager.getRegionFlushedSequenceId(watcher, failedServerName, loc
1565                 .getRegionInfo().getEncodedName());
1566         if (ids != null) {
1567           lastFlushedSequenceId = ids.getLastFlushedSequenceId();
1568           Map<byte[], Long> storeIds = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
1569           List<StoreSequenceId> maxSeqIdInStores = ids.getStoreSequenceIdList();
1570           for (StoreSequenceId id : maxSeqIdInStores) {
1571             storeIds.put(id.getFamilyName().toByteArray(), id.getSequenceId());
1572           }
1573           regionMaxSeqIdInStores.put(loc.getRegionInfo().getEncodedName(), storeIds);
1574         }
1575 
1576         if (cachedLastFlushedSequenceId == null
1577             || lastFlushedSequenceId > cachedLastFlushedSequenceId) {
1578           lastFlushedSequenceIds.put(loc.getRegionInfo().getEncodedName(), lastFlushedSequenceId);
1579         }
1580       }
1581 
1582       onlineRegions.put(loc.getRegionInfo().getEncodedName(), loc);
1583       return loc;
1584     }
1585 
1586     private void processWorkItems(String key, List<Pair<HRegionLocation, Row>> actions)
1587         throws IOException {
1588       RegionServerWriter rsw = null;
1589 
1590       long startTime = System.nanoTime();
1591       try {
1592         rsw = getRegionServerWriter(key);
1593         rsw.sink.replayEntries(actions);
1594 
1595         // Pass along summary statistics
1596         rsw.incrementEdits(actions.size());
1597         rsw.incrementNanoTime(System.nanoTime() - startTime);
1598       } catch (IOException e) {
1599         e = RemoteExceptionHandler.checkIOException(e);
1600         LOG.fatal(" Got while writing log entry to log", e);
1601         throw e;
1602       }
1603     }
1604 
1605     /**
1606      * Wait until region is online on the destination region server
1607      * @param loc
1608      * @param row
1609      * @param timeout How long to wait
1610      * @param isRecovering Recovering state of the region interested on destination region server.
1611      * @return True when region is online on the destination region server
1612      * @throws InterruptedException
1613      */
1614     private HRegionLocation waitUntilRegionOnline(HRegionLocation loc, byte[] row,
1615         final long timeout, AtomicBoolean isRecovering)
1616         throws IOException {
1617       final long endTime = EnvironmentEdgeManager.currentTimeMillis() + timeout;
1618       final long pause = conf.getLong(HConstants.HBASE_CLIENT_PAUSE,
1619         HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
1620       boolean reloadLocation = false;
1621       TableName tableName = loc.getRegionInfo().getTableName();
1622       int tries = 0;
1623       Throwable cause = null;
1624       while (endTime > EnvironmentEdgeManager.currentTimeMillis()) {
1625         try {
1626           // Try and get regioninfo from the hosting server.
1627           HConnection hconn = getConnectionByTableName(tableName);
1628           if(reloadLocation) {
1629             loc = hconn.getRegionLocation(tableName, row, true);
1630           }
1631           BlockingInterface remoteSvr = hconn.getAdmin(loc.getServerName());
1632           HRegionInfo region = loc.getRegionInfo();
1633           try {
1634             GetRegionInfoRequest request =
1635                 RequestConverter.buildGetRegionInfoRequest(region.getRegionName());
1636             GetRegionInfoResponse response = remoteSvr.getRegionInfo(null, request);
1637             if (HRegionInfo.convert(response.getRegionInfo()) != null) {
1638               isRecovering.set((response.hasIsRecovering()) ? response.getIsRecovering() : true);
1639               return loc;
1640             }
1641           } catch (ServiceException se) {
1642             throw ProtobufUtil.getRemoteException(se);
1643           }
1644         } catch (IOException e) {
1645           cause = e.getCause();
1646           if(!(cause instanceof RegionOpeningException)) {
1647             reloadLocation = true;
1648           }
1649         }
1650         long expectedSleep = ConnectionUtils.getPauseTime(pause, tries);
1651         try {
1652           Thread.sleep(expectedSleep);
1653         } catch (InterruptedException e) {
1654           Thread.currentThread().interrupt();
1655           throw new IOException("Interrupted when waiting regon " +
1656               loc.getRegionInfo().getEncodedName() + " online.", e);
1657         }
1658         tries++;
1659       }
1660 
1661       throw new IOException("Timeout when waiting region " + loc.getRegionInfo().getEncodedName() +
1662         " online for " + timeout + " milliseconds.", cause);
1663     }
1664 
1665     @Override
1666     protected boolean flush() throws IOException {
1667       String curLoc = null;
1668       int curSize = 0;
1669       List<Pair<HRegionLocation, Row>> curQueue = null;
1670       synchronized (this.serverToBufferQueueMap) {
1671         for (String locationKey : this.serverToBufferQueueMap.keySet()) {
1672           curQueue = this.serverToBufferQueueMap.get(locationKey);
1673           if (!curQueue.isEmpty()) {
1674             curSize = curQueue.size();
1675             curLoc = locationKey;
1676             break;
1677           }
1678         }
1679         if (curSize > 0) {
1680           this.serverToBufferQueueMap.remove(curLoc);
1681         }
1682       }
1683 
1684       if (curSize > 0) {
1685         this.processWorkItems(curLoc, curQueue);
1686         dataAvailable.notifyAll();
1687         return true;
1688       }
1689       return false;
1690     }
1691 
1692     void addWriterError(Throwable t) {
1693       thrown.add(t);
1694     }
1695 
1696     @Override
1697     List<Path> finishWritingAndClose() throws IOException {
1698       try {
1699         if (!finishWriting()) {
1700           return null;
1701         }
1702         if (hasEditsInDisablingOrDisabledTables) {
1703           splits = logRecoveredEditsOutputSink.finishWritingAndClose();
1704         } else {
1705           splits = new ArrayList<Path>();
1706         }
1707         // returns an empty array in order to keep interface same as old way
1708         return splits;
1709       } finally {
1710         List<IOException> thrown = closeRegionServerWriters();
1711         if (thrown != null && !thrown.isEmpty()) {
1712           throw MultipleIOException.createIOException(thrown);
1713         }
1714       }
1715     }
1716 
1717     @Override
1718     int getNumOpenWriters() {
1719       return this.writers.size() + this.logRecoveredEditsOutputSink.getNumOpenWriters();
1720     }
1721 
1722     private List<IOException> closeRegionServerWriters() throws IOException {
1723       List<IOException> result = null;
1724       if (!writersClosed) {
1725         result = Lists.newArrayList();
1726         try {
1727           for (WriterThread t : writerThreads) {
1728             while (t.isAlive()) {
1729               t.shouldStop = true;
1730               t.interrupt();
1731               try {
1732                 t.join(10);
1733               } catch (InterruptedException e) {
1734                 IOException iie = new InterruptedIOException();
1735                 iie.initCause(e);
1736                 throw iie;
1737               }
1738             }
1739           }
1740         } finally {
1741           synchronized (writers) {
1742             for (String locationKey : writers.keySet()) {
1743               RegionServerWriter tmpW = writers.get(locationKey);
1744               try {
1745                 tmpW.close();
1746               } catch (IOException ioe) {
1747                 LOG.error("Couldn't close writer for region server:" + locationKey, ioe);
1748                 result.add(ioe);
1749               }
1750             }
1751           }
1752 
1753           // close connections
1754           synchronized (this.tableNameToHConnectionMap) {
1755             for (TableName tableName : this.tableNameToHConnectionMap.keySet()) {
1756               HConnection hconn = this.tableNameToHConnectionMap.get(tableName);
1757               try {
1758                 hconn.clearRegionCache();
1759                 hconn.close();
1760               } catch (IOException ioe) {
1761                 result.add(ioe);
1762               }
1763             }
1764           }
1765           writersClosed = true;
1766         }
1767       }
1768       return result;
1769     }
1770 
1771     Map<byte[], Long> getOutputCounts() {
1772       TreeMap<byte[], Long> ret = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
1773       synchronized (writers) {
1774         for (Map.Entry<String, RegionServerWriter> entry : writers.entrySet()) {
1775           ret.put(Bytes.toBytes(entry.getKey()), entry.getValue().editsWritten);
1776         }
1777       }
1778       return ret;
1779     }
1780 
1781     @Override
1782     int getNumberOfRecoveredRegions() {
1783       return this.recoveredRegions.size();
1784     }
1785 
1786     /**
1787      * Get a writer and path for a log starting at the given entry. This function is threadsafe so
1788      * long as multiple threads are always acting on different regions.
1789      * @return null if this region shouldn't output any logs
1790      */
1791     private RegionServerWriter getRegionServerWriter(String loc) throws IOException {
1792       RegionServerWriter ret = writers.get(loc);
1793       if (ret != null) {
1794         return ret;
1795       }
1796 
1797       TableName tableName = getTableFromLocationStr(loc);
1798       if(tableName != null){
1799         LOG.warn("Invalid location string:" + loc + " found.");
1800       }
1801 
1802       HConnection hconn = getConnectionByTableName(tableName);
1803       synchronized (writers) {
1804         ret = writers.get(loc);
1805         if (ret == null) {
1806           ret = new RegionServerWriter(conf, tableName, hconn);
1807           writers.put(loc, ret);
1808         }
1809       }
1810       return ret;
1811     }
1812 
1813     private HConnection getConnectionByTableName(final TableName tableName) throws IOException {
1814       HConnection hconn = this.tableNameToHConnectionMap.get(tableName);
1815       if (hconn == null) {
1816         synchronized (this.tableNameToHConnectionMap) {
1817           hconn = this.tableNameToHConnectionMap.get(tableName);
1818           if (hconn == null) {
1819             hconn = HConnectionManager.getConnection(conf);
1820             this.tableNameToHConnectionMap.put(tableName, hconn);
1821           }
1822         }
1823       }
1824       return hconn;
1825     }
1826     private TableName getTableFromLocationStr(String loc) {
1827       /**
1828        * location key is in format <server name:port>#<table name>
1829        */
1830       String[] splits = loc.split(KEY_DELIMITER);
1831       if (splits.length != 2) {
1832         return null;
1833       }
1834       return TableName.valueOf(splits[1]);
1835     }
1836   }
1837 
1838   /**
1839    * Private data structure that wraps a receiving RS and collecting statistics about the data
1840    * written to this newly assigned RS.
1841    */
1842   private final static class RegionServerWriter extends SinkWriter {
1843     final WALEditsReplaySink sink;
1844 
1845     RegionServerWriter(final Configuration conf, final TableName tableName, final HConnection conn)
1846         throws IOException {
1847       this.sink = new WALEditsReplaySink(conf, tableName, conn);
1848     }
1849 
1850     void close() throws IOException {
1851     }
1852   }
1853 
1854   static class CorruptedLogFileException extends Exception {
1855     private static final long serialVersionUID = 1L;
1856 
1857     CorruptedLogFileException(String s) {
1858       super(s);
1859     }
1860   }
1861 }