View Javadoc

1   /**
2     * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.CHECK;
21  import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.FORCE;
22  import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.DELETED;
23  import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.FAILURE;
24  import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.IN_PROGRESS;
25  import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.SUCCESS;
26  
27  import java.io.IOException;
28  import java.util.ArrayList;
29  import java.util.Collections;
30  import java.util.HashSet;
31  import java.util.List;
32  import java.util.Map;
33  import java.util.Set;
34  import java.util.concurrent.ConcurrentHashMap;
35  import java.util.concurrent.ConcurrentMap;
36  import java.util.concurrent.locks.ReentrantLock;
37  
38  import org.apache.commons.logging.Log;
39  import org.apache.commons.logging.LogFactory;
40  import org.apache.hadoop.classification.InterfaceAudience;
41  import org.apache.hadoop.conf.Configuration;
42  import org.apache.hadoop.fs.FileStatus;
43  import org.apache.hadoop.fs.FileSystem;
44  import org.apache.hadoop.fs.Path;
45  import org.apache.hadoop.fs.PathFilter;
46  import org.apache.hadoop.hbase.Chore;
47  import org.apache.hadoop.hbase.HConstants;
48  import org.apache.hadoop.hbase.HRegionInfo;
49  import org.apache.hadoop.hbase.ServerName;
50  import org.apache.hadoop.hbase.SplitLogCounters;
51  import org.apache.hadoop.hbase.SplitLogTask;
52  import org.apache.hadoop.hbase.Stoppable;
53  import org.apache.hadoop.hbase.exceptions.DeserializationException;
54  import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
55  import org.apache.hadoop.hbase.monitoring.MonitoredTask;
56  import org.apache.hadoop.hbase.monitoring.TaskMonitor;
57  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.RegionStoreSequenceIds;
58  import org.apache.hadoop.hbase.regionserver.SplitLogWorker;
59  import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
60  import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
61  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
62  import org.apache.hadoop.hbase.util.FSUtils;
63  import org.apache.hadoop.hbase.util.Pair;
64  import org.apache.hadoop.hbase.util.Threads;
65  import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
66  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
67  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
68  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
69  import org.apache.hadoop.util.StringUtils;
70  import org.apache.zookeeper.AsyncCallback;
71  import org.apache.zookeeper.CreateMode;
72  import org.apache.zookeeper.KeeperException;
73  import org.apache.zookeeper.KeeperException.NoNodeException;
74  import org.apache.zookeeper.ZooDefs.Ids;
75  import org.apache.zookeeper.data.Stat;
76  
77  /**
78   * Distributes the task of log splitting to the available region servers.
79   * Coordination happens via zookeeper. For every log file that has to be split a
80   * znode is created under <code>/hbase/splitlog</code>. SplitLogWorkers race to grab a task.
81   *
82   * <p>SplitLogManager monitors the task znodes that it creates using the
83   * timeoutMonitor thread. If a task's progress is slow then
84   * {@link #resubmit(String, Task, ResubmitDirective)} will take away the task from the owner
85   * {@link SplitLogWorker} and the task will be up for grabs again. When the task is done then the
86   * task's znode is deleted by SplitLogManager.
87   *
88   * <p>Clients call {@link #splitLogDistributed(Path)} to split a region server's
89   * log files. The caller thread waits in this method until all the log files
90   * have been split.
91   *
92   * <p>All the zookeeper calls made by this class are asynchronous. This is mainly
93   * to help reduce response time seen by the callers.
94   *
95   * <p>There is race in this design between the SplitLogManager and the
96   * SplitLogWorker. SplitLogManager might re-queue a task that has in reality
97   * already been completed by a SplitLogWorker. We rely on the idempotency of
98   * the log splitting task for correctness.
99   *
100  * <p>It is also assumed that every log splitting task is unique and once
101  * completed (either with success or with error) it will be not be submitted
102  * again. If a task is resubmitted then there is a risk that old "delete task"
103  * can delete the re-submission.
104  */
105 @InterfaceAudience.Private
106 public class SplitLogManager extends ZooKeeperListener {
107   private static final Log LOG = LogFactory.getLog(SplitLogManager.class);
108 
109   public static final int DEFAULT_TIMEOUT = 120000;
110   public static final int DEFAULT_ZK_RETRIES = 3;
111   public static final int DEFAULT_MAX_RESUBMIT = 3;
112   public static final int DEFAULT_UNASSIGNED_TIMEOUT = (3 * 60 * 1000); //3 min
113 
114   private final Stoppable stopper;
115   private final MasterServices master;
116   private final ServerName serverName;
117   private final TaskFinisher taskFinisher;
118   private FileSystem fs;
119   private Configuration conf;
120 
121   private long zkretries;
122   private long resubmit_threshold;
123   private long timeout;
124   private long unassignedTimeout;
125   private long lastNodeCreateTime = Long.MAX_VALUE;
126   public boolean ignoreZKDeleteForTesting = false;
127   private volatile long lastRecoveringNodeCreationTime = 0;
128   // When lastRecoveringNodeCreationTime is older than the following threshold, we'll check
129   // whether to GC stale recovering znodes
130   private long checkRecoveringTimeThreshold = 15000; // 15 seconds
131   private final List<Pair<Set<ServerName>, Boolean>> failedRecoveringRegionDeletions = Collections
132       .synchronizedList(new ArrayList<Pair<Set<ServerName>, Boolean>>());
133 
134   /**
135    * In distributedLogReplay mode, we need touch both splitlog and recovering-regions znodes in one
136    * operation. So the lock is used to guard such cases.
137    */
138   protected final ReentrantLock recoveringRegionLock = new ReentrantLock();
139 
140   final boolean distributedLogReplay;
141 
142   private final ConcurrentMap<String, Task> tasks = new ConcurrentHashMap<String, Task>();
143   private TimeoutMonitor timeoutMonitor;
144 
145   private volatile Set<ServerName> deadWorkers = null;
146   private final Object deadWorkersLock = new Object();
147 
148   private Set<String> failedDeletions = null;
149 
150   /**
151    * Wrapper around {@link #SplitLogManager(ZooKeeperWatcher zkw, Configuration conf,
152    *   Stoppable stopper, MasterServices master, ServerName serverName,
153    *   boolean masterRecovery, TaskFinisher tf)}
154    * with masterRecovery = false, and tf = null.  Used in unit tests.
155    *
156    * @param zkw the ZK watcher
157    * @param conf the HBase configuration
158    * @param stopper the stoppable in case anything is wrong
159    * @param master the master services
160    * @param serverName the master server name
161    */
162   public SplitLogManager(ZooKeeperWatcher zkw, final Configuration conf,
163       Stoppable stopper, MasterServices master, ServerName serverName) {
164     this(zkw, conf, stopper, master, serverName, false, null);
165   }
166 
167   /**
168    * Wrapper around {@link #SplitLogManager(ZooKeeperWatcher zkw, Configuration conf,
169    *   Stoppable stopper, MasterServices master, ServerName serverName,
170    *   boolean masterRecovery, TaskFinisher tf)}
171    * that provides a task finisher for copying recovered edits to their final destination.
172    * The task finisher has to be robust because it can be arbitrarily restarted or called
173    * multiple times.
174    *
175    * @param zkw the ZK watcher
176    * @param conf the HBase configuration
177    * @param stopper the stoppable in case anything is wrong
178    * @param master the master services
179    * @param serverName the master server name
180    * @param masterRecovery an indication if the master is in recovery
181    */
182   public SplitLogManager(ZooKeeperWatcher zkw, final Configuration conf,
183       Stoppable stopper, MasterServices master, ServerName serverName, boolean masterRecovery) {
184     this(zkw, conf, stopper, master, serverName, masterRecovery, new TaskFinisher() {
185       @Override
186       public Status finish(ServerName workerName, String logfile) {
187         try {
188           HLogSplitter.finishSplitLogFile(logfile, conf);
189         } catch (IOException e) {
190           LOG.warn("Could not finish splitting of log file " + logfile, e);
191           return Status.ERR;
192         }
193         return Status.DONE;
194       }
195     });
196   }
197 
198   /**
199    * Its OK to construct this object even when region-servers are not online. It
200    * does lookup the orphan tasks in zk but it doesn't block waiting for them
201    * to be done.
202    *
203    * @param zkw the ZK watcher
204    * @param conf the HBase configuration
205    * @param stopper the stoppable in case anything is wrong
206    * @param master the master services
207    * @param serverName the master server name
208    * @param masterRecovery an indication if the master is in recovery
209    * @param tf task finisher
210    */
211   public SplitLogManager(ZooKeeperWatcher zkw, Configuration conf,
212         Stoppable stopper, MasterServices master,
213         ServerName serverName, boolean masterRecovery, TaskFinisher tf) {
214     super(zkw);
215     this.taskFinisher = tf;
216     this.conf = conf;
217     this.stopper = stopper;
218     this.master = master;
219     this.zkretries = conf.getLong("hbase.splitlog.zk.retries", DEFAULT_ZK_RETRIES);
220     this.resubmit_threshold = conf.getLong("hbase.splitlog.max.resubmit", DEFAULT_MAX_RESUBMIT);
221     this.timeout = conf.getInt("hbase.splitlog.manager.timeout", DEFAULT_TIMEOUT);
222     this.unassignedTimeout =
223       conf.getInt("hbase.splitlog.manager.unassigned.timeout", DEFAULT_UNASSIGNED_TIMEOUT);
224     LOG.info("timeout=" + timeout + ", unassigned timeout=" + unassignedTimeout);
225 
226     this.serverName = serverName;
227     this.timeoutMonitor = new TimeoutMonitor(
228       conf.getInt("hbase.splitlog.manager.timeoutmonitor.period", 1000), stopper);
229 
230     this.failedDeletions = Collections.synchronizedSet(new HashSet<String>());
231     this.distributedLogReplay = this.conf.getBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY,
232       HConstants.DEFAULT_DISTRIBUTED_LOG_REPLAY_CONFIG);
233     LOG.info("distributedLogReplay = " + this.distributedLogReplay);
234 
235     if (!masterRecovery) {
236       Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), serverName
237           + ".splitLogManagerTimeoutMonitor");
238     }
239     // Watcher can be null during tests with Mock'd servers.
240     if (this.watcher != null) {
241       this.watcher.registerListener(this);
242       lookForOrphans();
243     }
244   }
245 
246   private FileStatus[] getFileList(List<Path> logDirs, PathFilter filter) throws IOException {
247     List<FileStatus> fileStatus = new ArrayList<FileStatus>();
248     for (Path hLogDir : logDirs) {
249       this.fs = hLogDir.getFileSystem(conf);
250       if (!fs.exists(hLogDir)) {
251         LOG.warn(hLogDir + " doesn't exist. Nothing to do!");
252         continue;
253       }
254       FileStatus[] logfiles = FSUtils.listStatus(fs, hLogDir, filter);
255       if (logfiles == null || logfiles.length == 0) {
256         LOG.info(hLogDir + " is empty dir, no logs to split");
257       } else {
258         for (FileStatus status : logfiles)
259           fileStatus.add(status);
260       }
261     }
262     FileStatus[] a = new FileStatus[fileStatus.size()];
263     return fileStatus.toArray(a);
264   }
265 
266   /**
267    * @param logDir
268    *            one region sever hlog dir path in .logs
269    * @throws IOException
270    *             if there was an error while splitting any log file
271    * @return cumulative size of the logfiles split
272    * @throws IOException 
273    */
274   public long splitLogDistributed(final Path logDir) throws IOException {
275     List<Path> logDirs = new ArrayList<Path>();
276     logDirs.add(logDir);
277     return splitLogDistributed(logDirs);
278   }
279 
280   /**
281    * The caller will block until all the log files of the given region server
282    * have been processed - successfully split or an error is encountered - by an
283    * available worker region server. This method must only be called after the
284    * region servers have been brought online.
285    *
286    * @param logDirs List of log dirs to split
287    * @throws IOException If there was an error while splitting any log file
288    * @return cumulative size of the logfiles split
289    */
290   public long splitLogDistributed(final List<Path> logDirs) throws IOException {
291     if (logDirs.isEmpty()) {
292       return 0;
293     }
294     Set<ServerName> serverNames = new HashSet<ServerName>();
295     for (Path logDir : logDirs) {
296       try {
297         ServerName serverName = HLogUtil.getServerNameFromHLogDirectoryName(logDir);
298         if (serverName != null) {
299           serverNames.add(serverName);
300         }
301       } catch (IllegalArgumentException e) {
302         // ignore invalid format error.
303         LOG.warn("Cannot parse server name from " + logDir);
304       }
305     }
306     return splitLogDistributed(serverNames, logDirs, null);
307   }
308 
309   /**
310    * The caller will block until all the META log files of the given region server
311    * have been processed - successfully split or an error is encountered - by an
312    * available worker region server. This method must only be called after the
313    * region servers have been brought online.
314    *
315    * @param logDirs List of log dirs to split
316    * @param filter the Path filter to select specific files for considering
317    * @throws IOException If there was an error while splitting any log file
318    * @return cumulative size of the logfiles split
319    */
320   public long splitLogDistributed(final Set<ServerName> serverNames, final List<Path> logDirs,
321       PathFilter filter) throws IOException {
322     MonitoredTask status = TaskMonitor.get().createStatus(
323           "Doing distributed log split in " + logDirs);
324     FileStatus[] logfiles = getFileList(logDirs, filter);
325     status.setStatus("Checking directory contents...");
326     LOG.debug("Scheduling batch of logs to split");
327     SplitLogCounters.tot_mgr_log_split_batch_start.incrementAndGet();
328     LOG.info("started splitting " + logfiles.length + " logs in " + logDirs);
329     long t = EnvironmentEdgeManager.currentTimeMillis();
330     long totalSize = 0;
331     TaskBatch batch = new TaskBatch();
332     Boolean isMetaRecovery = (filter == null) ? null : false;
333     for (FileStatus lf : logfiles) {
334       // TODO If the log file is still being written to - which is most likely
335       // the case for the last log file - then its length will show up here
336       // as zero. The size of such a file can only be retrieved after
337       // recover-lease is done. totalSize will be under in most cases and the
338       // metrics that it drives will also be under-reported.
339       totalSize += lf.getLen();
340       String pathToLog = FSUtils.removeRootPath(lf.getPath(), conf);
341       if (!enqueueSplitTask(pathToLog, batch)) {
342         throw new IOException("duplicate log split scheduled for " + lf.getPath());
343       }
344     }
345     waitForSplittingCompletion(batch, status);
346     // remove recovering regions from ZK
347     if (filter == MasterFileSystem.META_FILTER /* reference comparison */) {
348       // we split meta regions and user regions separately therefore logfiles are either all for
349       // meta or user regions but won't for both( we could have mixed situations in tests)
350       isMetaRecovery = true;
351     }
352     this.removeRecoveringRegionsFromZK(serverNames, isMetaRecovery);
353 
354     if (batch.done != batch.installed) {
355       batch.isDead = true;
356       SplitLogCounters.tot_mgr_log_split_batch_err.incrementAndGet();
357       LOG.warn("error while splitting logs in " + logDirs +
358       " installed = " + batch.installed + " but only " + batch.done + " done");
359       String msg = "error or interrupted while splitting logs in "
360         + logDirs + " Task = " + batch;
361       status.abort(msg);
362       throw new IOException(msg);
363     }
364     for(Path logDir: logDirs){
365       status.setStatus("Cleaning up log directory...");
366       try {
367         if (fs.exists(logDir) && !fs.delete(logDir, false)) {
368           LOG.warn("Unable to delete log src dir. Ignoring. " + logDir);
369         }
370       } catch (IOException ioe) {
371         FileStatus[] files = fs.listStatus(logDir);
372         if (files != null && files.length > 0) {
373           LOG.warn("returning success without actually splitting and " + 
374               "deleting all the log files in path " + logDir);
375         } else {
376           LOG.warn("Unable to delete log src dir. Ignoring. " + logDir, ioe);
377         }
378       }
379       SplitLogCounters.tot_mgr_log_split_batch_success.incrementAndGet();
380     }
381     String msg = "finished splitting (more than or equal to) " + totalSize +
382         " bytes in " + batch.installed + " log files in " + logDirs + " in " +
383         (EnvironmentEdgeManager.currentTimeMillis() - t) + "ms";
384     status.markComplete(msg);
385     LOG.info(msg);
386     return totalSize;
387   }
388 
389   /**
390    * Add a task entry to splitlog znode if it is not already there.
391    * 
392    * @param taskname the path of the log to be split
393    * @param batch the batch this task belongs to
394    * @return true if a new entry is created, false if it is already there.
395    */
396   boolean enqueueSplitTask(String taskname, TaskBatch batch) {
397     SplitLogCounters.tot_mgr_log_split_start.incrementAndGet();
398     // This is a znode path under the splitlog dir with the rest of the path made up of an
399     // url encoding of the passed in log to split.
400     String path = ZKSplitLog.getEncodedNodeName(watcher, taskname);
401     Task oldtask = createTaskIfAbsent(path, batch);
402     if (oldtask == null) {
403       // publish the task in zk
404       createNode(path, zkretries);
405       return true;
406     }
407     return false;
408   }
409 
410   private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
411     synchronized (batch) {
412       while ((batch.done + batch.error) != batch.installed) {
413         try {
414           status.setStatus("Waiting for distributed tasks to finish. "
415               + " scheduled=" + batch.installed
416               + " done=" + batch.done
417               + " error=" + batch.error);
418           int remaining = batch.installed - (batch.done + batch.error);
419           int actual = activeTasks(batch);
420           if (remaining != actual) {
421             LOG.warn("Expected " + remaining
422               + " active tasks, but actually there are " + actual);
423           }
424           int remainingInZK = remainingTasksInZK();
425           if (remainingInZK >= 0 && actual > remainingInZK) {
426             LOG.warn("Expected at least" + actual
427               + " tasks in ZK, but actually there are " + remainingInZK);
428           }
429           if (remainingInZK == 0 || actual == 0) {
430             LOG.warn("No more task remaining (ZK or task map), splitting "
431               + "should have completed. Remaining tasks in ZK " + remainingInZK
432               + ", active tasks in map " + actual);
433             if (remainingInZK == 0 && actual == 0) {
434               return;
435             }
436           }
437           batch.wait(100);
438           if (stopper.isStopped()) {
439             LOG.warn("Stopped while waiting for log splits to be completed");
440             return;
441           }
442         } catch (InterruptedException e) {
443           LOG.warn("Interrupted while waiting for log splits to be completed");
444           Thread.currentThread().interrupt();
445           return;
446         }
447       }
448     }
449   }
450 
451   private int activeTasks(final TaskBatch batch) {
452     int count = 0;
453     for (Task t: tasks.values()) {
454       if (t.batch == batch && t.status == TerminationStatus.IN_PROGRESS) {
455         count++;
456       }
457     }
458     return count;
459   }
460 
461   private int remainingTasksInZK() {
462     int count = 0;
463     try {
464       List<String> tasks =
465         ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
466       if (tasks != null) {
467         for (String t: tasks) {
468           if (!ZKSplitLog.isRescanNode(watcher, t)) {
469             count++;
470           }
471         }
472       }
473     } catch (KeeperException ke) {
474       LOG.warn("Failed to check remaining tasks", ke);
475       count = -1;
476     }
477     return count;
478   }
479 
480   /**
481    * It removes recovering regions under /hbase/recovering-regions/[encoded region name] so that the
482    * region server hosting the region can allow reads to the recovered region
483    * @param serverNames servers which are just recovered
484    * @param isMetaRecovery whether current recovery is for the meta region on
485    *          <code>serverNames<code>
486    */
487   private void
488       removeRecoveringRegionsFromZK(final Set<ServerName> serverNames, Boolean isMetaRecovery) {
489 
490     if (!this.distributedLogReplay) {
491       // the function is only used in WALEdit direct replay mode
492       return;
493     }
494 
495     final String metaEncodeRegionName = HRegionInfo.FIRST_META_REGIONINFO.getEncodedName();
496     int count = 0;
497     Set<String> recoveredServerNameSet = new HashSet<String>();
498     if (serverNames != null) {
499       for (ServerName tmpServerName : serverNames) {
500         recoveredServerNameSet.add(tmpServerName.getServerName());
501       }
502     }
503 
504     try {
505       this.recoveringRegionLock.lock();
506 
507       List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
508       if (tasks != null) {
509         for (String t : tasks) {
510           if (!ZKSplitLog.isRescanNode(watcher, t)) {
511             count++;
512           }
513         }
514       }
515       if (count == 0 && this.master.isInitialized()
516           && !this.master.getServerManager().areDeadServersInProgress()) {
517         // no splitting work items left
518         deleteRecoveringRegionZNodes(null);
519         // reset lastRecoveringNodeCreationTime because we cleared all recovering znodes at
520         // this point.
521         lastRecoveringNodeCreationTime = Long.MAX_VALUE;
522       } else if (!recoveredServerNameSet.isEmpty()) {
523         // remove recovering regions which doesn't have any RS associated with it
524         List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.recoveringRegionsZNode);
525         if (regions != null) {
526           for (String region : regions) {
527             if(isMetaRecovery != null) {
528               if ((isMetaRecovery && !region.equalsIgnoreCase(metaEncodeRegionName))
529                   || (!isMetaRecovery && region.equalsIgnoreCase(metaEncodeRegionName))) {
530                 // skip non-meta regions when recovering the meta region or 
531                 // skip the meta region when recovering user regions
532                 continue;
533               }
534             }
535             String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, region);
536             List<String> failedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
537             if (failedServers == null || failedServers.isEmpty()) {
538               ZKUtil.deleteNode(watcher, nodePath);
539               continue;
540             }
541             if (recoveredServerNameSet.containsAll(failedServers)) {
542               ZKUtil.deleteNodeRecursively(watcher, nodePath);
543             } else {
544               for (String failedServer : failedServers) {
545                 if (recoveredServerNameSet.contains(failedServer)) {
546                   String tmpPath = ZKUtil.joinZNode(nodePath, failedServer);
547                   ZKUtil.deleteNode(watcher, tmpPath);
548                 }
549               }
550             }
551           }
552         }
553       }
554     } catch (KeeperException ke) {
555       LOG.warn("removeRecoveringRegionsFromZK got zookeeper exception. Will retry", ke);
556       if (serverNames != null && !serverNames.isEmpty()) {
557         this.failedRecoveringRegionDeletions.add(new Pair<Set<ServerName>, Boolean>(serverNames,
558             isMetaRecovery));
559       }
560     } finally {
561       this.recoveringRegionLock.unlock();
562     }
563   }
564 
565   /**
566    * It removes stale recovering regions under /hbase/recovering-regions/[encoded region name]
567    * during master initialization phase.
568    * @param failedServers A set of known failed servers
569    * @throws KeeperException
570    */
571   void removeStaleRecoveringRegionsFromZK(final Set<ServerName> failedServers)
572       throws KeeperException {
573 
574     if (!this.distributedLogReplay) {
575       // the function is only used in distributedLogReplay mode when master is in initialization
576       return;
577     }
578 
579     Set<String> knownFailedServers = new HashSet<String>();
580     if (failedServers != null) {
581       for (ServerName tmpServerName : failedServers) {
582         knownFailedServers.add(tmpServerName.getServerName());
583       }
584     }
585 
586     this.recoveringRegionLock.lock();
587     try {
588       List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
589       if (tasks != null) {
590         for (String t : tasks) {
591           byte[] data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.splitLogZNode, t));
592           if (data != null) {
593             SplitLogTask slt = null;
594             try {
595               slt = SplitLogTask.parseFrom(data);
596             } catch (DeserializationException e) {
597               LOG.warn("Failed parse data for znode " + t, e);
598             }
599             if (slt != null && slt.isDone()) {
600               continue;
601             }
602           }
603           // decode the file name
604           t = ZKSplitLog.getFileName(t);
605           ServerName serverName = HLogUtil.getServerNameFromHLogDirectoryName(new Path(t));
606           if (serverName != null) {
607             knownFailedServers.add(serverName.getServerName());
608           } else {
609             LOG.warn("Found invalid WAL log file name:" + t);
610           }
611         }
612       }
613 
614       // remove recovering regions which doesn't have any RS associated with it
615       List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.recoveringRegionsZNode);
616       if (regions != null) {
617         for (String region : regions) {
618           String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, region);
619           List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
620           if (regionFailedServers == null || regionFailedServers.isEmpty()) {
621             ZKUtil.deleteNode(watcher, nodePath);
622             continue;
623           }
624           boolean needMoreRecovery = false;
625           for (String tmpFailedServer : regionFailedServers) {
626             if (knownFailedServers.contains(tmpFailedServer)) {
627               needMoreRecovery = true;
628               break;
629             }
630           }
631           if (!needMoreRecovery) {
632             ZKUtil.deleteNodeRecursively(watcher, nodePath);
633           }
634         }
635       }
636     } finally {
637       this.recoveringRegionLock.unlock();
638     }
639   }
640 
641   private void deleteRecoveringRegionZNodes(List<String> regions) {
642     try {
643       if (regions == null) {
644         // remove all children under /home/recovering-regions
645         LOG.info("Garbage collecting all recovering regions.");
646         ZKUtil.deleteChildrenRecursively(watcher, watcher.recoveringRegionsZNode);
647       } else {
648         for (String curRegion : regions) {
649           String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, curRegion);
650           ZKUtil.deleteNodeRecursively(watcher, nodePath);
651         }
652       }
653     } catch (KeeperException e) {
654       LOG.warn("Cannot remove recovering regions from ZooKeeper", e);
655     }
656   }
657 
658   private void setDone(String path, TerminationStatus status) {
659     Task task = tasks.get(path);
660     if (task == null) {
661       if (!ZKSplitLog.isRescanNode(watcher, path)) {
662         SplitLogCounters.tot_mgr_unacquired_orphan_done.incrementAndGet();
663         LOG.debug("unacquired orphan task is done " + path);
664       }
665     } else {
666       synchronized (task) {
667         if (task.status == IN_PROGRESS) {
668           if (status == SUCCESS) {
669             SplitLogCounters.tot_mgr_log_split_success.incrementAndGet();
670             LOG.info("Done splitting " + path);
671           } else {
672             SplitLogCounters.tot_mgr_log_split_err.incrementAndGet();
673             LOG.warn("Error splitting " + path);
674           }
675           task.status = status;
676           if (task.batch != null) {
677             synchronized (task.batch) {
678               if (status == SUCCESS) {
679                 task.batch.done++;
680               } else {
681                 task.batch.error++;
682               }
683               task.batch.notify();
684             }
685           }
686         }
687       }
688     }
689     // delete the task node in zk. It's an async
690     // call and no one is blocked waiting for this node to be deleted. All
691     // task names are unique (log.<timestamp>) there is no risk of deleting
692     // a future task.
693     // if a deletion fails, TimeoutMonitor will retry the same deletion later
694     deleteNode(path, zkretries);
695     return;
696   }
697 
698   private void createNode(String path, Long retry_count) {
699     SplitLogTask slt = new SplitLogTask.Unassigned(serverName);
700     ZKUtil.asyncCreate(this.watcher, path, slt.toByteArray(), new CreateAsyncCallback(), retry_count);
701     SplitLogCounters.tot_mgr_node_create_queued.incrementAndGet();
702     return;
703   }
704 
705   private void createNodeSuccess(String path) {
706     lastNodeCreateTime = EnvironmentEdgeManager.currentTimeMillis();
707     LOG.debug("put up splitlog task at znode " + path);
708     getDataSetWatch(path, zkretries);
709   }
710 
711   private void createNodeFailure(String path) {
712     // TODO the Manager should split the log locally instead of giving up
713     LOG.warn("failed to create task node" + path);
714     setDone(path, FAILURE);
715   }
716 
717 
718   private void getDataSetWatch(String path, Long retry_count) {
719     this.watcher.getRecoverableZooKeeper().getZooKeeper().
720         getData(path, this.watcher,
721         new GetDataAsyncCallback(), retry_count);
722     SplitLogCounters.tot_mgr_get_data_queued.incrementAndGet();
723   }
724 
725   private void tryGetDataSetWatch(String path) {
726     // A negative retry count will lead to ignoring all error processing.
727     this.watcher.getRecoverableZooKeeper().getZooKeeper().
728         getData(path, this.watcher,
729         new GetDataAsyncCallback(), Long.valueOf(-1) /* retry count */);
730     SplitLogCounters.tot_mgr_get_data_queued.incrementAndGet();
731   }
732 
733   private void getDataSetWatchSuccess(String path, byte[] data, int version)
734   throws DeserializationException {
735     if (data == null) {
736       if (version == Integer.MIN_VALUE) {
737         // assume all done. The task znode suddenly disappeared.
738         setDone(path, SUCCESS);
739         return;
740       }
741       SplitLogCounters.tot_mgr_null_data.incrementAndGet();
742       LOG.fatal("logic error - got null data " + path);
743       setDone(path, FAILURE);
744       return;
745     }
746     data = this.watcher.getRecoverableZooKeeper().removeMetaData(data);
747     SplitLogTask slt = SplitLogTask.parseFrom(data);
748     if (slt.isUnassigned()) {
749       LOG.debug("task not yet acquired " + path + " ver = " + version);
750       handleUnassignedTask(path);
751     } else if (slt.isOwned()) {
752       heartbeat(path, version, slt.getServerName());
753     } else if (slt.isResigned()) {
754       LOG.info("task " + path + " entered state: " + slt.toString());
755       resubmitOrFail(path, FORCE);
756     } else if (slt.isDone()) {
757       LOG.info("task " + path + " entered state: " + slt.toString());
758       if (taskFinisher != null && !ZKSplitLog.isRescanNode(watcher, path)) {
759         if (taskFinisher.finish(slt.getServerName(), ZKSplitLog.getFileName(path)) == Status.DONE) {
760           setDone(path, SUCCESS);
761         } else {
762           resubmitOrFail(path, CHECK);
763         }
764       } else {
765         setDone(path, SUCCESS);
766       }
767     } else if (slt.isErr()) {
768       LOG.info("task " + path + " entered state: " + slt.toString());
769       resubmitOrFail(path, CHECK);
770     } else {
771       LOG.fatal("logic error - unexpected zk state for path = " + path + " data = " + slt.toString());
772       setDone(path, FAILURE);
773     }
774   }
775 
776   private void getDataSetWatchFailure(String path) {
777     LOG.warn("failed to set data watch " + path);
778     setDone(path, FAILURE);
779   }
780 
781   /**
782    * It is possible for a task to stay in UNASSIGNED state indefinitely - say
783    * SplitLogManager wants to resubmit a task. It forces the task to UNASSIGNED
784    * state but it dies before it could create the RESCAN task node to signal
785    * the SplitLogWorkers to pick up the task. To prevent this scenario the
786    * SplitLogManager resubmits all orphan and UNASSIGNED tasks at startup.
787    *
788    * @param path
789    */
790   private void handleUnassignedTask(String path) {
791     if (ZKSplitLog.isRescanNode(watcher, path)) {
792       return;
793     }
794     Task task = findOrCreateOrphanTask(path);
795     if (task.isOrphan() && (task.incarnation == 0)) {
796       LOG.info("resubmitting unassigned orphan task " + path);
797       // ignore failure to resubmit. The timeout-monitor will handle it later
798       // albeit in a more crude fashion
799       resubmit(path, task, FORCE);
800     }
801   }
802 
803   /**
804    * Helper function to check whether to abandon retries in ZooKeeper AsyncCallback functions
805    * @param statusCode integer value of a ZooKeeper exception code
806    * @param action description message about the retried action
807    * @return true when need to abandon retries otherwise false
808    */
809   private boolean needAbandonRetries(int statusCode, String action) {
810     if (statusCode == KeeperException.Code.SESSIONEXPIRED.intValue()) {
811       LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries for "
812           + "action=" + action);
813       return true;
814     }
815     return false;
816   }
817 
818   private void heartbeat(String path, int new_version, ServerName workerName) {
819     Task task = findOrCreateOrphanTask(path);
820     if (new_version != task.last_version) {
821       if (task.isUnassigned()) {
822         LOG.info("task " + path + " acquired by " + workerName);
823       }
824       task.heartbeat(EnvironmentEdgeManager.currentTimeMillis(), new_version, workerName);
825       SplitLogCounters.tot_mgr_heartbeat.incrementAndGet();
826     } else {
827       // duplicate heartbeats - heartbeats w/o zk node version
828       // changing - are possible. The timeout thread does
829       // getDataSetWatch() just to check whether a node still
830       // exists or not
831     }
832     return;
833   }
834 
835   private boolean resubmit(String path, Task task, ResubmitDirective directive) {
836     // its ok if this thread misses the update to task.deleted. It will fail later
837     if (task.status != IN_PROGRESS) {
838       return false;
839     }
840     int version;
841     if (directive != FORCE) {
842       // We're going to resubmit:
843       //  1) immediately if the worker server is now marked as dead
844       //  2) after a configurable timeout if the server is not marked as dead but has still not
845       //       finished the task. This allows to continue if the worker cannot actually handle it,
846       //       for any reason.
847       final long time = EnvironmentEdgeManager.currentTimeMillis() - task.last_update;
848       final boolean alive = master.getServerManager() != null ?
849           master.getServerManager().isServerOnline(task.cur_worker_name) : true;
850       if (alive && time < timeout) {
851         LOG.trace("Skipping the resubmit of " + task.toString() + "  because the server " +
852             task.cur_worker_name + " is not marked as dead, we waited for " + time +
853             " while the timeout is " + timeout);
854         return false;
855       }
856       if (task.unforcedResubmits >= resubmit_threshold) {
857         if (!task.resubmitThresholdReached) {
858           task.resubmitThresholdReached = true;
859           SplitLogCounters.tot_mgr_resubmit_threshold_reached.incrementAndGet();
860           LOG.info("Skipping resubmissions of task " + path +
861               " because threshold " + resubmit_threshold + " reached");
862         }
863         return false;
864       }
865       // race with heartbeat() that might be changing last_version
866       version = task.last_version;
867     } else {
868       SplitLogCounters.tot_mgr_resubmit_force.incrementAndGet();
869       version = -1;
870     }
871     LOG.info("resubmitting task " + path);
872     task.incarnation++;
873     try {
874       // blocking zk call but this is done from the timeout thread
875       SplitLogTask slt = new SplitLogTask.Unassigned(this.serverName);
876       if (ZKUtil.setData(this.watcher, path, slt.toByteArray(), version) == false) {
877         LOG.debug("failed to resubmit task " + path +
878             " version changed");
879         task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
880         return false;
881       }
882     } catch (NoNodeException e) {
883       LOG.warn("failed to resubmit because znode doesn't exist " + path +
884           " task done (or forced done by removing the znode)");
885       try {
886         getDataSetWatchSuccess(path, null, Integer.MIN_VALUE);
887       } catch (DeserializationException e1) {
888         LOG.debug("Failed to re-resubmit task " + path + " because of deserialization issue", e1);
889         task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
890         return false;
891       }
892       return false;
893     } catch (KeeperException.BadVersionException e) {
894       LOG.debug("failed to resubmit task " + path + " version changed");
895       task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
896       return false;
897     } catch (KeeperException e) {
898       SplitLogCounters.tot_mgr_resubmit_failed.incrementAndGet();
899       LOG.warn("failed to resubmit " + path, e);
900       return false;
901     }
902     // don't count forced resubmits
903     if (directive != FORCE) {
904       task.unforcedResubmits++;
905     }
906     task.setUnassigned();
907     createRescanNode(Long.MAX_VALUE);
908     SplitLogCounters.tot_mgr_resubmit.incrementAndGet();
909     return true;
910   }
911 
912   private void resubmitOrFail(String path, ResubmitDirective directive) {
913     if (resubmit(path, findOrCreateOrphanTask(path), directive) == false) {
914       setDone(path, FAILURE);
915     }
916   }
917 
918   private void deleteNode(String path, Long retries) {
919     SplitLogCounters.tot_mgr_node_delete_queued.incrementAndGet();
920     // Once a task znode is ready for delete, that is it is in the TASK_DONE
921     // state, then no one should be writing to it anymore. That is no one
922     // will be updating the znode version any more.
923     this.watcher.getRecoverableZooKeeper().getZooKeeper().
924       delete(path, -1, new DeleteAsyncCallback(),
925         retries);
926   }
927 
928   private void deleteNodeSuccess(String path) {
929     if (ignoreZKDeleteForTesting) {
930       return;
931     }
932     Task task;
933     task = tasks.remove(path);
934     if (task == null) {
935       if (ZKSplitLog.isRescanNode(watcher, path)) {
936         SplitLogCounters.tot_mgr_rescan_deleted.incrementAndGet();
937       }
938       SplitLogCounters.tot_mgr_missing_state_in_delete.incrementAndGet();
939       LOG.debug("deleted task without in memory state " + path);
940       return;
941     }
942     synchronized (task) {
943       task.status = DELETED;
944       task.notify();
945     }
946     SplitLogCounters.tot_mgr_task_deleted.incrementAndGet();
947   }
948 
949   private void deleteNodeFailure(String path) {
950     LOG.info("Failed to delete node " + path + " and will retry soon.");
951     return;
952   }
953 
954   /**
955    * signal the workers that a task was resubmitted by creating the
956    * RESCAN node.
957    * @throws KeeperException 
958    */
959   private void createRescanNode(long retries) {
960     // The RESCAN node will be deleted almost immediately by the
961     // SplitLogManager as soon as it is created because it is being
962     // created in the DONE state. This behavior prevents a buildup
963     // of RESCAN nodes. But there is also a chance that a SplitLogWorker
964     // might miss the watch-trigger that creation of RESCAN node provides.
965     // Since the TimeoutMonitor will keep resubmitting UNASSIGNED tasks
966     // therefore this behavior is safe.
967     SplitLogTask slt = new SplitLogTask.Done(this.serverName);
968     this.watcher.getRecoverableZooKeeper().getZooKeeper().
969       create(ZKSplitLog.getRescanNode(watcher), slt.toByteArray(),
970         Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL,
971         new CreateRescanAsyncCallback(), Long.valueOf(retries));
972   }
973 
974   private void createRescanSuccess(String path) {
975     lastNodeCreateTime = EnvironmentEdgeManager.currentTimeMillis();
976     SplitLogCounters.tot_mgr_rescan.incrementAndGet();
977     getDataSetWatch(path, zkretries);
978   }
979 
980   private void createRescanFailure() {
981     LOG.fatal("logic failure, rescan failure must not happen");
982   }
983 
984   /**
985    * @param path
986    * @param batch
987    * @return null on success, existing task on error
988    */
989   private Task createTaskIfAbsent(String path, TaskBatch batch) {
990     Task oldtask;
991     // batch.installed is only changed via this function and
992     // a single thread touches batch.installed.
993     Task newtask = new Task();
994     newtask.batch = batch;
995     oldtask = tasks.putIfAbsent(path, newtask);
996     if (oldtask == null) {
997       batch.installed++;
998       return  null;
999     }
1000     // new task was not used.
1001     synchronized (oldtask) {
1002       if (oldtask.isOrphan()) {
1003         if (oldtask.status == SUCCESS) {
1004           // The task is already done. Do not install the batch for this
1005           // task because it might be too late for setDone() to update
1006           // batch.done. There is no need for the batch creator to wait for
1007           // this task to complete.
1008           return (null);
1009         }
1010         if (oldtask.status == IN_PROGRESS) {
1011           oldtask.batch = batch;
1012           batch.installed++;
1013           LOG.debug("Previously orphan task " + path + " is now being waited upon");
1014           return null;
1015         }
1016         while (oldtask.status == FAILURE) {
1017           LOG.debug("wait for status of task " + path + " to change to DELETED");
1018           SplitLogCounters.tot_mgr_wait_for_zk_delete.incrementAndGet();
1019           try {
1020             oldtask.wait();
1021           } catch (InterruptedException e) {
1022             Thread.currentThread().interrupt();
1023             LOG.warn("Interrupted when waiting for znode delete callback");
1024             // fall through to return failure
1025             break;
1026           }
1027         }
1028         if (oldtask.status != DELETED) {
1029           LOG.warn("Failure because previously failed task" +
1030               " state still present. Waiting for znode delete callback" +
1031               " path=" + path);
1032           return oldtask;
1033         }
1034         // reinsert the newTask and it must succeed this time
1035         Task t = tasks.putIfAbsent(path, newtask);
1036         if (t == null) {
1037           batch.installed++;
1038           return  null;
1039         }
1040         LOG.fatal("Logic error. Deleted task still present in tasks map");
1041         assert false : "Deleted task still present in tasks map";
1042         return t;
1043       }
1044       LOG.warn("Failure because two threads can't wait for the same task; path=" + path);
1045       return oldtask;
1046     }
1047   }
1048 
1049   Task findOrCreateOrphanTask(String path) {
1050     Task orphanTask = new Task();
1051     Task task;
1052     task = tasks.putIfAbsent(path, orphanTask);
1053     if (task == null) {
1054       LOG.info("creating orphan task " + path);
1055       SplitLogCounters.tot_mgr_orphan_task_acquired.incrementAndGet();
1056       task = orphanTask;
1057     }
1058     return task;
1059   }
1060 
1061   @Override
1062   public void nodeDataChanged(String path) {
1063     Task task;
1064     task = tasks.get(path);
1065     if (task != null || ZKSplitLog.isRescanNode(watcher, path)) {
1066       if (task != null) {
1067         task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
1068       }
1069       getDataSetWatch(path, zkretries);
1070     }
1071   }
1072 
1073   public void stop() {
1074     if (timeoutMonitor != null) {
1075       timeoutMonitor.interrupt();
1076     }
1077   }
1078 
1079   private void lookForOrphans() {
1080     List<String> orphans;
1081     try {
1082        orphans = ZKUtil.listChildrenNoWatch(this.watcher,
1083           this.watcher.splitLogZNode);
1084       if (orphans == null) {
1085         LOG.warn("could not get children of " + this.watcher.splitLogZNode);
1086         return;
1087       }
1088     } catch (KeeperException e) {
1089       LOG.warn("could not get children of " + this.watcher.splitLogZNode +
1090           " " + StringUtils.stringifyException(e));
1091       return;
1092     }
1093     int rescan_nodes = 0;
1094     for (String path : orphans) {
1095       String nodepath = ZKUtil.joinZNode(watcher.splitLogZNode, path);
1096       if (ZKSplitLog.isRescanNode(watcher, nodepath)) {
1097         rescan_nodes++;
1098         LOG.debug("found orphan rescan node " + path);
1099       } else {
1100         LOG.info("found orphan task " + path);
1101       }
1102       getDataSetWatch(nodepath, zkretries);
1103     }
1104     LOG.info("Found " + (orphans.size() - rescan_nodes) + " orphan tasks and " +
1105         rescan_nodes + " rescan nodes");
1106   }
1107 
1108   /**
1109    * Create znodes /hbase/recovering-regions/[region_ids...]/[failed region server names ...] for
1110    * all regions of the passed in region servers
1111    * @param serverName the name of a region server
1112    * @param userRegions user regiones assigned on the region server
1113    */
1114   void markRegionsRecoveringInZK(final ServerName serverName, Set<HRegionInfo> userRegions)
1115       throws KeeperException {
1116     if (userRegions == null || !this.distributedLogReplay) {
1117       return;
1118     }
1119 
1120     try {
1121       this.recoveringRegionLock.lock();
1122       // mark that we're creating recovering znodes
1123       this.lastRecoveringNodeCreationTime = EnvironmentEdgeManager.currentTimeMillis();
1124 
1125       for (HRegionInfo region : userRegions) {
1126         String regionEncodeName = region.getEncodedName();
1127         long retries = this.zkretries;
1128 
1129         do {
1130           String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, regionEncodeName);
1131           long lastRecordedFlushedSequenceId = -1;
1132           try {
1133             long lastSequenceId = this.master.getServerManager().getLastFlushedSequenceId(
1134               regionEncodeName.getBytes());
1135 
1136             /*
1137              * znode layout: .../region_id[last known flushed sequence id]/failed server[last known
1138              * flushed sequence id for the server]
1139              */
1140             byte[] data = ZKUtil.getData(this.watcher, nodePath);
1141             if (data == null) {
1142               ZKUtil.createSetData(this.watcher, nodePath,
1143                 ZKUtil.positionToByteArray(lastSequenceId));
1144             } else {
1145               lastRecordedFlushedSequenceId = SplitLogManager.parseLastFlushedSequenceIdFrom(data);
1146               if (lastRecordedFlushedSequenceId < lastSequenceId) {
1147                 // update last flushed sequence id in the region level
1148                 ZKUtil.setData(this.watcher, nodePath, ZKUtil.positionToByteArray(lastSequenceId));
1149               }
1150             }
1151             // go one level deeper with server name
1152             nodePath = ZKUtil.joinZNode(nodePath, serverName.getServerName());
1153             if (lastSequenceId <= lastRecordedFlushedSequenceId) {
1154               // the newly assigned RS failed even before any flush to the region
1155               lastSequenceId = lastRecordedFlushedSequenceId;
1156             }
1157             ZKUtil.createSetData(this.watcher, nodePath,
1158               ZKUtil.regionSequenceIdsToByteArray(lastSequenceId, null));
1159             LOG.debug("Mark region " + regionEncodeName + " recovering from failed region server "
1160                 + serverName);
1161 
1162             // break retry loop
1163             break;
1164           } catch (KeeperException e) {
1165             // ignore ZooKeeper exceptions inside retry loop
1166             if (retries <= 1) {
1167               throw e;
1168             }
1169             // wait a little bit for retry
1170             try {
1171               Thread.sleep(20);
1172             } catch (Exception ignoreE) {
1173               // ignore
1174             }
1175           }
1176         } while ((--retries) > 0 && (!this.stopper.isStopped()));
1177       }
1178     } finally {
1179       this.recoveringRegionLock.unlock();
1180     }
1181   }
1182 
1183   /**
1184    * @param bytes - Content of a failed region server or recovering region znode.
1185    * @return long - The last flushed sequence Id for the region server
1186    */
1187   public static long parseLastFlushedSequenceIdFrom(final byte[] bytes) {
1188     long lastRecordedFlushedSequenceId = -1l;
1189     try {
1190       lastRecordedFlushedSequenceId = ZKUtil.parseHLogPositionFrom(bytes);
1191     } catch (DeserializationException e) {
1192       lastRecordedFlushedSequenceId = -1l;
1193       LOG.warn("Can't parse last flushed sequence Id", e);
1194     }
1195     return lastRecordedFlushedSequenceId;
1196   }
1197 
1198   /**
1199    * check if /hbase/recovering-regions/<current region encoded name> exists. Returns true if exists
1200    * and set watcher as well.
1201    * @param zkw
1202    * @param regionEncodedName region encode name
1203    * @return true when /hbase/recovering-regions/<current region encoded name> exists
1204    * @throws KeeperException
1205    */
1206   public static boolean
1207       isRegionMarkedRecoveringInZK(ZooKeeperWatcher zkw, String regionEncodedName)
1208           throws KeeperException {
1209     boolean result = false;
1210     String nodePath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, regionEncodedName);
1211 
1212     byte[] node = ZKUtil.getDataAndWatch(zkw, nodePath);
1213     if (node != null) {
1214       result = true;
1215     }
1216     return result;
1217   }
1218 
1219   /**
1220    * This function is used in distributedLogReplay to fetch last flushed sequence id from ZK
1221    * @param zkw
1222    * @param serverName
1223    * @param encodedRegionName
1224    * @return the last flushed sequence ids recorded in ZK of the region for <code>serverName<code>
1225    * @throws IOException
1226    */
1227   public static RegionStoreSequenceIds getRegionFlushedSequenceId(ZooKeeperWatcher zkw,
1228       String serverName, String encodedRegionName) throws IOException {
1229     // when SplitLogWorker recovers a region by directly replaying unflushed WAL edits,
1230     // last flushed sequence Id changes when newly assigned RS flushes writes to the region.
1231     // If the newly assigned RS fails again(a chained RS failures scenario), the last flushed
1232     // sequence Id name space (sequence Id only valid for a particular RS instance), changes 
1233     // when different newly assigned RS flushes the region.
1234     // Therefore, in this mode we need to fetch last sequence Ids from ZK where we keep history of
1235     // last flushed sequence Id for each failed RS instance.
1236     RegionStoreSequenceIds result = null;
1237     String nodePath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, encodedRegionName);
1238     nodePath = ZKUtil.joinZNode(nodePath, serverName);
1239     try {
1240       byte[] data = ZKUtil.getData(zkw, nodePath);
1241       if (data != null) {
1242         result = ZKUtil.parseRegionStoreSequenceIds(data);
1243       }
1244     } catch (KeeperException e) {
1245       throw new IOException("Cannot get lastFlushedSequenceId from ZooKeeper for server="
1246           + serverName + "; region=" + encodedRegionName, e);
1247     } catch (DeserializationException e) {
1248       LOG.warn("Can't parse last flushed sequence Id from znode:" + nodePath, e);
1249     }
1250     return result;
1251   }
1252 
1253   /**
1254    * Keeps track of the batch of tasks submitted together by a caller in splitLogDistributed().
1255    * Clients threads use this object to wait for all their tasks to be done.
1256    * <p>
1257    * All access is synchronized.
1258    */
1259   static class TaskBatch {
1260     int installed = 0;
1261     int done = 0;
1262     int error = 0;
1263     volatile boolean isDead = false;
1264 
1265     @Override
1266     public String toString() {
1267       return ("installed = " + installed + " done = " + done + " error = " + error);
1268     }
1269   }
1270 
1271   /**
1272    * in memory state of an active task.
1273    */
1274   static class Task {
1275     volatile long last_update;
1276     volatile int last_version;
1277     volatile ServerName cur_worker_name;
1278     volatile TaskBatch batch;
1279     volatile TerminationStatus status;
1280     volatile int incarnation;
1281     volatile int unforcedResubmits;
1282     volatile boolean resubmitThresholdReached;
1283 
1284     @Override
1285     public String toString() {
1286       return ("last_update = " + last_update +
1287           " last_version = " + last_version +
1288           " cur_worker_name = " + cur_worker_name +
1289           " status = " + status +
1290           " incarnation = " + incarnation +
1291           " resubmits = " + unforcedResubmits +
1292           " batch = " + batch);
1293     }
1294 
1295     Task() {
1296       incarnation = 0;
1297       last_version = -1;
1298       status = IN_PROGRESS;
1299       setUnassigned();
1300     }
1301 
1302     public boolean isOrphan() {
1303       return (batch == null || batch.isDead);
1304     }
1305 
1306     public boolean isUnassigned() {
1307       return (cur_worker_name == null);
1308     }
1309 
1310     public void heartbeatNoDetails(long time) {
1311       last_update = time;
1312     }
1313 
1314     public void heartbeat(long time, int version, ServerName worker) {
1315       last_version = version;
1316       last_update = time;
1317       cur_worker_name = worker;
1318     }
1319 
1320     public void setUnassigned() {
1321       cur_worker_name = null;
1322       last_update = -1;
1323     }
1324   }
1325 
1326   void handleDeadWorker(ServerName workerName) {
1327     // resubmit the tasks on the TimeoutMonitor thread. Makes it easier
1328     // to reason about concurrency. Makes it easier to retry.
1329     synchronized (deadWorkersLock) {
1330       if (deadWorkers == null) {
1331         deadWorkers = new HashSet<ServerName>(100);
1332       }
1333       deadWorkers.add(workerName);
1334     }
1335     LOG.info("dead splitlog worker " + workerName);
1336   }
1337 
1338   void handleDeadWorkers(Set<ServerName> serverNames) {
1339     synchronized (deadWorkersLock) {
1340       if (deadWorkers == null) {
1341         deadWorkers = new HashSet<ServerName>(100);
1342       }
1343       deadWorkers.addAll(serverNames);
1344     }
1345     LOG.info("dead splitlog workers " + serverNames);
1346   }
1347 
1348   /**
1349    * Periodically checks all active tasks and resubmits the ones that have timed
1350    * out
1351    */
1352   private class TimeoutMonitor extends Chore {
1353     private long lastLog = 0;
1354 
1355     public TimeoutMonitor(final int period, Stoppable stopper) {
1356       super("SplitLogManager Timeout Monitor", period, stopper);
1357     }
1358 
1359     @Override
1360     protected void chore() {
1361       int resubmitted = 0;
1362       int unassigned = 0;
1363       int tot = 0;
1364       boolean found_assigned_task = false;
1365       Set<ServerName> localDeadWorkers;
1366 
1367       synchronized (deadWorkersLock) {
1368         localDeadWorkers = deadWorkers;
1369         deadWorkers = null;
1370       }
1371 
1372       for (Map.Entry<String, Task> e : tasks.entrySet()) {
1373         String path = e.getKey();
1374         Task task = e.getValue();
1375         ServerName cur_worker = task.cur_worker_name;
1376         tot++;
1377         // don't easily resubmit a task which hasn't been picked up yet. It
1378         // might be a long while before a SplitLogWorker is free to pick up a
1379         // task. This is because a SplitLogWorker picks up a task one at a
1380         // time. If we want progress when there are no region servers then we
1381         // will have to run a SplitLogWorker thread in the Master.
1382         if (task.isUnassigned()) {
1383           unassigned++;
1384           continue;
1385         }
1386         found_assigned_task = true;
1387         if (localDeadWorkers != null && localDeadWorkers.contains(cur_worker)) {
1388           SplitLogCounters.tot_mgr_resubmit_dead_server_task.incrementAndGet();
1389           if (resubmit(path, task, FORCE)) {
1390             resubmitted++;
1391           } else {
1392             handleDeadWorker(cur_worker);
1393             LOG.warn("Failed to resubmit task " + path + " owned by dead " +
1394                 cur_worker + ", will retry.");
1395           }
1396         } else if (resubmit(path, task, CHECK)) {
1397           resubmitted++;
1398         }
1399       }
1400       if (tot > 0) {
1401         long now = EnvironmentEdgeManager.currentTimeMillis();
1402         if (now > lastLog + 5000) {
1403           lastLog = now;
1404           LOG.info("total tasks = " + tot + " unassigned = " + unassigned + " tasks=" + tasks);
1405         }
1406       }
1407       if (resubmitted > 0) {
1408         LOG.info("resubmitted " + resubmitted + " out of " + tot + " tasks");
1409       }
1410       // If there are pending tasks and all of them have been unassigned for
1411       // some time then put up a RESCAN node to ping the workers.
1412       // ZKSplitlog.DEFAULT_UNASSIGNED_TIMEOUT is of the order of minutes
1413       // because a. it is very unlikely that every worker had a
1414       // transient error when trying to grab the task b. if there are no
1415       // workers then all tasks wills stay unassigned indefinitely and the
1416       // manager will be indefinitely creating RESCAN nodes. TODO may be the
1417       // master should spawn both a manager and a worker thread to guarantee
1418       // that there is always one worker in the system
1419       if (tot > 0 && !found_assigned_task &&
1420           ((EnvironmentEdgeManager.currentTimeMillis() - lastNodeCreateTime) >
1421           unassignedTimeout)) {
1422         for (Map.Entry<String, Task> e : tasks.entrySet()) {
1423           String path = e.getKey();
1424           Task task = e.getValue();
1425           // we have to do task.isUnassigned() check again because tasks might
1426           // have been asynchronously assigned. There is no locking required
1427           // for these checks ... it is OK even if tryGetDataSetWatch() is
1428           // called unnecessarily for a task
1429           if (task.isUnassigned() && (task.status != FAILURE)) {
1430             // We just touch the znode to make sure its still there
1431             tryGetDataSetWatch(path);
1432           }
1433         }
1434         createRescanNode(Long.MAX_VALUE);
1435         SplitLogCounters.tot_mgr_resubmit_unassigned.incrementAndGet();
1436         LOG.debug("resubmitting unassigned task(s) after timeout");
1437       }
1438 
1439       // Retry previously failed deletes
1440       if (failedDeletions.size() > 0) {
1441         List<String> tmpPaths = new ArrayList<String>(failedDeletions);
1442         for (String tmpPath : tmpPaths) {
1443           // deleteNode is an async call
1444           deleteNode(tmpPath, zkretries);
1445         }
1446         failedDeletions.removeAll(tmpPaths);
1447       }
1448 
1449       // Garbage collect left-over /hbase/recovering-regions/... znode
1450       long timeInterval = EnvironmentEdgeManager.currentTimeMillis()
1451           - lastRecoveringNodeCreationTime;
1452       if (!failedRecoveringRegionDeletions.isEmpty()
1453           || (tot == 0 && tasks.size() == 0 && (timeInterval > checkRecoveringTimeThreshold))) {
1454         // inside the function there have more checks before GC anything
1455         if (!failedRecoveringRegionDeletions.isEmpty()) {
1456           List<Pair<Set<ServerName>, Boolean>> previouslyFailedDeletions =
1457               new ArrayList<Pair<Set<ServerName>, Boolean>>(failedRecoveringRegionDeletions);
1458           failedRecoveringRegionDeletions.removeAll(previouslyFailedDeletions);
1459           for (Pair<Set<ServerName>, Boolean> failedDeletion : previouslyFailedDeletions) {
1460             removeRecoveringRegionsFromZK(failedDeletion.getFirst(), failedDeletion.getSecond());
1461           }
1462         } else {
1463           removeRecoveringRegionsFromZK(null, null);
1464         }
1465       }
1466     }
1467   }
1468 
1469   /**
1470    * Asynchronous handler for zk create node results.
1471    * Retries on failures.
1472    */
1473   class CreateAsyncCallback implements AsyncCallback.StringCallback {
1474     private final Log LOG = LogFactory.getLog(CreateAsyncCallback.class);
1475 
1476     @Override
1477     public void processResult(int rc, String path, Object ctx, String name) {
1478       SplitLogCounters.tot_mgr_node_create_result.incrementAndGet();
1479       if (rc != 0) {
1480         if (needAbandonRetries(rc, "Create znode " + path)) {
1481           createNodeFailure(path);
1482           return;
1483         }
1484         if (rc == KeeperException.Code.NODEEXISTS.intValue()) {
1485           // What if there is a delete pending against this pre-existing
1486           // znode? Then this soon-to-be-deleted task znode must be in TASK_DONE
1487           // state. Only operations that will be carried out on this node by
1488           // this manager are get-znode-data, task-finisher and delete-znode.
1489           // And all code pieces correctly handle the case of suddenly
1490           // disappearing task-znode.
1491           LOG.debug("found pre-existing znode " + path);
1492           SplitLogCounters.tot_mgr_node_already_exists.incrementAndGet();
1493         } else {
1494           Long retry_count = (Long)ctx;
1495           LOG.warn("create rc =" + KeeperException.Code.get(rc) + " for " +
1496               path + " remaining retries=" + retry_count);
1497           if (retry_count == 0) {
1498             SplitLogCounters.tot_mgr_node_create_err.incrementAndGet();
1499             createNodeFailure(path);
1500           } else {
1501             SplitLogCounters.tot_mgr_node_create_retry.incrementAndGet();
1502             createNode(path, retry_count - 1);
1503           }
1504           return;
1505         }
1506       }
1507       createNodeSuccess(path);
1508     }
1509   }
1510 
1511   /**
1512    * Asynchronous handler for zk get-data-set-watch on node results.
1513    * Retries on failures.
1514    */
1515   class GetDataAsyncCallback implements AsyncCallback.DataCallback {
1516     private final Log LOG = LogFactory.getLog(GetDataAsyncCallback.class);
1517 
1518     @Override
1519     public void processResult(int rc, String path, Object ctx, byte[] data,
1520         Stat stat) {
1521       SplitLogCounters.tot_mgr_get_data_result.incrementAndGet();
1522       if (rc != 0) {
1523         if (needAbandonRetries(rc, "GetData from znode " + path)) {
1524           return;
1525         }
1526         if (rc == KeeperException.Code.NONODE.intValue()) {
1527           SplitLogCounters.tot_mgr_get_data_nonode.incrementAndGet();
1528           // The task znode has been deleted. Must be some pending delete
1529           // that deleted the task. Assume success because a task-znode is
1530           // is only deleted after TaskFinisher is successful.
1531           LOG.warn("task znode " + path + " vanished.");
1532           try {
1533             getDataSetWatchSuccess(path, null, Integer.MIN_VALUE);
1534           } catch (DeserializationException e) {
1535             LOG.warn("Deserialization problem", e);
1536           }
1537           return;
1538         }
1539         Long retry_count = (Long) ctx;
1540 
1541         if (retry_count < 0) {
1542           LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " +
1543               path + ". Ignoring error. No error handling. No retrying.");
1544           return;
1545         }
1546         LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " +
1547             path + " remaining retries=" + retry_count);
1548         if (retry_count == 0) {
1549           SplitLogCounters.tot_mgr_get_data_err.incrementAndGet();
1550           getDataSetWatchFailure(path);
1551         } else {
1552           SplitLogCounters.tot_mgr_get_data_retry.incrementAndGet();
1553           getDataSetWatch(path, retry_count - 1);
1554         }
1555         return;
1556       }
1557       try {
1558         getDataSetWatchSuccess(path, data, stat.getVersion());
1559       } catch (DeserializationException e) {
1560         LOG.warn("Deserialization problem", e);
1561       }
1562       return;
1563     }
1564   }
1565 
1566   /**
1567    * Asynchronous handler for zk delete node results.
1568    * Retries on failures.
1569    */
1570   class DeleteAsyncCallback implements AsyncCallback.VoidCallback {
1571     private final Log LOG = LogFactory.getLog(DeleteAsyncCallback.class);
1572 
1573     @Override
1574     public void processResult(int rc, String path, Object ctx) {
1575       SplitLogCounters.tot_mgr_node_delete_result.incrementAndGet();
1576       if (rc != 0) {
1577         if (needAbandonRetries(rc, "Delete znode " + path)) {
1578           failedDeletions.add(path);
1579           return;
1580         }
1581         if (rc != KeeperException.Code.NONODE.intValue()) {
1582           SplitLogCounters.tot_mgr_node_delete_err.incrementAndGet();
1583           Long retry_count = (Long) ctx;
1584           LOG.warn("delete rc=" + KeeperException.Code.get(rc) + " for " +
1585               path + " remaining retries=" + retry_count);
1586           if (retry_count == 0) {
1587             LOG.warn("delete failed " + path);
1588             failedDeletions.add(path);
1589             deleteNodeFailure(path);
1590           } else {
1591             deleteNode(path, retry_count - 1);
1592           }
1593           return;
1594         } else {
1595           LOG.info(path +
1596             " does not exist. Either was created but deleted behind our" +
1597             " back by another pending delete OR was deleted" +
1598             " in earlier retry rounds. zkretries = " + (Long) ctx);
1599         }
1600       } else {
1601         LOG.debug("deleted " + path);
1602       }
1603       deleteNodeSuccess(path);
1604     }
1605   }
1606 
1607   /**
1608    * Asynchronous handler for zk create RESCAN-node results.
1609    * Retries on failures.
1610    * <p>
1611    * A RESCAN node is created using PERSISTENT_SEQUENTIAL flag. It is a signal
1612    * for all the {@link SplitLogWorker}s to rescan for new tasks.
1613    */
1614   class CreateRescanAsyncCallback implements AsyncCallback.StringCallback {
1615     private final Log LOG = LogFactory.getLog(CreateRescanAsyncCallback.class);
1616 
1617     @Override
1618     public void processResult(int rc, String path, Object ctx, String name) {
1619       if (rc != 0) {
1620         if (needAbandonRetries(rc, "CreateRescan znode " + path)) {
1621           return;
1622         }
1623         Long retry_count = (Long)ctx;
1624         LOG.warn("rc=" + KeeperException.Code.get(rc) + " for "+ path +
1625             " remaining retries=" + retry_count);
1626         if (retry_count == 0) {
1627           createRescanFailure();
1628         } else {
1629           createRescanNode(retry_count - 1);
1630         }
1631         return;
1632       }
1633       // path is the original arg, name is the actual name that was created
1634       createRescanSuccess(name);
1635     }
1636   }
1637 
1638   /**
1639    * {@link SplitLogManager} can use objects implementing this interface to
1640    * finish off a partially done task by {@link SplitLogWorker}. This provides
1641    * a serialization point at the end of the task processing. Must be
1642    * restartable and idempotent.
1643    */
1644   public interface TaskFinisher {
1645     /**
1646      * status that can be returned finish()
1647      */
1648     enum Status {
1649       /**
1650        * task completed successfully
1651        */
1652       DONE(),
1653       /**
1654        * task completed with error
1655        */
1656       ERR();
1657     }
1658     /**
1659      * finish the partially done task. workername provides clue to where the
1660      * partial results of the partially done tasks are present. taskname is the
1661      * name of the task that was put up in zookeeper.
1662      * <p>
1663      * @param workerName
1664      * @param taskname
1665      * @return DONE if task completed successfully, ERR otherwise
1666      */
1667     Status finish(ServerName workerName, String taskname);
1668   }
1669 
1670   enum ResubmitDirective {
1671     CHECK(),
1672     FORCE();
1673   }
1674 
1675   enum TerminationStatus {
1676     IN_PROGRESS("in_progress"),
1677     SUCCESS("success"),
1678     FAILURE("failure"),
1679     DELETED("deleted");
1680 
1681     String statusMsg;
1682     TerminationStatus(String msg) {
1683       statusMsg = msg;
1684     }
1685     
1686     @Override
1687     public String toString() {
1688       return statusMsg;
1689     }
1690   }
1691 }