View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.hbase.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.TableName;
42  import org.apache.hadoop.hbase.HBaseInterfaceAudience;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.HTableDescriptor;
45  import org.apache.hadoop.hbase.Stoppable;
46  import org.apache.hadoop.hbase.catalog.MetaReader;
47  import org.apache.hadoop.hbase.errorhandling.ForeignException;
48  import org.apache.hadoop.hbase.executor.ExecutorService;
49  import org.apache.hadoop.hbase.master.AssignmentManager;
50  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
51  import org.apache.hadoop.hbase.master.MasterFileSystem;
52  import org.apache.hadoop.hbase.master.MasterServices;
53  import org.apache.hadoop.hbase.master.MetricsMaster;
54  import org.apache.hadoop.hbase.master.SnapshotSentinel;
55  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
56  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
57  import org.apache.hadoop.hbase.procedure.MasterProcedureManager;
58  import org.apache.hadoop.hbase.procedure.Procedure;
59  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
60  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
61  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
62  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
63  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.ProcedureDescription;
64  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
65  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
66  import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
67  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
68  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
69  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
70  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
71  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
72  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
73  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
74  import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
75  import org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil;
76  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
77  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
78  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
79  import org.apache.hadoop.hbase.util.FSUtils;
80  import org.apache.zookeeper.KeeperException;
81  
82  /**
83   * This class manages the procedure of taking and restoring snapshots. There is only one
84   * SnapshotManager for the master.
85   * <p>
86   * The class provides methods for monitoring in-progress snapshot actions.
87   * <p>
88   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
89   * simplification in the current implementation.
90   */
91  @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
92  @InterfaceStability.Unstable
93  public class SnapshotManager extends MasterProcedureManager implements Stoppable {
94    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
95  
96    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
97    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
98  
99    /**
100    * Wait time before removing a finished sentinel from the in-progress map
101    *
102    * NOTE: This is used as a safety auto cleanup.
103    * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
104    * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
105    * In case something fails on the client side and the snapshot/restore state is not reclaimed
106    * after a default timeout, the entry is removed from the in-progress map.
107    * At this point, if the user asks for the snapshot/restore status, the result will be
108    * snapshot done if exists or failed if it doesn't exists.
109    */
110   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
111 
112   /** Enable or disable snapshot support */
113   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
114 
115   /**
116    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
117    * completion.
118    */
119   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
120 
121   /** Name of the operation to use in the controller */
122   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
123 
124   /** Conf key for # of threads used by the SnapshotManager thread pool */
125   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
126 
127   /** number of current operations running on the master */
128   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
129 
130   private boolean stopped;
131   private MasterServices master;  // Needed by TableEventHandlers
132   private MetricsMaster metricsMaster;
133   private ProcedureCoordinator coordinator;
134 
135   // Is snapshot feature enabled?
136   private boolean isSnapshotSupported = false;
137 
138   // Snapshot handlers map, with table name as key.
139   // The map is always accessed and modified under the object lock using synchronized.
140   // snapshotTable() will insert an Handler in the table.
141   // isSnapshotDone() will remove the handler requested if the operation is finished.
142   private Map<TableName, SnapshotSentinel> snapshotHandlers =
143       new HashMap<TableName, SnapshotSentinel>();
144 
145   // Restore Sentinels map, with table name as key.
146   // The map is always accessed and modified under the object lock using synchronized.
147   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
148   // isRestoreDone() will remove the handler requested if the operation is finished.
149   private Map<TableName, SnapshotSentinel> restoreHandlers =
150       new HashMap<TableName, SnapshotSentinel>();
151 
152   private Path rootDir;
153   private ExecutorService executorService;
154 
155   /**
156    * Snapshot layout version to use when writing a new snapshot.
157    */
158   private int snapshotLayoutVersion = SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_LATEST_FORMAT;
159 
160   public SnapshotManager() {}
161 
162   /**
163    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
164    * @param master services for the master where the manager is running
165    * @param coordinator procedure coordinator instance.  exposed for testing.
166    * @param pool HBase ExecutorServcie instance, exposed for testing.
167    */
168   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster,
169       ProcedureCoordinator coordinator, ExecutorService pool)
170       throws IOException, UnsupportedOperationException {
171     this.master = master;
172     this.metricsMaster = metricsMaster;
173 
174     this.rootDir = master.getMasterFileSystem().getRootDir();
175     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
176 
177     this.snapshotLayoutVersion = SnapshotDescriptionUtils.getDefaultSnapshotLayoutFormat(
178         master.getConfiguration());
179 
180     this.coordinator = coordinator;
181     this.executorService = pool;
182     resetTempDir();
183   }
184 
185   /**
186    * Gets the list of all completed snapshots.
187    * @return list of SnapshotDescriptions
188    * @throws IOException File system exception
189    */
190   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
191     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
192   }
193 
194   /**
195    * Gets the list of all completed snapshots.
196    * @param snapshotDir snapshot directory
197    * @return list of SnapshotDescriptions
198    * @throws IOException File system exception
199    */
200   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
201     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
202     // first create the snapshot root path and check to see if it exists
203     FileSystem fs = master.getMasterFileSystem().getFileSystem();
204     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
205 
206     // if there are no snapshots, return an empty list
207     if (!fs.exists(snapshotDir)) {
208       return snapshotDescs;
209     }
210 
211     // ignore all the snapshots in progress
212     FileStatus[] snapshots = fs.listStatus(snapshotDir,
213       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
214     // loop through all the completed snapshots
215     for (FileStatus snapshot : snapshots) {
216       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
217       // if the snapshot is bad
218       if (!fs.exists(info)) {
219         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
220         continue;
221       }
222       FSDataInputStream in = null;
223       try {
224         in = fs.open(info);
225         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
226         snapshotDescs.add(desc);
227       } catch (IOException e) {
228         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
229       } finally {
230         if (in != null) {
231           in.close();
232         }
233       }
234     }
235     return snapshotDescs;
236   }
237 
238   /**
239    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
240    * snapshot attempts.
241    *
242    * @throws IOException if we can't reach the filesystem
243    */
244   void resetTempDir() throws IOException {
245     // cleanup any existing snapshots.
246     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
247     if (master.getMasterFileSystem().getFileSystem().exists(tmpdir)) {
248       if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
249         LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
250       }
251     }
252   }
253 
254   /**
255    * Delete the specified snapshot
256    * @param snapshot
257    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
258    * @throws IOException For filesystem IOExceptions
259    */
260   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
261 
262     // call coproc pre hook
263     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
264     if (cpHost != null) {
265       cpHost.preDeleteSnapshot(snapshot);
266     }
267 
268     // check to see if it is completed
269     if (!isSnapshotCompleted(snapshot)) {
270       throw new SnapshotDoesNotExistException(snapshot);
271     }
272 
273     String snapshotName = snapshot.getName();
274     LOG.debug("Deleting snapshot: " + snapshotName);
275     // first create the snapshot description and check to see if it exists
276     MasterFileSystem fs = master.getMasterFileSystem();
277     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
278 
279     // delete the existing snapshot
280     if (!fs.getFileSystem().delete(snapshotDir, true)) {
281       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
282     }
283 
284     // call coproc post hook
285     if (cpHost != null) {
286       cpHost.postDeleteSnapshot(snapshot);
287     }
288 
289   }
290 
291   /**
292    * Check if the specified snapshot is done
293    *
294    * @param expected
295    * @return true if snapshot is ready to be restored, false if it is still being taken.
296    * @throws IOException IOException if error from HDFS or RPC
297    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
298    */
299   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
300     // check the request to make sure it has a snapshot
301     if (expected == null) {
302       throw new UnknownSnapshotException(
303          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
304     }
305 
306     String ssString = ClientSnapshotDescriptionUtils.toString(expected);
307 
308     // check to see if the sentinel exists,
309     // and if the task is complete removes it from the in-progress snapshots map.
310     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
311 
312     // stop tracking "abandoned" handlers
313     cleanupSentinels();
314 
315     if (handler == null) {
316       // If there's no handler in the in-progress map, it means one of the following:
317       //   - someone has already requested the snapshot state
318       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
319       //   - the snapshot was never requested
320       // In those cases returns to the user the "done state" if the snapshots exists on disk,
321       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
322       if (!isSnapshotCompleted(expected)) {
323         throw new UnknownSnapshotException("Snapshot " + ssString
324             + " is not currently running or one of the known completed snapshots.");
325       }
326       // was done, return true;
327       return true;
328     }
329 
330     // pass on any failure we find in the sentinel
331     try {
332       handler.rethrowExceptionIfFailed();
333     } catch (ForeignException e) {
334       // Give some procedure info on an exception.
335       String status;
336       Procedure p = coordinator.getProcedure(expected.getName());
337       if (p != null) {
338         status = p.getStatus();
339       } else {
340         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
341       }
342       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
343           expected);
344     }
345 
346     // check to see if we are done
347     if (handler.isFinished()) {
348       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
349       return true;
350     } else if (LOG.isDebugEnabled()) {
351       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
352     }
353     return false;
354   }
355 
356   /**
357    * Check to see if there is a snapshot in progress with the same name or on the same table.
358    * Currently we have a limitation only allowing a single snapshot per table at a time. Also we
359    * don't allow snapshot with the same name.
360    * @param snapshot description of the snapshot being checked.
361    * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same
362    *         table.
363    */
364   synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) {
365     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
366     if (isTakingSnapshot(snapshotTable)) {
367       return true;
368     }
369     Iterator<Map.Entry<TableName, SnapshotSentinel>> it = this.snapshotHandlers.entrySet().iterator();
370     while (it.hasNext()) {
371       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
372       SnapshotSentinel sentinel = entry.getValue();
373       if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) {
374         return true;
375       }
376     }
377     return false;
378   }
379 
380   /**
381    * Check to see if the specified table has a snapshot in progress.  Currently we have a
382    * limitation only allowing a single snapshot per table at a time.
383    * @param tableName name of the table being snapshotted.
384    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
385    */
386   synchronized boolean isTakingSnapshot(final TableName tableName) {
387     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
388     return handler != null && !handler.isFinished();
389   }
390 
391   /**
392    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
393    * aren't already running a snapshot or restore on the requested table.
394    * @param snapshot description of the snapshot we want to start
395    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
396    */
397   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
398       throws HBaseSnapshotException {
399     FileSystem fs = master.getMasterFileSystem().getFileSystem();
400     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
401     TableName snapshotTable =
402         TableName.valueOf(snapshot.getTable());
403 
404     // make sure we aren't already running a snapshot
405     if (isTakingSnapshot(snapshot)) {
406       SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable);
407       throw new SnapshotCreationException("Rejected taking "
408           + ClientSnapshotDescriptionUtils.toString(snapshot)
409           + " because we are already running another snapshot "
410           + (handler != null ? ("on the same table " +
411               ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()))
412               : "with the same name"), snapshot);
413     }
414 
415     // make sure we aren't running a restore on the same table
416     if (isRestoringTable(snapshotTable)) {
417       SnapshotSentinel handler = restoreHandlers.get(snapshotTable);
418       throw new SnapshotCreationException("Rejected taking "
419           + ClientSnapshotDescriptionUtils.toString(snapshot)
420           + " because we are already have a restore in progress on the same snapshot "
421           + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
422     }
423 
424     try {
425       // delete the working directory, since we aren't running the snapshot. Likely leftovers
426       // from a failed attempt.
427       fs.delete(workingDir, true);
428 
429       // recreate the working directory for the snapshot
430       if (!fs.mkdirs(workingDir)) {
431         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
432             + ") for snapshot" , snapshot);
433       }
434     } catch (HBaseSnapshotException e) {
435       throw e;
436     } catch (IOException e) {
437       throw new SnapshotCreationException(
438           "Exception while checking to see if snapshot could be started.", e, snapshot);
439     }
440   }
441 
442   /**
443    * Take a snapshot of a disabled table.
444    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
445    * @throws HBaseSnapshotException if the snapshot could not be started
446    */
447   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
448       throws HBaseSnapshotException {
449     // setup the snapshot
450     prepareToTakeSnapshot(snapshot);
451 
452     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
453     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
454 
455     // Take the snapshot of the disabled table
456     DisabledTableSnapshotHandler handler =
457         new DisabledTableSnapshotHandler(snapshot, master);
458     snapshotTable(snapshot, handler);
459   }
460 
461   /**
462    * Take a snapshot of an enabled table.
463    * @param snapshot description of the snapshot to take.
464    * @throws HBaseSnapshotException if the snapshot could not be started
465    */
466   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
467       throws HBaseSnapshotException {
468     // setup the snapshot
469     prepareToTakeSnapshot(snapshot);
470 
471     // Take the snapshot of the enabled table
472     EnabledTableSnapshotHandler handler =
473         new EnabledTableSnapshotHandler(snapshot, master, this);
474     snapshotTable(snapshot, handler);
475   }
476 
477   /**
478    * Take a snapshot using the specified handler.
479    * On failure the snapshot temporary working directory is removed.
480    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
481    *       snapshot request if the table is busy with another snapshot/restore operation.
482    * @param snapshot the snapshot description
483    * @param handler the snapshot handler
484    */
485   private synchronized void snapshotTable(SnapshotDescription snapshot,
486       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
487     try {
488       handler.prepare();
489       this.executorService.submit(handler);
490       this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler);
491     } catch (Exception e) {
492       // cleanup the working directory by trying to delete it from the fs.
493       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
494       try {
495         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
496           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
497               ClientSnapshotDescriptionUtils.toString(snapshot));
498         }
499       } catch (IOException e1) {
500         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
501             ClientSnapshotDescriptionUtils.toString(snapshot));
502       }
503       // fail the snapshot
504       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
505     }
506   }
507 
508   /**
509    * Take a snapshot based on the enabled/disabled state of the table.
510    *
511    * @param snapshot
512    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
513    * @throws IOException when some sort of generic IO exception occurs.
514    */
515   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
516     // check to see if we already completed the snapshot
517     if (isSnapshotCompleted(snapshot)) {
518       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
519           + "' already stored on the filesystem.", snapshot);
520     }
521 
522     LOG.debug("No existing snapshot, attempting snapshot...");
523 
524     // stop tracking "abandoned" handlers
525     cleanupSentinels();
526 
527     // check to see if the table exists
528     HTableDescriptor desc = null;
529     try {
530       desc = master.getTableDescriptors().get(
531           TableName.valueOf(snapshot.getTable()));
532     } catch (FileNotFoundException e) {
533       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
534       LOG.error(msg);
535       throw new SnapshotCreationException(msg, e, snapshot);
536     } catch (IOException e) {
537       throw new SnapshotCreationException("Error while geting table description for table "
538           + snapshot.getTable(), e, snapshot);
539     }
540     if (desc == null) {
541       throw new SnapshotCreationException("Table '" + snapshot.getTable()
542           + "' doesn't exist, can't take snapshot.", snapshot);
543     }
544 
545     // if not specified, set the snapshot format
546     if (!snapshot.hasVersion()) {
547       snapshot = snapshot.toBuilder()
548           .setVersion(snapshotLayoutVersion)
549           .build();
550     }
551 
552     // call pre coproc hook
553     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
554     if (cpHost != null) {
555       cpHost.preSnapshot(snapshot, desc);
556     }
557 
558     // if the table is enabled, then have the RS run actually the snapshot work
559     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
560     AssignmentManager assignmentMgr = master.getAssignmentManager();
561     if (assignmentMgr.getZKTable().isEnabledTable(snapshotTable)) {
562       LOG.debug("Table enabled, starting distributed snapshot.");
563       snapshotEnabledTable(snapshot);
564       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
565     }
566     // For disabled table, snapshot is created by the master
567     else if (assignmentMgr.getZKTable().isDisabledTable(snapshotTable)) {
568       LOG.debug("Table is disabled, running snapshot entirely on master.");
569       snapshotDisabledTable(snapshot);
570       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
571     } else {
572       LOG.error("Can't snapshot table '" + snapshot.getTable()
573           + "', isn't open or closed, we don't know what to do!");
574       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
575           + " isn't fully open.");
576       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
577     }
578 
579     // call post coproc hook
580     if (cpHost != null) {
581       cpHost.postSnapshot(snapshot, desc);
582     }
583   }
584 
585   /**
586    * Set the handler for the current snapshot
587    * <p>
588    * Exposed for TESTING
589    * @param tableName
590    * @param handler handler the master should use
591    *
592    * TODO get rid of this if possible, repackaging, modify tests.
593    */
594   public synchronized void setSnapshotHandlerForTesting(
595       final TableName tableName,
596       final SnapshotSentinel handler) {
597     if (handler != null) {
598       this.snapshotHandlers.put(tableName, handler);
599     } else {
600       this.snapshotHandlers.remove(tableName);
601     }
602   }
603 
604   /**
605    * @return distributed commit coordinator for all running snapshots
606    */
607   ProcedureCoordinator getCoordinator() {
608     return coordinator;
609   }
610 
611   /**
612    * Check to see if the snapshot is one of the currently completed snapshots
613    * Returns true if the snapshot exists in the "completed snapshots folder".
614    *
615    * @param snapshot expected snapshot to check
616    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
617    *         not stored
618    * @throws IOException if the filesystem throws an unexpected exception,
619    * @throws IllegalArgumentException if snapshot name is invalid.
620    */
621   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
622     try {
623       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
624       FileSystem fs = master.getMasterFileSystem().getFileSystem();
625       // check to see if the snapshot already exists
626       return fs.exists(snapshotDir);
627     } catch (IllegalArgumentException iae) {
628       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
629     }
630   }
631 
632   /**
633    * Clone the specified snapshot into a new table.
634    * The operation will fail if the destination table has a snapshot or restore in progress.
635    *
636    * @param snapshot Snapshot Descriptor
637    * @param hTableDescriptor Table Descriptor of the table to create
638    */
639   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
640       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
641     TableName tableName = hTableDescriptor.getTableName();
642 
643     // make sure we aren't running a snapshot on the same table
644     if (isTakingSnapshot(tableName)) {
645       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
646     }
647 
648     // make sure we aren't running a restore on the same table
649     if (isRestoringTable(tableName)) {
650       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
651     }
652 
653     try {
654       CloneSnapshotHandler handler =
655         new CloneSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
656       this.executorService.submit(handler);
657       this.restoreHandlers.put(tableName, handler);
658     } catch (Exception e) {
659       String msg = "Couldn't clone the snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
660         " on table=" + tableName;
661       LOG.error(msg, e);
662       throw new RestoreSnapshotException(msg, e);
663     }
664   }
665 
666   /**
667    * Restore the specified snapshot
668    * @param reqSnapshot
669    * @throws IOException
670    */
671   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
672     FileSystem fs = master.getMasterFileSystem().getFileSystem();
673     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
674     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
675 
676     // check if the snapshot exists
677     if (!fs.exists(snapshotDir)) {
678       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
679       throw new SnapshotDoesNotExistException(reqSnapshot);
680     }
681 
682     // read snapshot information
683     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
684     SnapshotManifest manifest = SnapshotManifest.open(master.getConfiguration(), fs,
685         snapshotDir, fsSnapshot);
686     HTableDescriptor snapshotTableDesc = manifest.getTableDescriptor();
687     TableName tableName = TableName.valueOf(reqSnapshot.getTable());
688 
689     // stop tracking "abandoned" handlers
690     cleanupSentinels();
691 
692     // Verify snapshot validity
693     SnapshotReferenceUtil.verifySnapshot(master.getConfiguration(), fs, manifest);
694 
695     // Execute the restore/clone operation
696     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
697       if (master.getAssignmentManager().getZKTable().isEnabledTable(
698           TableName.valueOf(fsSnapshot.getTable()))) {
699         throw new UnsupportedOperationException("Table '" +
700             TableName.valueOf(fsSnapshot.getTable()) + "' must be disabled in order to " +
701             "perform a restore operation" +
702             ".");
703       }
704 
705       // call coproc pre hook
706       if (cpHost != null) {
707         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
708       }
709       restoreSnapshot(fsSnapshot, snapshotTableDesc);
710       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
711 
712       if (cpHost != null) {
713         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
714       }
715     } else {
716       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc, tableName);
717       if (cpHost != null) {
718         cpHost.preCloneSnapshot(reqSnapshot, htd);
719       }
720       cloneSnapshot(fsSnapshot, htd);
721       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
722 
723       if (cpHost != null) {
724         cpHost.postCloneSnapshot(reqSnapshot, htd);
725       }
726     }
727   }
728 
729   /**
730    * Restore the specified snapshot.
731    * The restore will fail if the destination table has a snapshot or restore in progress.
732    *
733    * @param snapshot Snapshot Descriptor
734    * @param hTableDescriptor Table Descriptor
735    */
736   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
737       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
738     TableName tableName = hTableDescriptor.getTableName();
739 
740     // make sure we aren't running a snapshot on the same table
741     if (isTakingSnapshot(tableName)) {
742       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
743     }
744 
745     // make sure we aren't running a restore on the same table
746     if (isRestoringTable(tableName)) {
747       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
748     }
749 
750     try {
751       RestoreSnapshotHandler handler =
752         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
753       this.executorService.submit(handler);
754       restoreHandlers.put(tableName, handler);
755     } catch (Exception e) {
756       String msg = "Couldn't restore the snapshot=" + ClientSnapshotDescriptionUtils.toString(
757           snapshot)  +
758           " on table=" + tableName;
759       LOG.error(msg, e);
760       throw new RestoreSnapshotException(msg, e);
761     }
762   }
763 
764   /**
765    * Verify if the restore of the specified table is in progress.
766    *
767    * @param tableName table under restore
768    * @return <tt>true</tt> if there is a restore in progress of the specified table.
769    */
770   private synchronized boolean isRestoringTable(final TableName tableName) {
771     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
772     return(sentinel != null && !sentinel.isFinished());
773   }
774 
775   /**
776    * Returns the status of a restore operation.
777    * If the in-progress restore is failed throws the exception that caused the failure.
778    *
779    * @param snapshot
780    * @return false if in progress, true if restore is completed or not requested.
781    * @throws IOException if there was a failure during the restore
782    */
783   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
784     // check to see if the sentinel exists,
785     // and if the task is complete removes it from the in-progress restore map.
786     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
787 
788     // stop tracking "abandoned" handlers
789     cleanupSentinels();
790 
791     if (sentinel == null) {
792       // there is no sentinel so restore is not in progress.
793       return true;
794     }
795 
796     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
797         + sentinel.getSnapshot().getName() + " table=" +
798         TableName.valueOf(snapshot.getTable()));
799 
800     // If the restore is failed, rethrow the exception
801     sentinel.rethrowExceptionIfFailed();
802 
803     // check to see if we are done
804     if (sentinel.isFinished()) {
805       LOG.debug("Restore snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
806           " has completed. Notifying the client.");
807       return true;
808     }
809 
810     if (LOG.isDebugEnabled()) {
811       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
812           ClientSnapshotDescriptionUtils.toString(snapshot));
813     }
814     return false;
815   }
816 
817   /**
818    * Return the handler if it is currently live and has the same snapshot target name.
819    * The handler is removed from the sentinels map if completed.
820    * @param sentinels live handlers
821    * @param snapshot snapshot description
822    * @return null if doesn't match, else a live handler.
823    */
824   private synchronized SnapshotSentinel removeSentinelIfFinished(
825       final Map<TableName, SnapshotSentinel> sentinels,
826       final SnapshotDescription snapshot) {
827     if (!snapshot.hasTable()) {
828       return null;
829     }
830 
831     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
832     SnapshotSentinel h = sentinels.get(snapshotTable);
833     if (h == null) {
834       return null;
835     }
836 
837     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
838       // specified snapshot is to the one currently running
839       return null;
840     }
841 
842     // Remove from the "in-progress" list once completed
843     if (h.isFinished()) {
844       sentinels.remove(snapshotTable);
845     }
846 
847     return h;
848   }
849 
850   /**
851    * Removes "abandoned" snapshot/restore requests.
852    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
853    * and the in-progress maps are cleaned up when the status of a completed task is requested.
854    * To avoid having sentinels staying around for long time if something client side is failed,
855    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
856    */
857   private void cleanupSentinels() {
858     cleanupSentinels(this.snapshotHandlers);
859     cleanupSentinels(this.restoreHandlers);
860   }
861 
862   /**
863    * Remove the sentinels that are marked as finished and the completion time
864    * has exceeded the removal timeout.
865    * @param sentinels map of sentinels to clean
866    */
867   private synchronized void cleanupSentinels(final Map<TableName, SnapshotSentinel> sentinels) {
868     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
869     Iterator<Map.Entry<TableName, SnapshotSentinel>> it =
870         sentinels.entrySet().iterator();
871     while (it.hasNext()) {
872       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
873       SnapshotSentinel sentinel = entry.getValue();
874       if (sentinel.isFinished() &&
875           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
876       {
877         it.remove();
878       }
879     }
880   }
881 
882   //
883   // Implementing Stoppable interface
884   //
885 
886   @Override
887   public void stop(String why) {
888     // short circuit
889     if (this.stopped) return;
890     // make sure we get stop
891     this.stopped = true;
892     // pass the stop onto take snapshot handlers
893     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
894       snapshotHandler.cancel(why);
895     }
896 
897     // pass the stop onto all the restore handlers
898     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
899       restoreHandler.cancel(why);
900     }
901     try {
902       if (coordinator != null) {
903         coordinator.close();
904       }
905     } catch (IOException e) {
906       LOG.error("stop ProcedureCoordinator error", e);
907     }
908   }
909 
910   @Override
911   public boolean isStopped() {
912     return this.stopped;
913   }
914 
915   /**
916    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
917    * Called at the beginning of snapshot() and restoreSnapshot() methods.
918    * @throws UnsupportedOperationException if snapshot are not supported
919    */
920   public void checkSnapshotSupport() throws UnsupportedOperationException {
921     if (!this.isSnapshotSupported) {
922       throw new UnsupportedOperationException(
923         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
924           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
925     }
926   }
927 
928   /**
929    * Called at startup, to verify if snapshot operation is supported, and to avoid
930    * starting the master if there're snapshots present but the cleaners needed are missing.
931    * Otherwise we can end up with snapshot data loss.
932    * @param conf The {@link Configuration} object to use
933    * @param mfs The MasterFileSystem to use
934    * @throws IOException in case of file-system operation failure
935    * @throws UnsupportedOperationException in case cleaners are missing and
936    *         there're snapshot in the system
937    */
938   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
939       throws IOException, UnsupportedOperationException {
940     // Verify if snapshot is disabled by the user
941     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
942     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
943     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
944 
945     // Extract cleaners from conf
946     Set<String> hfileCleaners = new HashSet<String>();
947     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
948     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
949 
950     Set<String> logCleaners = new HashSet<String>();
951     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
952     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
953 
954     // check if an older version of snapshot directory was present
955     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
956     FileSystem fs = mfs.getFileSystem();
957     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
958     if (ss != null && !ss.isEmpty()) {
959       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
960       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
961     }
962 
963     // If the user has enabled the snapshot, we force the cleaners to be present
964     // otherwise we still need to check if cleaners are enabled or not and verify
965     // that there're no snapshot in the .snapshot folder.
966     if (snapshotEnabled) {
967       // Inject snapshot cleaners, if snapshot.enable is true
968       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
969       hfileCleaners.add(HFileLinkCleaner.class.getName());
970       logCleaners.add(SnapshotLogCleaner.class.getName());
971 
972       // Set cleaners conf
973       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
974         hfileCleaners.toArray(new String[hfileCleaners.size()]));
975       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
976         logCleaners.toArray(new String[logCleaners.size()]));
977     } else {
978       // Verify if cleaners are present
979       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
980         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
981         hfileCleaners.contains(HFileLinkCleaner.class.getName());
982 
983       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
984       if (snapshotEnabled) {
985         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
986           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
987           (userDisabled ? "is set to 'false'." : "is not set."));
988       }
989     }
990 
991     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
992     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
993 
994     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
995     // otherwise we end up with snapshot data loss.
996     if (!snapshotEnabled) {
997       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
998       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
999       if (fs.exists(snapshotDir)) {
1000         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
1001           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
1002         if (snapshots != null) {
1003           LOG.error("Snapshots are present, but cleaners are not enabled.");
1004           checkSnapshotSupport();
1005         }
1006       }
1007     }
1008   }
1009 
1010   @Override
1011   public void initialize(MasterServices master, MetricsMaster metricsMaster) throws KeeperException,
1012       IOException, UnsupportedOperationException {
1013     this.master = master;
1014     this.metricsMaster = metricsMaster;
1015 
1016     this.rootDir = master.getMasterFileSystem().getRootDir();
1017     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
1018     this.snapshotLayoutVersion = SnapshotDescriptionUtils.getDefaultSnapshotLayoutFormat(
1019       master.getConfiguration());
1020     
1021     // get the configuration for the coordinator
1022     Configuration conf = master.getConfiguration();
1023     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
1024     long timeoutMillis = Math.max(conf.getLong(SnapshotDescriptionUtils.SNAPSHOT_TIMEOUT_MILLIS_KEY,
1025                     SnapshotDescriptionUtils.SNAPSHOT_TIMEOUT_MILLIS_DEFAULT),
1026             conf.getLong(SnapshotDescriptionUtils.MASTER_SNAPSHOT_TIMEOUT_MILLIS,
1027                     SnapshotDescriptionUtils.DEFAULT_MAX_WAIT_TIME));
1028     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
1029 
1030     // setup the default procedure coordinator
1031     String name = master.getServerName().toString();
1032     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
1033     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
1034         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
1035 
1036     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
1037     this.executorService = master.getExecutorService();
1038     resetTempDir();
1039   }
1040 
1041   @Override
1042   public String getProcedureSignature() {
1043     return ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION;
1044   }
1045 
1046   @Override
1047   public void execProcedure(ProcedureDescription desc) throws IOException {
1048     takeSnapshot(toSnapshotDescription(desc));
1049   }
1050 
1051   @Override
1052   public boolean isProcedureDone(ProcedureDescription desc) throws IOException {
1053     return isSnapshotDone(toSnapshotDescription(desc));
1054   }
1055 
1056   private SnapshotDescription toSnapshotDescription(ProcedureDescription desc)
1057       throws IOException {
1058     SnapshotDescription.Builder builder = SnapshotDescription.newBuilder();
1059     if (!desc.hasInstance()) {
1060       throw new IOException("Snapshot name is not defined: " + desc.toString());
1061     }
1062     String snapshotName = desc.getInstance();
1063     List<NameStringPair> props = desc.getConfigurationList();
1064     String table = null;
1065     for (NameStringPair prop : props) {
1066       if ("table".equalsIgnoreCase(prop.getName())) {
1067         table = prop.getValue();
1068       }
1069     }
1070     if (table == null) {
1071       throw new IOException("Snapshot table is not defined: " + desc.toString());
1072     }
1073     TableName tableName = TableName.valueOf(table);
1074     builder.setTable(tableName.getNameAsString());
1075     builder.setName(snapshotName);
1076     builder.setType(SnapshotDescription.Type.FLUSH);
1077     return builder.build();
1078   }
1079 }