View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.TableName;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HTableDescriptor;
44  import org.apache.hadoop.hbase.Stoppable;
45  import org.apache.hadoop.hbase.catalog.MetaReader;
46  import org.apache.hadoop.hbase.errorhandling.ForeignException;
47  import org.apache.hadoop.hbase.executor.ExecutorService;
48  import org.apache.hadoop.hbase.master.AssignmentManager;
49  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
50  import org.apache.hadoop.hbase.master.MasterFileSystem;
51  import org.apache.hadoop.hbase.master.MasterServices;
52  import org.apache.hadoop.hbase.master.MetricsMaster;
53  import org.apache.hadoop.hbase.master.SnapshotSentinel;
54  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
55  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
56  import org.apache.hadoop.hbase.procedure.Procedure;
57  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
58  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
59  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
60  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
61  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
62  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
63  import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
64  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
65  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
66  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
67  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
68  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
69  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
70  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
71  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
72  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
73  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
74  import org.apache.hadoop.hbase.util.FSTableDescriptors;
75  import org.apache.hadoop.hbase.util.FSUtils;
76  import org.apache.zookeeper.KeeperException;
77  
78  /**
79   * This class manages the procedure of taking and restoring snapshots. There is only one
80   * SnapshotManager for the master.
81   * <p>
82   * The class provides methods for monitoring in-progress snapshot actions.
83   * <p>
84   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
85   * simplification in the current implementation.
86   */
87  @InterfaceAudience.Private
88  @InterfaceStability.Unstable
89  public class SnapshotManager implements Stoppable {
90    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
91  
92    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
93    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
94  
95    /**
96     * Wait time before removing a finished sentinel from the in-progress map
97     *
98     * NOTE: This is used as a safety auto cleanup.
99     * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
100    * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
101    * In case something fails on the client side and the snapshot/restore state is not reclaimed
102    * after a default timeout, the entry is removed from the in-progress map.
103    * At this point, if the user asks for the snapshot/restore status, the result will be
104    * snapshot done if exists or failed if it doesn't exists.
105    */
106   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
107 
108   /** Enable or disable snapshot support */
109   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
110 
111   /**
112    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
113    * completion.
114    */
115   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
116 
117   /** By default, check to see if the snapshot is complete (ms) */
118   private static final int SNAPSHOT_TIMEOUT_MILLIS_DEFAULT = 60000;
119 
120   /**
121    * Conf key for # of ms elapsed before injecting a snapshot timeout error when waiting for
122    * completion.
123    */
124   private static final String SNAPSHOT_TIMEOUT_MILLIS_KEY = "hbase.snapshot.master.timeoutMillis";
125 
126   /** Name of the operation to use in the controller */
127   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
128 
129   /** Conf key for # of threads used by the SnapshotManager thread pool */
130   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
131 
132   /** number of current operations running on the master */
133   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
134 
135   private boolean stopped;
136   private final MasterServices master;  // Needed by TableEventHandlers
137   private final MetricsMaster metricsMaster;
138   private final ProcedureCoordinator coordinator;
139 
140   // Is snapshot feature enabled?
141   private boolean isSnapshotSupported = false;
142 
143   // Snapshot handlers map, with table name as key.
144   // The map is always accessed and modified under the object lock using synchronized.
145   // snapshotTable() will insert an Handler in the table.
146   // isSnapshotDone() will remove the handler requested if the operation is finished.
147   private Map<TableName, SnapshotSentinel> snapshotHandlers =
148       new HashMap<TableName, SnapshotSentinel>();
149 
150   // Restore Sentinels map, with table name as key.
151   // The map is always accessed and modified under the object lock using synchronized.
152   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
153   // isRestoreDone() will remove the handler requested if the operation is finished.
154   private Map<TableName, SnapshotSentinel> restoreHandlers =
155       new HashMap<TableName, SnapshotSentinel>();
156 
157   private final Path rootDir;
158   private final ExecutorService executorService;
159 
160   /**
161    * Construct a snapshot manager.
162    * @param master
163    */
164   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster)
165       throws KeeperException, IOException, UnsupportedOperationException {
166     this.master = master;
167     this.metricsMaster = metricsMaster;
168 
169     this.rootDir = master.getMasterFileSystem().getRootDir();
170     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
171 
172     // get the configuration for the coordinator
173     Configuration conf = master.getConfiguration();
174     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
175     long timeoutMillis = conf.getLong(SNAPSHOT_TIMEOUT_MILLIS_KEY, SNAPSHOT_TIMEOUT_MILLIS_DEFAULT);
176     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
177 
178     // setup the default procedure coordinator
179     String name = master.getServerName().toString();
180     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
181     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
182         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
183 
184     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
185     this.executorService = master.getExecutorService();
186     resetTempDir();
187   }
188 
189   /**
190    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
191    * @param master services for the master where the manager is running
192    * @param coordinator procedure coordinator instance.  exposed for testing.
193    * @param pool HBase ExecutorServcie instance, exposed for testing.
194    */
195   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster,
196       ProcedureCoordinator coordinator, ExecutorService pool)
197       throws IOException, UnsupportedOperationException {
198     this.master = master;
199     this.metricsMaster = metricsMaster;
200 
201     this.rootDir = master.getMasterFileSystem().getRootDir();
202     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
203 
204     this.coordinator = coordinator;
205     this.executorService = pool;
206     resetTempDir();
207   }
208 
209   /**
210    * Gets the list of all completed snapshots.
211    * @return list of SnapshotDescriptions
212    * @throws IOException File system exception
213    */
214   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
215     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
216   }
217 
218   /**
219    * Gets the list of all completed snapshots.
220    * @param snapshotDir snapshot directory
221    * @return list of SnapshotDescriptions
222    * @throws IOException File system exception
223    */
224   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
225     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
226     // first create the snapshot root path and check to see if it exists
227     FileSystem fs = master.getMasterFileSystem().getFileSystem();
228     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
229 
230     // if there are no snapshots, return an empty list
231     if (!fs.exists(snapshotDir)) {
232       return snapshotDescs;
233     }
234 
235     // ignore all the snapshots in progress
236     FileStatus[] snapshots = fs.listStatus(snapshotDir,
237       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
238     // loop through all the completed snapshots
239     for (FileStatus snapshot : snapshots) {
240       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
241       // if the snapshot is bad
242       if (!fs.exists(info)) {
243         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
244         continue;
245       }
246       FSDataInputStream in = null;
247       try {
248         in = fs.open(info);
249         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
250         snapshotDescs.add(desc);
251       } catch (IOException e) {
252         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
253       } finally {
254         if (in != null) {
255           in.close();
256         }
257       }
258     }
259     return snapshotDescs;
260   }
261 
262   /**
263    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
264    * snapshot attempts.
265    *
266    * @throws IOException if we can't reach the filesystem
267    */
268   void resetTempDir() throws IOException {
269     // cleanup any existing snapshots.
270     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
271     if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
272       LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
273     }
274   }
275 
276   /**
277    * Delete the specified snapshot
278    * @param snapshot
279    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
280    * @throws IOException For filesystem IOExceptions
281    */
282   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
283 
284     // call coproc pre hook
285     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
286     if (cpHost != null) {
287       cpHost.preDeleteSnapshot(snapshot);
288     }
289 
290     // check to see if it is completed
291     if (!isSnapshotCompleted(snapshot)) {
292       throw new SnapshotDoesNotExistException(snapshot);
293     }
294 
295     String snapshotName = snapshot.getName();
296     LOG.debug("Deleting snapshot: " + snapshotName);
297     // first create the snapshot description and check to see if it exists
298     MasterFileSystem fs = master.getMasterFileSystem();
299     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
300 
301     // delete the existing snapshot
302     if (!fs.getFileSystem().delete(snapshotDir, true)) {
303       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
304     }
305 
306     // call coproc post hook
307     if (cpHost != null) {
308       cpHost.postDeleteSnapshot(snapshot);
309     }
310 
311   }
312 
313   /**
314    * Check if the specified snapshot is done
315    *
316    * @param expected
317    * @return true if snapshot is ready to be restored, false if it is still being taken.
318    * @throws IOException IOException if error from HDFS or RPC
319    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
320    */
321   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
322     // check the request to make sure it has a snapshot
323     if (expected == null) {
324       throw new UnknownSnapshotException(
325          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
326     }
327 
328     String ssString = ClientSnapshotDescriptionUtils.toString(expected);
329 
330     // check to see if the sentinel exists,
331     // and if the task is complete removes it from the in-progress snapshots map.
332     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
333 
334     // stop tracking "abandoned" handlers
335     cleanupSentinels();
336 
337     if (handler == null) {
338       // If there's no handler in the in-progress map, it means one of the following:
339       //   - someone has already requested the snapshot state
340       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
341       //   - the snapshot was never requested
342       // In those cases returns to the user the "done state" if the snapshots exists on disk,
343       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
344       if (!isSnapshotCompleted(expected)) {
345         throw new UnknownSnapshotException("Snapshot " + ssString
346             + " is not currently running or one of the known completed snapshots.");
347       }
348       // was done, return true;
349       return true;
350     }
351 
352     // pass on any failure we find in the sentinel
353     try {
354       handler.rethrowExceptionIfFailed();
355     } catch (ForeignException e) {
356       // Give some procedure info on an exception.
357       String status;
358       Procedure p = coordinator.getProcedure(expected.getName());
359       if (p != null) {
360         status = p.getStatus();
361       } else {
362         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
363       }
364       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
365           expected);
366     }
367 
368     // check to see if we are done
369     if (handler.isFinished()) {
370       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
371       return true;
372     } else if (LOG.isDebugEnabled()) {
373       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
374     }
375     return false;
376   }
377 
378   /**
379    * Check to see if the specified table has a snapshot in progress.  Currently we have a
380    * limitation only allowing a single snapshot per table at a time.
381    * @param tableName name of the table being snapshotted.
382    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
383    */
384   synchronized boolean isTakingSnapshot(final TableName tableName) {
385     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
386     return handler != null && !handler.isFinished();
387   }
388 
389   /**
390    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
391    * aren't already running a snapshot or restore on the requested table.
392    * @param snapshot description of the snapshot we want to start
393    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
394    */
395   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
396       throws HBaseSnapshotException {
397     FileSystem fs = master.getMasterFileSystem().getFileSystem();
398     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
399     TableName snapshotTable =
400         TableName.valueOf(snapshot.getTable());
401 
402     // make sure we aren't already running a snapshot
403     if (isTakingSnapshot(snapshotTable)) {
404       SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable);
405       throw new SnapshotCreationException("Rejected taking "
406           + ClientSnapshotDescriptionUtils.toString(snapshot)
407           + " because we are already running another snapshot "
408           + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
409     }
410 
411     // make sure we aren't running a restore on the same table
412     if (isRestoringTable(snapshotTable)) {
413       SnapshotSentinel handler = restoreHandlers.get(snapshotTable);
414       throw new SnapshotCreationException("Rejected taking "
415           + ClientSnapshotDescriptionUtils.toString(snapshot)
416           + " because we are already have a restore in progress on the same snapshot "
417           + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
418     }
419 
420     try {
421       // delete the working directory, since we aren't running the snapshot. Likely leftovers
422       // from a failed attempt.
423       fs.delete(workingDir, true);
424 
425       // recreate the working directory for the snapshot
426       if (!fs.mkdirs(workingDir)) {
427         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
428             + ") for snapshot" , snapshot);
429       }
430     } catch (HBaseSnapshotException e) {
431       throw e;
432     } catch (IOException e) {
433       throw new SnapshotCreationException(
434           "Exception while checking to see if snapshot could be started.", e, snapshot);
435     }
436   }
437 
438   /**
439    * Take a snapshot of a disabled table.
440    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
441    * @throws HBaseSnapshotException if the snapshot could not be started
442    */
443   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
444       throws HBaseSnapshotException {
445     // setup the snapshot
446     prepareToTakeSnapshot(snapshot);
447 
448     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
449     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
450 
451     // Take the snapshot of the disabled table
452     DisabledTableSnapshotHandler handler =
453         new DisabledTableSnapshotHandler(snapshot, master, metricsMaster);
454     snapshotTable(snapshot, handler);
455   }
456 
457   /**
458    * Take a snapshot of an enabled table.
459    * @param snapshot description of the snapshot to take.
460    * @throws HBaseSnapshotException if the snapshot could not be started
461    */
462   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
463       throws HBaseSnapshotException {
464     // setup the snapshot
465     prepareToTakeSnapshot(snapshot);
466 
467     // Take the snapshot of the enabled table
468     EnabledTableSnapshotHandler handler =
469         new EnabledTableSnapshotHandler(snapshot, master, this, metricsMaster);
470     snapshotTable(snapshot, handler);
471   }
472 
473   /**
474    * Take a snapshot using the specified handler.
475    * On failure the snapshot temporary working directory is removed.
476    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
477    *       snapshot request if the table is busy with another snapshot/restore operation.
478    * @param snapshot the snapshot description
479    * @param handler the snapshot handler
480    */
481   private synchronized void snapshotTable(SnapshotDescription snapshot,
482       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
483     try {
484       handler.prepare();
485       this.executorService.submit(handler);
486       this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler);
487     } catch (Exception e) {
488       // cleanup the working directory by trying to delete it from the fs.
489       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
490       try {
491         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
492           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
493               ClientSnapshotDescriptionUtils.toString(snapshot));
494         }
495       } catch (IOException e1) {
496         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
497             ClientSnapshotDescriptionUtils.toString(snapshot));
498       }
499       // fail the snapshot
500       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
501     }
502   }
503 
504   /**
505    * Take a snapshot based on the enabled/disabled state of the table.
506    *
507    * @param snapshot
508    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
509    * @throws IOException when some sort of generic IO exception occurs.
510    */
511   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
512     // check to see if we already completed the snapshot
513     if (isSnapshotCompleted(snapshot)) {
514       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
515           + "' already stored on the filesystem.", snapshot);
516     }
517 
518     LOG.debug("No existing snapshot, attempting snapshot...");
519 
520     // stop tracking "abandoned" handlers
521     cleanupSentinels();
522 
523     // check to see if the table exists
524     HTableDescriptor desc = null;
525     try {
526       desc = master.getTableDescriptors().get(
527           TableName.valueOf(snapshot.getTable()));
528     } catch (FileNotFoundException e) {
529       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
530       LOG.error(msg);
531       throw new SnapshotCreationException(msg, e, snapshot);
532     } catch (IOException e) {
533       throw new SnapshotCreationException("Error while geting table description for table "
534           + snapshot.getTable(), e, snapshot);
535     }
536     if (desc == null) {
537       throw new SnapshotCreationException("Table '" + snapshot.getTable()
538           + "' doesn't exist, can't take snapshot.", snapshot);
539     }
540 
541     // set the snapshot version, now that we are ready to take it
542     snapshot = snapshot.toBuilder().setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION)
543         .build();
544 
545     // call pre coproc hook
546     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
547     if (cpHost != null) {
548       cpHost.preSnapshot(snapshot, desc);
549     }
550 
551     // if the table is enabled, then have the RS run actually the snapshot work
552     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
553     AssignmentManager assignmentMgr = master.getAssignmentManager();
554     if (assignmentMgr.getZKTable().isEnabledTable(snapshotTable)) {
555       LOG.debug("Table enabled, starting distributed snapshot.");
556       snapshotEnabledTable(snapshot);
557       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
558     }
559     // For disabled table, snapshot is created by the master
560     else if (assignmentMgr.getZKTable().isDisabledTable(snapshotTable)) {
561       LOG.debug("Table is disabled, running snapshot entirely on master.");
562       snapshotDisabledTable(snapshot);
563       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
564     } else {
565       LOG.error("Can't snapshot table '" + snapshot.getTable()
566           + "', isn't open or closed, we don't know what to do!");
567       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
568           + " isn't fully open.");
569       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
570     }
571 
572     // call post coproc hook
573     if (cpHost != null) {
574       cpHost.postSnapshot(snapshot, desc);
575     }
576   }
577 
578   /**
579    * Set the handler for the current snapshot
580    * <p>
581    * Exposed for TESTING
582    * @param tableName
583    * @param handler handler the master should use
584    *
585    * TODO get rid of this if possible, repackaging, modify tests.
586    */
587   public synchronized void setSnapshotHandlerForTesting(
588       final TableName tableName,
589       final SnapshotSentinel handler) {
590     if (handler != null) {
591       this.snapshotHandlers.put(tableName, handler);
592     } else {
593       this.snapshotHandlers.remove(tableName);
594     }
595   }
596 
597   /**
598    * @return distributed commit coordinator for all running snapshots
599    */
600   ProcedureCoordinator getCoordinator() {
601     return coordinator;
602   }
603 
604   /**
605    * Check to see if the snapshot is one of the currently completed snapshots
606    * Returns true if the snapshot exists in the "completed snapshots folder".
607    *
608    * @param snapshot expected snapshot to check
609    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
610    *         not stored
611    * @throws IOException if the filesystem throws an unexpected exception,
612    * @throws IllegalArgumentException if snapshot name is invalid.
613    */
614   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
615     try {
616       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
617       FileSystem fs = master.getMasterFileSystem().getFileSystem();
618       // check to see if the snapshot already exists
619       return fs.exists(snapshotDir);
620     } catch (IllegalArgumentException iae) {
621       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
622     }
623   }
624 
625   /**
626    * Clone the specified snapshot into a new table.
627    * The operation will fail if the destination table has a snapshot or restore in progress.
628    *
629    * @param snapshot Snapshot Descriptor
630    * @param hTableDescriptor Table Descriptor of the table to create
631    */
632   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
633       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
634     TableName tableName = hTableDescriptor.getTableName();
635 
636     // make sure we aren't running a snapshot on the same table
637     if (isTakingSnapshot(tableName)) {
638       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
639     }
640 
641     // make sure we aren't running a restore on the same table
642     if (isRestoringTable(tableName)) {
643       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
644     }
645 
646     try {
647       CloneSnapshotHandler handler =
648         new CloneSnapshotHandler(master, snapshot, hTableDescriptor, metricsMaster).prepare();
649       this.executorService.submit(handler);
650       this.restoreHandlers.put(tableName, handler);
651     } catch (Exception e) {
652       String msg = "Couldn't clone the snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
653         " on table=" + tableName;
654       LOG.error(msg, e);
655       throw new RestoreSnapshotException(msg, e);
656     }
657   }
658 
659   /**
660    * Restore the specified snapshot
661    * @param reqSnapshot
662    * @throws IOException
663    */
664   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
665     FileSystem fs = master.getMasterFileSystem().getFileSystem();
666     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
667     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
668 
669     // check if the snapshot exists
670     if (!fs.exists(snapshotDir)) {
671       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
672       throw new SnapshotDoesNotExistException(reqSnapshot);
673     }
674 
675     // read snapshot information
676     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
677     HTableDescriptor snapshotTableDesc =
678         FSTableDescriptors.getTableDescriptorFromFs(fs, snapshotDir);
679     TableName tableName = TableName.valueOf(reqSnapshot.getTable());
680 
681     // stop tracking "abandoned" handlers
682     cleanupSentinels();
683 
684     // Execute the restore/clone operation
685     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
686       if (master.getAssignmentManager().getZKTable().isEnabledTable(
687           TableName.valueOf(fsSnapshot.getTable()))) {
688         throw new UnsupportedOperationException("Table '" +
689             TableName.valueOf(fsSnapshot.getTable()) + "' must be disabled in order to " +
690             "perform a restore operation" +
691             ".");
692       }
693 
694       // call coproc pre hook
695       if (cpHost != null) {
696         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
697       }
698       restoreSnapshot(fsSnapshot, snapshotTableDesc);
699       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
700 
701       if (cpHost != null) {
702         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
703       }
704     } else {
705       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc, tableName);
706       if (cpHost != null) {
707         cpHost.preCloneSnapshot(reqSnapshot, htd);
708       }
709       cloneSnapshot(fsSnapshot, htd);
710       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
711 
712       if (cpHost != null) {
713         cpHost.postCloneSnapshot(reqSnapshot, htd);
714       }
715     }
716   }
717 
718   /**
719    * Restore the specified snapshot.
720    * The restore will fail if the destination table has a snapshot or restore in progress.
721    *
722    * @param snapshot Snapshot Descriptor
723    * @param hTableDescriptor Table Descriptor
724    */
725   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
726       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
727     TableName tableName = hTableDescriptor.getTableName();
728 
729     // make sure we aren't running a snapshot on the same table
730     if (isTakingSnapshot(tableName)) {
731       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
732     }
733 
734     // make sure we aren't running a restore on the same table
735     if (isRestoringTable(tableName)) {
736       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
737     }
738 
739     try {
740       RestoreSnapshotHandler handler =
741         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor, metricsMaster).prepare();
742       this.executorService.submit(handler);
743       restoreHandlers.put(tableName, handler);
744     } catch (Exception e) {
745       String msg = "Couldn't restore the snapshot=" + ClientSnapshotDescriptionUtils.toString(
746           snapshot)  +
747           " on table=" + tableName;
748       LOG.error(msg, e);
749       throw new RestoreSnapshotException(msg, e);
750     }
751   }
752 
753   /**
754    * Verify if the restore of the specified table is in progress.
755    *
756    * @param tableName table under restore
757    * @return <tt>true</tt> if there is a restore in progress of the specified table.
758    */
759   private synchronized boolean isRestoringTable(final TableName tableName) {
760     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
761     return(sentinel != null && !sentinel.isFinished());
762   }
763 
764   /**
765    * Returns the status of a restore operation.
766    * If the in-progress restore is failed throws the exception that caused the failure.
767    *
768    * @param snapshot
769    * @return false if in progress, true if restore is completed or not requested.
770    * @throws IOException if there was a failure during the restore
771    */
772   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
773     // check to see if the sentinel exists,
774     // and if the task is complete removes it from the in-progress restore map.
775     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
776 
777     // stop tracking "abandoned" handlers
778     cleanupSentinels();
779 
780     if (sentinel == null) {
781       // there is no sentinel so restore is not in progress.
782       return true;
783     }
784 
785     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
786         + sentinel.getSnapshot().getName() + " table=" +
787         TableName.valueOf(snapshot.getTable()));
788 
789     // If the restore is failed, rethrow the exception
790     sentinel.rethrowExceptionIfFailed();
791 
792     // check to see if we are done
793     if (sentinel.isFinished()) {
794       LOG.debug("Restore snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
795           " has completed. Notifying the client.");
796       return true;
797     }
798 
799     if (LOG.isDebugEnabled()) {
800       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
801           ClientSnapshotDescriptionUtils.toString(snapshot));
802     }
803     return false;
804   }
805 
806   /**
807    * Return the handler if it is currently live and has the same snapshot target name.
808    * The handler is removed from the sentinels map if completed.
809    * @param sentinels live handlers
810    * @param snapshot snapshot description
811    * @return null if doesn't match, else a live handler.
812    */
813   private synchronized SnapshotSentinel removeSentinelIfFinished(
814       final Map<TableName, SnapshotSentinel> sentinels,
815       final SnapshotDescription snapshot) {
816     if (!snapshot.hasTable()) {
817       return null;
818     }
819 
820     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
821     SnapshotSentinel h = sentinels.get(snapshotTable);
822     if (h == null) {
823       return null;
824     }
825 
826     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
827       // specified snapshot is to the one currently running
828       return null;
829     }
830 
831     // Remove from the "in-progress" list once completed
832     if (h.isFinished()) {
833       sentinels.remove(snapshotTable);
834     }
835 
836     return h;
837   }
838 
839   /**
840    * Removes "abandoned" snapshot/restore requests.
841    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
842    * and the in-progress maps are cleaned up when the status of a completed task is requested.
843    * To avoid having sentinels staying around for long time if something client side is failed,
844    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
845    */
846   private void cleanupSentinels() {
847     cleanupSentinels(this.snapshotHandlers);
848     cleanupSentinels(this.restoreHandlers);
849   }
850 
851   /**
852    * Remove the sentinels that are marked as finished and the completion time
853    * has exceeded the removal timeout.
854    * @param sentinels map of sentinels to clean
855    */
856   private synchronized void cleanupSentinels(final Map<TableName, SnapshotSentinel> sentinels) {
857     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
858     Iterator<Map.Entry<TableName, SnapshotSentinel>> it =
859         sentinels.entrySet().iterator();
860     while (it.hasNext()) {
861       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
862       SnapshotSentinel sentinel = entry.getValue();
863       if (sentinel.isFinished() &&
864           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
865       {
866         it.remove();
867       }
868     }
869   }
870 
871   //
872   // Implementing Stoppable interface
873   //
874 
875   @Override
876   public void stop(String why) {
877     // short circuit
878     if (this.stopped) return;
879     // make sure we get stop
880     this.stopped = true;
881     // pass the stop onto take snapshot handlers
882     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
883       snapshotHandler.cancel(why);
884     }
885 
886     // pass the stop onto all the restore handlers
887     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
888       restoreHandler.cancel(why);
889     }
890     try {
891       coordinator.close();
892     } catch (IOException e) {
893       LOG.error("stop ProcedureCoordinator error", e);
894     }
895   }
896 
897   @Override
898   public boolean isStopped() {
899     return this.stopped;
900   }
901 
902   /**
903    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
904    * Called at the beginning of snapshot() and restoreSnapshot() methods.
905    * @throws UnsupportedOperationException if snapshot are not supported
906    */
907   public void checkSnapshotSupport() throws UnsupportedOperationException {
908     if (!this.isSnapshotSupported) {
909       throw new UnsupportedOperationException(
910         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
911           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
912     }
913   }
914 
915   /**
916    * Called at startup, to verify if snapshot operation is supported, and to avoid
917    * starting the master if there're snapshots present but the cleaners needed are missing.
918    * Otherwise we can end up with snapshot data loss.
919    * @param conf The {@link Configuration} object to use
920    * @param mfs The MasterFileSystem to use
921    * @throws IOException in case of file-system operation failure
922    * @throws UnsupportedOperationException in case cleaners are missing and
923    *         there're snapshot in the system
924    */
925   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
926       throws IOException, UnsupportedOperationException {
927     // Verify if snapshot is disabled by the user
928     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
929     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
930     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
931 
932     // Extract cleaners from conf
933     Set<String> hfileCleaners = new HashSet<String>();
934     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
935     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
936 
937     Set<String> logCleaners = new HashSet<String>();
938     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
939     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
940 
941     // check if an older version of snapshot directory was present
942     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
943     FileSystem fs = mfs.getFileSystem();
944     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
945     if (ss != null && !ss.isEmpty()) {
946       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
947       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
948     }
949 
950     // If the user has enabled the snapshot, we force the cleaners to be present
951     // otherwise we still need to check if cleaners are enabled or not and verify
952     // that there're no snapshot in the .snapshot folder.
953     if (snapshotEnabled) {
954       // Inject snapshot cleaners, if snapshot.enable is true
955       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
956       hfileCleaners.add(HFileLinkCleaner.class.getName());
957       logCleaners.add(SnapshotLogCleaner.class.getName());
958 
959       // Set cleaners conf
960       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
961         hfileCleaners.toArray(new String[hfileCleaners.size()]));
962       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
963         logCleaners.toArray(new String[logCleaners.size()]));
964     } else {
965       // Verify if cleaners are present
966       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
967         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
968         hfileCleaners.contains(HFileLinkCleaner.class.getName());
969 
970       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
971       if (snapshotEnabled) {
972         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
973           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
974           (userDisabled ? "is set to 'false'." : "is not set."));
975       }
976     }
977 
978     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
979     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
980 
981     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
982     // otherwise we end up with snapshot data loss.
983     if (!snapshotEnabled) {
984       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
985       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
986       if (fs.exists(snapshotDir)) {
987         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
988           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
989         if (snapshots != null) {
990           LOG.error("Snapshots are present, but cleaners are not enabled.");
991           checkSnapshotSupport();
992         }
993       }
994     }
995   }
996 }