View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.TableName;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HTableDescriptor;
44  import org.apache.hadoop.hbase.Stoppable;
45  import org.apache.hadoop.hbase.catalog.MetaReader;
46  import org.apache.hadoop.hbase.errorhandling.ForeignException;
47  import org.apache.hadoop.hbase.executor.ExecutorService;
48  import org.apache.hadoop.hbase.master.AssignmentManager;
49  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
50  import org.apache.hadoop.hbase.master.MasterFileSystem;
51  import org.apache.hadoop.hbase.master.MasterServices;
52  import org.apache.hadoop.hbase.master.MetricsMaster;
53  import org.apache.hadoop.hbase.master.SnapshotSentinel;
54  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
55  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
56  import org.apache.hadoop.hbase.procedure.Procedure;
57  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
58  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
59  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
60  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
61  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
62  import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
63  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
64  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
65  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
66  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
67  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
68  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
69  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
70  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
71  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
72  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
73  import org.apache.hadoop.hbase.util.FSTableDescriptors;
74  import org.apache.hadoop.hbase.util.FSUtils;
75  import org.apache.zookeeper.KeeperException;
76  
77  /**
78   * This class manages the procedure of taking and restoring snapshots. There is only one
79   * SnapshotManager for the master.
80   * <p>
81   * The class provides methods for monitoring in-progress snapshot actions.
82   * <p>
83   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
84   * simplification in the current implementation.
85   */
86  @InterfaceAudience.Private
87  @InterfaceStability.Unstable
88  public class SnapshotManager implements Stoppable {
89    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
90  
91    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
92    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
93  
94    /**
95     * Wait time before removing a finished sentinel from the in-progress map
96     *
97     * NOTE: This is used as a safety auto cleanup.
98     * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
99     * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
100    * In case something fails on the client side and the snapshot/restore state is not reclaimed
101    * after a default timeout, the entry is removed from the in-progress map.
102    * At this point, if the user asks for the snapshot/restore status, the result will be
103    * snapshot done if exists or failed if it doesn't exists.
104    */
105   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
106 
107   /** Enable or disable snapshot support */
108   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
109 
110   /**
111    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
112    * completion.
113    */
114   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
115 
116   /** By default, check to see if the snapshot is complete (ms) */
117   private static final int SNAPSHOT_TIMEOUT_MILLIS_DEFAULT = 60000;
118 
119   /**
120    * Conf key for # of ms elapsed before injecting a snapshot timeout error when waiting for
121    * completion.
122    */
123   private static final String SNAPSHOT_TIMEOUT_MILLIS_KEY = "hbase.snapshot.master.timeoutMillis";
124 
125   /** Name of the operation to use in the controller */
126   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
127 
128   /** Conf key for # of threads used by the SnapshotManager thread pool */
129   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
130 
131   /** number of current operations running on the master */
132   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
133 
134   private boolean stopped;
135   private final MasterServices master;  // Needed by TableEventHandlers
136   private final MetricsMaster metricsMaster;
137   private final ProcedureCoordinator coordinator;
138 
139   // Is snapshot feature enabled?
140   private boolean isSnapshotSupported = false;
141 
142   // Snapshot handlers map, with table name as key.
143   // The map is always accessed and modified under the object lock using synchronized.
144   // snapshotTable() will insert an Handler in the table.
145   // isSnapshotDone() will remove the handler requested if the operation is finished.
146   private Map<TableName, SnapshotSentinel> snapshotHandlers =
147       new HashMap<TableName, SnapshotSentinel>();
148 
149   // Restore Sentinels map, with table name as key.
150   // The map is always accessed and modified under the object lock using synchronized.
151   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
152   // isRestoreDone() will remove the handler requested if the operation is finished.
153   private Map<TableName, SnapshotSentinel> restoreHandlers =
154       new HashMap<TableName, SnapshotSentinel>();
155 
156   private final Path rootDir;
157   private final ExecutorService executorService;
158 
159   /**
160    * Construct a snapshot manager.
161    * @param master
162    */
163   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster)
164       throws KeeperException, IOException, UnsupportedOperationException {
165     this.master = master;
166     this.metricsMaster = metricsMaster;
167 
168     this.rootDir = master.getMasterFileSystem().getRootDir();
169     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
170 
171     // get the configuration for the coordinator
172     Configuration conf = master.getConfiguration();
173     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
174     long timeoutMillis = conf.getLong(SNAPSHOT_TIMEOUT_MILLIS_KEY, SNAPSHOT_TIMEOUT_MILLIS_DEFAULT);
175     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
176 
177     // setup the default procedure coordinator
178     String name = master.getServerName().toString();
179     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
180     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
181         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
182 
183     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
184     this.executorService = master.getExecutorService();
185     resetTempDir();
186   }
187 
188   /**
189    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
190    * @param master services for the master where the manager is running
191    * @param coordinator procedure coordinator instance.  exposed for testing.
192    * @param pool HBase ExecutorServcie instance, exposed for testing.
193    */
194   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster,
195       ProcedureCoordinator coordinator, ExecutorService pool)
196       throws IOException, UnsupportedOperationException {
197     this.master = master;
198     this.metricsMaster = metricsMaster;
199 
200     this.rootDir = master.getMasterFileSystem().getRootDir();
201     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
202 
203     this.coordinator = coordinator;
204     this.executorService = pool;
205     resetTempDir();
206   }
207 
208   /**
209    * Gets the list of all completed snapshots.
210    * @return list of SnapshotDescriptions
211    * @throws IOException File system exception
212    */
213   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
214     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
215   }
216 
217   /**
218    * Gets the list of all completed snapshots.
219    * @param snapshotDir snapshot directory
220    * @return list of SnapshotDescriptions
221    * @throws IOException File system exception
222    */
223   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
224     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
225     // first create the snapshot root path and check to see if it exists
226     FileSystem fs = master.getMasterFileSystem().getFileSystem();
227     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
228 
229     // if there are no snapshots, return an empty list
230     if (!fs.exists(snapshotDir)) {
231       return snapshotDescs;
232     }
233 
234     // ignore all the snapshots in progress
235     FileStatus[] snapshots = fs.listStatus(snapshotDir,
236       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
237     // loop through all the completed snapshots
238     for (FileStatus snapshot : snapshots) {
239       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
240       // if the snapshot is bad
241       if (!fs.exists(info)) {
242         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
243         continue;
244       }
245       FSDataInputStream in = null;
246       try {
247         in = fs.open(info);
248         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
249         snapshotDescs.add(desc);
250       } catch (IOException e) {
251         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
252       } finally {
253         if (in != null) {
254           in.close();
255         }
256       }
257     }
258     return snapshotDescs;
259   }
260 
261   /**
262    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
263    * snapshot attempts.
264    *
265    * @throws IOException if we can't reach the filesystem
266    */
267   void resetTempDir() throws IOException {
268     // cleanup any existing snapshots.
269     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
270     if (master.getMasterFileSystem().getFileSystem().exists(tmpdir)) {
271       if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
272         LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
273       }
274     }
275   }
276 
277   /**
278    * Delete the specified snapshot
279    * @param snapshot
280    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
281    * @throws IOException For filesystem IOExceptions
282    */
283   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
284 
285     // call coproc pre hook
286     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
287     if (cpHost != null) {
288       cpHost.preDeleteSnapshot(snapshot);
289     }
290 
291     // check to see if it is completed
292     if (!isSnapshotCompleted(snapshot)) {
293       throw new SnapshotDoesNotExistException(snapshot);
294     }
295 
296     String snapshotName = snapshot.getName();
297     LOG.debug("Deleting snapshot: " + snapshotName);
298     // first create the snapshot description and check to see if it exists
299     MasterFileSystem fs = master.getMasterFileSystem();
300     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
301 
302     // delete the existing snapshot
303     if (!fs.getFileSystem().delete(snapshotDir, true)) {
304       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
305     }
306 
307     // call coproc post hook
308     if (cpHost != null) {
309       cpHost.postDeleteSnapshot(snapshot);
310     }
311 
312   }
313 
314   /**
315    * Check if the specified snapshot is done
316    *
317    * @param expected
318    * @return true if snapshot is ready to be restored, false if it is still being taken.
319    * @throws IOException IOException if error from HDFS or RPC
320    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
321    */
322   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
323     // check the request to make sure it has a snapshot
324     if (expected == null) {
325       throw new UnknownSnapshotException(
326          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
327     }
328 
329     String ssString = ClientSnapshotDescriptionUtils.toString(expected);
330 
331     // check to see if the sentinel exists,
332     // and if the task is complete removes it from the in-progress snapshots map.
333     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
334 
335     // stop tracking "abandoned" handlers
336     cleanupSentinels();
337 
338     if (handler == null) {
339       // If there's no handler in the in-progress map, it means one of the following:
340       //   - someone has already requested the snapshot state
341       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
342       //   - the snapshot was never requested
343       // In those cases returns to the user the "done state" if the snapshots exists on disk,
344       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
345       if (!isSnapshotCompleted(expected)) {
346         throw new UnknownSnapshotException("Snapshot " + ssString
347             + " is not currently running or one of the known completed snapshots.");
348       }
349       // was done, return true;
350       return true;
351     }
352 
353     // pass on any failure we find in the sentinel
354     try {
355       handler.rethrowExceptionIfFailed();
356     } catch (ForeignException e) {
357       // Give some procedure info on an exception.
358       String status;
359       Procedure p = coordinator.getProcedure(expected.getName());
360       if (p != null) {
361         status = p.getStatus();
362       } else {
363         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
364       }
365       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
366           expected);
367     }
368 
369     // check to see if we are done
370     if (handler.isFinished()) {
371       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
372       return true;
373     } else if (LOG.isDebugEnabled()) {
374       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
375     }
376     return false;
377   }
378 
379   /**
380    * Check to see if there is a snapshot in progress with the same name or on the same table.
381    * Currently we have a limitation only allowing a single snapshot per table at a time. Also we
382    * don't allow snapshot with the same name.
383    * @param snapshot description of the snapshot being checked.
384    * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same
385    *         table.
386    */
387   synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) {
388     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
389     if (isTakingSnapshot(snapshotTable)) {
390       return true;
391     }
392     Iterator<Map.Entry<TableName, SnapshotSentinel>> it = this.snapshotHandlers.entrySet().iterator();
393     while (it.hasNext()) {
394       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
395       SnapshotSentinel sentinel = entry.getValue();
396       if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) {
397         return true;
398       }
399     }
400     return false;
401   }
402 
403   /**
404    * Check to see if the specified table has a snapshot in progress.  Currently we have a
405    * limitation only allowing a single snapshot per table at a time.
406    * @param tableName name of the table being snapshotted.
407    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
408    */
409   synchronized boolean isTakingSnapshot(final TableName tableName) {
410     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
411     return handler != null && !handler.isFinished();
412   }
413 
414   /**
415    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
416    * aren't already running a snapshot or restore on the requested table.
417    * @param snapshot description of the snapshot we want to start
418    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
419    */
420   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
421       throws HBaseSnapshotException {
422     FileSystem fs = master.getMasterFileSystem().getFileSystem();
423     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
424     TableName snapshotTable =
425         TableName.valueOf(snapshot.getTable());
426 
427     // make sure we aren't already running a snapshot
428     if (isTakingSnapshot(snapshot)) {
429       SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable);
430       throw new SnapshotCreationException("Rejected taking "
431           + ClientSnapshotDescriptionUtils.toString(snapshot)
432           + " because we are already running another snapshot "
433           + (handler != null ? ("on the same table " +
434               ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()))
435               : "with the same name"), snapshot);
436     }
437 
438     // make sure we aren't running a restore on the same table
439     if (isRestoringTable(snapshotTable)) {
440       SnapshotSentinel handler = restoreHandlers.get(snapshotTable);
441       throw new SnapshotCreationException("Rejected taking "
442           + ClientSnapshotDescriptionUtils.toString(snapshot)
443           + " because we are already have a restore in progress on the same snapshot "
444           + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
445     }
446 
447     try {
448       // delete the working directory, since we aren't running the snapshot. Likely leftovers
449       // from a failed attempt.
450       fs.delete(workingDir, true);
451 
452       // recreate the working directory for the snapshot
453       if (!fs.mkdirs(workingDir)) {
454         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
455             + ") for snapshot" , snapshot);
456       }
457     } catch (HBaseSnapshotException e) {
458       throw e;
459     } catch (IOException e) {
460       throw new SnapshotCreationException(
461           "Exception while checking to see if snapshot could be started.", e, snapshot);
462     }
463   }
464 
465   /**
466    * Take a snapshot of a disabled table.
467    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
468    * @throws HBaseSnapshotException if the snapshot could not be started
469    */
470   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
471       throws HBaseSnapshotException {
472     // setup the snapshot
473     prepareToTakeSnapshot(snapshot);
474 
475     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
476     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
477 
478     // Take the snapshot of the disabled table
479     DisabledTableSnapshotHandler handler =
480         new DisabledTableSnapshotHandler(snapshot, master);
481     snapshotTable(snapshot, handler);
482   }
483 
484   /**
485    * Take a snapshot of an enabled table.
486    * @param snapshot description of the snapshot to take.
487    * @throws HBaseSnapshotException if the snapshot could not be started
488    */
489   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
490       throws HBaseSnapshotException {
491     // setup the snapshot
492     prepareToTakeSnapshot(snapshot);
493 
494     // Take the snapshot of the enabled table
495     EnabledTableSnapshotHandler handler =
496         new EnabledTableSnapshotHandler(snapshot, master, this);
497     snapshotTable(snapshot, handler);
498   }
499 
500   /**
501    * Take a snapshot using the specified handler.
502    * On failure the snapshot temporary working directory is removed.
503    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
504    *       snapshot request if the table is busy with another snapshot/restore operation.
505    * @param snapshot the snapshot description
506    * @param handler the snapshot handler
507    */
508   private synchronized void snapshotTable(SnapshotDescription snapshot,
509       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
510     try {
511       handler.prepare();
512       this.executorService.submit(handler);
513       this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler);
514     } catch (Exception e) {
515       // cleanup the working directory by trying to delete it from the fs.
516       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
517       try {
518         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
519           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
520               ClientSnapshotDescriptionUtils.toString(snapshot));
521         }
522       } catch (IOException e1) {
523         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
524             ClientSnapshotDescriptionUtils.toString(snapshot));
525       }
526       // fail the snapshot
527       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
528     }
529   }
530 
531   /**
532    * Take a snapshot based on the enabled/disabled state of the table.
533    *
534    * @param snapshot
535    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
536    * @throws IOException when some sort of generic IO exception occurs.
537    */
538   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
539     // check to see if we already completed the snapshot
540     if (isSnapshotCompleted(snapshot)) {
541       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
542           + "' already stored on the filesystem.", snapshot);
543     }
544 
545     LOG.debug("No existing snapshot, attempting snapshot...");
546 
547     // stop tracking "abandoned" handlers
548     cleanupSentinels();
549 
550     // check to see if the table exists
551     HTableDescriptor desc = null;
552     try {
553       desc = master.getTableDescriptors().get(
554           TableName.valueOf(snapshot.getTable()));
555     } catch (FileNotFoundException e) {
556       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
557       LOG.error(msg);
558       throw new SnapshotCreationException(msg, e, snapshot);
559     } catch (IOException e) {
560       throw new SnapshotCreationException("Error while geting table description for table "
561           + snapshot.getTable(), e, snapshot);
562     }
563     if (desc == null) {
564       throw new SnapshotCreationException("Table '" + snapshot.getTable()
565           + "' doesn't exist, can't take snapshot.", snapshot);
566     }
567 
568     // set the snapshot version, now that we are ready to take it
569     snapshot = snapshot.toBuilder().setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION)
570         .build();
571 
572     // call pre coproc hook
573     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
574     if (cpHost != null) {
575       cpHost.preSnapshot(snapshot, desc);
576     }
577 
578     // if the table is enabled, then have the RS run actually the snapshot work
579     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
580     AssignmentManager assignmentMgr = master.getAssignmentManager();
581     if (assignmentMgr.getZKTable().isEnabledTable(snapshotTable)) {
582       LOG.debug("Table enabled, starting distributed snapshot.");
583       snapshotEnabledTable(snapshot);
584       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
585     }
586     // For disabled table, snapshot is created by the master
587     else if (assignmentMgr.getZKTable().isDisabledTable(snapshotTable)) {
588       LOG.debug("Table is disabled, running snapshot entirely on master.");
589       snapshotDisabledTable(snapshot);
590       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
591     } else {
592       LOG.error("Can't snapshot table '" + snapshot.getTable()
593           + "', isn't open or closed, we don't know what to do!");
594       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
595           + " isn't fully open.");
596       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
597     }
598 
599     // call post coproc hook
600     if (cpHost != null) {
601       cpHost.postSnapshot(snapshot, desc);
602     }
603   }
604 
605   /**
606    * Set the handler for the current snapshot
607    * <p>
608    * Exposed for TESTING
609    * @param tableName
610    * @param handler handler the master should use
611    *
612    * TODO get rid of this if possible, repackaging, modify tests.
613    */
614   public synchronized void setSnapshotHandlerForTesting(
615       final TableName tableName,
616       final SnapshotSentinel handler) {
617     if (handler != null) {
618       this.snapshotHandlers.put(tableName, handler);
619     } else {
620       this.snapshotHandlers.remove(tableName);
621     }
622   }
623 
624   /**
625    * @return distributed commit coordinator for all running snapshots
626    */
627   ProcedureCoordinator getCoordinator() {
628     return coordinator;
629   }
630 
631   /**
632    * Check to see if the snapshot is one of the currently completed snapshots
633    * Returns true if the snapshot exists in the "completed snapshots folder".
634    *
635    * @param snapshot expected snapshot to check
636    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
637    *         not stored
638    * @throws IOException if the filesystem throws an unexpected exception,
639    * @throws IllegalArgumentException if snapshot name is invalid.
640    */
641   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
642     try {
643       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
644       FileSystem fs = master.getMasterFileSystem().getFileSystem();
645       // check to see if the snapshot already exists
646       return fs.exists(snapshotDir);
647     } catch (IllegalArgumentException iae) {
648       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
649     }
650   }
651 
652   /**
653    * Clone the specified snapshot into a new table.
654    * The operation will fail if the destination table has a snapshot or restore in progress.
655    *
656    * @param snapshot Snapshot Descriptor
657    * @param hTableDescriptor Table Descriptor of the table to create
658    */
659   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
660       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
661     TableName tableName = hTableDescriptor.getTableName();
662 
663     // make sure we aren't running a snapshot on the same table
664     if (isTakingSnapshot(tableName)) {
665       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
666     }
667 
668     // make sure we aren't running a restore on the same table
669     if (isRestoringTable(tableName)) {
670       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
671     }
672 
673     try {
674       CloneSnapshotHandler handler =
675         new CloneSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
676       this.executorService.submit(handler);
677       this.restoreHandlers.put(tableName, handler);
678     } catch (Exception e) {
679       String msg = "Couldn't clone the snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
680         " on table=" + tableName;
681       LOG.error(msg, e);
682       throw new RestoreSnapshotException(msg, e);
683     }
684   }
685 
686   /**
687    * Restore the specified snapshot
688    * @param reqSnapshot
689    * @throws IOException
690    */
691   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
692     FileSystem fs = master.getMasterFileSystem().getFileSystem();
693     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
694     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
695 
696     // check if the snapshot exists
697     if (!fs.exists(snapshotDir)) {
698       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
699       throw new SnapshotDoesNotExistException(reqSnapshot);
700     }
701 
702     // read snapshot information
703     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
704     HTableDescriptor snapshotTableDesc =
705         FSTableDescriptors.getTableDescriptorFromFs(fs, snapshotDir);
706     TableName tableName = TableName.valueOf(reqSnapshot.getTable());
707 
708     // stop tracking "abandoned" handlers
709     cleanupSentinels();
710 
711     // Execute the restore/clone operation
712     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
713       if (master.getAssignmentManager().getZKTable().isEnabledTable(
714           TableName.valueOf(fsSnapshot.getTable()))) {
715         throw new UnsupportedOperationException("Table '" +
716             TableName.valueOf(fsSnapshot.getTable()) + "' must be disabled in order to " +
717             "perform a restore operation" +
718             ".");
719       }
720 
721       // call coproc pre hook
722       if (cpHost != null) {
723         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
724       }
725       restoreSnapshot(fsSnapshot, snapshotTableDesc);
726       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
727 
728       if (cpHost != null) {
729         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
730       }
731     } else {
732       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc, tableName);
733       if (cpHost != null) {
734         cpHost.preCloneSnapshot(reqSnapshot, htd);
735       }
736       cloneSnapshot(fsSnapshot, htd);
737       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
738 
739       if (cpHost != null) {
740         cpHost.postCloneSnapshot(reqSnapshot, htd);
741       }
742     }
743   }
744 
745   /**
746    * Restore the specified snapshot.
747    * The restore will fail if the destination table has a snapshot or restore in progress.
748    *
749    * @param snapshot Snapshot Descriptor
750    * @param hTableDescriptor Table Descriptor
751    */
752   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
753       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
754     TableName tableName = hTableDescriptor.getTableName();
755 
756     // make sure we aren't running a snapshot on the same table
757     if (isTakingSnapshot(tableName)) {
758       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
759     }
760 
761     // make sure we aren't running a restore on the same table
762     if (isRestoringTable(tableName)) {
763       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
764     }
765 
766     try {
767       RestoreSnapshotHandler handler =
768         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
769       this.executorService.submit(handler);
770       restoreHandlers.put(tableName, handler);
771     } catch (Exception e) {
772       String msg = "Couldn't restore the snapshot=" + ClientSnapshotDescriptionUtils.toString(
773           snapshot)  +
774           " on table=" + tableName;
775       LOG.error(msg, e);
776       throw new RestoreSnapshotException(msg, e);
777     }
778   }
779 
780   /**
781    * Verify if the restore of the specified table is in progress.
782    *
783    * @param tableName table under restore
784    * @return <tt>true</tt> if there is a restore in progress of the specified table.
785    */
786   private synchronized boolean isRestoringTable(final TableName tableName) {
787     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
788     return(sentinel != null && !sentinel.isFinished());
789   }
790 
791   /**
792    * Returns the status of a restore operation.
793    * If the in-progress restore is failed throws the exception that caused the failure.
794    *
795    * @param snapshot
796    * @return false if in progress, true if restore is completed or not requested.
797    * @throws IOException if there was a failure during the restore
798    */
799   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
800     // check to see if the sentinel exists,
801     // and if the task is complete removes it from the in-progress restore map.
802     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
803 
804     // stop tracking "abandoned" handlers
805     cleanupSentinels();
806 
807     if (sentinel == null) {
808       // there is no sentinel so restore is not in progress.
809       return true;
810     }
811 
812     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
813         + sentinel.getSnapshot().getName() + " table=" +
814         TableName.valueOf(snapshot.getTable()));
815 
816     // If the restore is failed, rethrow the exception
817     sentinel.rethrowExceptionIfFailed();
818 
819     // check to see if we are done
820     if (sentinel.isFinished()) {
821       LOG.debug("Restore snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
822           " has completed. Notifying the client.");
823       return true;
824     }
825 
826     if (LOG.isDebugEnabled()) {
827       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
828           ClientSnapshotDescriptionUtils.toString(snapshot));
829     }
830     return false;
831   }
832 
833   /**
834    * Return the handler if it is currently live and has the same snapshot target name.
835    * The handler is removed from the sentinels map if completed.
836    * @param sentinels live handlers
837    * @param snapshot snapshot description
838    * @return null if doesn't match, else a live handler.
839    */
840   private synchronized SnapshotSentinel removeSentinelIfFinished(
841       final Map<TableName, SnapshotSentinel> sentinels,
842       final SnapshotDescription snapshot) {
843     if (!snapshot.hasTable()) {
844       return null;
845     }
846 
847     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
848     SnapshotSentinel h = sentinels.get(snapshotTable);
849     if (h == null) {
850       return null;
851     }
852 
853     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
854       // specified snapshot is to the one currently running
855       return null;
856     }
857 
858     // Remove from the "in-progress" list once completed
859     if (h.isFinished()) {
860       sentinels.remove(snapshotTable);
861     }
862 
863     return h;
864   }
865 
866   /**
867    * Removes "abandoned" snapshot/restore requests.
868    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
869    * and the in-progress maps are cleaned up when the status of a completed task is requested.
870    * To avoid having sentinels staying around for long time if something client side is failed,
871    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
872    */
873   private void cleanupSentinels() {
874     cleanupSentinels(this.snapshotHandlers);
875     cleanupSentinels(this.restoreHandlers);
876   }
877 
878   /**
879    * Remove the sentinels that are marked as finished and the completion time
880    * has exceeded the removal timeout.
881    * @param sentinels map of sentinels to clean
882    */
883   private synchronized void cleanupSentinels(final Map<TableName, SnapshotSentinel> sentinels) {
884     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
885     Iterator<Map.Entry<TableName, SnapshotSentinel>> it =
886         sentinels.entrySet().iterator();
887     while (it.hasNext()) {
888       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
889       SnapshotSentinel sentinel = entry.getValue();
890       if (sentinel.isFinished() &&
891           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
892       {
893         it.remove();
894       }
895     }
896   }
897 
898   //
899   // Implementing Stoppable interface
900   //
901 
902   @Override
903   public void stop(String why) {
904     // short circuit
905     if (this.stopped) return;
906     // make sure we get stop
907     this.stopped = true;
908     // pass the stop onto take snapshot handlers
909     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
910       snapshotHandler.cancel(why);
911     }
912 
913     // pass the stop onto all the restore handlers
914     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
915       restoreHandler.cancel(why);
916     }
917     try {
918       coordinator.close();
919     } catch (IOException e) {
920       LOG.error("stop ProcedureCoordinator error", e);
921     }
922   }
923 
924   @Override
925   public boolean isStopped() {
926     return this.stopped;
927   }
928 
929   /**
930    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
931    * Called at the beginning of snapshot() and restoreSnapshot() methods.
932    * @throws UnsupportedOperationException if snapshot are not supported
933    */
934   public void checkSnapshotSupport() throws UnsupportedOperationException {
935     if (!this.isSnapshotSupported) {
936       throw new UnsupportedOperationException(
937         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
938           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
939     }
940   }
941 
942   /**
943    * Called at startup, to verify if snapshot operation is supported, and to avoid
944    * starting the master if there're snapshots present but the cleaners needed are missing.
945    * Otherwise we can end up with snapshot data loss.
946    * @param conf The {@link Configuration} object to use
947    * @param mfs The MasterFileSystem to use
948    * @throws IOException in case of file-system operation failure
949    * @throws UnsupportedOperationException in case cleaners are missing and
950    *         there're snapshot in the system
951    */
952   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
953       throws IOException, UnsupportedOperationException {
954     // Verify if snapshot is disabled by the user
955     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
956     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
957     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
958 
959     // Extract cleaners from conf
960     Set<String> hfileCleaners = new HashSet<String>();
961     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
962     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
963 
964     Set<String> logCleaners = new HashSet<String>();
965     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
966     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
967 
968     // check if an older version of snapshot directory was present
969     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
970     FileSystem fs = mfs.getFileSystem();
971     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
972     if (ss != null && !ss.isEmpty()) {
973       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
974       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
975     }
976 
977     // If the user has enabled the snapshot, we force the cleaners to be present
978     // otherwise we still need to check if cleaners are enabled or not and verify
979     // that there're no snapshot in the .snapshot folder.
980     if (snapshotEnabled) {
981       // Inject snapshot cleaners, if snapshot.enable is true
982       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
983       hfileCleaners.add(HFileLinkCleaner.class.getName());
984       logCleaners.add(SnapshotLogCleaner.class.getName());
985 
986       // Set cleaners conf
987       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
988         hfileCleaners.toArray(new String[hfileCleaners.size()]));
989       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
990         logCleaners.toArray(new String[logCleaners.size()]));
991     } else {
992       // Verify if cleaners are present
993       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
994         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
995         hfileCleaners.contains(HFileLinkCleaner.class.getName());
996 
997       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
998       if (snapshotEnabled) {
999         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
1000           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
1001           (userDisabled ? "is set to 'false'." : "is not set."));
1002       }
1003     }
1004 
1005     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
1006     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
1007 
1008     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
1009     // otherwise we end up with snapshot data loss.
1010     if (!snapshotEnabled) {
1011       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
1012       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
1013       if (fs.exists(snapshotDir)) {
1014         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
1015           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
1016         if (snapshots != null) {
1017           LOG.error("Snapshots are present, but cleaners are not enabled.");
1018           checkSnapshotSupport();
1019         }
1020       }
1021     }
1022   }
1023 }