View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.TableName;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HTableDescriptor;
44  import org.apache.hadoop.hbase.Stoppable;
45  import org.apache.hadoop.hbase.catalog.MetaReader;
46  import org.apache.hadoop.hbase.errorhandling.ForeignException;
47  import org.apache.hadoop.hbase.executor.ExecutorService;
48  import org.apache.hadoop.hbase.master.AssignmentManager;
49  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
50  import org.apache.hadoop.hbase.master.MasterFileSystem;
51  import org.apache.hadoop.hbase.master.MasterServices;
52  import org.apache.hadoop.hbase.master.MetricsMaster;
53  import org.apache.hadoop.hbase.master.SnapshotSentinel;
54  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
55  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
56  import org.apache.hadoop.hbase.procedure.Procedure;
57  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
58  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
59  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
60  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
61  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
62  import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
63  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
64  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
65  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
66  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
67  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
68  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
69  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
70  import org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil;
71  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
72  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
73  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
74  import org.apache.hadoop.hbase.util.FSTableDescriptors;
75  import org.apache.hadoop.hbase.util.FSUtils;
76  import org.apache.zookeeper.KeeperException;
77  
78  /**
79   * This class manages the procedure of taking and restoring snapshots. There is only one
80   * SnapshotManager for the master.
81   * <p>
82   * The class provides methods for monitoring in-progress snapshot actions.
83   * <p>
84   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
85   * simplification in the current implementation.
86   */
87  @InterfaceAudience.Private
88  @InterfaceStability.Unstable
89  public class SnapshotManager implements Stoppable {
90    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
91  
92    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
93    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
94  
95    /**
96     * Wait time before removing a finished sentinel from the in-progress map
97     *
98     * NOTE: This is used as a safety auto cleanup.
99     * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
100    * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
101    * In case something fails on the client side and the snapshot/restore state is not reclaimed
102    * after a default timeout, the entry is removed from the in-progress map.
103    * At this point, if the user asks for the snapshot/restore status, the result will be
104    * snapshot done if exists or failed if it doesn't exists.
105    */
106   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
107 
108   /** Enable or disable snapshot support */
109   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
110 
111   /**
112    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
113    * completion.
114    */
115   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
116 
117   /** By default, check to see if the snapshot is complete (ms) */
118   private static final int SNAPSHOT_TIMEOUT_MILLIS_DEFAULT = 60000;
119 
120   /**
121    * Conf key for # of ms elapsed before injecting a snapshot timeout error when waiting for
122    * completion.
123    */
124   private static final String SNAPSHOT_TIMEOUT_MILLIS_KEY = "hbase.snapshot.master.timeoutMillis";
125 
126   /** Name of the operation to use in the controller */
127   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
128 
129   /** Conf key for # of threads used by the SnapshotManager thread pool */
130   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
131 
132   /** number of current operations running on the master */
133   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
134 
135   private boolean stopped;
136   private final MasterServices master;  // Needed by TableEventHandlers
137   private final MetricsMaster metricsMaster;
138   private final ProcedureCoordinator coordinator;
139 
140   // Is snapshot feature enabled?
141   private boolean isSnapshotSupported = false;
142 
143   // Snapshot handlers map, with table name as key.
144   // The map is always accessed and modified under the object lock using synchronized.
145   // snapshotTable() will insert an Handler in the table.
146   // isSnapshotDone() will remove the handler requested if the operation is finished.
147   private Map<TableName, SnapshotSentinel> snapshotHandlers =
148       new HashMap<TableName, SnapshotSentinel>();
149 
150   // Restore Sentinels map, with table name as key.
151   // The map is always accessed and modified under the object lock using synchronized.
152   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
153   // isRestoreDone() will remove the handler requested if the operation is finished.
154   private Map<TableName, SnapshotSentinel> restoreHandlers =
155       new HashMap<TableName, SnapshotSentinel>();
156 
157   private final Path rootDir;
158   private final ExecutorService executorService;
159 
160   /**
161    * Construct a snapshot manager.
162    * @param master
163    */
164   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster)
165       throws KeeperException, IOException, UnsupportedOperationException {
166     this.master = master;
167     this.metricsMaster = metricsMaster;
168 
169     this.rootDir = master.getMasterFileSystem().getRootDir();
170     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
171 
172     // get the configuration for the coordinator
173     Configuration conf = master.getConfiguration();
174     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
175     long timeoutMillis = conf.getLong(SNAPSHOT_TIMEOUT_MILLIS_KEY, SNAPSHOT_TIMEOUT_MILLIS_DEFAULT);
176     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
177 
178     // setup the default procedure coordinator
179     String name = master.getServerName().toString();
180     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
181     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
182         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
183 
184     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
185     this.executorService = master.getExecutorService();
186     resetTempDir();
187   }
188 
189   /**
190    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
191    * @param master services for the master where the manager is running
192    * @param coordinator procedure coordinator instance.  exposed for testing.
193    * @param pool HBase ExecutorServcie instance, exposed for testing.
194    */
195   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster,
196       ProcedureCoordinator coordinator, ExecutorService pool)
197       throws IOException, UnsupportedOperationException {
198     this.master = master;
199     this.metricsMaster = metricsMaster;
200 
201     this.rootDir = master.getMasterFileSystem().getRootDir();
202     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
203 
204     this.coordinator = coordinator;
205     this.executorService = pool;
206     resetTempDir();
207   }
208 
209   /**
210    * Gets the list of all completed snapshots.
211    * @return list of SnapshotDescriptions
212    * @throws IOException File system exception
213    */
214   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
215     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
216   }
217 
218   /**
219    * Gets the list of all completed snapshots.
220    * @param snapshotDir snapshot directory
221    * @return list of SnapshotDescriptions
222    * @throws IOException File system exception
223    */
224   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
225     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
226     // first create the snapshot root path and check to see if it exists
227     FileSystem fs = master.getMasterFileSystem().getFileSystem();
228     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
229 
230     // if there are no snapshots, return an empty list
231     if (!fs.exists(snapshotDir)) {
232       return snapshotDescs;
233     }
234 
235     // ignore all the snapshots in progress
236     FileStatus[] snapshots = fs.listStatus(snapshotDir,
237       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
238     // loop through all the completed snapshots
239     for (FileStatus snapshot : snapshots) {
240       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
241       // if the snapshot is bad
242       if (!fs.exists(info)) {
243         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
244         continue;
245       }
246       FSDataInputStream in = null;
247       try {
248         in = fs.open(info);
249         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
250         snapshotDescs.add(desc);
251       } catch (IOException e) {
252         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
253       } finally {
254         if (in != null) {
255           in.close();
256         }
257       }
258     }
259     return snapshotDescs;
260   }
261 
262   /**
263    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
264    * snapshot attempts.
265    *
266    * @throws IOException if we can't reach the filesystem
267    */
268   void resetTempDir() throws IOException {
269     // cleanup any existing snapshots.
270     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
271     if (master.getMasterFileSystem().getFileSystem().exists(tmpdir)) {
272       if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
273         LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
274       }
275     }
276   }
277 
278   /**
279    * Delete the specified snapshot
280    * @param snapshot
281    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
282    * @throws IOException For filesystem IOExceptions
283    */
284   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
285 
286     // call coproc pre hook
287     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
288     if (cpHost != null) {
289       cpHost.preDeleteSnapshot(snapshot);
290     }
291 
292     // check to see if it is completed
293     if (!isSnapshotCompleted(snapshot)) {
294       throw new SnapshotDoesNotExistException(snapshot);
295     }
296 
297     String snapshotName = snapshot.getName();
298     LOG.debug("Deleting snapshot: " + snapshotName);
299     // first create the snapshot description and check to see if it exists
300     MasterFileSystem fs = master.getMasterFileSystem();
301     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
302 
303     // delete the existing snapshot
304     if (!fs.getFileSystem().delete(snapshotDir, true)) {
305       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
306     }
307 
308     // call coproc post hook
309     if (cpHost != null) {
310       cpHost.postDeleteSnapshot(snapshot);
311     }
312 
313   }
314 
315   /**
316    * Check if the specified snapshot is done
317    *
318    * @param expected
319    * @return true if snapshot is ready to be restored, false if it is still being taken.
320    * @throws IOException IOException if error from HDFS or RPC
321    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
322    */
323   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
324     // check the request to make sure it has a snapshot
325     if (expected == null) {
326       throw new UnknownSnapshotException(
327          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
328     }
329 
330     String ssString = ClientSnapshotDescriptionUtils.toString(expected);
331 
332     // check to see if the sentinel exists,
333     // and if the task is complete removes it from the in-progress snapshots map.
334     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
335 
336     // stop tracking "abandoned" handlers
337     cleanupSentinels();
338 
339     if (handler == null) {
340       // If there's no handler in the in-progress map, it means one of the following:
341       //   - someone has already requested the snapshot state
342       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
343       //   - the snapshot was never requested
344       // In those cases returns to the user the "done state" if the snapshots exists on disk,
345       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
346       if (!isSnapshotCompleted(expected)) {
347         throw new UnknownSnapshotException("Snapshot " + ssString
348             + " is not currently running or one of the known completed snapshots.");
349       }
350       // was done, return true;
351       return true;
352     }
353 
354     // pass on any failure we find in the sentinel
355     try {
356       handler.rethrowExceptionIfFailed();
357     } catch (ForeignException e) {
358       // Give some procedure info on an exception.
359       String status;
360       Procedure p = coordinator.getProcedure(expected.getName());
361       if (p != null) {
362         status = p.getStatus();
363       } else {
364         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
365       }
366       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
367           expected);
368     }
369 
370     // check to see if we are done
371     if (handler.isFinished()) {
372       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
373       return true;
374     } else if (LOG.isDebugEnabled()) {
375       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
376     }
377     return false;
378   }
379 
380   /**
381    * Check to see if there is a snapshot in progress with the same name or on the same table.
382    * Currently we have a limitation only allowing a single snapshot per table at a time. Also we
383    * don't allow snapshot with the same name.
384    * @param snapshot description of the snapshot being checked.
385    * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same
386    *         table.
387    */
388   synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) {
389     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
390     if (isTakingSnapshot(snapshotTable)) {
391       return true;
392     }
393     Iterator<Map.Entry<TableName, SnapshotSentinel>> it = this.snapshotHandlers.entrySet().iterator();
394     while (it.hasNext()) {
395       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
396       SnapshotSentinel sentinel = entry.getValue();
397       if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) {
398         return true;
399       }
400     }
401     return false;
402   }
403 
404   /**
405    * Check to see if the specified table has a snapshot in progress.  Currently we have a
406    * limitation only allowing a single snapshot per table at a time.
407    * @param tableName name of the table being snapshotted.
408    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
409    */
410   synchronized boolean isTakingSnapshot(final TableName tableName) {
411     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
412     return handler != null && !handler.isFinished();
413   }
414 
415   /**
416    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
417    * aren't already running a snapshot or restore on the requested table.
418    * @param snapshot description of the snapshot we want to start
419    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
420    */
421   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
422       throws HBaseSnapshotException {
423     FileSystem fs = master.getMasterFileSystem().getFileSystem();
424     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
425     TableName snapshotTable =
426         TableName.valueOf(snapshot.getTable());
427 
428     // make sure we aren't already running a snapshot
429     if (isTakingSnapshot(snapshot)) {
430       SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable);
431       throw new SnapshotCreationException("Rejected taking "
432           + ClientSnapshotDescriptionUtils.toString(snapshot)
433           + " because we are already running another snapshot "
434           + (handler != null ? ("on the same table " +
435               ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()))
436               : "with the same name"), snapshot);
437     }
438 
439     // make sure we aren't running a restore on the same table
440     if (isRestoringTable(snapshotTable)) {
441       SnapshotSentinel handler = restoreHandlers.get(snapshotTable);
442       throw new SnapshotCreationException("Rejected taking "
443           + ClientSnapshotDescriptionUtils.toString(snapshot)
444           + " because we are already have a restore in progress on the same snapshot "
445           + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
446     }
447 
448     try {
449       // delete the working directory, since we aren't running the snapshot. Likely leftovers
450       // from a failed attempt.
451       fs.delete(workingDir, true);
452 
453       // recreate the working directory for the snapshot
454       if (!fs.mkdirs(workingDir)) {
455         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
456             + ") for snapshot" , snapshot);
457       }
458     } catch (HBaseSnapshotException e) {
459       throw e;
460     } catch (IOException e) {
461       throw new SnapshotCreationException(
462           "Exception while checking to see if snapshot could be started.", e, snapshot);
463     }
464   }
465 
466   /**
467    * Take a snapshot of a disabled table.
468    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
469    * @throws HBaseSnapshotException if the snapshot could not be started
470    */
471   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
472       throws HBaseSnapshotException {
473     // setup the snapshot
474     prepareToTakeSnapshot(snapshot);
475 
476     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
477     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
478 
479     // Take the snapshot of the disabled table
480     DisabledTableSnapshotHandler handler =
481         new DisabledTableSnapshotHandler(snapshot, master);
482     snapshotTable(snapshot, handler);
483   }
484 
485   /**
486    * Take a snapshot of an enabled table.
487    * @param snapshot description of the snapshot to take.
488    * @throws HBaseSnapshotException if the snapshot could not be started
489    */
490   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
491       throws HBaseSnapshotException {
492     // setup the snapshot
493     prepareToTakeSnapshot(snapshot);
494 
495     // Take the snapshot of the enabled table
496     EnabledTableSnapshotHandler handler =
497         new EnabledTableSnapshotHandler(snapshot, master, this);
498     snapshotTable(snapshot, handler);
499   }
500 
501   /**
502    * Take a snapshot using the specified handler.
503    * On failure the snapshot temporary working directory is removed.
504    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
505    *       snapshot request if the table is busy with another snapshot/restore operation.
506    * @param snapshot the snapshot description
507    * @param handler the snapshot handler
508    */
509   private synchronized void snapshotTable(SnapshotDescription snapshot,
510       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
511     try {
512       handler.prepare();
513       this.executorService.submit(handler);
514       this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler);
515     } catch (Exception e) {
516       // cleanup the working directory by trying to delete it from the fs.
517       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
518       try {
519         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
520           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
521               ClientSnapshotDescriptionUtils.toString(snapshot));
522         }
523       } catch (IOException e1) {
524         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
525             ClientSnapshotDescriptionUtils.toString(snapshot));
526       }
527       // fail the snapshot
528       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
529     }
530   }
531 
532   /**
533    * Take a snapshot based on the enabled/disabled state of the table.
534    *
535    * @param snapshot
536    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
537    * @throws IOException when some sort of generic IO exception occurs.
538    */
539   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
540     // check to see if we already completed the snapshot
541     if (isSnapshotCompleted(snapshot)) {
542       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
543           + "' already stored on the filesystem.", snapshot);
544     }
545 
546     LOG.debug("No existing snapshot, attempting snapshot...");
547 
548     // stop tracking "abandoned" handlers
549     cleanupSentinels();
550 
551     // check to see if the table exists
552     HTableDescriptor desc = null;
553     try {
554       desc = master.getTableDescriptors().get(
555           TableName.valueOf(snapshot.getTable()));
556     } catch (FileNotFoundException e) {
557       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
558       LOG.error(msg);
559       throw new SnapshotCreationException(msg, e, snapshot);
560     } catch (IOException e) {
561       throw new SnapshotCreationException("Error while geting table description for table "
562           + snapshot.getTable(), e, snapshot);
563     }
564     if (desc == null) {
565       throw new SnapshotCreationException("Table '" + snapshot.getTable()
566           + "' doesn't exist, can't take snapshot.", snapshot);
567     }
568 
569     // set the snapshot version, now that we are ready to take it
570     snapshot = snapshot.toBuilder().setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION)
571         .build();
572 
573     // call pre coproc hook
574     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
575     if (cpHost != null) {
576       cpHost.preSnapshot(snapshot, desc);
577     }
578 
579     // if the table is enabled, then have the RS run actually the snapshot work
580     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
581     AssignmentManager assignmentMgr = master.getAssignmentManager();
582     if (assignmentMgr.getZKTable().isEnabledTable(snapshotTable)) {
583       LOG.debug("Table enabled, starting distributed snapshot.");
584       snapshotEnabledTable(snapshot);
585       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
586     }
587     // For disabled table, snapshot is created by the master
588     else if (assignmentMgr.getZKTable().isDisabledTable(snapshotTable)) {
589       LOG.debug("Table is disabled, running snapshot entirely on master.");
590       snapshotDisabledTable(snapshot);
591       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
592     } else {
593       LOG.error("Can't snapshot table '" + snapshot.getTable()
594           + "', isn't open or closed, we don't know what to do!");
595       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
596           + " isn't fully open.");
597       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
598     }
599 
600     // call post coproc hook
601     if (cpHost != null) {
602       cpHost.postSnapshot(snapshot, desc);
603     }
604   }
605 
606   /**
607    * Set the handler for the current snapshot
608    * <p>
609    * Exposed for TESTING
610    * @param tableName
611    * @param handler handler the master should use
612    *
613    * TODO get rid of this if possible, repackaging, modify tests.
614    */
615   public synchronized void setSnapshotHandlerForTesting(
616       final TableName tableName,
617       final SnapshotSentinel handler) {
618     if (handler != null) {
619       this.snapshotHandlers.put(tableName, handler);
620     } else {
621       this.snapshotHandlers.remove(tableName);
622     }
623   }
624 
625   /**
626    * @return distributed commit coordinator for all running snapshots
627    */
628   ProcedureCoordinator getCoordinator() {
629     return coordinator;
630   }
631 
632   /**
633    * Check to see if the snapshot is one of the currently completed snapshots
634    * Returns true if the snapshot exists in the "completed snapshots folder".
635    *
636    * @param snapshot expected snapshot to check
637    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
638    *         not stored
639    * @throws IOException if the filesystem throws an unexpected exception,
640    * @throws IllegalArgumentException if snapshot name is invalid.
641    */
642   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
643     try {
644       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
645       FileSystem fs = master.getMasterFileSystem().getFileSystem();
646       // check to see if the snapshot already exists
647       return fs.exists(snapshotDir);
648     } catch (IllegalArgumentException iae) {
649       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
650     }
651   }
652 
653   /**
654    * Clone the specified snapshot into a new table.
655    * The operation will fail if the destination table has a snapshot or restore in progress.
656    *
657    * @param snapshot Snapshot Descriptor
658    * @param hTableDescriptor Table Descriptor of the table to create
659    */
660   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
661       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
662     TableName tableName = hTableDescriptor.getTableName();
663 
664     // make sure we aren't running a snapshot on the same table
665     if (isTakingSnapshot(tableName)) {
666       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
667     }
668 
669     // make sure we aren't running a restore on the same table
670     if (isRestoringTable(tableName)) {
671       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
672     }
673 
674     try {
675       CloneSnapshotHandler handler =
676         new CloneSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
677       this.executorService.submit(handler);
678       this.restoreHandlers.put(tableName, handler);
679     } catch (Exception e) {
680       String msg = "Couldn't clone the snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
681         " on table=" + tableName;
682       LOG.error(msg, e);
683       throw new RestoreSnapshotException(msg, e);
684     }
685   }
686 
687   /**
688    * Restore the specified snapshot
689    * @param reqSnapshot
690    * @throws IOException
691    */
692   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
693     FileSystem fs = master.getMasterFileSystem().getFileSystem();
694     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
695     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
696 
697     // check if the snapshot exists
698     if (!fs.exists(snapshotDir)) {
699       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
700       throw new SnapshotDoesNotExistException(reqSnapshot);
701     }
702 
703     // read snapshot information
704     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
705     HTableDescriptor snapshotTableDesc =
706         FSTableDescriptors.getTableDescriptorFromFs(fs, snapshotDir);
707     TableName tableName = TableName.valueOf(reqSnapshot.getTable());
708 
709     // stop tracking "abandoned" handlers
710     cleanupSentinels();
711 
712     // Verify snapshot validity
713     SnapshotReferenceUtil.verifySnapshot(master.getConfiguration(), fs, snapshotDir, fsSnapshot);
714 
715     // Execute the restore/clone operation
716     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
717       if (master.getAssignmentManager().getZKTable().isEnabledTable(
718           TableName.valueOf(fsSnapshot.getTable()))) {
719         throw new UnsupportedOperationException("Table '" +
720             TableName.valueOf(fsSnapshot.getTable()) + "' must be disabled in order to " +
721             "perform a restore operation" +
722             ".");
723       }
724 
725       // call coproc pre hook
726       if (cpHost != null) {
727         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
728       }
729       restoreSnapshot(fsSnapshot, snapshotTableDesc);
730       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
731 
732       if (cpHost != null) {
733         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
734       }
735     } else {
736       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc, tableName);
737       if (cpHost != null) {
738         cpHost.preCloneSnapshot(reqSnapshot, htd);
739       }
740       cloneSnapshot(fsSnapshot, htd);
741       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
742 
743       if (cpHost != null) {
744         cpHost.postCloneSnapshot(reqSnapshot, htd);
745       }
746     }
747   }
748 
749   /**
750    * Restore the specified snapshot.
751    * The restore will fail if the destination table has a snapshot or restore in progress.
752    *
753    * @param snapshot Snapshot Descriptor
754    * @param hTableDescriptor Table Descriptor
755    */
756   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
757       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
758     TableName tableName = hTableDescriptor.getTableName();
759 
760     // make sure we aren't running a snapshot on the same table
761     if (isTakingSnapshot(tableName)) {
762       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
763     }
764 
765     // make sure we aren't running a restore on the same table
766     if (isRestoringTable(tableName)) {
767       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
768     }
769 
770     try {
771       RestoreSnapshotHandler handler =
772         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
773       this.executorService.submit(handler);
774       restoreHandlers.put(tableName, handler);
775     } catch (Exception e) {
776       String msg = "Couldn't restore the snapshot=" + ClientSnapshotDescriptionUtils.toString(
777           snapshot)  +
778           " on table=" + tableName;
779       LOG.error(msg, e);
780       throw new RestoreSnapshotException(msg, e);
781     }
782   }
783 
784   /**
785    * Verify if the restore of the specified table is in progress.
786    *
787    * @param tableName table under restore
788    * @return <tt>true</tt> if there is a restore in progress of the specified table.
789    */
790   private synchronized boolean isRestoringTable(final TableName tableName) {
791     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
792     return(sentinel != null && !sentinel.isFinished());
793   }
794 
795   /**
796    * Returns the status of a restore operation.
797    * If the in-progress restore is failed throws the exception that caused the failure.
798    *
799    * @param snapshot
800    * @return false if in progress, true if restore is completed or not requested.
801    * @throws IOException if there was a failure during the restore
802    */
803   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
804     // check to see if the sentinel exists,
805     // and if the task is complete removes it from the in-progress restore map.
806     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
807 
808     // stop tracking "abandoned" handlers
809     cleanupSentinels();
810 
811     if (sentinel == null) {
812       // there is no sentinel so restore is not in progress.
813       return true;
814     }
815 
816     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
817         + sentinel.getSnapshot().getName() + " table=" +
818         TableName.valueOf(snapshot.getTable()));
819 
820     // If the restore is failed, rethrow the exception
821     sentinel.rethrowExceptionIfFailed();
822 
823     // check to see if we are done
824     if (sentinel.isFinished()) {
825       LOG.debug("Restore snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
826           " has completed. Notifying the client.");
827       return true;
828     }
829 
830     if (LOG.isDebugEnabled()) {
831       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
832           ClientSnapshotDescriptionUtils.toString(snapshot));
833     }
834     return false;
835   }
836 
837   /**
838    * Return the handler if it is currently live and has the same snapshot target name.
839    * The handler is removed from the sentinels map if completed.
840    * @param sentinels live handlers
841    * @param snapshot snapshot description
842    * @return null if doesn't match, else a live handler.
843    */
844   private synchronized SnapshotSentinel removeSentinelIfFinished(
845       final Map<TableName, SnapshotSentinel> sentinels,
846       final SnapshotDescription snapshot) {
847     if (!snapshot.hasTable()) {
848       return null;
849     }
850 
851     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
852     SnapshotSentinel h = sentinels.get(snapshotTable);
853     if (h == null) {
854       return null;
855     }
856 
857     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
858       // specified snapshot is to the one currently running
859       return null;
860     }
861 
862     // Remove from the "in-progress" list once completed
863     if (h.isFinished()) {
864       sentinels.remove(snapshotTable);
865     }
866 
867     return h;
868   }
869 
870   /**
871    * Removes "abandoned" snapshot/restore requests.
872    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
873    * and the in-progress maps are cleaned up when the status of a completed task is requested.
874    * To avoid having sentinels staying around for long time if something client side is failed,
875    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
876    */
877   private void cleanupSentinels() {
878     cleanupSentinels(this.snapshotHandlers);
879     cleanupSentinels(this.restoreHandlers);
880   }
881 
882   /**
883    * Remove the sentinels that are marked as finished and the completion time
884    * has exceeded the removal timeout.
885    * @param sentinels map of sentinels to clean
886    */
887   private synchronized void cleanupSentinels(final Map<TableName, SnapshotSentinel> sentinels) {
888     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
889     Iterator<Map.Entry<TableName, SnapshotSentinel>> it =
890         sentinels.entrySet().iterator();
891     while (it.hasNext()) {
892       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
893       SnapshotSentinel sentinel = entry.getValue();
894       if (sentinel.isFinished() &&
895           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
896       {
897         it.remove();
898       }
899     }
900   }
901 
902   //
903   // Implementing Stoppable interface
904   //
905 
906   @Override
907   public void stop(String why) {
908     // short circuit
909     if (this.stopped) return;
910     // make sure we get stop
911     this.stopped = true;
912     // pass the stop onto take snapshot handlers
913     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
914       snapshotHandler.cancel(why);
915     }
916 
917     // pass the stop onto all the restore handlers
918     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
919       restoreHandler.cancel(why);
920     }
921     try {
922       coordinator.close();
923     } catch (IOException e) {
924       LOG.error("stop ProcedureCoordinator error", e);
925     }
926   }
927 
928   @Override
929   public boolean isStopped() {
930     return this.stopped;
931   }
932 
933   /**
934    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
935    * Called at the beginning of snapshot() and restoreSnapshot() methods.
936    * @throws UnsupportedOperationException if snapshot are not supported
937    */
938   public void checkSnapshotSupport() throws UnsupportedOperationException {
939     if (!this.isSnapshotSupported) {
940       throw new UnsupportedOperationException(
941         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
942           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
943     }
944   }
945 
946   /**
947    * Called at startup, to verify if snapshot operation is supported, and to avoid
948    * starting the master if there're snapshots present but the cleaners needed are missing.
949    * Otherwise we can end up with snapshot data loss.
950    * @param conf The {@link Configuration} object to use
951    * @param mfs The MasterFileSystem to use
952    * @throws IOException in case of file-system operation failure
953    * @throws UnsupportedOperationException in case cleaners are missing and
954    *         there're snapshot in the system
955    */
956   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
957       throws IOException, UnsupportedOperationException {
958     // Verify if snapshot is disabled by the user
959     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
960     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
961     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
962 
963     // Extract cleaners from conf
964     Set<String> hfileCleaners = new HashSet<String>();
965     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
966     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
967 
968     Set<String> logCleaners = new HashSet<String>();
969     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
970     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
971 
972     // check if an older version of snapshot directory was present
973     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
974     FileSystem fs = mfs.getFileSystem();
975     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
976     if (ss != null && !ss.isEmpty()) {
977       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
978       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
979     }
980 
981     // If the user has enabled the snapshot, we force the cleaners to be present
982     // otherwise we still need to check if cleaners are enabled or not and verify
983     // that there're no snapshot in the .snapshot folder.
984     if (snapshotEnabled) {
985       // Inject snapshot cleaners, if snapshot.enable is true
986       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
987       hfileCleaners.add(HFileLinkCleaner.class.getName());
988       logCleaners.add(SnapshotLogCleaner.class.getName());
989 
990       // Set cleaners conf
991       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
992         hfileCleaners.toArray(new String[hfileCleaners.size()]));
993       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
994         logCleaners.toArray(new String[logCleaners.size()]));
995     } else {
996       // Verify if cleaners are present
997       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
998         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
999         hfileCleaners.contains(HFileLinkCleaner.class.getName());
1000 
1001       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
1002       if (snapshotEnabled) {
1003         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
1004           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
1005           (userDisabled ? "is set to 'false'." : "is not set."));
1006       }
1007     }
1008 
1009     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
1010     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
1011 
1012     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
1013     // otherwise we end up with snapshot data loss.
1014     if (!snapshotEnabled) {
1015       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
1016       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
1017       if (fs.exists(snapshotDir)) {
1018         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
1019           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
1020         if (snapshots != null) {
1021           LOG.error("Snapshots are present, but cleaners are not enabled.");
1022           checkSnapshotSupport();
1023         }
1024       }
1025     }
1026   }
1027 }