View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.HConstants;
42  import org.apache.hadoop.hbase.HTableDescriptor;
43  import org.apache.hadoop.hbase.Stoppable;
44  import org.apache.hadoop.hbase.catalog.MetaReader;
45  import org.apache.hadoop.hbase.errorhandling.ForeignException;
46  import org.apache.hadoop.hbase.executor.ExecutorService;
47  import org.apache.hadoop.hbase.master.AssignmentManager;
48  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
49  import org.apache.hadoop.hbase.master.MasterFileSystem;
50  import org.apache.hadoop.hbase.master.MasterServices;
51  import org.apache.hadoop.hbase.master.SnapshotSentinel;
52  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
53  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
54  import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
55  import org.apache.hadoop.hbase.procedure.Procedure;
56  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
57  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
58  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
59  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
60  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
61  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
62  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
63  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
64  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
65  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
66  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
67  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
68  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
69  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
70  import org.apache.hadoop.hbase.util.Bytes;
71  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
72  import org.apache.hadoop.hbase.util.FSTableDescriptors;
73  import org.apache.hadoop.hbase.util.FSUtils;
74  import org.apache.zookeeper.KeeperException;
75  
76  /**
77   * This class manages the procedure of taking and restoring snapshots. There is only one
78   * SnapshotManager for the master.
79   * <p>
80   * The class provides methods for monitoring in-progress snapshot actions.
81   * <p>
82   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
83   * simplification in the current implementation.
84   */
85  @InterfaceAudience.Private
86  @InterfaceStability.Unstable
87  public class SnapshotManager implements Stoppable {
88    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
89  
90    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
91    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
92  
93    /**
94     * Wait time before removing a finished sentinel from the in-progress map
95     *
96     * NOTE: This is used as a safety auto cleanup.
97     * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
98     * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
99     * In case something fails on the client side and the snapshot/restore state is not reclaimed
100    * after a default timeout, the entry is removed from the in-progress map.
101    * At this point, if the user asks for the snapshot/restore status, the result will be
102    * snapshot done if exists or failed if it doesn't exists.
103    */
104   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
105 
106   /** Enable or disable snapshot support */
107   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
108 
109   /**
110    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
111    * completion.
112    */
113   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
114 
115   /** By default, check to see if the snapshot is complete (ms) */
116   private static final int SNAPSHOT_TIMEOUT_MILLIS_DEFAULT = 60000;
117 
118   /**
119    * Conf key for # of ms elapsed before injecting a snapshot timeout error when waiting for
120    * completion.
121    */
122   private static final String SNAPSHOT_TIMEOUT_MILLIS_KEY = "hbase.snapshot.master.timeoutMillis";
123 
124   /** Name of the operation to use in the controller */
125   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
126 
127   /** Conf key for # of threads used by the SnapshotManager thread pool */
128   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
129 
130   /** number of current operations running on the master */
131   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
132 
133   private boolean stopped;
134   private final MasterServices master;  // Needed by TableEventHandlers
135   private final MasterMetrics metricsMaster;
136   private final ProcedureCoordinator coordinator;
137 
138   // Is snapshot feature enabled?
139   private boolean isSnapshotSupported = false;
140 
141   // Snapshot handlers map, with table name as key.
142   // The map is always accessed and modified under the object lock using synchronized.
143   // snapshotTable() will insert an Handler in the table.
144   // isSnapshotDone() will remove the handler requested if the operation is finished.
145   private Map<String, SnapshotSentinel> snapshotHandlers = new HashMap<String, SnapshotSentinel>();
146 
147   // Restore Sentinels map, with table name as key.
148   // The map is always accessed and modified under the object lock using synchronized.
149   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
150   // isRestoreDone() will remove the handler requested if the operation is finished.
151   private Map<String, SnapshotSentinel> restoreHandlers = new HashMap<String, SnapshotSentinel>();
152 
153   private final Path rootDir;
154   private final ExecutorService executorService;
155 
156   /**
157    * Construct a snapshot manager.
158    * @param master
159    */
160   public SnapshotManager(final MasterServices master, final MasterMetrics metricsMaster)
161       throws KeeperException, IOException, UnsupportedOperationException {
162     this.master = master;
163     this.metricsMaster = metricsMaster;
164 
165     this.rootDir = master.getMasterFileSystem().getRootDir();
166     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
167 
168     // get the configuration for the coordinator
169     Configuration conf = master.getConfiguration();
170     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
171     long timeoutMillis = conf.getLong(SNAPSHOT_TIMEOUT_MILLIS_KEY, SNAPSHOT_TIMEOUT_MILLIS_DEFAULT);
172     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
173 
174     // setup the default procedure coordinator
175     String name = master.getServerName().toString();
176     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
177     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
178         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
179     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
180     this.executorService = master.getExecutorService();
181     resetTempDir();
182   }
183 
184   /**
185    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
186    * @param master services for the master where the manager is running
187    * @param coordinator procedure coordinator instance.  exposed for testing.
188    * @param pool HBase ExecutorServcie instance, exposed for testing.
189    */
190   public SnapshotManager(final MasterServices master, final MasterMetrics metricsMaster,
191       ProcedureCoordinator coordinator, ExecutorService pool)
192       throws IOException, UnsupportedOperationException {
193     this.master = master;
194     this.metricsMaster = metricsMaster;
195 
196     this.rootDir = master.getMasterFileSystem().getRootDir();
197     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
198 
199     this.coordinator = coordinator;
200     this.executorService = pool;
201     resetTempDir();
202   }
203 
204   /**
205    * Gets the list of all completed snapshots.
206    * @return list of SnapshotDescriptions
207    * @throws IOException File system exception
208    */
209   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
210     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
211   }
212 
213   /**
214    * Gets the list of all completed snapshots.
215    * @param snapshotDir snapshot directory
216    * @return list of SnapshotDescriptions
217    * @throws IOException File system exception
218    */
219   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
220     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
221     // first create the snapshot root path and check to see if it exists
222     FileSystem fs = master.getMasterFileSystem().getFileSystem();
223     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
224 
225     // if there are no snapshots, return an empty list
226     if (!fs.exists(snapshotDir)) {
227       return snapshotDescs;
228     }
229 
230     // ignore all the snapshots in progress
231     FileStatus[] snapshots = fs.listStatus(snapshotDir,
232       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
233     // loop through all the completed snapshots
234     for (FileStatus snapshot : snapshots) {
235       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
236       // if the snapshot is bad
237       if (!fs.exists(info)) {
238         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
239         continue;
240       }
241       FSDataInputStream in = null;
242       try {
243         in = fs.open(info);
244         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
245         snapshotDescs.add(desc);
246       } catch (IOException e) {
247         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
248       } finally {
249         if (in != null) {
250           in.close();
251         }
252       }
253     }
254     return snapshotDescs;
255   }
256 
257   /**
258    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
259    * snapshot attempts.
260    *
261    * @throws IOException if we can't reach the filesystem
262    */
263   void resetTempDir() throws IOException {
264     // cleanup any existing snapshots.
265     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
266     if (master.getMasterFileSystem().getFileSystem().exists(tmpdir)) {
267       if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
268         LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
269       }
270     }
271   }
272 
273   /**
274    * Delete the specified snapshot
275    * @param snapshot
276    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
277    * @throws IOException For filesystem IOExceptions
278    */
279   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
280 
281     // call coproc pre hook
282     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
283     if (cpHost != null) {
284       cpHost.preDeleteSnapshot(snapshot);
285     }
286 
287     // check to see if it is completed
288     if (!isSnapshotCompleted(snapshot)) {
289       throw new SnapshotDoesNotExistException(snapshot);
290     }
291 
292     String snapshotName = snapshot.getName();
293     LOG.debug("Deleting snapshot: " + snapshotName);
294     // first create the snapshot description and check to see if it exists
295     MasterFileSystem fs = master.getMasterFileSystem();
296     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
297 
298     // delete the existing snapshot
299     if (!fs.getFileSystem().delete(snapshotDir, true)) {
300       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
301     }
302 
303     // call coproc post hook
304     if (cpHost != null) {
305       cpHost.postDeleteSnapshot(snapshot);
306     }
307 
308   }
309 
310   /**
311    * Check if the specified snapshot is done
312    *
313    * @param expected
314    * @return true if snapshot is ready to be restored, false if it is still being taken.
315    * @throws IOException IOException if error from HDFS or RPC
316    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
317    */
318   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
319     // check the request to make sure it has a snapshot
320     if (expected == null) {
321       throw new UnknownSnapshotException(
322          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
323     }
324 
325     String ssString = SnapshotDescriptionUtils.toString(expected);
326 
327     // check to see if the sentinel exists,
328     // and if the task is complete removes it from the in-progress snapshots map.
329     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
330 
331     // stop tracking "abandoned" handlers
332     cleanupSentinels();
333 
334     if (handler == null) {
335       // If there's no handler in the in-progress map, it means one of the following:
336       //   - someone has already requested the snapshot state
337       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
338       //   - the snapshot was never requested
339       // In those cases returns to the user the "done state" if the snapshots exists on disk,
340       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
341       if (!isSnapshotCompleted(expected)) {
342         throw new UnknownSnapshotException("Snapshot " + ssString
343             + " is not currently running or one of the known completed snapshots.");
344       }
345       // was done, return true;
346       return true;
347     }
348 
349     // pass on any failure we find in the sentinel
350     try {
351       handler.rethrowExceptionIfFailed();
352     } catch (ForeignException e) {
353       // Give some procedure info on an exception.
354       String status;
355       Procedure p = coordinator.getProcedure(expected.getName());
356       if (p != null) {
357         status = p.getStatus();
358       } else {
359         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
360       }
361       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
362           expected);
363     }
364 
365     // check to see if we are done
366     if (handler.isFinished()) {
367       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
368       return true;
369     } else if (LOG.isDebugEnabled()) {
370       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
371     }
372     return false;
373   }
374 
375   /**
376    * Check to see if there is a snapshot in progress with the same name or on the same table.
377    * Currently we have a limitation only allowing a single snapshot per table at a time. Also we
378    * don't allow snapshot with the same name.
379    * @param snapshot description of the snapshot being checked.
380    * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same
381    *         table.
382    */
383   synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) {
384     if (isTakingSnapshot(snapshot.getTable())) {
385       return true;
386     }
387     Iterator<Map.Entry<String, SnapshotSentinel>> it = this.snapshotHandlers.entrySet().iterator();
388     while (it.hasNext()) {
389       Map.Entry<String, SnapshotSentinel> entry = it.next();
390       SnapshotSentinel sentinel = entry.getValue();
391       if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) {
392         return true;
393       }
394     }
395     return false;
396   }
397 
398   /**
399    * Check to see if the specified table has a snapshot in progress.  Currently we have a
400    * limitation only allowing a single snapshot per table at a time.
401    * @param tableName name of the table being snapshotted.
402    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
403    */
404   synchronized boolean isTakingSnapshot(final String tableName) {
405     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
406     return handler != null && !handler.isFinished();
407   }
408 
409   /**
410    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
411    * aren't already running a snapshot or restore on the requested table.
412    * @param snapshot description of the snapshot we want to start
413    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
414    */
415   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
416       throws HBaseSnapshotException {
417     FileSystem fs = master.getMasterFileSystem().getFileSystem();
418     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
419     // make sure we aren't already running a snapshot
420     if (isTakingSnapshot(snapshot)) {
421       SnapshotSentinel handler = this.snapshotHandlers.get(snapshot.getTable());
422       throw new SnapshotCreationException("Rejected taking "
423           + SnapshotDescriptionUtils.toString(snapshot)
424           + " because we are already running another snapshot "
425           + (handler != null ? ("on the same table " +
426               SnapshotDescriptionUtils.toString(handler.getSnapshot()))
427               : "with the same name"), snapshot);
428     }
429 
430     // make sure we aren't running a restore on the same table
431     if (isRestoringTable(snapshot.getTable())) {
432       SnapshotSentinel handler = restoreHandlers.get(snapshot.getTable());
433       throw new SnapshotCreationException("Rejected taking "
434           + SnapshotDescriptionUtils.toString(snapshot)
435           + " because we are already have a restore in progress on the same snapshot "
436           + SnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
437     }
438 
439     try {
440       // delete the working directory, since we aren't running the snapshot. Likely leftovers
441       // from a failed attempt.
442       fs.delete(workingDir, true);
443 
444       // recreate the working directory for the snapshot
445       if (!fs.mkdirs(workingDir)) {
446         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
447             + ") for snapshot" , snapshot);
448       }
449     } catch (HBaseSnapshotException e) {
450       throw e;
451     } catch (IOException e) {
452       throw new SnapshotCreationException(
453           "Exception while checking to see if snapshot could be started.", e, snapshot);
454     }
455   }
456 
457   /**
458    * Take a snapshot of a disabled table.
459    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
460    * @throws HBaseSnapshotException if the snapshot could not be started
461    */
462   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
463       throws HBaseSnapshotException {
464     // setup the snapshot
465     prepareToTakeSnapshot(snapshot);
466 
467     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
468     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
469 
470     // Take the snapshot of the disabled table
471     DisabledTableSnapshotHandler handler =
472         new DisabledTableSnapshotHandler(snapshot, master, metricsMaster);
473     snapshotTable(snapshot, handler);
474   }
475 
476   /**
477    * Take a snapshot of an enabled table.
478    * @param snapshot description of the snapshot to take.
479    * @throws HBaseSnapshotException if the snapshot could not be started
480    */
481   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
482       throws HBaseSnapshotException {
483     // setup the snapshot
484     prepareToTakeSnapshot(snapshot);
485 
486     // Take the snapshot of the enabled table
487     EnabledTableSnapshotHandler handler =
488         new EnabledTableSnapshotHandler(snapshot, master, this, metricsMaster);
489     snapshotTable(snapshot, handler);
490   }
491 
492   /**
493    * Take a snapshot using the specified handler.
494    * On failure the snapshot temporary working directory is removed.
495    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
496    *       snapshot request if the table is busy with another snapshot/restore operation.
497    * @param snapshot the snapshot description
498    * @param handler the snapshot handler
499    */
500   private synchronized void snapshotTable(SnapshotDescription snapshot,
501       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
502     try {
503       handler.prepare();
504       this.executorService.submit(handler);
505       this.snapshotHandlers.put(snapshot.getTable(), handler);
506     } catch (Exception e) {
507       // cleanup the working directory by trying to delete it from the fs.
508       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
509       try {
510         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
511           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
512               SnapshotDescriptionUtils.toString(snapshot));
513         }
514       } catch (IOException e1) {
515         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
516             SnapshotDescriptionUtils.toString(snapshot));
517       }
518       // fail the snapshot
519       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
520     }
521   }
522 
523   /**
524    * Take a snapshot based on the enabled/disabled state of the table.
525    *
526    * @param snapshot
527    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
528    * @throws IOException when some sort of generic IO exception occurs.
529    */
530   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
531     // check to see if we already completed the snapshot
532     if (isSnapshotCompleted(snapshot)) {
533       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
534           + "' already stored on the filesystem.", snapshot);
535     }
536 
537     LOG.debug("No existing snapshot, attempting snapshot...");
538 
539     // stop tracking "abandoned" handlers
540     cleanupSentinels();
541 
542     // check to see if the table exists
543     HTableDescriptor desc = null;
544     try {
545       desc = master.getTableDescriptors().get(snapshot.getTable());
546     } catch (FileNotFoundException e) {
547       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
548       LOG.error(msg);
549       throw new SnapshotCreationException(msg, e, snapshot);
550     } catch (IOException e) {
551       throw new SnapshotCreationException("Error while geting table description for table "
552           + snapshot.getTable(), e, snapshot);
553     }
554     if (desc == null) {
555       throw new SnapshotCreationException("Table '" + snapshot.getTable()
556           + "' doesn't exist, can't take snapshot.", snapshot);
557     }
558 
559     // set the snapshot version, now that we are ready to take it
560     snapshot = snapshot.toBuilder().setVersion(SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_VERSION)
561         .build();
562 
563     // call pre coproc hook
564     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
565     if (cpHost != null) {
566       cpHost.preSnapshot(snapshot, desc);
567     }
568 
569     // if the table is enabled, then have the RS run actually the snapshot work
570     AssignmentManager assignmentMgr = master.getAssignmentManager();
571     if (assignmentMgr.getZKTable().isEnabledTable(snapshot.getTable())) {
572       LOG.debug("Table enabled, starting distributed snapshot.");
573       snapshotEnabledTable(snapshot);
574       LOG.debug("Started snapshot: " + SnapshotDescriptionUtils.toString(snapshot));
575     }
576     // For disabled table, snapshot is created by the master
577     else if (assignmentMgr.getZKTable().isDisabledTable(snapshot.getTable())) {
578       LOG.debug("Table is disabled, running snapshot entirely on master.");
579       snapshotDisabledTable(snapshot);
580       LOG.debug("Started snapshot: " + SnapshotDescriptionUtils.toString(snapshot));
581     } else {
582       LOG.error("Can't snapshot table '" + snapshot.getTable()
583           + "', isn't open or closed, we don't know what to do!");
584       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
585           + " isn't fully open.");
586       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
587     }
588 
589     // call post coproc hook
590     if (cpHost != null) {
591       cpHost.postSnapshot(snapshot, desc);
592     }
593   }
594 
595   /**
596    * Set the handler for the current snapshot
597    * <p>
598    * Exposed for TESTING
599    * @param tableName
600    * @param handler handler the master should use
601    *
602    * TODO get rid of this if possible, repackaging, modify tests.
603    */
604   public synchronized void setSnapshotHandlerForTesting(final String tableName,
605       final SnapshotSentinel handler) {
606     if (handler != null) {
607       this.snapshotHandlers.put(tableName, handler);
608     } else {
609       this.snapshotHandlers.remove(tableName);
610     }
611   }
612 
613   /**
614    * @return distributed commit coordinator for all running snapshots
615    */
616   ProcedureCoordinator getCoordinator() {
617     return coordinator;
618   }
619 
620   /**
621    * Check to see if the snapshot is one of the currently completed snapshots
622    * Returns true if the snapshot exists in the "completed snapshots folder".
623    *
624    * @param snapshot expected snapshot to check
625    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
626    *         not stored
627    * @throws IOException if the filesystem throws an unexpected exception,
628    * @throws IllegalArgumentException if snapshot name is invalid.
629    */
630   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
631     try {
632       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
633       FileSystem fs = master.getMasterFileSystem().getFileSystem();
634 
635       // check to see if the snapshot already exists
636       return fs.exists(snapshotDir);
637     } catch (IllegalArgumentException iae) {
638       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
639     }
640   }
641 
642   /**
643    * Clone the specified snapshot into a new table.
644    * The operation will fail if the destination table has a snapshot or restore in progress.
645    *
646    * @param snapshot Snapshot Descriptor
647    * @param hTableDescriptor Table Descriptor of the table to create
648    */
649   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
650       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
651     String tableName = hTableDescriptor.getNameAsString();
652 
653     // make sure we aren't running a snapshot on the same table
654     if (isTakingSnapshot(tableName)) {
655       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
656     }
657 
658     // make sure we aren't running a restore on the same table
659     if (isRestoringTable(tableName)) {
660       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
661     }
662 
663     try {
664       CloneSnapshotHandler handler =
665         new CloneSnapshotHandler(master, snapshot, hTableDescriptor, metricsMaster);
666       this.executorService.submit(handler);
667       this.restoreHandlers.put(tableName, handler);
668     } catch (Exception e) {
669       String msg = "Couldn't clone the snapshot=" + SnapshotDescriptionUtils.toString(snapshot) +
670         " on table=" + tableName;
671       LOG.error(msg, e);
672       throw new RestoreSnapshotException(msg, e);
673     }
674   }
675 
676   /**
677    * Restore the specified snapshot
678    * @param reqSnapshot
679    * @throws IOException
680    */
681   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
682     FileSystem fs = master.getMasterFileSystem().getFileSystem();
683     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
684     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
685 
686     // check if the snapshot exists
687     if (!fs.exists(snapshotDir)) {
688       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
689       throw new SnapshotDoesNotExistException(reqSnapshot);
690     }
691 
692     // read snapshot information
693     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
694     HTableDescriptor snapshotTableDesc = FSTableDescriptors.getTableDescriptor(fs, snapshotDir);
695     String tableName = reqSnapshot.getTable();
696 
697     // stop tracking "abandoned" handlers
698     cleanupSentinels();
699 
700     // Execute the restore/clone operation
701     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
702       if (master.getAssignmentManager().getZKTable().isEnabledTable(fsSnapshot.getTable())) {
703         throw new UnsupportedOperationException("Table '" +
704           fsSnapshot.getTable() + "' must be disabled in order to perform a restore operation.");
705       }
706 
707       // call coproc pre hook
708       if (cpHost != null) {
709         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
710       }
711       restoreSnapshot(fsSnapshot, snapshotTableDesc);
712       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
713 
714       if (cpHost != null) {
715         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
716       }
717     } else {
718       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc,
719                                                          Bytes.toBytes(tableName));
720       if (cpHost != null) {
721         cpHost.preCloneSnapshot(reqSnapshot, htd);
722       }
723       cloneSnapshot(fsSnapshot, htd);
724       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
725 
726       if (cpHost != null) {
727         cpHost.postCloneSnapshot(reqSnapshot, htd);
728       }
729     }
730   }
731 
732   /**
733    * Restore the specified snapshot.
734    * The restore will fail if the destination table has a snapshot or restore in progress.
735    *
736    * @param snapshot Snapshot Descriptor
737    * @param hTableDescriptor Table Descriptor
738    */
739   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
740       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
741     String tableName = hTableDescriptor.getNameAsString();
742 
743     // make sure we aren't running a snapshot on the same table
744     if (isTakingSnapshot(tableName)) {
745       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
746     }
747 
748     // make sure we aren't running a restore on the same table
749     if (isRestoringTable(tableName)) {
750       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
751     }
752 
753     try {
754       RestoreSnapshotHandler handler =
755         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor, metricsMaster);
756       this.executorService.submit(handler);
757       restoreHandlers.put(hTableDescriptor.getNameAsString(), handler);
758     } catch (Exception e) {
759       String msg = "Couldn't restore the snapshot=" + SnapshotDescriptionUtils.toString(
760           snapshot)  +
761           " on table=" + tableName;
762       LOG.error(msg, e);
763       throw new RestoreSnapshotException(msg, e);
764     }
765   }
766 
767   /**
768    * Verify if the restore of the specified table is in progress.
769    *
770    * @param tableName table under restore
771    * @return <tt>true</tt> if there is a restore in progress of the specified table.
772    */
773   private synchronized boolean isRestoringTable(final String tableName) {
774     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
775     return(sentinel != null && !sentinel.isFinished());
776   }
777 
778   /**
779    * Returns the status of a restore operation.
780    * If the in-progress restore is failed throws the exception that caused the failure.
781    *
782    * @param snapshot
783    * @return false if in progress, true if restore is completed or not requested.
784    * @throws IOException if there was a failure during the restore
785    */
786   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
787     // check to see if the sentinel exists,
788     // and if the task is complete removes it from the in-progress restore map.
789     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
790 
791     // stop tracking "abandoned" handlers
792     cleanupSentinels();
793 
794     if (sentinel == null) {
795       // there is no sentinel so restore is not in progress.
796       return true;
797     }
798 
799     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
800         + sentinel.getSnapshot().getName() + " table=" + snapshot.getTable());
801 
802     // If the restore is failed, rethrow the exception
803     sentinel.rethrowExceptionIfFailed();
804 
805     // check to see if we are done
806     if (sentinel.isFinished()) {
807       LOG.debug("Restore snapshot=" + SnapshotDescriptionUtils.toString(snapshot) +
808           " has completed. Notifying the client.");
809       return true;
810     }
811 
812     if (LOG.isDebugEnabled()) {
813       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
814           SnapshotDescriptionUtils.toString(snapshot));
815     }
816     return false;
817   }
818 
819   /**
820    * Return the handler if it is currently live and has the same snapshot target name.
821    * The handler is removed from the sentinels map if completed.
822    * @param sentinels live handlers
823    * @param snapshot snapshot description
824    * @return null if doesn't match, else a live handler.
825    */
826   private synchronized SnapshotSentinel removeSentinelIfFinished(
827       final Map<String, SnapshotSentinel> sentinels, final SnapshotDescription snapshot) {
828     SnapshotSentinel h = sentinels.get(snapshot.getTable());
829     if (h == null) {
830       return null;
831     }
832 
833     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
834       // specified snapshot is to the one currently running
835       return null;
836     }
837 
838     // Remove from the "in-progress" list once completed
839     if (h.isFinished()) {
840       sentinels.remove(snapshot.getTable());
841     }
842 
843     return h;
844   }
845 
846   /**
847    * Removes "abandoned" snapshot/restore requests.
848    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
849    * and the in-progress maps are cleaned up when the status of a completed task is requested.
850    * To avoid having sentinels staying around for long time if something client side is failed,
851    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
852    */
853   private void cleanupSentinels() {
854     cleanupSentinels(this.snapshotHandlers);
855     cleanupSentinels(this.restoreHandlers);
856   }
857 
858   /**
859    * Remove the sentinels that are marked as finished and the completion time
860    * has exceeded the removal timeout.
861    * @param sentinels map of sentinels to clean
862    */
863   private synchronized void cleanupSentinels(final Map<String, SnapshotSentinel> sentinels) {
864     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
865     Iterator<Map.Entry<String, SnapshotSentinel>> it = sentinels.entrySet().iterator();
866     while (it.hasNext()) {
867       Map.Entry<String, SnapshotSentinel> entry = it.next();
868       SnapshotSentinel sentinel = entry.getValue();
869       if (sentinel.isFinished() &&
870           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
871       {
872         it.remove();
873       }
874     }
875   }
876 
877   //
878   // Implementing Stoppable interface
879   //
880 
881   @Override
882   public void stop(String why) {
883     // short circuit
884     if (this.stopped) return;
885     // make sure we get stop
886     this.stopped = true;
887     // pass the stop onto take snapshot handlers
888     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
889       snapshotHandler.cancel(why);
890     }
891 
892     // pass the stop onto all the restore handlers
893     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
894       restoreHandler.cancel(why);
895     }
896 
897     try {
898       coordinator.close();
899     } catch (IOException e) {
900       LOG.error("stop ProcedureCoordinator error", e);
901     }
902   }
903 
904   @Override
905   public boolean isStopped() {
906     return this.stopped;
907   }
908 
909   /**
910    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
911    * Called at the beginning of snapshot() and restoreSnapshot() methods.
912    * @throws UnsupportedOperationException if snapshot are not supported
913    */
914   public void checkSnapshotSupport() throws UnsupportedOperationException {
915     if (!this.isSnapshotSupported) {
916       throw new UnsupportedOperationException(
917         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
918           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
919     }
920   }
921 
922   /**
923    * Called at startup, to verify if snapshot operation is supported, and to avoid
924    * starting the master if there're snapshots present but the cleaners needed are missing.
925    * Otherwise we can end up with snapshot data loss.
926    * @param conf The {@link Configuration} object to use
927    * @param mfs The MasterFileSystem to use
928    * @throws IOException in case of file-system operation failure
929    * @throws UnsupportedOperationException in case cleaners are missing and
930    *         there're snapshot in the system
931    */
932   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
933       throws IOException, UnsupportedOperationException {
934     // Verify if snapshot is disabled by the user
935     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
936     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
937     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
938 
939     // Extract cleaners from conf
940     Set<String> hfileCleaners = new HashSet<String>();
941     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
942     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
943 
944     Set<String> logCleaners = new HashSet<String>();
945     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
946     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
947 
948     // check if an older version of snapshot directory was present
949     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
950     FileSystem fs = mfs.getFileSystem();
951     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
952     if (ss != null && !ss.isEmpty()) {
953       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
954       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
955     }
956 
957     // If the user has enabled the snapshot, we force the cleaners to be present
958     // otherwise we still need to check if cleaners are enabled or not and verify
959     // that there're no snapshot in the .snapshot folder.
960     if (snapshotEnabled) {
961       // Inject snapshot cleaners, if snapshot.enable is true
962       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
963       hfileCleaners.add(HFileLinkCleaner.class.getName());
964       logCleaners.add(SnapshotLogCleaner.class.getName());
965 
966       // Set cleaners conf
967       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
968         hfileCleaners.toArray(new String[hfileCleaners.size()]));
969       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
970         logCleaners.toArray(new String[logCleaners.size()]));
971     } else {
972       // Verify if cleaners are present
973       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
974         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
975         hfileCleaners.contains(HFileLinkCleaner.class.getName());
976 
977       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
978       if (snapshotEnabled) {
979         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
980           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
981           (userDisabled ? "is set to 'false'." : "is not set."));
982       }
983     }
984 
985     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
986     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
987 
988     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
989     // otherwise we end up with snapshot data loss.
990     if (!snapshotEnabled) {
991       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
992       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
993       if (fs.exists(snapshotDir)) {
994         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
995           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
996         if (snapshots != null) {
997           LOG.error("Snapshots are present, but cleaners are not enabled.");
998           checkSnapshotSupport();
999         }
1000       }
1001     }
1002   }
1003 }