View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.snapshot;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.concurrent.ThreadPoolExecutor;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.classification.InterfaceStability;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FileStatus;
39  import org.apache.hadoop.fs.FileSystem;
40  import org.apache.hadoop.fs.Path;
41  import org.apache.hadoop.hbase.TableName;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HTableDescriptor;
44  import org.apache.hadoop.hbase.Stoppable;
45  import org.apache.hadoop.hbase.catalog.MetaReader;
46  import org.apache.hadoop.hbase.errorhandling.ForeignException;
47  import org.apache.hadoop.hbase.executor.ExecutorService;
48  import org.apache.hadoop.hbase.master.AssignmentManager;
49  import org.apache.hadoop.hbase.master.MasterCoprocessorHost;
50  import org.apache.hadoop.hbase.master.MasterFileSystem;
51  import org.apache.hadoop.hbase.master.MasterServices;
52  import org.apache.hadoop.hbase.master.MetricsMaster;
53  import org.apache.hadoop.hbase.master.SnapshotSentinel;
54  import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
55  import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
56  import org.apache.hadoop.hbase.procedure.MasterProcedureManager;
57  import org.apache.hadoop.hbase.procedure.Procedure;
58  import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
59  import org.apache.hadoop.hbase.procedure.ProcedureCoordinatorRpcs;
60  import org.apache.hadoop.hbase.procedure.ZKProcedureCoordinatorRpcs;
61  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.NameStringPair;
62  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.ProcedureDescription;
63  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
64  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription.Type;
65  import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
66  import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
67  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException;
68  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
69  import org.apache.hadoop.hbase.snapshot.SnapshotCreationException;
70  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
71  import org.apache.hadoop.hbase.snapshot.SnapshotDoesNotExistException;
72  import org.apache.hadoop.hbase.snapshot.SnapshotExistsException;
73  import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
74  import org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil;
75  import org.apache.hadoop.hbase.snapshot.TablePartiallyOpenException;
76  import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException;
77  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
78  import org.apache.hadoop.hbase.util.FSUtils;
79  import org.apache.zookeeper.KeeperException;
80  
81  /**
82   * This class manages the procedure of taking and restoring snapshots. There is only one
83   * SnapshotManager for the master.
84   * <p>
85   * The class provides methods for monitoring in-progress snapshot actions.
86   * <p>
87   * Note: Currently there can only be one snapshot being taken at a time over the cluster. This is a
88   * simplification in the current implementation.
89   */
90  @InterfaceAudience.Private
91  @InterfaceStability.Unstable
92  public class SnapshotManager extends MasterProcedureManager implements Stoppable {
93    private static final Log LOG = LogFactory.getLog(SnapshotManager.class);
94  
95    /** By default, check to see if the snapshot is complete every WAKE MILLIS (ms) */
96    private static final int SNAPSHOT_WAKE_MILLIS_DEFAULT = 500;
97  
98    /**
99     * Wait time before removing a finished sentinel from the in-progress map
100    *
101    * NOTE: This is used as a safety auto cleanup.
102    * The snapshot and restore handlers map entries are removed when a user asks if a snapshot or
103    * restore is completed. This operation is part of the HBaseAdmin snapshot/restore API flow.
104    * In case something fails on the client side and the snapshot/restore state is not reclaimed
105    * after a default timeout, the entry is removed from the in-progress map.
106    * At this point, if the user asks for the snapshot/restore status, the result will be
107    * snapshot done if exists or failed if it doesn't exists.
108    */
109   private static final int SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT = 60 * 1000;
110 
111   /** Enable or disable snapshot support */
112   public static final String HBASE_SNAPSHOT_ENABLED = "hbase.snapshot.enabled";
113 
114   /**
115    * Conf key for # of ms elapsed between checks for snapshot errors while waiting for
116    * completion.
117    */
118   private static final String SNAPSHOT_WAKE_MILLIS_KEY = "hbase.snapshot.master.wakeMillis";
119 
120   /** By default, check to see if the snapshot is complete (ms) */
121   private static final int SNAPSHOT_TIMEOUT_MILLIS_DEFAULT = 60000;
122 
123   /**
124    * Conf key for # of ms elapsed before injecting a snapshot timeout error when waiting for
125    * completion.
126    */
127   private static final String SNAPSHOT_TIMEOUT_MILLIS_KEY = "hbase.snapshot.master.timeoutMillis";
128 
129   /** Name of the operation to use in the controller */
130   public static final String ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION = "online-snapshot";
131 
132   /** Conf key for # of threads used by the SnapshotManager thread pool */
133   private static final String SNAPSHOT_POOL_THREADS_KEY = "hbase.snapshot.master.threads";
134 
135   /** number of current operations running on the master */
136   private static final int SNAPSHOT_POOL_THREADS_DEFAULT = 1;
137 
138   private boolean stopped;
139   private MasterServices master;  // Needed by TableEventHandlers
140   private MetricsMaster metricsMaster;
141   private ProcedureCoordinator coordinator;
142 
143   // Is snapshot feature enabled?
144   private boolean isSnapshotSupported = false;
145 
146   // Snapshot handlers map, with table name as key.
147   // The map is always accessed and modified under the object lock using synchronized.
148   // snapshotTable() will insert an Handler in the table.
149   // isSnapshotDone() will remove the handler requested if the operation is finished.
150   private Map<TableName, SnapshotSentinel> snapshotHandlers =
151       new HashMap<TableName, SnapshotSentinel>();
152 
153   // Restore Sentinels map, with table name as key.
154   // The map is always accessed and modified under the object lock using synchronized.
155   // restoreSnapshot()/cloneSnapshot() will insert an Handler in the table.
156   // isRestoreDone() will remove the handler requested if the operation is finished.
157   private Map<TableName, SnapshotSentinel> restoreHandlers =
158       new HashMap<TableName, SnapshotSentinel>();
159 
160   private Path rootDir;
161   private ExecutorService executorService;
162 
163   /**
164    * Snapshot layout version to use when writing a new snapshot.
165    */
166   private int snapshotLayoutVersion = SnapshotDescriptionUtils.SNAPSHOT_LAYOUT_LATEST_FORMAT;
167 
168   public SnapshotManager() {}
169 
170   /**
171    * Fully specify all necessary components of a snapshot manager. Exposed for testing.
172    * @param master services for the master where the manager is running
173    * @param coordinator procedure coordinator instance.  exposed for testing.
174    * @param pool HBase ExecutorServcie instance, exposed for testing.
175    */
176   public SnapshotManager(final MasterServices master, final MetricsMaster metricsMaster,
177       ProcedureCoordinator coordinator, ExecutorService pool)
178       throws IOException, UnsupportedOperationException {
179     this.master = master;
180     this.metricsMaster = metricsMaster;
181 
182     this.rootDir = master.getMasterFileSystem().getRootDir();
183     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
184 
185     this.snapshotLayoutVersion = SnapshotDescriptionUtils.getDefaultSnapshotLayoutFormat(
186         master.getConfiguration());
187 
188     this.coordinator = coordinator;
189     this.executorService = pool;
190     resetTempDir();
191   }
192 
193   /**
194    * Gets the list of all completed snapshots.
195    * @return list of SnapshotDescriptions
196    * @throws IOException File system exception
197    */
198   public List<SnapshotDescription> getCompletedSnapshots() throws IOException {
199     return getCompletedSnapshots(SnapshotDescriptionUtils.getSnapshotsDir(rootDir));
200   }
201 
202   /**
203    * Gets the list of all completed snapshots.
204    * @param snapshotDir snapshot directory
205    * @return list of SnapshotDescriptions
206    * @throws IOException File system exception
207    */
208   private List<SnapshotDescription> getCompletedSnapshots(Path snapshotDir) throws IOException {
209     List<SnapshotDescription> snapshotDescs = new ArrayList<SnapshotDescription>();
210     // first create the snapshot root path and check to see if it exists
211     FileSystem fs = master.getMasterFileSystem().getFileSystem();
212     if (snapshotDir == null) snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(rootDir);
213 
214     // if there are no snapshots, return an empty list
215     if (!fs.exists(snapshotDir)) {
216       return snapshotDescs;
217     }
218 
219     // ignore all the snapshots in progress
220     FileStatus[] snapshots = fs.listStatus(snapshotDir,
221       new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
222     // loop through all the completed snapshots
223     for (FileStatus snapshot : snapshots) {
224       Path info = new Path(snapshot.getPath(), SnapshotDescriptionUtils.SNAPSHOTINFO_FILE);
225       // if the snapshot is bad
226       if (!fs.exists(info)) {
227         LOG.error("Snapshot information for " + snapshot.getPath() + " doesn't exist");
228         continue;
229       }
230       FSDataInputStream in = null;
231       try {
232         in = fs.open(info);
233         SnapshotDescription desc = SnapshotDescription.parseFrom(in);
234         snapshotDescs.add(desc);
235       } catch (IOException e) {
236         LOG.warn("Found a corrupted snapshot " + snapshot.getPath(), e);
237       } finally {
238         if (in != null) {
239           in.close();
240         }
241       }
242     }
243     return snapshotDescs;
244   }
245 
246   /**
247    * Cleans up any snapshots in the snapshot/.tmp directory that were left from failed
248    * snapshot attempts.
249    *
250    * @throws IOException if we can't reach the filesystem
251    */
252   void resetTempDir() throws IOException {
253     // cleanup any existing snapshots.
254     Path tmpdir = SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir);
255     if (master.getMasterFileSystem().getFileSystem().exists(tmpdir)) {
256       if (!master.getMasterFileSystem().getFileSystem().delete(tmpdir, true)) {
257         LOG.warn("Couldn't delete working snapshot directory: " + tmpdir);
258       }
259     }
260   }
261 
262   /**
263    * Delete the specified snapshot
264    * @param snapshot
265    * @throws SnapshotDoesNotExistException If the specified snapshot does not exist.
266    * @throws IOException For filesystem IOExceptions
267    */
268   public void deleteSnapshot(SnapshotDescription snapshot) throws SnapshotDoesNotExistException, IOException {
269 
270     // call coproc pre hook
271     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
272     if (cpHost != null) {
273       cpHost.preDeleteSnapshot(snapshot);
274     }
275 
276     // check to see if it is completed
277     if (!isSnapshotCompleted(snapshot)) {
278       throw new SnapshotDoesNotExistException(snapshot);
279     }
280 
281     String snapshotName = snapshot.getName();
282     LOG.debug("Deleting snapshot: " + snapshotName);
283     // first create the snapshot description and check to see if it exists
284     MasterFileSystem fs = master.getMasterFileSystem();
285     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
286 
287     // delete the existing snapshot
288     if (!fs.getFileSystem().delete(snapshotDir, true)) {
289       throw new HBaseSnapshotException("Failed to delete snapshot directory: " + snapshotDir);
290     }
291 
292     // call coproc post hook
293     if (cpHost != null) {
294       cpHost.postDeleteSnapshot(snapshot);
295     }
296 
297   }
298 
299   /**
300    * Check if the specified snapshot is done
301    *
302    * @param expected
303    * @return true if snapshot is ready to be restored, false if it is still being taken.
304    * @throws IOException IOException if error from HDFS or RPC
305    * @throws UnknownSnapshotException if snapshot is invalid or does not exist.
306    */
307   public boolean isSnapshotDone(SnapshotDescription expected) throws IOException {
308     // check the request to make sure it has a snapshot
309     if (expected == null) {
310       throw new UnknownSnapshotException(
311          "No snapshot name passed in request, can't figure out which snapshot you want to check.");
312     }
313 
314     String ssString = ClientSnapshotDescriptionUtils.toString(expected);
315 
316     // check to see if the sentinel exists,
317     // and if the task is complete removes it from the in-progress snapshots map.
318     SnapshotSentinel handler = removeSentinelIfFinished(this.snapshotHandlers, expected);
319 
320     // stop tracking "abandoned" handlers
321     cleanupSentinels();
322 
323     if (handler == null) {
324       // If there's no handler in the in-progress map, it means one of the following:
325       //   - someone has already requested the snapshot state
326       //   - the requested snapshot was completed long time ago (cleanupSentinels() timeout)
327       //   - the snapshot was never requested
328       // In those cases returns to the user the "done state" if the snapshots exists on disk,
329       // otherwise raise an exception saying that the snapshot is not running and doesn't exist.
330       if (!isSnapshotCompleted(expected)) {
331         throw new UnknownSnapshotException("Snapshot " + ssString
332             + " is not currently running or one of the known completed snapshots.");
333       }
334       // was done, return true;
335       return true;
336     }
337 
338     // pass on any failure we find in the sentinel
339     try {
340       handler.rethrowExceptionIfFailed();
341     } catch (ForeignException e) {
342       // Give some procedure info on an exception.
343       String status;
344       Procedure p = coordinator.getProcedure(expected.getName());
345       if (p != null) {
346         status = p.getStatus();
347       } else {
348         status = expected.getName() + " not found in proclist " + coordinator.getProcedureNames();
349       }
350       throw new HBaseSnapshotException("Snapshot " + ssString +  " had an error.  " + status, e,
351           expected);
352     }
353 
354     // check to see if we are done
355     if (handler.isFinished()) {
356       LOG.debug("Snapshot '" + ssString + "' has completed, notifying client.");
357       return true;
358     } else if (LOG.isDebugEnabled()) {
359       LOG.debug("Snapshoting '" + ssString + "' is still in progress!");
360     }
361     return false;
362   }
363 
364   /**
365    * Check to see if there is a snapshot in progress with the same name or on the same table.
366    * Currently we have a limitation only allowing a single snapshot per table at a time. Also we
367    * don't allow snapshot with the same name.
368    * @param snapshot description of the snapshot being checked.
369    * @return <tt>true</tt> if there is a snapshot in progress with the same name or on the same
370    *         table.
371    */
372   synchronized boolean isTakingSnapshot(final SnapshotDescription snapshot) {
373     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
374     if (isTakingSnapshot(snapshotTable)) {
375       return true;
376     }
377     Iterator<Map.Entry<TableName, SnapshotSentinel>> it = this.snapshotHandlers.entrySet().iterator();
378     while (it.hasNext()) {
379       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
380       SnapshotSentinel sentinel = entry.getValue();
381       if (snapshot.getName().equals(sentinel.getSnapshot().getName()) && !sentinel.isFinished()) {
382         return true;
383       }
384     }
385     return false;
386   }
387 
388   /**
389    * Check to see if the specified table has a snapshot in progress.  Currently we have a
390    * limitation only allowing a single snapshot per table at a time.
391    * @param tableName name of the table being snapshotted.
392    * @return <tt>true</tt> if there is a snapshot in progress on the specified table.
393    */
394   synchronized boolean isTakingSnapshot(final TableName tableName) {
395     SnapshotSentinel handler = this.snapshotHandlers.get(tableName);
396     return handler != null && !handler.isFinished();
397   }
398 
399   /**
400    * Check to make sure that we are OK to run the passed snapshot. Checks to make sure that we
401    * aren't already running a snapshot or restore on the requested table.
402    * @param snapshot description of the snapshot we want to start
403    * @throws HBaseSnapshotException if the filesystem could not be prepared to start the snapshot
404    */
405   private synchronized void prepareToTakeSnapshot(SnapshotDescription snapshot)
406       throws HBaseSnapshotException {
407     FileSystem fs = master.getMasterFileSystem().getFileSystem();
408     Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
409     TableName snapshotTable =
410         TableName.valueOf(snapshot.getTable());
411 
412     // make sure we aren't already running a snapshot
413     if (isTakingSnapshot(snapshot)) {
414       SnapshotSentinel handler = this.snapshotHandlers.get(snapshotTable);
415       throw new SnapshotCreationException("Rejected taking "
416           + ClientSnapshotDescriptionUtils.toString(snapshot)
417           + " because we are already running another snapshot "
418           + (handler != null ? ("on the same table " +
419               ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()))
420               : "with the same name"), snapshot);
421     }
422 
423     // make sure we aren't running a restore on the same table
424     if (isRestoringTable(snapshotTable)) {
425       SnapshotSentinel handler = restoreHandlers.get(snapshotTable);
426       throw new SnapshotCreationException("Rejected taking "
427           + ClientSnapshotDescriptionUtils.toString(snapshot)
428           + " because we are already have a restore in progress on the same snapshot "
429           + ClientSnapshotDescriptionUtils.toString(handler.getSnapshot()), snapshot);
430     }
431 
432     try {
433       // delete the working directory, since we aren't running the snapshot. Likely leftovers
434       // from a failed attempt.
435       fs.delete(workingDir, true);
436 
437       // recreate the working directory for the snapshot
438       if (!fs.mkdirs(workingDir)) {
439         throw new SnapshotCreationException("Couldn't create working directory (" + workingDir
440             + ") for snapshot" , snapshot);
441       }
442     } catch (HBaseSnapshotException e) {
443       throw e;
444     } catch (IOException e) {
445       throw new SnapshotCreationException(
446           "Exception while checking to see if snapshot could be started.", e, snapshot);
447     }
448   }
449 
450   /**
451    * Take a snapshot of a disabled table.
452    * @param snapshot description of the snapshot to take. Modified to be {@link Type#DISABLED}.
453    * @throws HBaseSnapshotException if the snapshot could not be started
454    */
455   private synchronized void snapshotDisabledTable(SnapshotDescription snapshot)
456       throws HBaseSnapshotException {
457     // setup the snapshot
458     prepareToTakeSnapshot(snapshot);
459 
460     // set the snapshot to be a disabled snapshot, since the client doesn't know about that
461     snapshot = snapshot.toBuilder().setType(Type.DISABLED).build();
462 
463     // Take the snapshot of the disabled table
464     DisabledTableSnapshotHandler handler =
465         new DisabledTableSnapshotHandler(snapshot, master);
466     snapshotTable(snapshot, handler);
467   }
468 
469   /**
470    * Take a snapshot of an enabled table.
471    * @param snapshot description of the snapshot to take.
472    * @throws HBaseSnapshotException if the snapshot could not be started
473    */
474   private synchronized void snapshotEnabledTable(SnapshotDescription snapshot)
475       throws HBaseSnapshotException {
476     // setup the snapshot
477     prepareToTakeSnapshot(snapshot);
478 
479     // Take the snapshot of the enabled table
480     EnabledTableSnapshotHandler handler =
481         new EnabledTableSnapshotHandler(snapshot, master, this);
482     snapshotTable(snapshot, handler);
483   }
484 
485   /**
486    * Take a snapshot using the specified handler.
487    * On failure the snapshot temporary working directory is removed.
488    * NOTE: prepareToTakeSnapshot() called before this one takes care of the rejecting the
489    *       snapshot request if the table is busy with another snapshot/restore operation.
490    * @param snapshot the snapshot description
491    * @param handler the snapshot handler
492    */
493   private synchronized void snapshotTable(SnapshotDescription snapshot,
494       final TakeSnapshotHandler handler) throws HBaseSnapshotException {
495     try {
496       handler.prepare();
497       this.executorService.submit(handler);
498       this.snapshotHandlers.put(TableName.valueOf(snapshot.getTable()), handler);
499     } catch (Exception e) {
500       // cleanup the working directory by trying to delete it from the fs.
501       Path workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir);
502       try {
503         if (!this.master.getMasterFileSystem().getFileSystem().delete(workingDir, true)) {
504           LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
505               ClientSnapshotDescriptionUtils.toString(snapshot));
506         }
507       } catch (IOException e1) {
508         LOG.error("Couldn't delete working directory (" + workingDir + " for snapshot:" +
509             ClientSnapshotDescriptionUtils.toString(snapshot));
510       }
511       // fail the snapshot
512       throw new SnapshotCreationException("Could not build snapshot handler", e, snapshot);
513     }
514   }
515 
516   /**
517    * Take a snapshot based on the enabled/disabled state of the table.
518    *
519    * @param snapshot
520    * @throws HBaseSnapshotException when a snapshot specific exception occurs.
521    * @throws IOException when some sort of generic IO exception occurs.
522    */
523   public void takeSnapshot(SnapshotDescription snapshot) throws IOException {
524     // check to see if we already completed the snapshot
525     if (isSnapshotCompleted(snapshot)) {
526       throw new SnapshotExistsException("Snapshot '" + snapshot.getName()
527           + "' already stored on the filesystem.", snapshot);
528     }
529 
530     LOG.debug("No existing snapshot, attempting snapshot...");
531 
532     // stop tracking "abandoned" handlers
533     cleanupSentinels();
534 
535     // check to see if the table exists
536     HTableDescriptor desc = null;
537     try {
538       desc = master.getTableDescriptors().get(
539           TableName.valueOf(snapshot.getTable()));
540     } catch (FileNotFoundException e) {
541       String msg = "Table:" + snapshot.getTable() + " info doesn't exist!";
542       LOG.error(msg);
543       throw new SnapshotCreationException(msg, e, snapshot);
544     } catch (IOException e) {
545       throw new SnapshotCreationException("Error while geting table description for table "
546           + snapshot.getTable(), e, snapshot);
547     }
548     if (desc == null) {
549       throw new SnapshotCreationException("Table '" + snapshot.getTable()
550           + "' doesn't exist, can't take snapshot.", snapshot);
551     }
552 
553     // if not specified, set the snapshot format
554     if (!snapshot.hasVersion()) {
555       snapshot = snapshot.toBuilder()
556           .setVersion(snapshotLayoutVersion)
557           .build();
558     }
559 
560     // call pre coproc hook
561     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
562     if (cpHost != null) {
563       cpHost.preSnapshot(snapshot, desc);
564     }
565 
566     // if the table is enabled, then have the RS run actually the snapshot work
567     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
568     AssignmentManager assignmentMgr = master.getAssignmentManager();
569     if (assignmentMgr.getZKTable().isEnabledTable(snapshotTable)) {
570       LOG.debug("Table enabled, starting distributed snapshot.");
571       snapshotEnabledTable(snapshot);
572       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
573     }
574     // For disabled table, snapshot is created by the master
575     else if (assignmentMgr.getZKTable().isDisabledTable(snapshotTable)) {
576       LOG.debug("Table is disabled, running snapshot entirely on master.");
577       snapshotDisabledTable(snapshot);
578       LOG.debug("Started snapshot: " + ClientSnapshotDescriptionUtils.toString(snapshot));
579     } else {
580       LOG.error("Can't snapshot table '" + snapshot.getTable()
581           + "', isn't open or closed, we don't know what to do!");
582       TablePartiallyOpenException tpoe = new TablePartiallyOpenException(snapshot.getTable()
583           + " isn't fully open.");
584       throw new SnapshotCreationException("Table is not entirely open or closed", tpoe, snapshot);
585     }
586 
587     // call post coproc hook
588     if (cpHost != null) {
589       cpHost.postSnapshot(snapshot, desc);
590     }
591   }
592 
593   /**
594    * Set the handler for the current snapshot
595    * <p>
596    * Exposed for TESTING
597    * @param tableName
598    * @param handler handler the master should use
599    *
600    * TODO get rid of this if possible, repackaging, modify tests.
601    */
602   public synchronized void setSnapshotHandlerForTesting(
603       final TableName tableName,
604       final SnapshotSentinel handler) {
605     if (handler != null) {
606       this.snapshotHandlers.put(tableName, handler);
607     } else {
608       this.snapshotHandlers.remove(tableName);
609     }
610   }
611 
612   /**
613    * @return distributed commit coordinator for all running snapshots
614    */
615   ProcedureCoordinator getCoordinator() {
616     return coordinator;
617   }
618 
619   /**
620    * Check to see if the snapshot is one of the currently completed snapshots
621    * Returns true if the snapshot exists in the "completed snapshots folder".
622    *
623    * @param snapshot expected snapshot to check
624    * @return <tt>true</tt> if the snapshot is stored on the {@link FileSystem}, <tt>false</tt> if is
625    *         not stored
626    * @throws IOException if the filesystem throws an unexpected exception,
627    * @throws IllegalArgumentException if snapshot name is invalid.
628    */
629   private boolean isSnapshotCompleted(SnapshotDescription snapshot) throws IOException {
630     try {
631       final Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
632       FileSystem fs = master.getMasterFileSystem().getFileSystem();
633       // check to see if the snapshot already exists
634       return fs.exists(snapshotDir);
635     } catch (IllegalArgumentException iae) {
636       throw new UnknownSnapshotException("Unexpected exception thrown", iae);
637     }
638   }
639 
640   /**
641    * Clone the specified snapshot into a new table.
642    * The operation will fail if the destination table has a snapshot or restore in progress.
643    *
644    * @param snapshot Snapshot Descriptor
645    * @param hTableDescriptor Table Descriptor of the table to create
646    */
647   synchronized void cloneSnapshot(final SnapshotDescription snapshot,
648       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
649     TableName tableName = hTableDescriptor.getTableName();
650 
651     // make sure we aren't running a snapshot on the same table
652     if (isTakingSnapshot(tableName)) {
653       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
654     }
655 
656     // make sure we aren't running a restore on the same table
657     if (isRestoringTable(tableName)) {
658       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
659     }
660 
661     try {
662       CloneSnapshotHandler handler =
663         new CloneSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
664       this.executorService.submit(handler);
665       this.restoreHandlers.put(tableName, handler);
666     } catch (Exception e) {
667       String msg = "Couldn't clone the snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
668         " on table=" + tableName;
669       LOG.error(msg, e);
670       throw new RestoreSnapshotException(msg, e);
671     }
672   }
673 
674   /**
675    * Restore the specified snapshot
676    * @param reqSnapshot
677    * @throws IOException
678    */
679   public void restoreSnapshot(SnapshotDescription reqSnapshot) throws IOException {
680     FileSystem fs = master.getMasterFileSystem().getFileSystem();
681     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(reqSnapshot, rootDir);
682     MasterCoprocessorHost cpHost = master.getCoprocessorHost();
683 
684     // check if the snapshot exists
685     if (!fs.exists(snapshotDir)) {
686       LOG.error("A Snapshot named '" + reqSnapshot.getName() + "' does not exist.");
687       throw new SnapshotDoesNotExistException(reqSnapshot);
688     }
689 
690     // read snapshot information
691     SnapshotDescription fsSnapshot = SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
692     SnapshotManifest manifest = SnapshotManifest.open(master.getConfiguration(), fs,
693         snapshotDir, fsSnapshot);
694     HTableDescriptor snapshotTableDesc = manifest.getTableDescriptor();
695     TableName tableName = TableName.valueOf(reqSnapshot.getTable());
696 
697     // stop tracking "abandoned" handlers
698     cleanupSentinels();
699 
700     // Verify snapshot validity
701     SnapshotReferenceUtil.verifySnapshot(master.getConfiguration(), fs, manifest);
702 
703     // Execute the restore/clone operation
704     if (MetaReader.tableExists(master.getCatalogTracker(), tableName)) {
705       if (master.getAssignmentManager().getZKTable().isEnabledTable(
706           TableName.valueOf(fsSnapshot.getTable()))) {
707         throw new UnsupportedOperationException("Table '" +
708             TableName.valueOf(fsSnapshot.getTable()) + "' must be disabled in order to " +
709             "perform a restore operation" +
710             ".");
711       }
712 
713       // call coproc pre hook
714       if (cpHost != null) {
715         cpHost.preRestoreSnapshot(reqSnapshot, snapshotTableDesc);
716       }
717       restoreSnapshot(fsSnapshot, snapshotTableDesc);
718       LOG.info("Restore snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
719 
720       if (cpHost != null) {
721         cpHost.postRestoreSnapshot(reqSnapshot, snapshotTableDesc);
722       }
723     } else {
724       HTableDescriptor htd = RestoreSnapshotHelper.cloneTableSchema(snapshotTableDesc, tableName);
725       if (cpHost != null) {
726         cpHost.preCloneSnapshot(reqSnapshot, htd);
727       }
728       cloneSnapshot(fsSnapshot, htd);
729       LOG.info("Clone snapshot=" + fsSnapshot.getName() + " as table=" + tableName);
730 
731       if (cpHost != null) {
732         cpHost.postCloneSnapshot(reqSnapshot, htd);
733       }
734     }
735   }
736 
737   /**
738    * Restore the specified snapshot.
739    * The restore will fail if the destination table has a snapshot or restore in progress.
740    *
741    * @param snapshot Snapshot Descriptor
742    * @param hTableDescriptor Table Descriptor
743    */
744   private synchronized void restoreSnapshot(final SnapshotDescription snapshot,
745       final HTableDescriptor hTableDescriptor) throws HBaseSnapshotException {
746     TableName tableName = hTableDescriptor.getTableName();
747 
748     // make sure we aren't running a snapshot on the same table
749     if (isTakingSnapshot(tableName)) {
750       throw new RestoreSnapshotException("Snapshot in progress on the restore table=" + tableName);
751     }
752 
753     // make sure we aren't running a restore on the same table
754     if (isRestoringTable(tableName)) {
755       throw new RestoreSnapshotException("Restore already in progress on the table=" + tableName);
756     }
757 
758     try {
759       RestoreSnapshotHandler handler =
760         new RestoreSnapshotHandler(master, snapshot, hTableDescriptor).prepare();
761       this.executorService.submit(handler);
762       restoreHandlers.put(tableName, handler);
763     } catch (Exception e) {
764       String msg = "Couldn't restore the snapshot=" + ClientSnapshotDescriptionUtils.toString(
765           snapshot)  +
766           " on table=" + tableName;
767       LOG.error(msg, e);
768       throw new RestoreSnapshotException(msg, e);
769     }
770   }
771 
772   /**
773    * Verify if the restore of the specified table is in progress.
774    *
775    * @param tableName table under restore
776    * @return <tt>true</tt> if there is a restore in progress of the specified table.
777    */
778   private synchronized boolean isRestoringTable(final TableName tableName) {
779     SnapshotSentinel sentinel = this.restoreHandlers.get(tableName);
780     return(sentinel != null && !sentinel.isFinished());
781   }
782 
783   /**
784    * Returns the status of a restore operation.
785    * If the in-progress restore is failed throws the exception that caused the failure.
786    *
787    * @param snapshot
788    * @return false if in progress, true if restore is completed or not requested.
789    * @throws IOException if there was a failure during the restore
790    */
791   public boolean isRestoreDone(final SnapshotDescription snapshot) throws IOException {
792     // check to see if the sentinel exists,
793     // and if the task is complete removes it from the in-progress restore map.
794     SnapshotSentinel sentinel = removeSentinelIfFinished(this.restoreHandlers, snapshot);
795 
796     // stop tracking "abandoned" handlers
797     cleanupSentinels();
798 
799     if (sentinel == null) {
800       // there is no sentinel so restore is not in progress.
801       return true;
802     }
803 
804     LOG.debug("Verify snapshot=" + snapshot.getName() + " against="
805         + sentinel.getSnapshot().getName() + " table=" +
806         TableName.valueOf(snapshot.getTable()));
807 
808     // If the restore is failed, rethrow the exception
809     sentinel.rethrowExceptionIfFailed();
810 
811     // check to see if we are done
812     if (sentinel.isFinished()) {
813       LOG.debug("Restore snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot) +
814           " has completed. Notifying the client.");
815       return true;
816     }
817 
818     if (LOG.isDebugEnabled()) {
819       LOG.debug("Sentinel is not yet finished with restoring snapshot=" +
820           ClientSnapshotDescriptionUtils.toString(snapshot));
821     }
822     return false;
823   }
824 
825   /**
826    * Return the handler if it is currently live and has the same snapshot target name.
827    * The handler is removed from the sentinels map if completed.
828    * @param sentinels live handlers
829    * @param snapshot snapshot description
830    * @return null if doesn't match, else a live handler.
831    */
832   private synchronized SnapshotSentinel removeSentinelIfFinished(
833       final Map<TableName, SnapshotSentinel> sentinels,
834       final SnapshotDescription snapshot) {
835     if (!snapshot.hasTable()) {
836       return null;
837     }
838 
839     TableName snapshotTable = TableName.valueOf(snapshot.getTable());
840     SnapshotSentinel h = sentinels.get(snapshotTable);
841     if (h == null) {
842       return null;
843     }
844 
845     if (!h.getSnapshot().getName().equals(snapshot.getName())) {
846       // specified snapshot is to the one currently running
847       return null;
848     }
849 
850     // Remove from the "in-progress" list once completed
851     if (h.isFinished()) {
852       sentinels.remove(snapshotTable);
853     }
854 
855     return h;
856   }
857 
858   /**
859    * Removes "abandoned" snapshot/restore requests.
860    * As part of the HBaseAdmin snapshot/restore API the operation status is checked until completed,
861    * and the in-progress maps are cleaned up when the status of a completed task is requested.
862    * To avoid having sentinels staying around for long time if something client side is failed,
863    * each operation tries to clean up the in-progress maps sentinels finished from a long time.
864    */
865   private void cleanupSentinels() {
866     cleanupSentinels(this.snapshotHandlers);
867     cleanupSentinels(this.restoreHandlers);
868   }
869 
870   /**
871    * Remove the sentinels that are marked as finished and the completion time
872    * has exceeded the removal timeout.
873    * @param sentinels map of sentinels to clean
874    */
875   private synchronized void cleanupSentinels(final Map<TableName, SnapshotSentinel> sentinels) {
876     long currentTime = EnvironmentEdgeManager.currentTimeMillis();
877     Iterator<Map.Entry<TableName, SnapshotSentinel>> it =
878         sentinels.entrySet().iterator();
879     while (it.hasNext()) {
880       Map.Entry<TableName, SnapshotSentinel> entry = it.next();
881       SnapshotSentinel sentinel = entry.getValue();
882       if (sentinel.isFinished() &&
883           (currentTime - sentinel.getCompletionTimestamp()) > SNAPSHOT_SENTINELS_CLEANUP_TIMEOUT)
884       {
885         it.remove();
886       }
887     }
888   }
889 
890   //
891   // Implementing Stoppable interface
892   //
893 
894   @Override
895   public void stop(String why) {
896     // short circuit
897     if (this.stopped) return;
898     // make sure we get stop
899     this.stopped = true;
900     // pass the stop onto take snapshot handlers
901     for (SnapshotSentinel snapshotHandler: this.snapshotHandlers.values()) {
902       snapshotHandler.cancel(why);
903     }
904 
905     // pass the stop onto all the restore handlers
906     for (SnapshotSentinel restoreHandler: this.restoreHandlers.values()) {
907       restoreHandler.cancel(why);
908     }
909     try {
910       coordinator.close();
911     } catch (IOException e) {
912       LOG.error("stop ProcedureCoordinator error", e);
913     }
914   }
915 
916   @Override
917   public boolean isStopped() {
918     return this.stopped;
919   }
920 
921   /**
922    * Throws an exception if snapshot operations (take a snapshot, restore, clone) are not supported.
923    * Called at the beginning of snapshot() and restoreSnapshot() methods.
924    * @throws UnsupportedOperationException if snapshot are not supported
925    */
926   public void checkSnapshotSupport() throws UnsupportedOperationException {
927     if (!this.isSnapshotSupported) {
928       throw new UnsupportedOperationException(
929         "To use snapshots, You must add to the hbase-site.xml of the HBase Master: '" +
930           HBASE_SNAPSHOT_ENABLED + "' property with value 'true'.");
931     }
932   }
933 
934   /**
935    * Called at startup, to verify if snapshot operation is supported, and to avoid
936    * starting the master if there're snapshots present but the cleaners needed are missing.
937    * Otherwise we can end up with snapshot data loss.
938    * @param conf The {@link Configuration} object to use
939    * @param mfs The MasterFileSystem to use
940    * @throws IOException in case of file-system operation failure
941    * @throws UnsupportedOperationException in case cleaners are missing and
942    *         there're snapshot in the system
943    */
944   private void checkSnapshotSupport(final Configuration conf, final MasterFileSystem mfs)
945       throws IOException, UnsupportedOperationException {
946     // Verify if snapshot is disabled by the user
947     String enabled = conf.get(HBASE_SNAPSHOT_ENABLED);
948     boolean snapshotEnabled = conf.getBoolean(HBASE_SNAPSHOT_ENABLED, false);
949     boolean userDisabled = (enabled != null && enabled.trim().length() > 0 && !snapshotEnabled);
950 
951     // Extract cleaners from conf
952     Set<String> hfileCleaners = new HashSet<String>();
953     String[] cleaners = conf.getStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS);
954     if (cleaners != null) Collections.addAll(hfileCleaners, cleaners);
955 
956     Set<String> logCleaners = new HashSet<String>();
957     cleaners = conf.getStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS);
958     if (cleaners != null) Collections.addAll(logCleaners, cleaners);
959 
960     // check if an older version of snapshot directory was present
961     Path oldSnapshotDir = new Path(mfs.getRootDir(), HConstants.OLD_SNAPSHOT_DIR_NAME);
962     FileSystem fs = mfs.getFileSystem();
963     List<SnapshotDescription> ss = getCompletedSnapshots(new Path(rootDir, oldSnapshotDir));
964     if (ss != null && !ss.isEmpty()) {
965       LOG.error("Snapshots from an earlier release were found under: " + oldSnapshotDir);
966       LOG.error("Please rename the directory as " + HConstants.SNAPSHOT_DIR_NAME);
967     }
968 
969     // If the user has enabled the snapshot, we force the cleaners to be present
970     // otherwise we still need to check if cleaners are enabled or not and verify
971     // that there're no snapshot in the .snapshot folder.
972     if (snapshotEnabled) {
973       // Inject snapshot cleaners, if snapshot.enable is true
974       hfileCleaners.add(SnapshotHFileCleaner.class.getName());
975       hfileCleaners.add(HFileLinkCleaner.class.getName());
976       logCleaners.add(SnapshotLogCleaner.class.getName());
977 
978       // Set cleaners conf
979       conf.setStrings(HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS,
980         hfileCleaners.toArray(new String[hfileCleaners.size()]));
981       conf.setStrings(HConstants.HBASE_MASTER_LOGCLEANER_PLUGINS,
982         logCleaners.toArray(new String[logCleaners.size()]));
983     } else {
984       // Verify if cleaners are present
985       snapshotEnabled = logCleaners.contains(SnapshotLogCleaner.class.getName()) &&
986         hfileCleaners.contains(SnapshotHFileCleaner.class.getName()) &&
987         hfileCleaners.contains(HFileLinkCleaner.class.getName());
988 
989       // Warn if the cleaners are enabled but the snapshot.enabled property is false/not set.
990       if (snapshotEnabled) {
991         LOG.warn("Snapshot log and hfile cleaners are present in the configuration, " +
992           "but the '" + HBASE_SNAPSHOT_ENABLED + "' property " +
993           (userDisabled ? "is set to 'false'." : "is not set."));
994       }
995     }
996 
997     // Mark snapshot feature as enabled if cleaners are present and user has not disabled it.
998     this.isSnapshotSupported = snapshotEnabled && !userDisabled;
999 
1000     // If cleaners are not enabled, verify that there're no snapshot in the .snapshot folder
1001     // otherwise we end up with snapshot data loss.
1002     if (!snapshotEnabled) {
1003       LOG.info("Snapshot feature is not enabled, missing log and hfile cleaners.");
1004       Path snapshotDir = SnapshotDescriptionUtils.getSnapshotsDir(mfs.getRootDir());
1005       if (fs.exists(snapshotDir)) {
1006         FileStatus[] snapshots = FSUtils.listStatus(fs, snapshotDir,
1007           new SnapshotDescriptionUtils.CompletedSnaphotDirectoriesFilter(fs));
1008         if (snapshots != null) {
1009           LOG.error("Snapshots are present, but cleaners are not enabled.");
1010           checkSnapshotSupport();
1011         }
1012       }
1013     }
1014   }
1015 
1016   @Override
1017   public void initialize(MasterServices master, MetricsMaster metricsMaster) throws KeeperException,
1018       IOException, UnsupportedOperationException {
1019     this.master = master;
1020     this.metricsMaster = metricsMaster;
1021 
1022     this.rootDir = master.getMasterFileSystem().getRootDir();
1023     checkSnapshotSupport(master.getConfiguration(), master.getMasterFileSystem());
1024 
1025     // get the configuration for the coordinator
1026     Configuration conf = master.getConfiguration();
1027     long wakeFrequency = conf.getInt(SNAPSHOT_WAKE_MILLIS_KEY, SNAPSHOT_WAKE_MILLIS_DEFAULT);
1028     long timeoutMillis = conf.getLong(SNAPSHOT_TIMEOUT_MILLIS_KEY, SNAPSHOT_TIMEOUT_MILLIS_DEFAULT);
1029     int opThreads = conf.getInt(SNAPSHOT_POOL_THREADS_KEY, SNAPSHOT_POOL_THREADS_DEFAULT);
1030 
1031     // setup the default procedure coordinator
1032     String name = master.getServerName().toString();
1033     ThreadPoolExecutor tpool = ProcedureCoordinator.defaultPool(name, opThreads);
1034     ProcedureCoordinatorRpcs comms = new ZKProcedureCoordinatorRpcs(
1035         master.getZooKeeper(), SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, name);
1036 
1037     this.coordinator = new ProcedureCoordinator(comms, tpool, timeoutMillis, wakeFrequency);
1038     this.executorService = master.getExecutorService();
1039     resetTempDir();
1040   }
1041 
1042   @Override
1043   public String getProcedureSignature() {
1044     return ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION;
1045   }
1046 
1047   @Override
1048   public void execProcedure(ProcedureDescription desc) throws IOException {
1049     takeSnapshot(toSnapshotDescription(desc));
1050   }
1051 
1052   @Override
1053   public boolean isProcedureDone(ProcedureDescription desc) throws IOException {
1054     return isSnapshotDone(toSnapshotDescription(desc));
1055   }
1056 
1057   private SnapshotDescription toSnapshotDescription(ProcedureDescription desc)
1058       throws IOException {
1059     SnapshotDescription.Builder builder = SnapshotDescription.newBuilder();
1060     if (!desc.hasInstance()) {
1061       throw new IOException("Snapshot name is not defined: " + desc.toString());
1062     }
1063     String snapshotName = desc.getInstance();
1064     List<NameStringPair> props = desc.getConfigurationList();
1065     String table = null;
1066     for (NameStringPair prop : props) {
1067       if ("table".equalsIgnoreCase(prop.getName())) {
1068         table = prop.getValue();
1069       }
1070     }
1071     if (table == null) {
1072       throw new IOException("Snapshot table is not defined: " + desc.toString());
1073     }
1074     TableName tableName = TableName.valueOf(table);
1075     builder.setTable(tableName.getNameAsString());
1076     builder.setName(snapshotName);
1077     builder.setType(SnapshotDescription.Type.FLUSH);
1078     return builder.build();
1079   }
1080 }