View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.IOException;
22  import java.util.HashMap;
23  import java.util.Map;
24  import java.util.Random;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.conf.Configuration;
29  import org.apache.hadoop.conf.Configured;
30  import org.apache.hadoop.fs.FileSystem;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HBaseConfiguration;
33  import org.apache.hadoop.hbase.HConstants;
34  import org.apache.hadoop.hbase.TableName;
35  import org.apache.hadoop.hbase.classification.InterfaceAudience;
36  import org.apache.hadoop.hbase.classification.InterfaceStability;
37  import org.apache.hadoop.hbase.client.HTable;
38  import org.apache.hadoop.hbase.client.Scan;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.mapreduce.Job;
41  import org.apache.hadoop.util.GenericOptionsParser;
42  import org.apache.hadoop.util.Tool;
43  import org.apache.hadoop.util.ToolRunner;
44  
45  /**
46   * Tool used to copy a table to another one which can be on a different setup.
47   * It is also configurable with a start and time as well as a specification
48   * of the region server implementation if different from the local cluster.
49   */
50  @InterfaceAudience.Public
51  @InterfaceStability.Stable
52  public class CopyTable extends Configured implements Tool {
53    private static final Log LOG = LogFactory.getLog(CopyTable.class);
54  
55    final static String NAME = "copytable";
56    long startTime = 0;
57    long endTime = HConstants.LATEST_TIMESTAMP;
58    int batch = Integer.MAX_VALUE;
59    int cacheRow = -1;
60    int versions = -1;
61    String tableName = null;
62    String startRow = null;
63    String stopRow = null;
64    String dstTableName = null;
65    String peerAddress = null;
66    String families = null;
67    boolean allCells = false;
68    
69    boolean bulkload = false;
70    Path bulkloadDir = null;
71  
72    private final static String JOB_NAME_CONF_KEY = "mapreduce.job.name";
73  
74  
75    // The following variables are introduced to preserve the binary compatibility in 0.98.
76    // Please see HBASE-12836 for further details.
77    @Deprecated
78    static long startTime_ = 0;
79    @Deprecated
80    static long endTime_ = 0;
81    @Deprecated
82    static int versions_ = -1;
83    @Deprecated
84    static String tableName_ = null;
85    @Deprecated
86    static String startRow_ = null;
87    @Deprecated
88    static String stopRow_ = null;
89    @Deprecated
90    static String newTableName_ = null;
91    @Deprecated
92    static String peerAddress_ = null;
93    @Deprecated
94    static String families_ = null;
95    @Deprecated
96    static boolean allCells_ = false;
97  
98    public CopyTable(Configuration conf) {
99      super(conf);
100   }
101 
102   /**
103    * Sets up the actual job.
104    *
105    * @param conf The current configuration.
106    * @param args The command line parameters.
107    * @return The newly created job.
108    * @throws IOException When setting up the job fails.
109    * @deprecated Use {@link #createSubmittableJob(String[])} instead
110    */
111   @Deprecated
112   public static Job createSubmittableJob(Configuration conf, String[] args)
113       throws IOException {
114     if (!deprecatedDoCommandLine(args)) {
115       return null;
116     }
117     Job job = new Job(conf, NAME + "_" + tableName_);
118     job.setJarByClass(CopyTable.class);
119     Scan scan = new Scan();
120     scan.setCacheBlocks(false);
121     if (startTime_ != 0) {
122       scan.setTimeRange(startTime_,
123           endTime_ == 0 ? HConstants.LATEST_TIMESTAMP : endTime_);
124     }
125     if (allCells_) {
126       scan.setRaw(true);
127     }
128     if (versions_ >= 0) {
129       scan.setMaxVersions(versions_);
130     }
131     if (startRow_ != null) {
132       scan.setStartRow(Bytes.toBytes(startRow_));
133     }
134     if (stopRow_ != null) {
135       scan.setStopRow(Bytes.toBytes(stopRow_));
136     }
137     if(families_ != null) {
138       String[] fams = families_.split(",");
139       Map<String,String> cfRenameMap = new HashMap<String,String>();
140       for(String fam : fams) {
141         String sourceCf;
142         if(fam.contains(":")) {
143           // fam looks like "sourceCfName:destCfName"
144           String[] srcAndDest = fam.split(":", 2);
145           sourceCf = srcAndDest[0];
146           String destCf = srcAndDest[1];
147           cfRenameMap.put(sourceCf, destCf);
148         } else {
149          // fam is just "sourceCf"
150           sourceCf = fam;
151         }
152         scan.addFamily(Bytes.toBytes(sourceCf));
153       }
154       Import.configureCfRenaming(job.getConfiguration(), cfRenameMap);
155     }
156     TableMapReduceUtil.initTableMapperJob(tableName_, scan,
157         Import.Importer.class, null, null, job);
158     TableMapReduceUtil.initTableReducerJob(
159         newTableName_ == null ? tableName_ : newTableName_, null, job,
160         null, peerAddress_, null, null);
161     job.setNumReduceTasks(0);
162     return job;
163   }
164 
165   private static boolean deprecatedDoCommandLine(final String[] args) {
166    // Process command-line args. TODO: Better cmd-line processing
167    // (but hopefully something not as painful as cli options).
168     if (args.length < 1) {
169       printUsage(null);
170       return false;
171     }
172     try {
173       for (int i = 0; i < args.length; i++) {
174         String cmd = args[i];
175         if (cmd.equals("-h") || cmd.startsWith("--h")) {
176           printUsage(null);
177           return false;
178         }
179         final String startRowArgKey = "--startrow=";
180         if (cmd.startsWith(startRowArgKey)) {
181           startRow_ = cmd.substring(startRowArgKey.length());
182           continue;
183         }
184         final String stopRowArgKey = "--stoprow=";
185         if (cmd.startsWith(stopRowArgKey)) {
186           stopRow_ = cmd.substring(stopRowArgKey.length());
187           continue;
188         }
189         final String startTimeArgKey = "--starttime=";
190         if (cmd.startsWith(startTimeArgKey)) {
191           startTime_ = Long.parseLong(cmd.substring(startTimeArgKey.length()));
192           continue;
193         }
194         final String endTimeArgKey = "--endtime=";
195         if (cmd.startsWith(endTimeArgKey)) {
196           endTime_ = Long.parseLong(cmd.substring(endTimeArgKey.length()));
197           continue;
198         }
199         final String versionsArgKey = "--versions=";
200         if (cmd.startsWith(versionsArgKey)) {
201           versions_ = Integer.parseInt(cmd.substring(versionsArgKey.length()));
202           continue;
203         }
204         final String newNameArgKey = "--new.name=";
205         if (cmd.startsWith(newNameArgKey)) {
206           newTableName_ = cmd.substring(newNameArgKey.length());
207           continue;
208         }
209         final String peerAdrArgKey = "--peer.adr=";
210         if (cmd.startsWith(peerAdrArgKey)) {
211           peerAddress_ = cmd.substring(peerAdrArgKey.length());
212           continue;
213         }
214         final String familiesArgKey = "--families=";
215         if (cmd.startsWith(familiesArgKey)) {
216           families_ = cmd.substring(familiesArgKey.length());
217           continue;
218         }
219         if (cmd.startsWith("--all.cells")) {
220           allCells_ = true;
221           continue;
222         }
223         if (i == args.length-1) {
224           tableName_ = cmd;
225         } else {
226           printUsage("Invalid argument '" + cmd + "'" );
227           return false;
228         }
229       }
230       if (newTableName_ == null && peerAddress_ == null) {
231         printUsage("At least a new table name or a " +
232             "peer address must be specified");
233         return false;
234       }
235       if ((endTime_ != 0) && (startTime_ > endTime_)) {
236         printUsage("Invalid time range filter: starttime=" + startTime_ + " > endtime="
237             + endTime_);
238         return false;
239       }
240     } catch (Exception e) {
241       e.printStackTrace();
242       printUsage("Can't start because " + e.getMessage());
243       return false;
244     }
245     return true;
246   }
247 
248   /**
249    * Sets up the actual job.
250    *
251    * @param args  The command line parameters.
252    * @return The newly created job.
253    * @throws IOException When setting up the job fails.
254    */
255   public Job createSubmittableJob(String[] args)
256   throws IOException {
257     if (!doCommandLine(args)) {
258       return null;
259     }
260     
261     Job job = Job.getInstance(getConf(), getConf().get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
262     job.setJarByClass(CopyTable.class);
263     Scan scan = new Scan();
264          scan.setBatch(batch);
265     scan.setCacheBlocks(false);
266 
267     if (cacheRow > 0) {
268       scan.setCaching(cacheRow);
269     } else {
270       scan.setCaching(getConf().getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 100));
271     }
272 
273     scan.setTimeRange(startTime, endTime);
274     if (allCells) {
275       scan.setRaw(true);
276     }
277     if (versions >= 0) {
278       scan.setMaxVersions(versions);
279     }
280     
281     if (startRow != null) {
282       scan.setStartRow(Bytes.toBytes(startRow));
283     }
284     
285     if (stopRow != null) {
286       scan.setStopRow(Bytes.toBytes(stopRow));
287     }
288     
289     if(families != null) {
290       String[] fams = families.split(",");
291       Map<String,String> cfRenameMap = new HashMap<String,String>();
292       for(String fam : fams) {
293         String sourceCf;
294         if(fam.contains(":")) { 
295             // fam looks like "sourceCfName:destCfName"
296             String[] srcAndDest = fam.split(":", 2);
297             sourceCf = srcAndDest[0];
298             String destCf = srcAndDest[1];
299             cfRenameMap.put(sourceCf, destCf);
300         } else {
301             // fam is just "sourceCf"
302             sourceCf = fam; 
303         }
304         scan.addFamily(Bytes.toBytes(sourceCf));
305       }
306       Import.configureCfRenaming(job.getConfiguration(), cfRenameMap);
307     }
308     job.setNumReduceTasks(0);
309     
310     if (bulkload) {
311       TableMapReduceUtil.initTableMapperJob(tableName, scan, Import.KeyValueImporter.class, null,
312         null, job);
313       
314       // We need to split the inputs by destination tables so that output of Map can be bulk-loaded.
315       TableInputFormat.configureSplitTable(job, TableName.valueOf(dstTableName));
316       
317       FileSystem fs = FileSystem.get(getConf());
318       Random rand = new Random();
319       Path root = new Path(fs.getWorkingDirectory(), "copytable");
320       fs.mkdirs(root);
321       while (true) {
322         bulkloadDir = new Path(root, "" + rand.nextLong());
323         if (!fs.exists(bulkloadDir)) {
324           break;
325         }
326       }
327       
328       System.out.println("HFiles will be stored at " + this.bulkloadDir);
329       HFileOutputFormat2.setOutputPath(job, bulkloadDir);
330       HTable htable = new HTable(getConf(), TableName.valueOf(dstTableName));
331       try {
332         HFileOutputFormat2.configureIncrementalLoadMap(job, htable);
333       } finally {
334         htable.close();
335       }
336     } else {
337       TableMapReduceUtil.initTableMapperJob(tableName, scan,
338         Import.Importer.class, null, null, job);
339       
340       TableMapReduceUtil.initTableReducerJob(dstTableName, null, job, null, peerAddress, null,
341         null);
342     }
343     
344     return job;
345   }
346 
347   /*
348    * @param errorMsg Error message.  Can be null.
349    */
350   private static void printUsage(final String errorMsg) {
351     if (errorMsg != null && errorMsg.length() > 0) {
352       System.err.println("ERROR: " + errorMsg);
353     }
354     System.err.println("Usage: CopyTable [general options] [--starttime=X] [--endtime=Y] " +
355         "[--new.name=NEW] [--peer.adr=ADR] <tablename>");
356     System.err.println();
357     System.err.println("Options:");
358     System.err.println(" rs.class     hbase.regionserver.class of the peer cluster");
359     System.err.println("              specify if different from current cluster");
360     System.err.println(" rs.impl      hbase.regionserver.impl of the peer cluster");
361     System.err.println(" startrow     the start row");
362     System.err.println(" stoprow      the stop row");
363     System.err.println(" starttime    beginning of the time range (unixtime in millis)");
364     System.err.println("              without endtime means from starttime to forever");
365     System.err.println(" endtime      end of the time range.  Ignored if no starttime specified.");
366     System.err.println(" versions     number of cell versions to copy");
367     System.err.println(" new.name     new table's name");
368     System.err.println(" peer.adr     Address of the peer cluster given in the format");
369     System.err.println("              hbase.zookeeer.quorum:hbase.zookeeper.client.port:zookeeper.znode.parent");
370     System.err.println(" families     comma-separated list of families to copy");
371     System.err.println("              To copy from cf1 to cf2, give sourceCfName:destCfName. ");
372     System.err.println("              To keep the same name, just give \"cfName\"");
373     System.err.println(" all.cells    also copy delete markers and deleted cells");
374     System.err.println(" bulkload     Write input into HFiles and bulk load to the destination "
375         + "table");
376     System.err.println();
377     System.err.println("Args:");
378     System.err.println(" tablename    Name of the table to copy");
379     System.err.println();
380     System.err.println("Examples:");
381     System.err.println(" To copy 'TestTable' to a cluster that uses replication for a 1 hour window:");
382     System.err.println(" $ bin/hbase " +
383         "org.apache.hadoop.hbase.mapreduce.CopyTable --starttime=1265875194289 --endtime=1265878794289 " +
384         "--peer.adr=server1,server2,server3:2181:/hbase --families=myOldCf:myNewCf,cf2,cf3 TestTable ");
385     System.err.println("For performance consider the following general options:\n"
386         + "-Dhbase.client.scanner.caching=100\n"
387         + "-Dmapred.map.tasks.speculative.execution=false");
388   }
389 
390   private boolean doCommandLine(final String[] args) {
391     // Process command-line args. TODO: Better cmd-line processing
392     // (but hopefully something not as painful as cli options).
393     if (args.length < 1) {
394       printUsage(null);
395       return false;
396     }
397     try {
398       for (int i = 0; i < args.length; i++) {
399         String cmd = args[i];
400         if (cmd.equals("-h") || cmd.startsWith("--h")) {
401           printUsage(null);
402           return false;
403         }
404         
405         final String startRowArgKey = "--startrow=";
406         if (cmd.startsWith(startRowArgKey)) {
407           startRow = cmd.substring(startRowArgKey.length());
408           continue;
409         }
410         
411         final String stopRowArgKey = "--stoprow=";
412         if (cmd.startsWith(stopRowArgKey)) {
413           stopRow = cmd.substring(stopRowArgKey.length());
414           continue;
415         }
416         
417         final String startTimeArgKey = "--starttime=";
418         if (cmd.startsWith(startTimeArgKey)) {
419           startTime = Long.parseLong(cmd.substring(startTimeArgKey.length()));
420           continue;
421         }
422 
423         final String endTimeArgKey = "--endtime=";
424         if (cmd.startsWith(endTimeArgKey)) {
425           endTime = Long.parseLong(cmd.substring(endTimeArgKey.length()));
426           continue;
427         }
428         
429         final String batchArgKey = "--batch=";
430         if (cmd.startsWith(batchArgKey)) {
431           batch = Integer.parseInt(cmd.substring(batchArgKey.length()));
432           continue;
433         }
434         
435         final String cacheRowArgKey = "--cacheRow=";
436         if (cmd.startsWith(cacheRowArgKey)) {
437           cacheRow = Integer.parseInt(cmd.substring(cacheRowArgKey.length()));
438           continue;
439         }
440 
441         final String versionsArgKey = "--versions=";
442         if (cmd.startsWith(versionsArgKey)) {
443           versions = Integer.parseInt(cmd.substring(versionsArgKey.length()));
444           continue;
445         }
446 
447         final String newNameArgKey = "--new.name=";
448         if (cmd.startsWith(newNameArgKey)) {
449           dstTableName = cmd.substring(newNameArgKey.length());
450           continue;
451         }
452 
453         final String peerAdrArgKey = "--peer.adr=";
454         if (cmd.startsWith(peerAdrArgKey)) {
455           peerAddress = cmd.substring(peerAdrArgKey.length());
456           continue;
457         }
458 
459         final String familiesArgKey = "--families=";
460         if (cmd.startsWith(familiesArgKey)) {
461           families = cmd.substring(familiesArgKey.length());
462           continue;
463         }
464 
465         if (cmd.startsWith("--all.cells")) {
466           allCells = true;
467           continue;
468         }
469         
470         if (cmd.startsWith("--bulkload")) {
471           bulkload = true;
472           continue;
473         }
474 
475         if (i == args.length-1) {
476           tableName = cmd;
477         } else {
478           printUsage("Invalid argument '" + cmd + "'" );
479           return false;
480         }
481       }
482       if (dstTableName == null && peerAddress == null) {
483         printUsage("At least a new table name or a " +
484             "peer address must be specified");
485         return false;
486       }
487       if ((endTime != 0) && (startTime > endTime)) {
488         printUsage("Invalid time range filter: starttime=" + startTime + " >  endtime=" + endTime);
489         return false;
490       }
491       
492       if (bulkload && peerAddress != null) {
493         printUsage("Remote bulkload is not supported!");
494         return false;
495       }
496       
497       // set dstTableName if necessary
498       if (dstTableName == null) {
499         dstTableName = tableName;
500       }
501     } catch (Exception e) {
502       e.printStackTrace();
503       printUsage("Can't start because " + e.getMessage());
504       return false;
505     }
506     return true;
507   }
508 
509   /**
510    * Main entry point.
511    *
512    * @param args  The command line parameters.
513    * @throws Exception When running the job fails.
514    */
515   public static void main(String[] args) throws Exception {
516     int ret = ToolRunner.run(new CopyTable(HBaseConfiguration.create()), args);
517     System.exit(ret);
518   }
519 
520   @Override
521   public int run(String[] args) throws Exception {
522     String[] otherArgs = new GenericOptionsParser(getConf(), args).getRemainingArgs();
523     Job job = createSubmittableJob(otherArgs);
524     if (job == null) return 1;
525     if (!job.waitForCompletion(true)) {
526       LOG.info("Map-reduce job failed!");
527       if (bulkload) {
528         LOG.info("Files are not bulkloaded!");
529       }
530       return 1;
531     }
532     int code = 0;
533     if (bulkload) {
534       code = new LoadIncrementalHFiles(this.getConf()).run(new String[]{this.bulkloadDir.toString(),
535           this.dstTableName});
536       if (code == 0) {
537         // bulkloadDir is deleted only LoadIncrementalHFiles was successful so that one can rerun
538         // LoadIncrementalHFiles.
539         FileSystem fs = FileSystem.get(this.getConf());
540         if (!fs.delete(this.bulkloadDir, true)) {
541           LOG.error("Deleting folder " + bulkloadDir + " failed!");
542           code = 1;
543         }
544       }
545     }
546     return code;
547   }
548 }