View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.IOException;
22  import java.util.HashMap;
23  import java.util.Map;
24  import java.util.Random;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.conf.Configuration;
29  import org.apache.hadoop.conf.Configured;
30  import org.apache.hadoop.fs.FileSystem;
31  import org.apache.hadoop.fs.Path;
32  import org.apache.hadoop.hbase.HBaseConfiguration;
33  import org.apache.hadoop.hbase.HConstants;
34  import org.apache.hadoop.hbase.TableName;
35  import org.apache.hadoop.hbase.classification.InterfaceAudience;
36  import org.apache.hadoop.hbase.classification.InterfaceStability;
37  import org.apache.hadoop.hbase.client.HTable;
38  import org.apache.hadoop.hbase.client.Scan;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.mapreduce.Job;
41  import org.apache.hadoop.util.GenericOptionsParser;
42  import org.apache.hadoop.util.Tool;
43  import org.apache.hadoop.util.ToolRunner;
44  
45  /**
46   * Tool used to copy a table to another one which can be on a different setup.
47   * It is also configurable with a start and time as well as a specification
48   * of the region server implementation if different from the local cluster.
49   */
50  @InterfaceAudience.Public
51  @InterfaceStability.Stable
52  public class CopyTable extends Configured implements Tool {
53    private static final Log LOG = LogFactory.getLog(CopyTable.class);
54  
55    final static String NAME = "copytable";
56    long startTime = 0;
57    long endTime = 0;
58    int versions = -1;
59    String tableName = null;
60    String startRow = null;
61    String stopRow = null;
62    String dstTableName = null;
63    String peerAddress = null;
64    String families = null;
65    boolean allCells = false;
66    
67    boolean bulkload = false;
68    Path bulkloadDir = null;
69  
70    private final static String JOB_NAME_CONF_KEY = "mapreduce.job.name";
71  
72    public CopyTable(Configuration conf) {
73      super(conf);
74    }
75    /**
76     * Sets up the actual job.
77     *
78     * @param args  The command line parameters.
79     * @return The newly created job.
80     * @throws IOException When setting up the job fails.
81     */
82    public Job createSubmittableJob(String[] args)
83    throws IOException {
84      if (!doCommandLine(args)) {
85        return null;
86      }
87      
88      Job job = Job.getInstance(getConf(), getConf().get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
89      job.setJarByClass(CopyTable.class);
90      Scan scan = new Scan();
91      scan.setCacheBlocks(false);
92      if (startTime != 0) {
93        scan.setTimeRange(startTime,
94            endTime == 0 ? HConstants.LATEST_TIMESTAMP : endTime);
95      }
96      if (allCells) {
97        scan.setRaw(true);
98      }
99      if (versions >= 0) {
100       scan.setMaxVersions(versions);
101     }
102     
103     if (startRow != null) {
104       scan.setStartRow(Bytes.toBytes(startRow));
105     }
106     
107     if (stopRow != null) {
108       scan.setStopRow(Bytes.toBytes(stopRow));
109     }
110     
111     if(families != null) {
112       String[] fams = families.split(",");
113       Map<String,String> cfRenameMap = new HashMap<String,String>();
114       for(String fam : fams) {
115         String sourceCf;
116         if(fam.contains(":")) { 
117             // fam looks like "sourceCfName:destCfName"
118             String[] srcAndDest = fam.split(":", 2);
119             sourceCf = srcAndDest[0];
120             String destCf = srcAndDest[1];
121             cfRenameMap.put(sourceCf, destCf);
122         } else {
123             // fam is just "sourceCf"
124             sourceCf = fam; 
125         }
126         scan.addFamily(Bytes.toBytes(sourceCf));
127       }
128       Import.configureCfRenaming(job.getConfiguration(), cfRenameMap);
129     }
130     job.setNumReduceTasks(0);
131     
132     if (bulkload) {
133       TableMapReduceUtil.initTableMapperJob(tableName, scan, Import.KeyValueImporter.class, null,
134         null, job);
135       
136       // We need to split the inputs by destination tables so that output of Map can be bulk-loaded.
137       TableInputFormat.configureSplitTable(job, TableName.valueOf(dstTableName));
138       
139       FileSystem fs = FileSystem.get(getConf());
140       Random rand = new Random();
141       Path root = new Path(fs.getWorkingDirectory(), "copytable");
142       fs.mkdirs(root);
143       while (true) {
144         bulkloadDir = new Path(root, "" + rand.nextLong());
145         if (!fs.exists(bulkloadDir)) {
146           break;
147         }
148       }
149       
150       System.out.println("HFiles will be stored at " + this.bulkloadDir);
151       HFileOutputFormat2.setOutputPath(job, bulkloadDir);
152       HTable htable = new HTable(getConf(), TableName.valueOf(dstTableName));
153       try {
154         HFileOutputFormat2.configureIncrementalLoadMap(job, htable);
155       } finally {
156         htable.close();
157       }
158     } else {
159       TableMapReduceUtil.initTableMapperJob(tableName, scan,
160         Import.Importer.class, null, null, job);
161       
162       TableMapReduceUtil.initTableReducerJob(dstTableName, null, job, null, peerAddress, null,
163         null);
164     }
165     
166     return job;
167   }
168 
169   /*
170    * @param errorMsg Error message.  Can be null.
171    */
172   private static void printUsage(final String errorMsg) {
173     if (errorMsg != null && errorMsg.length() > 0) {
174       System.err.println("ERROR: " + errorMsg);
175     }
176     System.err.println("Usage: CopyTable [general options] [--starttime=X] [--endtime=Y] " +
177         "[--new.name=NEW] [--peer.adr=ADR] <tablename>");
178     System.err.println();
179     System.err.println("Options:");
180     System.err.println(" rs.class     hbase.regionserver.class of the peer cluster");
181     System.err.println("              specify if different from current cluster");
182     System.err.println(" rs.impl      hbase.regionserver.impl of the peer cluster");
183     System.err.println(" startrow     the start row");
184     System.err.println(" stoprow      the stop row");
185     System.err.println(" starttime    beginning of the time range (unixtime in millis)");
186     System.err.println("              without endtime means from starttime to forever");
187     System.err.println(" endtime      end of the time range.  Ignored if no starttime specified.");
188     System.err.println(" versions     number of cell versions to copy");
189     System.err.println(" new.name     new table's name");
190     System.err.println(" peer.adr     Address of the peer cluster given in the format");
191     System.err.println("              hbase.zookeeer.quorum:hbase.zookeeper.client.port:zookeeper.znode.parent");
192     System.err.println(" families     comma-separated list of families to copy");
193     System.err.println("              To copy from cf1 to cf2, give sourceCfName:destCfName. ");
194     System.err.println("              To keep the same name, just give \"cfName\"");
195     System.err.println(" all.cells    also copy delete markers and deleted cells");
196     System.err.println(" bulkload     Write input into HFiles and bulk load to the destination "
197         + "table");
198     System.err.println();
199     System.err.println("Args:");
200     System.err.println(" tablename    Name of the table to copy");
201     System.err.println();
202     System.err.println("Examples:");
203     System.err.println(" To copy 'TestTable' to a cluster that uses replication for a 1 hour window:");
204     System.err.println(" $ bin/hbase " +
205         "org.apache.hadoop.hbase.mapreduce.CopyTable --starttime=1265875194289 --endtime=1265878794289 " +
206         "--peer.adr=server1,server2,server3:2181:/hbase --families=myOldCf:myNewCf,cf2,cf3 TestTable ");
207     System.err.println("For performance consider the following general options:\n"
208         + "-Dhbase.client.scanner.caching=100\n"
209         + "-Dmapred.map.tasks.speculative.execution=false");
210   }
211 
212   private boolean doCommandLine(final String[] args) {
213     // Process command-line args. TODO: Better cmd-line processing
214     // (but hopefully something not as painful as cli options).
215     if (args.length < 1) {
216       printUsage(null);
217       return false;
218     }
219     try {
220       for (int i = 0; i < args.length; i++) {
221         String cmd = args[i];
222         if (cmd.equals("-h") || cmd.startsWith("--h")) {
223           printUsage(null);
224           return false;
225         }
226         
227         final String startRowArgKey = "--startrow=";
228         if (cmd.startsWith(startRowArgKey)) {
229           startRow = cmd.substring(startRowArgKey.length());
230           continue;
231         }
232         
233         final String stopRowArgKey = "--stoprow=";
234         if (cmd.startsWith(stopRowArgKey)) {
235           stopRow = cmd.substring(stopRowArgKey.length());
236           continue;
237         }
238         
239         final String startTimeArgKey = "--starttime=";
240         if (cmd.startsWith(startTimeArgKey)) {
241           startTime = Long.parseLong(cmd.substring(startTimeArgKey.length()));
242           continue;
243         }
244 
245         final String endTimeArgKey = "--endtime=";
246         if (cmd.startsWith(endTimeArgKey)) {
247           endTime = Long.parseLong(cmd.substring(endTimeArgKey.length()));
248           continue;
249         }
250 
251         final String versionsArgKey = "--versions=";
252         if (cmd.startsWith(versionsArgKey)) {
253           versions = Integer.parseInt(cmd.substring(versionsArgKey.length()));
254           continue;
255         }
256 
257         final String newNameArgKey = "--new.name=";
258         if (cmd.startsWith(newNameArgKey)) {
259           dstTableName = cmd.substring(newNameArgKey.length());
260           continue;
261         }
262 
263         final String peerAdrArgKey = "--peer.adr=";
264         if (cmd.startsWith(peerAdrArgKey)) {
265           peerAddress = cmd.substring(peerAdrArgKey.length());
266           continue;
267         }
268 
269         final String familiesArgKey = "--families=";
270         if (cmd.startsWith(familiesArgKey)) {
271           families = cmd.substring(familiesArgKey.length());
272           continue;
273         }
274 
275         if (cmd.startsWith("--all.cells")) {
276           allCells = true;
277           continue;
278         }
279         
280         if (cmd.startsWith("--bulkload")) {
281           bulkload = true;
282           continue;
283         }
284 
285         if (i == args.length-1) {
286           tableName = cmd;
287         } else {
288           printUsage("Invalid argument '" + cmd + "'" );
289           return false;
290         }
291       }
292       if (dstTableName == null && peerAddress == null) {
293         printUsage("At least a new table name or a " +
294             "peer address must be specified");
295         return false;
296       }
297       if ((endTime != 0) && (startTime > endTime)) {
298         printUsage("Invalid time range filter: starttime=" + startTime + " >  endtime=" + endTime);
299         return false;
300       }
301       
302       if (bulkload && peerAddress != null) {
303         printUsage("Remote bulkload is not supported!");
304         return false;
305       }
306       
307       // set dstTableName if necessary
308       if (dstTableName == null) {
309         dstTableName = tableName;
310       }
311     } catch (Exception e) {
312       e.printStackTrace();
313       printUsage("Can't start because " + e.getMessage());
314       return false;
315     }
316     return true;
317   }
318 
319   /**
320    * Main entry point.
321    *
322    * @param args  The command line parameters.
323    * @throws Exception When running the job fails.
324    */
325   public static void main(String[] args) throws Exception {
326     int ret = ToolRunner.run(new CopyTable(HBaseConfiguration.create()), args);
327     System.exit(ret);
328   }
329 
330   @Override
331   public int run(String[] args) throws Exception {
332     String[] otherArgs = new GenericOptionsParser(getConf(), args).getRemainingArgs();
333     Job job = createSubmittableJob(otherArgs);
334     if (job == null) return 1;
335     if (!job.waitForCompletion(true)) {
336       LOG.info("Map-reduce job failed!");
337       if (bulkload) {
338         LOG.info("Files are not bulkloaded!");
339       }
340       return 1;
341     }
342     int code = 0;
343     if (bulkload) {
344       code = new LoadIncrementalHFiles(this.getConf()).run(new String[]{this.bulkloadDir.toString(),
345           this.dstTableName});
346       if (code == 0) {
347         // bulkloadDir is deleted only LoadIncrementalHFiles was successful so that one can rerun
348         // LoadIncrementalHFiles.
349         FileSystem fs = FileSystem.get(this.getConf());
350         if (!fs.delete(this.bulkloadDir, true)) {
351           LOG.error("Deleting folder " + bulkloadDir + " failed!");
352           code = 1;
353         }
354       }
355     }
356     return code;
357   }
358 }