1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.apache.hadoop.classification.InterfaceAudience;
26 import org.apache.hadoop.classification.InterfaceStability;
27 import org.apache.hadoop.conf.Configuration;
28 import org.apache.hadoop.fs.Path;
29 import org.apache.hadoop.hbase.HBaseConfiguration;
30 import org.apache.hadoop.hbase.client.Result;
31 import org.apache.hadoop.hbase.client.Scan;
32 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
33 import org.apache.hadoop.hbase.filter.Filter;
34 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
35 import org.apache.hadoop.hbase.filter.PrefixFilter;
36 import org.apache.hadoop.hbase.filter.RegexStringComparator;
37 import org.apache.hadoop.hbase.filter.RowFilter;
38 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
39 import org.apache.hadoop.hbase.util.Bytes;
40 import org.apache.hadoop.mapreduce.Job;
41 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
42 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
43 import org.apache.hadoop.util.GenericOptionsParser;
44
45
46
47
48
49
50 @InterfaceAudience.Public
51 @InterfaceStability.Stable
52 public class Export {
53 private static final Log LOG = LogFactory.getLog(Export.class);
54 final static String NAME = "export";
55 final static String RAW_SCAN = "hbase.mapreduce.include.deleted.rows";
56 final static String EXPORT_BATCHING = "hbase.export.scanner.batch";
57
58
59
60
61
62
63
64
65
66 public static Job createSubmittableJob(Configuration conf, String[] args)
67 throws IOException {
68 String tableName = args[0];
69 Path outputDir = new Path(args[1]);
70 Job job = new Job(conf, NAME + "_" + tableName);
71 job.setJobName(NAME + "_" + tableName);
72 job.setJarByClass(Export.class);
73
74 Scan s = getConfiguredScanForJob(conf, args);
75 IdentityTableMapper.initJob(tableName, s, IdentityTableMapper.class, job);
76
77 job.setNumReduceTasks(0);
78 job.setOutputFormatClass(SequenceFileOutputFormat.class);
79 job.setOutputKeyClass(ImmutableBytesWritable.class);
80 job.setOutputValueClass(Result.class);
81 FileOutputFormat.setOutputPath(job, outputDir);
82 return job;
83 }
84
85 private static Scan getConfiguredScanForJob(Configuration conf, String[] args) throws IOException {
86 Scan s = new Scan();
87
88
89 int versions = args.length > 2? Integer.parseInt(args[2]): 1;
90 s.setMaxVersions(versions);
91
92 long startTime = args.length > 3? Long.parseLong(args[3]): 0L;
93 long endTime = args.length > 4? Long.parseLong(args[4]): Long.MAX_VALUE;
94 s.setTimeRange(startTime, endTime);
95
96 s.setCacheBlocks(false);
97
98 boolean raw = Boolean.parseBoolean(conf.get(RAW_SCAN));
99 if (raw) {
100 s.setRaw(raw);
101 }
102
103 if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
104 s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
105 }
106
107 Filter exportFilter = getExportFilter(args);
108 if (exportFilter!= null) {
109 LOG.info("Setting Scan Filter for Export.");
110 s.setFilter(exportFilter);
111 }
112
113 int batching = conf.getInt(EXPORT_BATCHING, -1);
114 if (batching != -1){
115 try {
116 s.setBatch(batching);
117 } catch (IncompatibleFilterException e) {
118 LOG.error("Batching could not be set", e);
119 }
120 }
121 LOG.info("versions=" + versions + ", starttime=" + startTime +
122 ", endtime=" + endTime + ", keepDeletedCells=" + raw);
123 return s;
124 }
125
126 private static Filter getExportFilter(String[] args) {
127 Filter exportFilter = null;
128 String filterCriteria = (args.length > 5) ? args[5]: null;
129 if (filterCriteria == null) return null;
130 if (filterCriteria.startsWith("^")) {
131 String regexPattern = filterCriteria.substring(1, filterCriteria.length());
132 exportFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator(regexPattern));
133 } else {
134 exportFilter = new PrefixFilter(Bytes.toBytes(filterCriteria));
135 }
136 return exportFilter;
137 }
138
139
140
141
142 private static void usage(final String errorMsg) {
143 if (errorMsg != null && errorMsg.length() > 0) {
144 System.err.println("ERROR: " + errorMsg);
145 }
146 System.err.println("Usage: Export [-D <property=value>]* <tablename> <outputdir> [<versions> " +
147 "[<starttime> [<endtime>]] [^[regex pattern] or [Prefix] to filter]]\n");
148 System.err.println(" Note: -D properties will be applied to the conf used. ");
149 System.err.println(" For example: ");
150 System.err.println(" -D mapred.output.compress=true");
151 System.err.println(" -D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec");
152 System.err.println(" -D mapred.output.compression.type=BLOCK");
153 System.err.println(" Additionally, the following SCAN properties can be specified");
154 System.err.println(" to control/limit what is exported..");
155 System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<familyName>");
156 System.err.println(" -D " + RAW_SCAN + "=true");
157 System.err.println("For performance consider the following properties:\n"
158 + " -Dhbase.client.scanner.caching=100\n"
159 + " -Dmapred.map.tasks.speculative.execution=false\n"
160 + " -Dmapred.reduce.tasks.speculative.execution=false");
161 System.err.println("For tables with very wide rows consider setting the batch size as below:\n"
162 + " -D" + EXPORT_BATCHING + "=10");
163 }
164
165
166
167
168
169
170
171 public static void main(String[] args) throws Exception {
172 Configuration conf = HBaseConfiguration.create();
173 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
174 if (otherArgs.length < 2) {
175 usage("Wrong number of arguments: " + otherArgs.length);
176 System.exit(-1);
177 }
178 Job job = createSubmittableJob(conf, otherArgs);
179 System.exit(job.waitForCompletion(true)? 0 : 1);
180 }
181 }