1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.apache.hadoop.classification.InterfaceAudience;
26 import org.apache.hadoop.classification.InterfaceStability;
27 import org.apache.hadoop.conf.Configuration;
28 import org.apache.hadoop.fs.Path;
29 import org.apache.hadoop.hbase.HBaseConfiguration;
30 import org.apache.hadoop.hbase.KeyValue;
31 import org.apache.hadoop.hbase.client.Result;
32 import org.apache.hadoop.hbase.client.Scan;
33 import org.apache.hadoop.hbase.filter.CompareFilter;
34 import org.apache.hadoop.hbase.filter.Filter;
35 import org.apache.hadoop.hbase.filter.PrefixFilter;
36 import org.apache.hadoop.hbase.filter.RegexStringComparator;
37 import org.apache.hadoop.hbase.filter.RowFilter;
38 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
39 import org.apache.hadoop.hbase.util.Bytes;
40 import org.apache.hadoop.io.IntWritable;
41 import org.apache.hadoop.io.Text;
42 import org.apache.hadoop.mapreduce.Job;
43 import org.apache.hadoop.mapreduce.Reducer;
44 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
45 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
46 import org.apache.hadoop.util.GenericOptionsParser;
47
48 import com.google.common.base.Preconditions;
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68 @InterfaceAudience.Public
69 @InterfaceStability.Stable
70 public class CellCounter {
71 private static final Log LOG =
72 LogFactory.getLog(CellCounter.class.getName());
73
74
75
76
77
78 static final String NAME = "CellCounter";
79
80
81
82
83 static class CellCounterMapper
84 extends TableMapper<Text, IntWritable> {
85
86
87
88 public static enum Counters {
89 ROWS
90 }
91
92
93
94
95
96
97
98
99
100
101
102
103 @Override
104 public void map(ImmutableBytesWritable row, Result values,
105 Context context)
106 throws IOException {
107 Preconditions.checkState(values != null,
108 "values passed to the map is null");
109 String currentFamilyName = null;
110 String currentQualifierName = null;
111 String currentRowKey = null;
112 Configuration config = context.getConfiguration();
113 String separator = config.get("ReportSeparator",":");
114 try {
115 context.getCounter(Counters.ROWS).increment(1);
116 context.write(new Text("Total ROWS"), new IntWritable(1));
117
118 for (KeyValue value : values.list()) {
119 currentRowKey = Bytes.toStringBinary(value.getRow());
120 String thisRowFamilyName = Bytes.toStringBinary(value.getFamily());
121 if (thisRowFamilyName != null &&
122 !thisRowFamilyName.equals(currentFamilyName)) {
123 currentFamilyName = thisRowFamilyName;
124 context.getCounter("CF", thisRowFamilyName).increment(1);
125 context.write(new Text("Total Families Across all Rows"),
126 new IntWritable(1));
127 context.write(new Text(thisRowFamilyName), new IntWritable(1));
128 }
129 String thisRowQualifierName = thisRowFamilyName + separator
130 + Bytes.toStringBinary(value.getQualifier());
131 if (thisRowQualifierName != null &&
132 !thisRowQualifierName.equals(currentQualifierName)) {
133 currentQualifierName = thisRowQualifierName;
134 context.getCounter("CFQL", thisRowQualifierName).increment(1);
135 context.write(new Text("Total Qualifiers across all Rows"),
136 new IntWritable(1));
137 context.write(new Text(thisRowQualifierName), new IntWritable(1));
138
139 context.getCounter("QL_VERSIONS", currentRowKey + separator +
140 thisRowQualifierName).increment(1);
141 context.write(new Text(currentRowKey + separator
142 + thisRowQualifierName + "_Versions"), new IntWritable(1));
143
144 } else {
145
146 currentQualifierName = thisRowQualifierName;
147 context.getCounter("QL_VERSIONS", currentRowKey + separator +
148 thisRowQualifierName).increment(1);
149 context.write(new Text(currentRowKey + separator
150 + thisRowQualifierName + "_Versions"), new IntWritable(1));
151 }
152 }
153 } catch (InterruptedException e) {
154 e.printStackTrace();
155 }
156 }
157 }
158
159 static class IntSumReducer<Key> extends Reducer<Key, IntWritable,
160 Key, IntWritable> {
161
162 private IntWritable result = new IntWritable();
163 public void reduce(Key key, Iterable<IntWritable> values,
164 Context context)
165 throws IOException, InterruptedException {
166 int sum = 0;
167 for (IntWritable val : values) {
168 sum += val.get();
169 }
170 result.set(sum);
171 context.write(key, result);
172 }
173 }
174
175
176
177
178
179
180
181
182
183 public static Job createSubmittableJob(Configuration conf, String[] args)
184 throws IOException {
185 String tableName = args[0];
186 Path outputDir = new Path(args[1]);
187 String reportSeparatorString = (args.length > 2) ? args[2]: ":";
188 conf.set("ReportSeparator", reportSeparatorString);
189 Job job = new Job(conf, NAME + "_" + tableName);
190 job.setJarByClass(CellCounter.class);
191 Scan scan = getConfiguredScanForJob(conf, args);
192 TableMapReduceUtil.initTableMapperJob(tableName, scan,
193 CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);
194 job.setNumReduceTasks(1);
195 job.setMapOutputKeyClass(Text.class);
196 job.setMapOutputValueClass(IntWritable.class);
197 job.setOutputFormatClass(TextOutputFormat.class);
198 job.setOutputKeyClass(Text.class);
199 job.setOutputValueClass(IntWritable.class);
200 FileOutputFormat.setOutputPath(job, outputDir);
201 job.setReducerClass(IntSumReducer.class);
202 return job;
203 }
204
205 private static Scan getConfiguredScanForJob(Configuration conf, String[] args) throws IOException {
206 Scan s = new Scan();
207
208 s.setMaxVersions(Integer.MAX_VALUE);
209 s.setCacheBlocks(false);
210
211 if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
212 s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
213 }
214
215 Filter rowFilter = getRowFilter(args);
216 if (rowFilter!= null) {
217 LOG.info("Setting Row Filter for counter.");
218 s.setFilter(rowFilter);
219 }
220 return s;
221 }
222
223
224 private static Filter getRowFilter(String[] args) {
225 Filter rowFilter = null;
226 String filterCriteria = (args.length > 3) ? args[3]: null;
227 if (filterCriteria == null) return null;
228 if (filterCriteria.startsWith("^")) {
229 String regexPattern = filterCriteria.substring(1, filterCriteria.length());
230 rowFilter = new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator(regexPattern));
231 } else {
232 rowFilter = new PrefixFilter(Bytes.toBytes(filterCriteria));
233 }
234 return rowFilter;
235 }
236
237
238
239
240
241
242
243 public static void main(String[] args) throws Exception {
244 Configuration conf = HBaseConfiguration.create();
245 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
246 if (otherArgs.length < 1) {
247 System.err.println("ERROR: Wrong number of parameters: " + args.length);
248 System.err.println("Usage: CellCounter <tablename> <outputDir> <reportSeparator> " +
249 "[^[regex pattern] or [Prefix] for row filter]] ");
250 System.err.println(" Note: -D properties will be applied to the conf used. ");
251 System.err.println(" Additionally, the following SCAN properties can be specified");
252 System.err.println(" to get fine grained control on what is counted..");
253 System.err.println(" -D " + TableInputFormat.SCAN_COLUMN_FAMILY + "=<familyName>");
254 System.err.println(" <reportSeparator> parameter can be used to override the default report separator " +
255 "string : used to separate the rowId/column family name and qualifier name.");
256 System.err.println(" [^[regex pattern] or [Prefix] parameter can be used to limit the cell counter count " +
257 "operation to a limited subset of rows from the table based on regex or prefix pattern.");
258 System.exit(-1);
259 }
260 Job job = createSubmittableJob(conf, otherArgs);
261 System.exit(job.waitForCompletion(true) ? 0 : 1);
262 }
263 }