1 /**
2 * Copyright 2009 The Apache Software Foundation
3 *
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20 package org.apache.hadoop.hbase.mapreduce;
21
22 import java.io.IOException;
23 import java.util.ArrayList;
24 import java.util.List;
25
26 import org.apache.commons.logging.Log;
27 import org.apache.commons.logging.LogFactory;
28 import org.apache.hadoop.hbase.client.HTable;
29 import org.apache.hadoop.hbase.client.Result;
30 import org.apache.hadoop.hbase.client.ResultScanner;
31 import org.apache.hadoop.hbase.client.Scan;
32 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
33 import org.apache.hadoop.hbase.util.Bytes;
34 import org.apache.hadoop.hbase.util.Pair;
35 import org.apache.hadoop.mapreduce.InputFormat;
36 import org.apache.hadoop.mapreduce.InputSplit;
37 import org.apache.hadoop.mapreduce.JobContext;
38 import org.apache.hadoop.mapreduce.RecordReader;
39 import org.apache.hadoop.mapreduce.TaskAttemptContext;
40 import org.apache.hadoop.util.StringUtils;
41
42 /**
43 * A base for {@link TableInputFormat}s. Receives a {@link HTable}, an
44 * {@link Scan} instance that defines the input columns etc. Subclasses may use
45 * other TableRecordReader implementations.
46 * <p>
47 * An example of a subclass:
48 * <pre>
49 * class ExampleTIF extends TableInputFormatBase implements JobConfigurable {
50 *
51 * public void configure(JobConf job) {
52 * HTable exampleTable = new HTable(new HBaseConfiguration(job),
53 * Bytes.toBytes("exampleTable"));
54 * // mandatory
55 * setHTable(exampleTable);
56 * Text[] inputColumns = new byte [][] { Bytes.toBytes("columnA"),
57 * Bytes.toBytes("columnB") };
58 * // mandatory
59 * setInputColumns(inputColumns);
60 * RowFilterInterface exampleFilter = new RegExpRowFilter("keyPrefix.*");
61 * // optional
62 * setRowFilter(exampleFilter);
63 * }
64 *
65 * public void validateInput(JobConf job) throws IOException {
66 * }
67 * }
68 * </pre>
69 */
70 public abstract class TableInputFormatBase
71 extends InputFormat<ImmutableBytesWritable, Result> {
72
73 final Log LOG = LogFactory.getLog(TableInputFormatBase.class);
74
75 /** Holds the details for the internal scanner. */
76 private Scan scan = null;
77 /** The table to scan. */
78 private HTable table = null;
79 /** The reader scanning the table, can be a custom one. */
80 private TableRecordReader tableRecordReader = null;
81
82
83 /**
84 * Builds a TableRecordReader. If no TableRecordReader was provided, uses
85 * the default.
86 *
87 * @param split The split to work with.
88 * @param context The current context.
89 * @return The newly created record reader.
90 * @throws IOException When creating the reader fails.
91 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
92 * org.apache.hadoop.mapreduce.InputSplit,
93 * org.apache.hadoop.mapreduce.TaskAttemptContext)
94 */
95 @Override
96 public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
97 InputSplit split, TaskAttemptContext context)
98 throws IOException {
99 TableSplit tSplit = (TableSplit) split;
100 TableRecordReader trr = this.tableRecordReader;
101 // if no table record reader was provided use default
102 if (trr == null) {
103 trr = new TableRecordReader();
104 }
105 Scan sc = new Scan(this.scan);
106 sc.setStartRow(tSplit.getStartRow());
107 sc.setStopRow(tSplit.getEndRow());
108 trr.setScan(sc);
109 trr.setHTable(table);
110 trr.init();
111 return trr;
112 }
113
114 /**
115 * Calculates the splits that will serve as input for the map tasks. The
116 * number of splits matches the number of regions in a table.
117 *
118 * @param context The current job context.
119 * @return The list of input splits.
120 * @throws IOException When creating the list of splits fails.
121 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
122 * org.apache.hadoop.mapreduce.JobContext)
123 */
124 @Override
125 public List<InputSplit> getSplits(JobContext context) throws IOException {
126 if (table == null) {
127 throw new IOException("No table was provided.");
128 }
129 Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
130 if (keys == null || keys.getFirst() == null ||
131 keys.getFirst().length == 0) {
132 throw new IOException("Expecting at least one region.");
133 }
134 int count = 0;
135 List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
136 for (int i = 0; i < keys.getFirst().length; i++) {
137 if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
138 continue;
139 }
140 String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
141 getServerAddress().getHostname();
142 byte[] startRow = scan.getStartRow();
143 byte[] stopRow = scan.getStopRow();
144 // determine if the given start an stop key fall into the region
145 if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
146 Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
147 (stopRow.length == 0 ||
148 Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
149 byte[] splitStart = startRow.length == 0 ||
150 Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
151 keys.getFirst()[i] : startRow;
152 byte[] splitStop = (stopRow.length == 0 ||
153 Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
154 keys.getSecond()[i].length > 0 ?
155 keys.getSecond()[i] : stopRow;
156 InputSplit split = new TableSplit(table.getTableName(),
157 splitStart, splitStop, regionLocation);
158 splits.add(split);
159 if (LOG.isDebugEnabled())
160 LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
161 }
162 }
163 return splits;
164 }
165
166 /**
167 *
168 *
169 * Test if the given region is to be included in the InputSplit while splitting
170 * the regions of a table.
171 * <p>
172 * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
173 * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
174 * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
175 * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
176 * <br>
177 * <br>
178 * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
179 * <br>
180 * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
181 *
182 *
183 * @param startKey Start key of the region
184 * @param endKey End key of the region
185 * @return true, if this region needs to be included as part of the input (default).
186 *
187 */
188 protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
189 return true;
190 }
191
192
193
194 /**
195 * Allows subclasses to get the {@link HTable}.
196 */
197 protected HTable getHTable() {
198 return this.table;
199 }
200
201 /**
202 * Allows subclasses to set the {@link HTable}.
203 *
204 * @param table The table to get the data from.
205 */
206 protected void setHTable(HTable table) {
207 this.table = table;
208 }
209
210 /**
211 * Gets the scan defining the actual details like columns etc.
212 *
213 * @return The internal scan instance.
214 */
215 public Scan getScan() {
216 if (this.scan == null) this.scan = new Scan();
217 return scan;
218 }
219
220 /**
221 * Sets the scan defining the actual details like columns etc.
222 *
223 * @param scan The scan to set.
224 */
225 public void setScan(Scan scan) {
226 this.scan = scan;
227 }
228
229 /**
230 * Allows subclasses to set the {@link TableRecordReader}.
231 *
232 * @param tableRecordReader A different {@link TableRecordReader}
233 * implementation.
234 */
235 protected void setTableRecordReader(TableRecordReader tableRecordReader) {
236 this.tableRecordReader = tableRecordReader;
237 }
238
239 }