View Javadoc

1   /**
2    * Copyright 2009 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.mapreduce;
21  
22  import java.io.IOException;
23  import java.util.ArrayList;
24  import java.util.List;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.hbase.client.HTable;
29  import org.apache.hadoop.hbase.client.Result;
30  import org.apache.hadoop.hbase.client.ResultScanner;
31  import org.apache.hadoop.hbase.client.Scan;
32  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
33  import org.apache.hadoop.hbase.util.Bytes;
34  import org.apache.hadoop.hbase.util.Pair;
35  import org.apache.hadoop.mapreduce.InputFormat;
36  import org.apache.hadoop.mapreduce.InputSplit;
37  import org.apache.hadoop.mapreduce.JobContext;
38  import org.apache.hadoop.mapreduce.RecordReader;
39  import org.apache.hadoop.mapreduce.TaskAttemptContext;
40  import org.apache.hadoop.util.StringUtils;
41  
42  /**
43   * A base for {@link TableInputFormat}s. Receives a {@link HTable}, an
44   * {@link Scan} instance that defines the input columns etc. Subclasses may use
45   * other TableRecordReader implementations.
46   * <p>
47   * An example of a subclass:
48   * <pre>
49   *   class ExampleTIF extends TableInputFormatBase implements JobConfigurable {
50   *
51   *     public void configure(JobConf job) {
52   *       HTable exampleTable = new HTable(new HBaseConfiguration(job),
53   *         Bytes.toBytes("exampleTable"));
54   *       // mandatory
55   *       setHTable(exampleTable);
56   *       Text[] inputColumns = new byte [][] { Bytes.toBytes("columnA"),
57   *         Bytes.toBytes("columnB") };
58   *       // mandatory
59   *       setInputColumns(inputColumns);
60   *       RowFilterInterface exampleFilter = new RegExpRowFilter("keyPrefix.*");
61   *       // optional
62   *       setRowFilter(exampleFilter);
63   *     }
64   *
65   *     public void validateInput(JobConf job) throws IOException {
66   *     }
67   *  }
68   * </pre>
69   */
70  public abstract class TableInputFormatBase
71  extends InputFormat<ImmutableBytesWritable, Result> {
72  
73    final Log LOG = LogFactory.getLog(TableInputFormatBase.class);
74  
75    /** Holds the details for the internal scanner. */
76    private Scan scan = null;
77    /** The table to scan. */
78    private HTable table = null;
79    /** The reader scanning the table, can be a custom one. */
80    private TableRecordReader tableRecordReader = null;
81  
82  
83    /**
84     * Builds a TableRecordReader. If no TableRecordReader was provided, uses
85     * the default.
86     *
87     * @param split  The split to work with.
88     * @param context  The current context.
89     * @return The newly created record reader.
90     * @throws IOException When creating the reader fails.
91     * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
92     *   org.apache.hadoop.mapreduce.InputSplit,
93     *   org.apache.hadoop.mapreduce.TaskAttemptContext)
94     */
95    @Override
96    public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
97        InputSplit split, TaskAttemptContext context)
98    throws IOException {
99      TableSplit tSplit = (TableSplit) split;
100     TableRecordReader trr = this.tableRecordReader;
101     // if no table record reader was provided use default
102     if (trr == null) {
103       trr = new TableRecordReader();
104     }
105     Scan sc = new Scan(this.scan);
106     sc.setStartRow(tSplit.getStartRow());
107     sc.setStopRow(tSplit.getEndRow());
108     trr.setScan(sc);
109     trr.setHTable(table);
110     trr.init();
111     return trr;
112   }
113 
114   /**
115    * Calculates the splits that will serve as input for the map tasks. The
116    * number of splits matches the number of regions in a table.
117    *
118    * @param context  The current job context.
119    * @return The list of input splits.
120    * @throws IOException When creating the list of splits fails.
121    * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
122    *   org.apache.hadoop.mapreduce.JobContext)
123    */
124   @Override
125   public List<InputSplit> getSplits(JobContext context) throws IOException {
126     Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
127     if (keys == null || keys.getFirst() == null ||
128         keys.getFirst().length == 0) {
129       throw new IOException("Expecting at least one region.");
130     }
131     if (table == null) {
132       throw new IOException("No table was provided.");
133     }
134     int count = 0;
135     List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
136     for (int i = 0; i < keys.getFirst().length; i++) {
137       if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
138         continue;
139       }
140       String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
141         getServerAddress().getHostname();
142       byte[] startRow = scan.getStartRow();
143       byte[] stopRow = scan.getStopRow();
144       // determine if the given start an stop key fall into the region
145       if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
146            Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
147           (stopRow.length == 0 ||
148            Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
149         byte[] splitStart = startRow.length == 0 ||
150           Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
151             keys.getFirst()[i] : startRow;
152         byte[] splitStop = (stopRow.length == 0 ||
153           Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
154           keys.getSecond()[i].length > 0 ?
155             keys.getSecond()[i] : stopRow;
156         InputSplit split = new TableSplit(table.getTableName(),
157           splitStart, splitStop, regionLocation);
158         splits.add(split);
159         if (LOG.isDebugEnabled())
160           LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
161       }
162     }
163     return splits;
164   }
165 
166   /**
167    *
168    *
169    * Test if the given region is to be included in the InputSplit while splitting
170    * the regions of a table.
171    * <p>
172    * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
173    * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
174    * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
175    * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
176    * <br>
177    * <br>
178    * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
179    * <br>
180    * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
181    *
182    *
183    * @param startKey Start key of the region
184    * @param endKey End key of the region
185    * @return true, if this region needs to be included as part of the input (default).
186    *
187    */
188   protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
189     return true;
190   }
191 
192 
193 
194   /**
195    * Allows subclasses to get the {@link HTable}.
196    */
197   protected HTable getHTable() {
198     return this.table;
199   }
200 
201   /**
202    * Allows subclasses to set the {@link HTable}.
203    *
204    * @param table  The table to get the data from.
205    */
206   protected void setHTable(HTable table) {
207     this.table = table;
208   }
209 
210   /**
211    * Gets the scan defining the actual details like columns etc.
212    *
213    * @return The internal scan instance.
214    */
215   public Scan getScan() {
216     if (this.scan == null) this.scan = new Scan();
217     return scan;
218   }
219 
220   /**
221    * Sets the scan defining the actual details like columns etc.
222    *
223    * @param scan  The scan to set.
224    */
225   public void setScan(Scan scan) {
226     this.scan = scan;
227   }
228 
229   /**
230    * Allows subclasses to set the {@link TableRecordReader}.
231    *
232    * @param tableRecordReader A different {@link TableRecordReader}
233    *   implementation.
234    */
235   protected void setTableRecordReader(TableRecordReader tableRecordReader) {
236     this.tableRecordReader = tableRecordReader;
237   }
238 
239 }