View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.IOException;
22  import java.io.InterruptedIOException;
23  import java.net.InetAddress;
24  import java.net.InetSocketAddress;
25  import java.text.MessageFormat;
26  import java.net.UnknownHostException;
27  import java.util.ArrayList;
28  import java.util.HashMap;
29  import java.util.List;
30  
31  import javax.naming.NamingException;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.hbase.classification.InterfaceAudience;
36  import org.apache.hadoop.hbase.classification.InterfaceStability;
37  import org.apache.hadoop.hbase.HConstants;
38  import org.apache.hadoop.hbase.HRegionLocation;
39  import org.apache.hadoop.hbase.client.HTable;
40  import org.apache.hadoop.hbase.client.Result;
41  import org.apache.hadoop.hbase.client.Scan;
42  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
43  import org.apache.hadoop.hbase.util.Addressing;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.Pair;
46  import org.apache.hadoop.hbase.util.RegionSizeCalculator;
47  import org.apache.hadoop.hbase.util.Strings;
48  import org.apache.hadoop.mapreduce.InputFormat;
49  import org.apache.hadoop.mapreduce.InputSplit;
50  import org.apache.hadoop.mapreduce.JobContext;
51  import org.apache.hadoop.mapreduce.RecordReader;
52  import org.apache.hadoop.mapreduce.TaskAttemptContext;
53  import org.apache.hadoop.net.DNS;
54  import org.apache.hadoop.util.StringUtils;
55  
56  /**
57   * A base for {@link TableInputFormat}s. Receives a {@link HTable}, an
58   * {@link Scan} instance that defines the input columns etc. Subclasses may use
59   * other TableRecordReader implementations.
60   * <p>
61   * An example of a subclass:
62   * <pre>
63   *   class ExampleTIF extends TableInputFormatBase implements JobConfigurable {
64   *
65   *     public void configure(JobConf job) {
66   *       HTable exampleTable = new HTable(HBaseConfiguration.create(job),
67   *         Bytes.toBytes("exampleTable"));
68   *       // mandatory
69   *       setHTable(exampleTable);
70   *       Text[] inputColumns = new byte [][] { Bytes.toBytes("cf1:columnA"),
71   *         Bytes.toBytes("cf2") };
72   *       // mandatory
73   *       setInputColumns(inputColumns);
74   *       RowFilterInterface exampleFilter = new RegExpRowFilter("keyPrefix.*");
75   *       // optional
76   *       setRowFilter(exampleFilter);
77   *     }
78   *
79   *     public void validateInput(JobConf job) throws IOException {
80   *     }
81   *  }
82   * </pre>
83   */
84  @InterfaceAudience.Public
85  @InterfaceStability.Stable
86  public abstract class TableInputFormatBase
87  extends InputFormat<ImmutableBytesWritable, Result> {
88  
89    final Log LOG = LogFactory.getLog(TableInputFormatBase.class);
90  
91    /** Holds the details for the internal scanner. */
92    private Scan scan = null;
93    /** The table to scan. */
94    private HTable table = null;
95    /** The reader scanning the table, can be a custom one. */
96    private TableRecordReader tableRecordReader = null;
97  
98  
99    /** The reverse DNS lookup cache mapping: IPAddress => HostName */
100   private HashMap<InetAddress, String> reverseDNSCacheMap =
101     new HashMap<InetAddress, String>();
102   
103   /** The NameServer address */
104   private String nameServer = null;
105   
106   /**
107    * Builds a TableRecordReader. If no TableRecordReader was provided, uses
108    * the default.
109    *
110    * @param split  The split to work with.
111    * @param context  The current context.
112    * @return The newly created record reader.
113    * @throws IOException When creating the reader fails.
114    * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(
115    *   org.apache.hadoop.mapreduce.InputSplit,
116    *   org.apache.hadoop.mapreduce.TaskAttemptContext)
117    */
118   @Override
119   public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
120       InputSplit split, TaskAttemptContext context)
121   throws IOException {
122     if (table == null) {
123       throw new IOException("Cannot create a record reader because of a" +
124           " previous error. Please look at the previous logs lines from" +
125           " the task's full log for more details.");
126     }
127     TableSplit tSplit = (TableSplit) split;
128     LOG.info("Input split length: " + StringUtils.humanReadableInt(tSplit.getLength()) + " bytes.");
129     TableRecordReader trr = this.tableRecordReader;
130     // if no table record reader was provided use default
131     if (trr == null) {
132       trr = new TableRecordReader();
133     }
134     Scan sc = new Scan(this.scan);
135     sc.setStartRow(tSplit.getStartRow());
136     sc.setStopRow(tSplit.getEndRow());
137     trr.setScan(sc);
138     trr.setHTable(table);
139     return trr;
140   }
141 
142   /**
143    * Calculates the splits that will serve as input for the map tasks. The
144    * number of splits matches the number of regions in a table.
145    *
146    * @param context  The current job context.
147    * @return The list of input splits.
148    * @throws IOException When creating the list of splits fails.
149    * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(
150    *   org.apache.hadoop.mapreduce.JobContext)
151    */
152   @Override
153   public List<InputSplit> getSplits(JobContext context) throws IOException {
154     if (table == null) {
155       throw new IOException("No table was provided.");
156     }
157     // Get the name server address and the default value is null.
158     this.nameServer =
159       context.getConfiguration().get("hbase.nameserver.address", null);
160 
161     RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(table);
162 
163     Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
164     if (keys == null || keys.getFirst() == null ||
165         keys.getFirst().length == 0) {
166       HRegionLocation regLoc = table.getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);
167       if (null == regLoc) {
168         throw new IOException("Expecting at least one region.");
169       }
170       List<InputSplit> splits = new ArrayList<InputSplit>(1);
171       long regionSize = sizeCalculator.getRegionSize(regLoc.getRegionInfo().getRegionName());
172       TableSplit split = new TableSplit(table.getName(),
173           HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY, regLoc
174               .getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);
175       splits.add(split);
176       return splits;
177     }
178     List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
179     for (int i = 0; i < keys.getFirst().length; i++) {
180       if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
181         continue;
182       }
183       HRegionLocation location = table.getRegionLocation(keys.getFirst()[i], false);
184       // The below InetSocketAddress creation does a name resolution.
185       InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());
186       if (isa.isUnresolved()) {
187         LOG.warn("Failed resolve " + isa);
188       }
189       InetAddress regionAddress = isa.getAddress();
190       String regionLocation;
191       try {
192         regionLocation = reverseDNS(regionAddress);
193       } catch (NamingException e) {
194         LOG.warn("Cannot resolve the host name for " + regionAddress + " because of " + e);
195         regionLocation = location.getHostname();
196       }
197 
198       byte[] startRow = scan.getStartRow();
199       byte[] stopRow = scan.getStopRow();
200       // determine if the given start an stop key fall into the region
201       if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
202           Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
203           (stopRow.length == 0 ||
204            Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
205         byte[] splitStart = startRow.length == 0 ||
206           Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
207             keys.getFirst()[i] : startRow;
208         byte[] splitStop = (stopRow.length == 0 ||
209           Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
210           keys.getSecond()[i].length > 0 ?
211             keys.getSecond()[i] : stopRow;
212 
213         byte[] regionName = location.getRegionInfo().getRegionName();
214         long regionSize = sizeCalculator.getRegionSize(regionName);
215         TableSplit split = new TableSplit(table.getName(),
216           splitStart, splitStop, regionLocation, regionSize);
217         splits.add(split);
218         if (LOG.isDebugEnabled()) {
219           LOG.debug("getSplits: split -> " + i + " -> " + split);
220         }
221       }
222     }
223     return splits;
224   }
225   
226   public String reverseDNS(InetAddress ipAddress) throws NamingException, UnknownHostException {
227     String hostName = this.reverseDNSCacheMap.get(ipAddress);
228     if (hostName == null) {
229       String ipAddressString = null;
230       try {
231         ipAddressString = DNS.reverseDns(ipAddress, null);
232       } catch (Exception e) {
233         // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the
234         // name service. Also, in case of ipv6, we need to use the InetAddress since resolving
235         // reverse DNS using jndi doesn't work well with ipv6 addresses.
236         ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();
237       }
238       if (ipAddressString == null) throw new UnknownHostException("No host found for " + ipAddress);
239       hostName = Strings.domainNamePointerToHostName(ipAddressString);
240       this.reverseDNSCacheMap.put(ipAddress, hostName);
241     }
242     return hostName;
243   }
244 
245   /**
246    *
247    *
248    * Test if the given region is to be included in the InputSplit while splitting
249    * the regions of a table.
250    * <p>
251    * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
252    * (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
253    * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
254    * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
255    * <br>
256    * <br>
257    * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
258    * <br>
259    * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
260    *
261    *
262    * @param startKey Start key of the region
263    * @param endKey End key of the region
264    * @return true, if this region needs to be included as part of the input (default).
265    *
266    */
267   protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
268     return true;
269   }
270 
271   /**
272    * Allows subclasses to get the {@link HTable}.
273    */
274   protected HTable getHTable() {
275     return this.table;
276   }
277 
278   /**
279    * Allows subclasses to set the {@link HTable}.
280    *
281    * @param table  The table to get the data from.
282    */
283   protected void setHTable(HTable table) {
284     this.table = table;
285   }
286 
287   /**
288    * Gets the scan defining the actual details like columns etc.
289    *
290    * @return The internal scan instance.
291    */
292   public Scan getScan() {
293     if (this.scan == null) this.scan = new Scan();
294     return scan;
295   }
296 
297   /**
298    * Sets the scan defining the actual details like columns etc.
299    *
300    * @param scan  The scan to set.
301    */
302   public void setScan(Scan scan) {
303     this.scan = scan;
304   }
305 
306   /**
307    * Allows subclasses to set the {@link TableRecordReader}.
308    *
309    * @param tableRecordReader A different {@link TableRecordReader}
310    *   implementation.
311    */
312   protected void setTableRecordReader(TableRecordReader tableRecordReader) {
313     this.tableRecordReader = tableRecordReader;
314   }
315 }