1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.mapreduce;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.List;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26 import org.apache.hadoop.classification.InterfaceAudience;
27 import org.apache.hadoop.classification.InterfaceStability;
28 import org.apache.hadoop.hbase.client.HTable;
29 import org.apache.hadoop.hbase.client.Result;
30 import org.apache.hadoop.hbase.client.Scan;
31 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
32 import org.apache.hadoop.hbase.util.Bytes;
33 import org.apache.hadoop.hbase.util.Pair;
34 import org.apache.hadoop.mapreduce.InputFormat;
35 import org.apache.hadoop.mapreduce.InputSplit;
36 import org.apache.hadoop.mapreduce.JobContext;
37 import org.apache.hadoop.mapreduce.RecordReader;
38 import org.apache.hadoop.mapreduce.TaskAttemptContext;
39
40 import java.util.Map;
41 import java.util.HashMap;
42 import java.util.Iterator;
43
44
45
46
47
48 @InterfaceAudience.Public
49 @InterfaceStability.Evolving
50 public abstract class MultiTableInputFormatBase extends
51 InputFormat<ImmutableBytesWritable, Result> {
52
53 final Log LOG = LogFactory.getLog(MultiTableInputFormatBase.class);
54
55
56 private List<Scan> scans;
57
58
59 private TableRecordReader tableRecordReader = null;
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 @Override
75 public RecordReader<ImmutableBytesWritable, Result> createRecordReader(
76 InputSplit split, TaskAttemptContext context)
77 throws IOException, InterruptedException {
78 TableSplit tSplit = (TableSplit) split;
79
80 if (tSplit.getTableName() == null) {
81 throw new IOException("Cannot create a record reader because of a"
82 + " previous error. Please look at the previous logs lines from"
83 + " the task's full log for more details.");
84 }
85 HTable table =
86 new HTable(context.getConfiguration(), tSplit.getTableName());
87
88 TableRecordReader trr = this.tableRecordReader;
89
90 if (trr == null) {
91 trr = new TableRecordReader();
92 }
93 Scan sc = tSplit.getScan();
94 sc.setStartRow(tSplit.getStartRow());
95 sc.setStopRow(tSplit.getEndRow());
96 trr.setScan(sc);
97 trr.setHTable(table);
98 trr.initialize(split, context);
99 return trr;
100 }
101
102
103
104
105
106
107
108
109
110
111
112
113 @Override
114 public List<InputSplit> getSplits(JobContext context) throws IOException {
115 if (scans.isEmpty()) {
116 throw new IOException("No scans were provided.");
117 }
118
119 Map<String, List<Scan>> tableMaps = new HashMap<String, List<Scan>>();
120 for (Scan scan : scans) {
121 byte[] tableName = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
122 if (tableName == null)
123 throw new IOException("A scan object did not have a table name");
124 String tableNameStr = Bytes.toString(tableName);
125
126 List<Scan> scanList = tableMaps.get(tableNameStr);
127
128 if (scanList == null) {
129 scanList = new ArrayList<Scan>();
130 tableMaps.put(tableNameStr, scanList);
131 }
132 scanList.add(scan);
133 }
134
135 List<InputSplit> splits = new ArrayList<InputSplit>();
136 Iterator iter = tableMaps.entrySet().iterator();
137 while (iter.hasNext()) {
138 Map.Entry<String, List<Scan>> entry = (Map.Entry<String, List<Scan>>) iter.next();
139 String tableNameStr = entry.getKey();
140 List<Scan> scanList = entry.getValue();
141 HTable table = new HTable(context.getConfiguration(), tableNameStr);
142 Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
143 for (Scan scan : scanList) {
144 if (keys == null || keys.getFirst() == null ||
145 keys.getFirst().length == 0) {
146 throw new IOException("Expecting at least one region for table : "
147 + tableNameStr);
148 }
149 int count = 0;
150 byte[] startRow = scan.getStartRow();
151 byte[] stopRow = scan.getStopRow();
152 for (int i = 0; i < keys.getFirst().length; i++) {
153 if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
154 continue;
155 }
156
157
158 if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
159 Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
160 (stopRow.length == 0 || Bytes.compareTo(stopRow,
161 keys.getFirst()[i]) > 0)) {
162 byte[] splitStart = startRow.length == 0 ||
163 Bytes.compareTo(keys.getFirst()[i],
164 startRow) >= 0 ? keys.getFirst()[i] : startRow;
165 byte[] splitStop = (stopRow.length == 0 ||
166 Bytes.compareTo(keys.getSecond()[i],
167 stopRow) <= 0) && keys.getSecond()[i].length > 0 ?
168 keys.getSecond()[i] : stopRow;
169 String regionLocation = table.getRegionLocation(
170 keys.getFirst()[i], false).getHostname();
171 InputSplit split = new TableSplit(Bytes.toBytes(tableNameStr), scan,
172 splitStart, splitStop, regionLocation);
173 splits.add(split);
174 if (LOG.isDebugEnabled())
175 LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
176 }
177 }
178 }
179 table.close();
180 }
181
182 return splits;
183 }
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207 protected boolean includeRegionInSplit(final byte[] startKey,
208 final byte[] endKey) {
209 return true;
210 }
211
212
213
214
215 protected List<Scan> getScans() {
216 return this.scans;
217 }
218
219
220
221
222
223
224 protected void setScans(List<Scan> scans) {
225 this.scans = scans;
226 }
227
228
229
230
231
232
233
234 protected void setTableRecordReader(TableRecordReader tableRecordReader) {
235 this.tableRecordReader = tableRecordReader;
236 }
237 }