View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.mapreduce;
19  
20  import com.google.protobuf.HBaseZeroCopyByteString;
21  import org.apache.hadoop.classification.InterfaceAudience;
22  import org.apache.hadoop.classification.InterfaceStability;
23  import org.apache.hadoop.conf.Configuration;
24  import org.apache.hadoop.fs.FileSystem;
25  import org.apache.hadoop.fs.Path;
26  import org.apache.hadoop.hbase.CellUtil;
27  import org.apache.hadoop.hbase.HConstants;
28  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
29  import org.apache.hadoop.hbase.HDFSBlocksDistribution.HostAndWeight;
30  import org.apache.hadoop.hbase.HRegionInfo;
31  import org.apache.hadoop.hbase.HTableDescriptor;
32  import org.apache.hadoop.hbase.client.ClientSideRegionScanner;
33  import org.apache.hadoop.hbase.client.IsolationLevel;
34  import org.apache.hadoop.hbase.client.Result;
35  import org.apache.hadoop.hbase.client.Scan;
36  import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
37  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
38  import org.apache.hadoop.hbase.protobuf.generated.MapReduceProtos;
39  import org.apache.hadoop.hbase.regionserver.HRegion;
40  import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
41  import org.apache.hadoop.hbase.snapshot.RestoreSnapshotHelper;
42  import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
43  import org.apache.hadoop.hbase.snapshot.SnapshotReferenceUtil;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.FSTableDescriptors;
46  import org.apache.hadoop.io.Writable;
47  import org.apache.hadoop.mapreduce.Job;
48  
49  import java.io.ByteArrayOutputStream;
50  import java.io.DataInput;
51  import java.io.DataOutput;
52  import java.io.IOException;
53  import java.util.ArrayList;
54  import java.util.List;
55  import java.util.Set;
56  import java.util.UUID;
57  
58  /**
59   * API-agnostic implementation for mapreduce over table snapshots.
60   */
61  @InterfaceAudience.Private
62  @InterfaceStability.Evolving
63  public class TableSnapshotInputFormatImpl {
64    // TODO: Snapshots files are owned in fs by the hbase user. There is no
65    // easy way to delegate access.
66  
67    private static final String SNAPSHOT_NAME_KEY = "hbase.TableSnapshotInputFormat.snapshot.name";
68    private static final String TABLE_DIR_KEY = "hbase.TableSnapshotInputFormat.table.dir";
69  
70    /** See {@link #getBestLocations(Configuration, HDFSBlocksDistribution)} */
71    private static final String LOCALITY_CUTOFF_MULTIPLIER = "hbase.tablesnapshotinputformat.locality.cutoff.multiplier";
72    private static final float DEFAULT_LOCALITY_CUTOFF_MULTIPLIER = 0.8f;
73  
74    /**
75     * Implementation class for InputSplit logic common between mapred and mapreduce.
76     */
77    public static class InputSplit implements Writable {
78      private String regionName;
79      private String[] locations;
80  
81      // constructor for mapreduce framework / Writable
82      public InputSplit() { }
83  
84      public InputSplit(String regionName, List<String> locations) {
85        this.regionName = regionName;
86        if (locations == null || locations.isEmpty()) {
87          this.locations = new String[0];
88        } else {
89          this.locations = locations.toArray(new String[locations.size()]);
90        }
91      }
92  
93      public long getLength() {
94        //TODO: We can obtain the file sizes of the snapshot here.
95        return 0;
96      }
97  
98      public String[] getLocations() {
99        return locations;
100     }
101 
102     // TODO: We should have ProtobufSerialization in Hadoop, and directly use PB objects instead of
103     // doing this wrapping with Writables.
104     @Override
105     public void write(DataOutput out) throws IOException {
106       MapReduceProtos.TableSnapshotRegionSplit.Builder builder =
107         MapReduceProtos.TableSnapshotRegionSplit.newBuilder()
108           .setRegion(HBaseProtos.RegionSpecifier.newBuilder()
109             .setType(HBaseProtos.RegionSpecifier.RegionSpecifierType.ENCODED_REGION_NAME)
110             .setValue(HBaseZeroCopyByteString.wrap(Bytes.toBytes(regionName))).build());
111 
112       for (String location : locations) {
113         builder.addLocations(location);
114       }
115 
116       MapReduceProtos.TableSnapshotRegionSplit split = builder.build();
117 
118       ByteArrayOutputStream baos = new ByteArrayOutputStream();
119       split.writeTo(baos);
120       baos.close();
121       byte[] buf = baos.toByteArray();
122       out.writeInt(buf.length);
123       out.write(buf);
124     }
125 
126     @Override
127     public void readFields(DataInput in) throws IOException {
128       int len = in.readInt();
129       byte[] buf = new byte[len];
130       in.readFully(buf);
131       MapReduceProtos.TableSnapshotRegionSplit split = MapReduceProtos.TableSnapshotRegionSplit.PARSER.parseFrom(buf);
132       this.regionName = Bytes.toString(split.getRegion().getValue().toByteArray());
133       List<String> locationsList = split.getLocationsList();
134       this.locations = locationsList.toArray(new String[locationsList.size()]);
135     }
136   }
137 
138   /**
139    * Implementation class for RecordReader logic common between mapred and mapreduce.
140    */
141   public static class RecordReader {
142     InputSplit split;
143     private Scan scan;
144     private Result result = null;
145     private ImmutableBytesWritable row = null;
146     private ClientSideRegionScanner scanner;
147 
148     public ClientSideRegionScanner getScanner() {
149       return scanner;
150     }
151 
152     public void initialize(InputSplit split, Configuration conf) throws IOException {
153       this.split = split;
154       String regionName = this.split.regionName;
155       String snapshotName = getSnapshotName(conf);
156       Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
157       FileSystem fs = rootDir.getFileSystem(conf);
158 
159       Path tmpRootDir = new Path(conf.get(TABLE_DIR_KEY)); // This is the user specified root
160       // directory where snapshot was restored
161 
162       Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
163 
164       //load table descriptor
165       HTableDescriptor htd = FSTableDescriptors.getTableDescriptorFromFs(fs, snapshotDir);
166 
167       //load region descriptor
168       Path regionDir = new Path(snapshotDir, regionName);
169       HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
170 
171       // create scan
172       // TODO: mapred does not support scan as input API. Work around for now.
173       if (conf.get(TableInputFormat.SCAN) != null) {
174         scan = TableMapReduceUtil.convertStringToScan(conf.get(TableInputFormat.SCAN));
175       } else if (conf.get(org.apache.hadoop.hbase.mapred.TableInputFormat.COLUMN_LIST) != null) {
176         String[] columns =
177           conf.get(org.apache.hadoop.hbase.mapred.TableInputFormat.COLUMN_LIST).split(" ");
178         scan = new Scan();
179         for (String col : columns) {
180           scan.addFamily(Bytes.toBytes(col));
181         }
182       } else {
183         throw new IllegalArgumentException("A Scan is not configured for this job");
184       }
185 
186       // region is immutable, this should be fine,
187       // otherwise we have to set the thread read point
188       scan.setIsolationLevel(IsolationLevel.READ_UNCOMMITTED);
189       // disable caching of data blocks
190       scan.setCacheBlocks(false);
191 
192       scanner = new ClientSideRegionScanner(conf, fs, tmpRootDir, htd, hri, scan, null);
193     }
194 
195     public boolean nextKeyValue() throws IOException {
196       result = scanner.next();
197       if (result == null) {
198         //we are done
199         return false;
200       }
201 
202       if (this.row == null) {
203         this.row = new ImmutableBytesWritable();
204       }
205       this.row.set(result.getRow());
206       return true;
207     }
208 
209     public ImmutableBytesWritable getCurrentKey() {
210       return row;
211     }
212 
213     public Result getCurrentValue() {
214       return result;
215     }
216 
217     public long getPos() {
218       return 0;
219     }
220 
221     public float getProgress() {
222       return 0; // TODO: use total bytes to estimate
223     }
224 
225     public void close() {
226       if (this.scanner != null) {
227         this.scanner.close();
228       }
229     }
230   }
231 
232   public static List<InputSplit> getSplits(Configuration conf) throws IOException {
233     String snapshotName = getSnapshotName(conf);
234 
235     Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
236     FileSystem fs = rootDir.getFileSystem(conf);
237 
238     Path snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshotName, rootDir);
239     HBaseProtos.SnapshotDescription snapshotDesc =
240       SnapshotDescriptionUtils.readSnapshotInfo(fs, snapshotDir);
241 
242     Set<String> snapshotRegionNames =
243       SnapshotReferenceUtil.getSnapshotRegionNames(fs, snapshotDir);
244     if (snapshotRegionNames == null) {
245       throw new IllegalArgumentException("Snapshot seems empty");
246     }
247 
248     // load table descriptor
249     HTableDescriptor htd = FSTableDescriptors.getTableDescriptorFromFs(fs,
250       snapshotDir);
251 
252     // TODO: mapred does not support scan as input API. Work around for now.
253     Scan scan = null;
254     if (conf.get(TableInputFormat.SCAN) != null) {
255       scan = TableMapReduceUtil.convertStringToScan(conf.get(TableInputFormat.SCAN));
256     } else if (conf.get(org.apache.hadoop.hbase.mapred.TableInputFormat.COLUMN_LIST) != null) {
257       String[] columns =
258         conf.get(org.apache.hadoop.hbase.mapred.TableInputFormat.COLUMN_LIST).split(" ");
259       scan = new Scan();
260       for (String col : columns) {
261         scan.addFamily(Bytes.toBytes(col));
262       }
263     } else {
264       throw new IllegalArgumentException("Unable to create scan");
265     }
266     Path tableDir = new Path(conf.get(TABLE_DIR_KEY));
267 
268     List<InputSplit> splits = new ArrayList<InputSplit>();
269     for (String regionName : snapshotRegionNames) {
270       // load region descriptor
271       Path regionDir = new Path(snapshotDir, regionName);
272       HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs,
273         regionDir);
274 
275       if (CellUtil.overlappingKeys(scan.getStartRow(), scan.getStopRow(),
276         hri.getStartKey(), hri.getEndKey())) {
277         // compute HDFS locations from snapshot files (which will get the locations for
278         // referred hfiles)
279         List<String> hosts = getBestLocations(conf,
280           HRegion.computeHDFSBlocksDistribution(conf, htd, hri, tableDir));
281 
282         int len = Math.min(3, hosts.size());
283         hosts = hosts.subList(0, len);
284         splits.add(new InputSplit(regionName, hosts));
285       }
286     }
287 
288     return splits;
289   }
290 
291   /**
292    * This computes the locations to be passed from the InputSplit. MR/Yarn schedulers does not take
293    * weights into account, thus will treat every location passed from the input split as equal. We
294    * do not want to blindly pass all the locations, since we are creating one split per region, and
295    * the region's blocks are all distributed throughout the cluster unless favorite node assignment
296    * is used. On the expected stable case, only one location will contain most of the blocks as local.
297    * On the other hand, in favored node assignment, 3 nodes will contain highly local blocks. Here
298    * we are doing a simple heuristic, where we will pass all hosts which have at least 80%
299    * (hbase.tablesnapshotinputformat.locality.cutoff.multiplier) as much block locality as the top
300    * host with the best locality.
301    */
302   public static List<String> getBestLocations(
303       Configuration conf, HDFSBlocksDistribution blockDistribution) {
304     List<String> locations = new ArrayList<String>(3);
305 
306     HostAndWeight[] hostAndWeights = blockDistribution.getTopHostsWithWeights();
307 
308     if (hostAndWeights.length == 0) {
309       return locations;
310     }
311 
312     HostAndWeight topHost = hostAndWeights[0];
313     locations.add(topHost.getHost());
314 
315     // Heuristic: filter all hosts which have at least cutoffMultiplier % of block locality
316     double cutoffMultiplier
317       = conf.getFloat(LOCALITY_CUTOFF_MULTIPLIER, DEFAULT_LOCALITY_CUTOFF_MULTIPLIER);
318 
319     double filterWeight = topHost.getWeight() * cutoffMultiplier;
320 
321     for (int i = 1; i < hostAndWeights.length; i++) {
322       if (hostAndWeights[i].getWeight() >= filterWeight) {
323         locations.add(hostAndWeights[i].getHost());
324       } else {
325         break;
326       }
327     }
328 
329     return locations;
330   }
331 
332   private static String getSnapshotName(Configuration conf) {
333     String snapshotName = conf.get(SNAPSHOT_NAME_KEY);
334     if (snapshotName == null) {
335       throw new IllegalArgumentException("Snapshot name must be provided");
336     }
337     return snapshotName;
338   }
339 
340   /**
341    * Configures the job to use TableSnapshotInputFormat to read from a snapshot.
342    * @param conf the job to configure
343    * @param snapshotName the name of the snapshot to read from
344    * @param restoreDir a temporary directory to restore the snapshot into. Current user should
345    * have write permissions to this directory, and this should not be a subdirectory of rootdir.
346    * After the job is finished, restoreDir can be deleted.
347    * @throws IOException if an error occurs
348    */
349   public static void setInput(Configuration conf, String snapshotName, Path restoreDir)
350       throws IOException {
351     conf.set(SNAPSHOT_NAME_KEY, snapshotName);
352 
353     Path rootDir = new Path(conf.get(HConstants.HBASE_DIR));
354     FileSystem fs = rootDir.getFileSystem(conf);
355 
356     restoreDir = new Path(restoreDir, UUID.randomUUID().toString());
357 
358     // TODO: restore from record readers to parallelize.
359     RestoreSnapshotHelper.copySnapshotForScanner(conf, fs, rootDir, restoreDir, snapshotName);
360 
361     conf.set(TABLE_DIR_KEY, restoreDir.toString());
362   }
363 }