View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import java.io.IOException;
22  import java.util.TreeMap;
23  import java.util.Map;
24  
25  import org.apache.commons.logging.Log;
26  import org.apache.commons.logging.LogFactory;
27  import org.apache.hadoop.classification.InterfaceAudience;
28  import org.apache.hadoop.classification.InterfaceStability;
29  import org.apache.hadoop.hbase.client.Put;
30  import org.apache.hadoop.hbase.util.Bytes;
31  import org.apache.hadoop.mapreduce.Reducer;
32  
33  /**
34   * Combine Puts. Merges Put instances grouped by <code>K</code> into a single
35   * instance.
36   * @see TableMapReduceUtil
37   */
38  @InterfaceAudience.Public
39  @InterfaceStability.Evolving
40  public class PutCombiner<K> extends Reducer<K, Put, K, Put> {
41    private static final Log LOG = LogFactory.getLog(PutCombiner.class);
42  
43    @Override
44    protected void reduce(K row, Iterable<Put> vals, Context context)
45        throws IOException, InterruptedException {
46  
47      int cnt = 0;
48      // There's nothing to say <code>K row</code> is the same as the rowkey
49      // used to construct Puts (value) instances. Thus the map of put.getRow()
50      // to combined Put is necessary.
51      // TODO: would be better if we knew <code>K row</code> and Put rowkey were
52      // identical. Then this whole Put buffering business goes away.
53      // TODO: Could use HeapSize to create an upper bound on the memory size of
54      // the puts map and flush some portion of the content while looping. This
55      // flush could result in multiple Puts for a single rowkey. That is
56      // acceptable because Combiner is run as an optimization and it's not
57      // critical that all Puts are grouped perfectly.
58      Map<byte[], Put> puts = new TreeMap<byte[], Put>(Bytes.BYTES_COMPARATOR);
59      for (Put p : vals) {
60        cnt++;
61        if (!puts.containsKey(p.getRow())) {
62          puts.put(p.getRow(), p);
63        } else {
64          puts.get(p.getRow()).getFamilyMap().putAll(p.getFamilyMap());
65        }
66      }
67  
68      for (Put p : puts.values()) {
69        context.write(row, p);
70      }
71      LOG.info(String.format("Combined %d Put(s) into %d.", cnt, puts.size()));
72    }
73  }