1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19 package org.apache.hadoop.hbase.mapreduce;
20
21 import java.io.IOException;
22 import java.util.TreeMap;
23 import java.util.Map;
24
25 import org.apache.commons.logging.Log;
26 import org.apache.commons.logging.LogFactory;
27 import org.apache.hadoop.classification.InterfaceAudience;
28 import org.apache.hadoop.classification.InterfaceStability;
29 import org.apache.hadoop.hbase.client.Put;
30 import org.apache.hadoop.hbase.util.Bytes;
31 import org.apache.hadoop.mapreduce.Reducer;
32
33 /**
34 * Combine Puts. Merges Put instances grouped by <code>K</code> into a single
35 * instance.
36 * @see TableMapReduceUtil
37 */
38 @InterfaceAudience.Public
39 @InterfaceStability.Evolving
40 public class PutCombiner<K> extends Reducer<K, Put, K, Put> {
41 private static final Log LOG = LogFactory.getLog(PutCombiner.class);
42
43 @Override
44 protected void reduce(K row, Iterable<Put> vals, Context context)
45 throws IOException, InterruptedException {
46
47 int cnt = 0;
48 // There's nothing to say <code>K row</code> is the same as the rowkey
49 // used to construct Puts (value) instances. Thus the map of put.getRow()
50 // to combined Put is necessary.
51 // TODO: would be better if we knew <code>K row</code> and Put rowkey were
52 // identical. Then this whole Put buffering business goes away.
53 // TODO: Could use HeapSize to create an upper bound on the memory size of
54 // the puts map and flush some portion of the content while looping. This
55 // flush could result in multiple Puts for a single rowkey. That is
56 // acceptable because Combiner is run as an optimization and it's not
57 // critical that all Puts are grouped perfectly.
58 Map<byte[], Put> puts = new TreeMap<byte[], Put>(Bytes.BYTES_COMPARATOR);
59 for (Put p : vals) {
60 cnt++;
61 if (!puts.containsKey(p.getRow())) {
62 puts.put(p.getRow(), p);
63 } else {
64 puts.get(p.getRow()).getFamilyMap().putAll(p.getFamilyMap());
65 }
66 }
67
68 for (Put p : puts.values()) {
69 context.write(row, p);
70 }
71 LOG.info(String.format("Combined %d Put(s) into %d.", cnt, puts.size()));
72 }
73 }