View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.mapreduce;
19  
20  import static java.lang.String.format;
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.assertTrue;
24  
25  import java.io.IOException;
26  import java.util.Arrays;
27  import java.util.Iterator;
28  import java.util.Set;
29  import java.util.TreeSet;
30  import java.util.UUID;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.conf.Configurable;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.fs.FSDataOutputStream;
37  import org.apache.hadoop.fs.FileSystem;
38  import org.apache.hadoop.fs.Path;
39  import org.apache.hadoop.hbase.Cell;
40  import org.apache.hadoop.hbase.HBaseCommonTestingUtility;
41  import org.apache.hadoop.hbase.HBaseConfiguration;
42  import org.apache.hadoop.hbase.HBaseTestingUtility;
43  import org.apache.hadoop.hbase.IntegrationTestingUtility;
44  import org.apache.hadoop.hbase.IntegrationTests;
45  import org.apache.hadoop.hbase.KeyValue;
46  import org.apache.hadoop.hbase.KeyValue.Type;
47  import org.apache.hadoop.hbase.client.HTable;
48  import org.apache.hadoop.hbase.client.Result;
49  import org.apache.hadoop.hbase.client.Scan;
50  import org.apache.hadoop.hbase.util.Bytes;
51  import org.apache.hadoop.io.LongWritable;
52  import org.apache.hadoop.io.Text;
53  import org.apache.hadoop.mapreduce.Job;
54  import org.apache.hadoop.mapreduce.JobContext;
55  import org.apache.hadoop.mapreduce.OutputCommitter;
56  import org.apache.hadoop.mapreduce.OutputFormat;
57  import org.apache.hadoop.mapreduce.RecordWriter;
58  import org.apache.hadoop.mapreduce.TaskAttemptContext;
59  import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
60  import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
61  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
62  import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
63  import org.apache.hadoop.util.GenericOptionsParser;
64  import org.apache.hadoop.util.Tool;
65  import org.apache.hadoop.util.ToolRunner;
66  import org.junit.AfterClass;
67  import org.junit.BeforeClass;
68  import org.junit.Test;
69  import org.junit.experimental.categories.Category;
70  
71  /**
72   * Validate ImportTsv + LoadIncrementalHFiles on a distributed cluster.
73   */
74  @Category(IntegrationTests.class)
75  public class IntegrationTestImportTsv implements Configurable, Tool {
76  
77    private static final String NAME = IntegrationTestImportTsv.class.getSimpleName();
78    protected static final Log LOG = LogFactory.getLog(IntegrationTestImportTsv.class);
79  
80    protected static final String simple_tsv =
81        "row1\t1\tc1\tc2\n" +
82        "row2\t1\tc1\tc2\n" +
83        "row3\t1\tc1\tc2\n" +
84        "row4\t1\tc1\tc2\n" +
85        "row5\t1\tc1\tc2\n" +
86        "row6\t1\tc1\tc2\n" +
87        "row7\t1\tc1\tc2\n" +
88        "row8\t1\tc1\tc2\n" +
89        "row9\t1\tc1\tc2\n" +
90        "row10\t1\tc1\tc2\n";
91  
92    protected static final Set<KeyValue> simple_expected =
93        new TreeSet<KeyValue>(KeyValue.COMPARATOR) {
94      private static final long serialVersionUID = 1L;
95      {
96        byte[] family = Bytes.toBytes("d");
97        for (String line : simple_tsv.split("\n")) {
98          String[] row = line.split("\t");
99          byte[] key = Bytes.toBytes(row[0]);
100         long ts = Long.parseLong(row[1]);
101         byte[][] fields = { Bytes.toBytes(row[2]), Bytes.toBytes(row[3]) };
102         add(new KeyValue(key, family, fields[0], ts, Type.Put, fields[0]));
103         add(new KeyValue(key, family, fields[1], ts, Type.Put, fields[1]));
104       }
105     }
106   };
107 
108   // this instance is initialized on first access when the test is run from
109   // JUnit/Maven or by main when run from the CLI.
110   protected static IntegrationTestingUtility util = null;
111 
112   public Configuration getConf() {
113     return util.getConfiguration();
114   }
115 
116   public void setConf(Configuration conf) {
117     throw new IllegalArgumentException("setConf not supported");
118   }
119 
120   @BeforeClass
121   public static void provisionCluster() throws Exception {
122     if (null == util) {
123       util = new IntegrationTestingUtility();
124     }
125     util.initializeCluster(1);
126   }
127 
128   @AfterClass
129   public static void releaseCluster() throws Exception {
130     util.restoreCluster();
131     util = null;
132   }
133 
134   /**
135    * Verify the data described by <code>simple_tsv</code> matches
136    * <code>simple_expected</code>.
137    */
138   protected void doLoadIncrementalHFiles(Path hfiles, String tableName)
139       throws Exception {
140 
141     String[] args = { hfiles.toString(), tableName };
142     LOG.info(format("Running LoadIncrememntalHFiles with args: %s", Arrays.asList(args)));
143     assertEquals("Loading HFiles failed.",
144       0, ToolRunner.run(new LoadIncrementalHFiles(new Configuration(getConf())), args));
145 
146     HTable table = null;
147     Scan scan = new Scan() {{
148       setCacheBlocks(false);
149       setCaching(1000);
150     }};
151     try {
152       table = new HTable(getConf(), tableName);
153       Iterator<Result> resultsIt = table.getScanner(scan).iterator();
154       Iterator<KeyValue> expectedIt = simple_expected.iterator();
155       while (resultsIt.hasNext() && expectedIt.hasNext()) {
156         Result r = resultsIt.next();
157         for (Cell actual : r.rawCells()) {
158           assertTrue(
159             "Ran out of expected values prematurely!",
160             expectedIt.hasNext());
161           KeyValue expected = expectedIt.next();
162           assertTrue(
163             format("Scan produced surprising result. expected: <%s>, actual: %s",
164               expected, actual),
165             KeyValue.COMPARATOR.compare(expected, actual) == 0);
166         }
167       }
168       assertFalse("Did not consume all expected values.", expectedIt.hasNext());
169       assertFalse("Did not consume all scan results.", resultsIt.hasNext());
170     } finally {
171       if (null != table) table.close();
172     }
173   }
174 
175   /**
176    * Confirm the absence of the {@link TotalOrderPartitioner} partitions file.
177    */
178   protected static void validateDeletedPartitionsFile(Configuration conf) throws IOException {
179     if (!conf.getBoolean(IntegrationTestingUtility.IS_DISTRIBUTED_CLUSTER, false))
180       return;
181 
182     FileSystem fs = FileSystem.get(conf);
183     Path partitionsFile = new Path(TotalOrderPartitioner.getPartitionFile(conf));
184     assertFalse("Failed to clean up partitions file.", fs.exists(partitionsFile));
185   }
186 
187   @Test
188   public void testGenerateAndLoad() throws Exception {
189     LOG.info("Running test testGenerateAndLoad.");
190     String table = NAME + "-" + UUID.randomUUID();
191     String cf = "d";
192     Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
193 
194     String[] args = {
195         format("-D%s=%s", ImportTsv.BULK_OUTPUT_CONF_KEY, hfiles),
196         format("-D%s=HBASE_ROW_KEY,HBASE_TS_KEY,%s:c1,%s:c2",
197           ImportTsv.COLUMNS_CONF_KEY, cf, cf),
198         // configure the test harness to NOT delete the HFiles after they're
199         // generated. We need those for doLoadIncrementalHFiles
200         format("-D%s=false", TestImportTsv.DELETE_AFTER_LOAD_CONF),
201         table
202     };
203 
204     // run the job, complete the load.
205     util.createTable(table, cf);
206     Tool t = TestImportTsv.doMROnTableTest(util, cf, simple_tsv, args);
207     doLoadIncrementalHFiles(hfiles, table);
208 
209     // validate post-conditions
210     validateDeletedPartitionsFile(t.getConf());
211 
212     // clean up after ourselves.
213     util.deleteTable(table);
214     util.cleanupDataTestDirOnTestFS(table);
215     LOG.info("testGenerateAndLoad completed successfully.");
216   }
217 
218   //
219   // helper classes used in the following test.
220   //
221 
222   /**
223    * A {@link FileOutputCommitter} that launches an ImportTsv job through
224    * its {@link #commitJob(JobContext)} method.
225    */
226   private static class JobLaunchingOuputCommitter extends FileOutputCommitter {
227 
228     public JobLaunchingOuputCommitter(Path outputPath, TaskAttemptContext context)
229         throws IOException {
230       super(outputPath, context);
231     }
232 
233     @Override
234     public void commitJob(JobContext context) throws IOException {
235       super.commitJob(context);
236 
237       // inherit jar dependencies added to distributed cache loaded by parent job
238       Configuration conf = HBaseConfiguration.create(context.getConfiguration());
239       conf.set("mapred.job.classpath.archives",
240         context.getConfiguration().get("mapred.job.classpath.archives", ""));
241       conf.set("mapreduce.job.cache.archives.visibilities",
242         context.getConfiguration().get("mapreduce.job.cache.archives.visibilities", ""));
243 
244       // can't use IntegrationTest instance of util because it hasn't been
245       // instantiated on the JVM running this method. Create our own.
246       IntegrationTestingUtility util =
247           new IntegrationTestingUtility(conf);
248 
249       // this is why we're here: launch a child job. The rest of this should
250       // look a lot like TestImportTsv#testMROnTable.
251       final String table = format("%s-%s-child", NAME, context.getJobID());
252       final String cf = "FAM";
253 
254       String[] args = {
255           "-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
256           "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
257           table
258       };
259 
260       try {
261         util.createTable(table, cf);
262         LOG.info("testRunFromOutputCommitter: launching child job.");
263         TestImportTsv.doMROnTableTest(util, cf, null, args, 1);
264       } catch (Exception e) {
265         throw new IOException("Underlying MapReduce job failed. Aborting commit.", e);
266       } finally {
267         util.deleteTable(table);
268       }
269     }
270   }
271 
272   /**
273    * An {@link OutputFormat} that exposes the <code>JobLaunchingOutputCommitter</code>.
274    */
275   public static class JobLaunchingOutputFormat extends FileOutputFormat<LongWritable, Text> {
276 
277     private OutputCommitter committer = null;
278 
279     @Override
280     public RecordWriter<LongWritable, Text> getRecordWriter(TaskAttemptContext job)
281         throws IOException, InterruptedException {
282       return new RecordWriter<LongWritable, Text>() {
283         @Override
284         public void write(LongWritable key, Text value) throws IOException,
285             InterruptedException {
286           /* do nothing */
287         }
288 
289         @Override
290         public void close(TaskAttemptContext context) throws IOException,
291             InterruptedException {
292           /* do nothing */
293         }
294       };
295     }
296 
297     @Override
298     public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context)
299         throws IOException {
300       if (committer == null) {
301         Path output = getOutputPath(context);
302         LOG.debug("Using JobLaunchingOuputCommitter.");
303         committer = new JobLaunchingOuputCommitter(output, context);
304       }
305       return committer;
306     }
307   }
308 
309   /**
310    * Add classes necessary for integration-test jobs.
311    */
312   public static void addTestDependencyJars(Configuration conf) throws IOException {
313     TableMapReduceUtil.addDependencyJars(conf,
314       org.apache.hadoop.hbase.BaseConfigurable.class, // hbase-server
315       HBaseTestingUtility.class,                      // hbase-server-test
316       HBaseCommonTestingUtility.class,                // hbase-common-test
317       com.google.common.collect.ListMultimap.class,   // Guava
318       org.cloudera.htrace.Trace.class);               // HTrace
319   }
320 
321   /**
322    * {@link TableMapReduceUtil#addDependencyJars(Job)} is used when
323    * configuring a mapreduce job to ensure dependencies of the job are shipped
324    * to the cluster. Sometimes those dependencies are on the classpath, but not
325    * packaged as a jar, for instance, when run at the end of another mapreduce
326    * job. In that case, dependency jars have already been shipped to the cluster
327    * and expanded in the parent job's run folder. This test validates the child
328    * job's classpath is constructed correctly under that scenario.
329    */
330   @Test
331   public void testRunFromOutputCommitter() throws Exception {
332     LOG.info("Running test testRunFromOutputCommitter.");
333 
334     FileSystem fs = FileSystem.get(getConf());
335     Path inputPath = new Path(util.getDataTestDirOnTestFS("parent"), "input.txt");
336     Path outputPath = new Path(util.getDataTestDirOnTestFS("parent"), "output");
337     FSDataOutputStream fout = null;
338     try {
339       fout = fs.create(inputPath, true);
340       fout.write(Bytes.toBytes("testRunFromOutputCommitter\n"));
341       LOG.debug(format("Wrote test data to file: %s", inputPath));
342     } finally {
343       if (fout != null) {
344         fout.close();
345       }
346     }
347 
348     // create a parent job that ships the HBase dependencies. This is
349     // accurate as the expected calling context.
350     Job job = new Job(getConf(), NAME + ".testRunFromOutputCommitter - parent");
351     job.setJarByClass(IntegrationTestImportTsv.class);
352     job.setInputFormatClass(TextInputFormat.class);
353     job.setOutputFormatClass(JobLaunchingOutputFormat.class);
354     TextInputFormat.addInputPath(job, inputPath);
355     JobLaunchingOutputFormat.setOutputPath(job, outputPath);
356     TableMapReduceUtil.addDependencyJars(job);
357     addTestDependencyJars(job.getConfiguration());
358 
359     // Job launched by the OutputCommitter will fail if dependency jars are
360     // not shipped properly.
361     LOG.info("testRunFromOutputCommitter: launching parent job.");
362     assertTrue(job.waitForCompletion(true));
363     LOG.info("testRunFromOutputCommitter completed successfully.");
364   }
365 
366   public int run(String[] args) throws Exception {
367     if (args.length != 0) {
368       System.err.println(format("%s [genericOptions]", NAME));
369       System.err.println("  Runs ImportTsv integration tests against a distributed cluster.");
370       System.err.println();
371       GenericOptionsParser.printGenericCommandUsage(System.err);
372       return 1;
373     }
374 
375     // adding more test methods? Don't forget to add them here... or consider doing what
376     // IntegrationTestsDriver does.
377     provisionCluster();
378     testGenerateAndLoad();
379     testRunFromOutputCommitter();
380     releaseCluster();
381 
382     return 0;
383   }
384 
385   public static void main(String[] args) throws Exception {
386     Configuration conf = HBaseConfiguration.create();
387     IntegrationTestingUtility.setUseDistributedCluster(conf);
388     util = new IntegrationTestingUtility(conf);
389     // not using ToolRunner to avoid unnecessary call to setConf()
390     args = new GenericOptionsParser(conf, args).getRemainingArgs();
391     int status = new IntegrationTestImportTsv().run(args);
392     System.exit(status);
393   }
394 }