View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.mapreduce;
20  
21  import static org.junit.Assert.assertEquals;
22  import static org.junit.Assert.assertTrue;
23  
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.Arrays;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Set;
30  import java.util.UUID;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.conf.Configurable;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.fs.FSDataOutputStream;
37  import org.apache.hadoop.fs.FileStatus;
38  import org.apache.hadoop.fs.FileSystem;
39  import org.apache.hadoop.fs.Path;
40  import org.apache.hadoop.hbase.Cell;
41  import org.apache.hadoop.hbase.CellUtil;
42  import org.apache.hadoop.hbase.HBaseTestingUtility;
43  import org.apache.hadoop.hbase.HConstants;
44  import org.apache.hadoop.hbase.testclassification.LargeTests;
45  import org.apache.hadoop.hbase.TableName;
46  import org.apache.hadoop.hbase.TableNotFoundException;
47  import org.apache.hadoop.hbase.client.HTable;
48  import org.apache.hadoop.hbase.client.Result;
49  import org.apache.hadoop.hbase.client.ResultScanner;
50  import org.apache.hadoop.hbase.client.Scan;
51  import org.apache.hadoop.hbase.client.Table;
52  import org.apache.hadoop.hbase.util.Bytes;
53  import org.apache.hadoop.io.Text;
54  import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
55  import org.apache.hadoop.mapreduce.Job;
56  import org.apache.hadoop.util.GenericOptionsParser;
57  import org.apache.hadoop.util.Tool;
58  import org.apache.hadoop.util.ToolRunner;
59  import org.junit.AfterClass;
60  import org.junit.BeforeClass;
61  import org.junit.Test;
62  import org.junit.experimental.categories.Category;
63  
64  @Category(LargeTests.class)
65  public class TestImportTsv implements Configurable {
66  
67    protected static final Log LOG = LogFactory.getLog(TestImportTsv.class);
68    protected static final String NAME = TestImportTsv.class.getSimpleName();
69    protected static HBaseTestingUtility util = new HBaseTestingUtility();
70  
71    /**
72     * Delete the tmp directory after running doMROnTableTest. Boolean. Default is
73     * false.
74     */
75    protected static final String DELETE_AFTER_LOAD_CONF = NAME + ".deleteAfterLoad";
76  
77    /**
78     * Force use of combiner in doMROnTableTest. Boolean. Default is true.
79     */
80    protected static final String FORCE_COMBINER_CONF = NAME + ".forceCombiner";
81  
82    private final String FAMILY = "FAM";
83  
84    public Configuration getConf() {
85      return util.getConfiguration();
86    }
87  
88    public void setConf(Configuration conf) {
89      throw new IllegalArgumentException("setConf not supported");
90    }
91  
92    @BeforeClass
93    public static void provisionCluster() throws Exception {
94      util.startMiniCluster();
95      util.startMiniMapReduceCluster();
96    }
97  
98    @AfterClass
99    public static void releaseCluster() throws Exception {
100     util.shutdownMiniMapReduceCluster();
101     util.shutdownMiniCluster();
102   }
103 
104   @Test
105   public void testMROnTable() throws Exception {
106     String table = "test-" + UUID.randomUUID();
107 
108     // Prepare the arguments required for the test.
109     String[] args = new String[] {
110         "-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
111         "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
112         table
113     };
114 
115     util.createTable(TableName.valueOf(table), FAMILY);
116     doMROnTableTest(util, FAMILY, null, args, 1);
117     util.deleteTable(table);
118   }
119   
120   @Test
121   public void testMROnTableWithTimestamp() throws Exception {
122     String table = "test-" + UUID.randomUUID();
123 
124     // Prepare the arguments required for the test.
125     String[] args = new String[] {
126         "-D" + ImportTsv.COLUMNS_CONF_KEY
127             + "=HBASE_ROW_KEY,HBASE_TS_KEY,FAM:A,FAM:B",
128         "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=,",
129         table
130     };
131     String data = "KEY,1234,VALUE1,VALUE2\n";
132 
133     util.createTable(TableName.valueOf(table), FAMILY);
134     doMROnTableTest(util, FAMILY, data, args, 1);
135     util.deleteTable(table);
136   }
137   
138 
139   @Test
140   public void testMROnTableWithCustomMapper()
141   throws Exception {
142     String table = "test-" + UUID.randomUUID();
143 
144     // Prepare the arguments required for the test.
145     String[] args = new String[] {
146         "-D" + ImportTsv.MAPPER_CONF_KEY + "=org.apache.hadoop.hbase.mapreduce.TsvImporterCustomTestMapper",
147         table
148     };
149 
150     util.createTable(TableName.valueOf(table), FAMILY);
151     doMROnTableTest(util, FAMILY, null, args, 3);
152     util.deleteTable(table);
153   }
154   
155   @Test
156   public void testBulkOutputWithoutAnExistingTable() throws Exception {
157     String table = "test-" + UUID.randomUUID();
158 
159     // Prepare the arguments required for the test.
160     Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
161     String[] args = new String[] {
162         "-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
163         "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
164         "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + hfiles.toString(),
165         table
166     };
167 
168     doMROnTableTest(util, FAMILY, null, args, 3);
169     util.deleteTable(table);
170   }
171 
172   @Test
173   public void testBulkOutputWithAnExistingTable() throws Exception {
174     String table = "test-" + UUID.randomUUID();
175 
176     // Prepare the arguments required for the test.
177     Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
178     String[] args = new String[] {
179         "-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
180         "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
181         "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + hfiles.toString(),
182         table
183     };
184 
185     util.createTable(TableName.valueOf(table), FAMILY);
186     doMROnTableTest(util, FAMILY, null, args, 3);
187     util.deleteTable(table);
188   }
189   
190   @Test
191   public void testBulkOutputWithAnExistingTableNoStrictTrue() throws Exception {
192     String table = "test-" + UUID.randomUUID();
193     // Prepare the arguments required for the test.
194     Path hfiles = new Path(util.getDataTestDirOnTestFS(table), "hfiles");
195     String[] args = new String[] {
196         "-D" + ImportTsv.COLUMNS_CONF_KEY + "=HBASE_ROW_KEY,FAM:A,FAM:B",
197         "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
198         "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + hfiles.toString(),
199         "-D" + ImportTsv.NO_STRICT_COL_FAMILY + "=true",
200         table
201     };
202     util.createTable(TableName.valueOf(table), FAMILY);
203     doMROnTableTest(util, FAMILY, null, args, 3);
204     util.deleteTable(table);
205   }
206 
207   @Test
208   public void testJobConfigurationsWithTsvImporterTextMapper() throws Exception {
209     String table = "test-" + UUID.randomUUID();
210     Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table),"hfiles");
211     String INPUT_FILE = "InputFile1.csv";
212     // Prepare the arguments required for the test.
213     String[] args =
214         new String[] {
215             "-D" + ImportTsv.MAPPER_CONF_KEY
216                 + "=org.apache.hadoop.hbase.mapreduce.TsvImporterTextMapper",
217             "-D" + ImportTsv.COLUMNS_CONF_KEY
218                 + "=HBASE_ROW_KEY,FAM:A,FAM:B",
219             "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=,",
220             "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + bulkOutputPath.toString(), table,
221             INPUT_FILE
222             };
223     GenericOptionsParser opts = new GenericOptionsParser(util.getConfiguration(), args);
224     args = opts.getRemainingArgs();
225     Job job = ImportTsv.createSubmittableJob(util.getConfiguration(), args);
226     assertTrue(job.getMapperClass().equals(TsvImporterTextMapper.class));
227     assertTrue(job.getReducerClass().equals(TextSortReducer.class));
228     assertTrue(job.getMapOutputValueClass().equals(Text.class));
229   }
230 
231   @Test
232   public void testBulkOutputWithTsvImporterTextMapper() throws Exception {
233     String table = "test-" + UUID.randomUUID();
234     String FAMILY = "FAM";
235     Path bulkOutputPath = new Path(util.getDataTestDirOnTestFS(table),"hfiles");
236     // Prepare the arguments required for the test.
237     String[] args =
238         new String[] {
239             "-D" + ImportTsv.MAPPER_CONF_KEY
240                 + "=org.apache.hadoop.hbase.mapreduce.TsvImporterTextMapper",
241             "-D" + ImportTsv.COLUMNS_CONF_KEY
242                 + "=HBASE_ROW_KEY,FAM:A,FAM:B",
243             "-D" + ImportTsv.SEPARATOR_CONF_KEY + "=\u001b",
244             "-D" + ImportTsv.BULK_OUTPUT_CONF_KEY + "=" + bulkOutputPath.toString(), table 
245             };
246     String data = "KEY\u001bVALUE4\u001bVALUE8\n";
247     doMROnTableTest(util, FAMILY, data, args, 4);
248   }
249 
250   @Test(expected = TableNotFoundException.class)
251   public void testWithoutAnExistingTableAndCreateTableSetToNo() throws Exception {
252     String table = "test-" + UUID.randomUUID();
253     String[] args =
254         new String[] { table, "/inputFile" };
255 
256     Configuration conf = new Configuration(util.getConfiguration());
257     conf.set(ImportTsv.COLUMNS_CONF_KEY, "HBASE_ROW_KEY,FAM:A");
258     conf.set(ImportTsv.BULK_OUTPUT_CONF_KEY, "/output");
259     conf.set(ImportTsv.CREATE_TABLE_CONF_KEY, "no");
260     ImportTsv.createSubmittableJob(conf, args);
261   }
262 
263   @Test(expected = TableNotFoundException.class)
264   public void testMRWithoutAnExistingTable() throws Exception {
265     String table = "test-" + UUID.randomUUID();
266     String[] args =
267         new String[] { table, "/inputFile" };
268 
269     Configuration conf = new Configuration(util.getConfiguration());
270     ImportTsv.createSubmittableJob(conf, args);
271   }
272 
273   protected static Tool doMROnTableTest(HBaseTestingUtility util, String family,
274       String data, String[] args) throws Exception {
275     return doMROnTableTest(util, family, data, args, 1);
276   }
277 
278   /**
279    * Run an ImportTsv job and perform basic validation on the results.
280    * Returns the ImportTsv <code>Tool</code> instance so that other tests can
281    * inspect it for further validation as necessary. This method is static to
282    * insure non-reliance on instance's util/conf facilities.
283    * @param args Any arguments to pass BEFORE inputFile path is appended.
284    * @return The Tool instance used to run the test.
285    */
286   protected static Tool doMROnTableTest(HBaseTestingUtility util, String family,
287       String data, String[] args, int valueMultiplier)
288   throws Exception {
289     String table = args[args.length - 1];
290     Configuration conf = new Configuration(util.getConfiguration());
291 
292     // populate input file
293     FileSystem fs = FileSystem.get(conf);
294     Path inputPath = fs.makeQualified(new Path(util.getDataTestDirOnTestFS(table), "input.dat"));
295     FSDataOutputStream op = fs.create(inputPath, true);
296     if (data == null) {
297       data = "KEY\u001bVALUE1\u001bVALUE2\n";
298     }
299     op.write(Bytes.toBytes(data));
300     op.close();
301     LOG.debug(String.format("Wrote test data to file: %s", inputPath));
302 
303     if (conf.getBoolean(FORCE_COMBINER_CONF, true)) {
304       LOG.debug("Forcing combiner.");
305       conf.setInt("mapreduce.map.combine.minspills", 1);
306     }
307 
308     // run the import
309     List<String> argv = new ArrayList<String>(Arrays.asList(args));
310     argv.add(inputPath.toString());
311     Tool tool = new ImportTsv();
312     LOG.debug("Running ImportTsv with arguments: " + argv);
313     assertEquals(0, ToolRunner.run(conf, tool, argv.toArray(args)));
314 
315     // Perform basic validation. If the input args did not include
316     // ImportTsv.BULK_OUTPUT_CONF_KEY then validate data in the table.
317     // Otherwise, validate presence of hfiles.
318     boolean createdHFiles = false;
319     String outputPath = null;
320     for (String arg : argv) {
321       if (arg.contains(ImportTsv.BULK_OUTPUT_CONF_KEY)) {
322         createdHFiles = true;
323         // split '-Dfoo=bar' on '=' and keep 'bar'
324         outputPath = arg.split("=")[1];
325         break;
326       }
327     }
328 
329     if (createdHFiles)
330       validateHFiles(fs, outputPath, family);
331     else
332       validateTable(conf, TableName.valueOf(table), family, valueMultiplier);
333 
334     if (conf.getBoolean(DELETE_AFTER_LOAD_CONF, true)) {
335       LOG.debug("Deleting test subdirectory");
336       util.cleanupDataTestDirOnTestFS(table);
337     }
338     return tool;
339   }
340 
341   /**
342    * Confirm ImportTsv via data in online table.
343    */
344   private static void validateTable(Configuration conf, TableName tableName,
345       String family, int valueMultiplier) throws IOException {
346 
347     LOG.debug("Validating table.");
348     Table table = new HTable(conf, tableName);
349     boolean verified = false;
350     long pause = conf.getLong("hbase.client.pause", 5 * 1000);
351     int numRetries = conf.getInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 5);
352     for (int i = 0; i < numRetries; i++) {
353       try {
354         Scan scan = new Scan();
355         // Scan entire family.
356         scan.addFamily(Bytes.toBytes(family));
357         ResultScanner resScanner = table.getScanner(scan);
358         for (Result res : resScanner) {
359           assertTrue(res.size() == 2);
360           List<Cell> kvs = res.listCells();
361           assertTrue(CellUtil.matchingRow(kvs.get(0), Bytes.toBytes("KEY")));
362           assertTrue(CellUtil.matchingRow(kvs.get(1), Bytes.toBytes("KEY")));
363           assertTrue(CellUtil.matchingValue(kvs.get(0), Bytes.toBytes("VALUE" + valueMultiplier)));
364           assertTrue(CellUtil.matchingValue(kvs.get(1), Bytes.toBytes("VALUE" + 2 * valueMultiplier)));
365           // Only one result set is expected, so let it loop.
366         }
367         verified = true;
368         break;
369       } catch (NullPointerException e) {
370         // If here, a cell was empty. Presume its because updates came in
371         // after the scanner had been opened. Wait a while and retry.
372       }
373       try {
374         Thread.sleep(pause);
375       } catch (InterruptedException e) {
376         // continue
377       }
378     }
379     table.close();
380     assertTrue(verified);
381   }
382 
383   /**
384    * Confirm ImportTsv via HFiles on fs.
385    */
386   private static void validateHFiles(FileSystem fs, String outputPath, String family)
387       throws IOException {
388 
389     // validate number and content of output columns
390     LOG.debug("Validating HFiles.");
391     Set<String> configFamilies = new HashSet<String>();
392     configFamilies.add(family);
393     Set<String> foundFamilies = new HashSet<String>();
394     for (FileStatus cfStatus : fs.listStatus(new Path(outputPath), new OutputFilesFilter())) {
395       String[] elements = cfStatus.getPath().toString().split(Path.SEPARATOR);
396       String cf = elements[elements.length - 1];
397       foundFamilies.add(cf);
398       assertTrue(
399         String.format(
400           "HFile ouput contains a column family (%s) not present in input families (%s)",
401           cf, configFamilies),
402           configFamilies.contains(cf));
403       for (FileStatus hfile : fs.listStatus(cfStatus.getPath())) {
404         assertTrue(
405           String.format("HFile %s appears to contain no data.", hfile.getPath()),
406           hfile.getLen() > 0);
407       }
408     }
409   }
410 }
411