org.apache.blur.mapreduce.lib
Class BlurOutputFormat

java.lang.Object
  extended by org.apache.hadoop.mapreduce.OutputFormat<org.apache.hadoop.io.Text,BlurMutate>
      extended by org.apache.blur.mapreduce.lib.BlurOutputFormat

public class BlurOutputFormat
extends org.apache.hadoop.mapreduce.OutputFormat<org.apache.hadoop.io.Text,BlurMutate>

BlurOutputFormat is used to index data and delivery the indexes to the proper Blur table for searching. A typical usage of this class would be as follows.


Blur.Iface client = BlurClient.getClient("controller1:40010");

TableDescriptor tableDescriptor = client.describe(tableName);

Job job = new Job(jobConf, "blur index");
job.setJarByClass(BlurOutputFormatTest.class);
job.setMapperClass(CsvBlurMapper.class);
job.setInputFormatClass(TextInputFormat.class);

FileInputFormat.addInputPath(job, new Path(input));
CsvBlurMapper.addColumns(job, "cf1", "col");

BlurOutputFormat.setupJob(job, tableDescriptor);
BlurOutputFormat.setIndexLocally(job, true);
BlurOutputFormat.setOptimizeInFlight(job, false);

job.waitForCompletion(true);


Field Summary
static String BLUR_OUTPUT_DOCUMENT_BUFFER_STRATEGY
           
static String BLUR_OUTPUT_INDEXLOCALLY
           
static String BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_HEAP_SIZE
           
static String BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_SIZE
           
static String BLUR_OUTPUT_OPTIMIZEINFLIGHT
           
static String BLUR_OUTPUT_PATH
           
static String BLUR_OUTPUT_REDUCER_MULTIPLIER
           
static String BLUR_TABLE_DESCRIPTOR
           
 
Constructor Summary
BlurOutputFormat()
           
 
Method Summary
 void checkOutputSpecs(org.apache.hadoop.mapreduce.JobContext context)
           
static DocumentBufferStrategy getDocumentBufferStrategy(org.apache.hadoop.conf.Configuration configuration)
           
static GetCounter getGetCounter()
           
static int getMaxDocumentBufferHeapSize(org.apache.hadoop.conf.Configuration configuration)
           
static int getMaxDocumentBufferSize(org.apache.hadoop.conf.Configuration configuration)
           
 org.apache.hadoop.mapreduce.OutputCommitter getOutputCommitter(org.apache.hadoop.mapreduce.TaskAttemptContext context)
           
static org.apache.hadoop.fs.Path getOutputPath(org.apache.hadoop.conf.Configuration configuration)
           
static org.apache.hadoop.util.Progressable getProgressable()
           
 org.apache.hadoop.mapreduce.RecordWriter<org.apache.hadoop.io.Text,BlurMutate> getRecordWriter(org.apache.hadoop.mapreduce.TaskAttemptContext context)
           
static int getReducerMultiplier(org.apache.hadoop.conf.Configuration configuration)
           
static org.apache.blur.thrift.generated.TableDescriptor getTableDescriptor(org.apache.hadoop.conf.Configuration configuration)
           
static boolean isIndexLocally(org.apache.hadoop.conf.Configuration configuration)
           
static boolean isOptimizeInFlight(org.apache.hadoop.conf.Configuration configuration)
           
static void setDocumentBufferStrategy(org.apache.hadoop.conf.Configuration configuration, Class<? extends DocumentBufferStrategy> documentBufferStrategyClass)
           
static void setDocumentBufferStrategy(org.apache.hadoop.mapreduce.Job job, Class<? extends DocumentBufferStrategy> documentBufferStrategyClass)
           
static void setGetCounter(GetCounter getCounter)
           
static void setIndexLocally(org.apache.hadoop.conf.Configuration configuration, boolean b)
          Enabled by default, this will enable local indexing on the machine where the task is running.
static void setIndexLocally(org.apache.hadoop.mapreduce.Job job, boolean b)
          Enabled by default, this will enable local indexing on the machine where the task is running.
static void setMaxDocumentBufferHeapSize(org.apache.hadoop.conf.Configuration configuration, int maxDocumentBufferHeapSize)
           
static void setMaxDocumentBufferHeapSize(org.apache.hadoop.mapreduce.Job job, int maxDocumentBufferHeapSize)
           
static void setMaxDocumentBufferSize(org.apache.hadoop.conf.Configuration configuration, int maxDocumentBufferSize)
          Sets the maximum number of documents that the buffer will hold in memory before overflowing to disk.
static void setMaxDocumentBufferSize(org.apache.hadoop.mapreduce.Job job, int maxDocumentBufferSize)
          Sets the maximum number of documents that the buffer will hold in memory before overflowing to disk.
static void setOptimizeInFlight(org.apache.hadoop.conf.Configuration configuration, boolean b)
          Enabled by default, this will optimize the index while copying from the local index to the remote destination in HDFS.
static void setOptimizeInFlight(org.apache.hadoop.mapreduce.Job job, boolean b)
          Enabled by default, this will optimize the index while copying from the local index to the remote destination in HDFS.
static void setOutputPath(org.apache.hadoop.conf.Configuration configuration, org.apache.hadoop.fs.Path path)
           
static void setOutputPath(org.apache.hadoop.mapreduce.Job job, org.apache.hadoop.fs.Path path)
           
static void setProgressable(org.apache.hadoop.util.Progressable progressable)
           
static void setReducerMultiplier(org.apache.hadoop.mapreduce.Job job, int multiple)
          This will multiple the number of reducers for this job.
static void setTableDescriptor(org.apache.hadoop.conf.Configuration configuration, org.apache.blur.thrift.generated.TableDescriptor tableDescriptor)
          Sets the TableDescriptor for this job.
static void setTableDescriptor(org.apache.hadoop.mapreduce.Job job, org.apache.blur.thrift.generated.TableDescriptor tableDescriptor)
          Sets the TableDescriptor for this job.
static void setupJob(org.apache.hadoop.mapreduce.Job job, org.apache.blur.thrift.generated.TableDescriptor tableDescriptor)
          Sets up the output portion of the map reduce job.
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

BLUR_OUTPUT_REDUCER_MULTIPLIER

public static final String BLUR_OUTPUT_REDUCER_MULTIPLIER
See Also:
Constant Field Values

BLUR_OUTPUT_OPTIMIZEINFLIGHT

public static final String BLUR_OUTPUT_OPTIMIZEINFLIGHT
See Also:
Constant Field Values

BLUR_OUTPUT_INDEXLOCALLY

public static final String BLUR_OUTPUT_INDEXLOCALLY
See Also:
Constant Field Values

BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_SIZE

public static final String BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_SIZE
See Also:
Constant Field Values

BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_HEAP_SIZE

public static final String BLUR_OUTPUT_MAX_DOCUMENT_BUFFER_HEAP_SIZE
See Also:
Constant Field Values

BLUR_OUTPUT_DOCUMENT_BUFFER_STRATEGY

public static final String BLUR_OUTPUT_DOCUMENT_BUFFER_STRATEGY
See Also:
Constant Field Values

BLUR_TABLE_DESCRIPTOR

public static final String BLUR_TABLE_DESCRIPTOR
See Also:
Constant Field Values

BLUR_OUTPUT_PATH

public static final String BLUR_OUTPUT_PATH
See Also:
Constant Field Values
Constructor Detail

BlurOutputFormat

public BlurOutputFormat()
Method Detail

setProgressable

public static void setProgressable(org.apache.hadoop.util.Progressable progressable)

getProgressable

public static org.apache.hadoop.util.Progressable getProgressable()

setGetCounter

public static void setGetCounter(GetCounter getCounter)

getGetCounter

public static GetCounter getGetCounter()

checkOutputSpecs

public void checkOutputSpecs(org.apache.hadoop.mapreduce.JobContext context)
                      throws IOException,
                             InterruptedException
Specified by:
checkOutputSpecs in class org.apache.hadoop.mapreduce.OutputFormat<org.apache.hadoop.io.Text,BlurMutate>
Throws:
IOException
InterruptedException

getRecordWriter

public org.apache.hadoop.mapreduce.RecordWriter<org.apache.hadoop.io.Text,BlurMutate> getRecordWriter(org.apache.hadoop.mapreduce.TaskAttemptContext context)
                                                                                               throws IOException,
                                                                                                      InterruptedException
Specified by:
getRecordWriter in class org.apache.hadoop.mapreduce.OutputFormat<org.apache.hadoop.io.Text,BlurMutate>
Throws:
IOException
InterruptedException

getOutputCommitter

public org.apache.hadoop.mapreduce.OutputCommitter getOutputCommitter(org.apache.hadoop.mapreduce.TaskAttemptContext context)
                                                               throws IOException,
                                                                      InterruptedException
Specified by:
getOutputCommitter in class org.apache.hadoop.mapreduce.OutputFormat<org.apache.hadoop.io.Text,BlurMutate>
Throws:
IOException
InterruptedException

getTableDescriptor

public static org.apache.blur.thrift.generated.TableDescriptor getTableDescriptor(org.apache.hadoop.conf.Configuration configuration)
                                                                           throws IOException
Throws:
IOException

setReducerMultiplier

public static void setReducerMultiplier(org.apache.hadoop.mapreduce.Job job,
                                        int multiple)
                                 throws IOException
This will multiple the number of reducers for this job. For example if the table has 256 shards the normal number of reducers is 256. However if the reducer multiplier is set to 4 then the number of reducers will be 1024 and each shard will get 4 new segments instead of the normal 1.

Parameters:
job - the job to setup.
multiple - the multiple to use.
Throws:
IOException

getReducerMultiplier

public static int getReducerMultiplier(org.apache.hadoop.conf.Configuration configuration)

setTableDescriptor

public static void setTableDescriptor(org.apache.hadoop.mapreduce.Job job,
                                      org.apache.blur.thrift.generated.TableDescriptor tableDescriptor)
                               throws IOException
Sets the TableDescriptor for this job.

Parameters:
job - the job to setup.
tableDescriptor - the TableDescriptor.
Throws:
IOException

setTableDescriptor

public static void setTableDescriptor(org.apache.hadoop.conf.Configuration configuration,
                                      org.apache.blur.thrift.generated.TableDescriptor tableDescriptor)
                               throws IOException
Sets the TableDescriptor for this job.

Parameters:
job - the job to setup.
tableDescriptor - the TableDescriptor.
Throws:
IOException

setMaxDocumentBufferSize

public static void setMaxDocumentBufferSize(org.apache.hadoop.mapreduce.Job job,
                                            int maxDocumentBufferSize)
Sets the maximum number of documents that the buffer will hold in memory before overflowing to disk. By default this is 1000 which will probably be very low for most systems.

Parameters:
job - the job to setup.
maxDocumentBufferSize - the maxDocumentBufferSize.

setMaxDocumentBufferSize

public static void setMaxDocumentBufferSize(org.apache.hadoop.conf.Configuration configuration,
                                            int maxDocumentBufferSize)
Sets the maximum number of documents that the buffer will hold in memory before overflowing to disk. By default this is 1000 which will probably be very low for most systems.

Parameters:
configuration - the configuration to setup.
maxDocumentBufferSize - the maxDocumentBufferSize.

getMaxDocumentBufferSize

public static int getMaxDocumentBufferSize(org.apache.hadoop.conf.Configuration configuration)

getMaxDocumentBufferHeapSize

public static int getMaxDocumentBufferHeapSize(org.apache.hadoop.conf.Configuration configuration)

setMaxDocumentBufferHeapSize

public static void setMaxDocumentBufferHeapSize(org.apache.hadoop.conf.Configuration configuration,
                                                int maxDocumentBufferHeapSize)

setMaxDocumentBufferHeapSize

public static void setMaxDocumentBufferHeapSize(org.apache.hadoop.mapreduce.Job job,
                                                int maxDocumentBufferHeapSize)

getDocumentBufferStrategy

public static DocumentBufferStrategy getDocumentBufferStrategy(org.apache.hadoop.conf.Configuration configuration)

setDocumentBufferStrategy

public static void setDocumentBufferStrategy(org.apache.hadoop.mapreduce.Job job,
                                             Class<? extends DocumentBufferStrategy> documentBufferStrategyClass)

setDocumentBufferStrategy

public static void setDocumentBufferStrategy(org.apache.hadoop.conf.Configuration configuration,
                                             Class<? extends DocumentBufferStrategy> documentBufferStrategyClass)

setOutputPath

public static void setOutputPath(org.apache.hadoop.mapreduce.Job job,
                                 org.apache.hadoop.fs.Path path)

setOutputPath

public static void setOutputPath(org.apache.hadoop.conf.Configuration configuration,
                                 org.apache.hadoop.fs.Path path)

getOutputPath

public static org.apache.hadoop.fs.Path getOutputPath(org.apache.hadoop.conf.Configuration configuration)

setIndexLocally

public static void setIndexLocally(org.apache.hadoop.mapreduce.Job job,
                                   boolean b)
Enabled by default, this will enable local indexing on the machine where the task is running. Then when the RecordWriter closes the index is copied to the remote destination in HDFS.

Parameters:
job - the job to setup.
b - the boolean to true enable, false to disable.

setIndexLocally

public static void setIndexLocally(org.apache.hadoop.conf.Configuration configuration,
                                   boolean b)
Enabled by default, this will enable local indexing on the machine where the task is running. Then when the RecordWriter closes the index is copied to the remote destination in HDFS.

Parameters:
configuration - the configuration to setup.
b - the boolean to true enable, false to disable.

isIndexLocally

public static boolean isIndexLocally(org.apache.hadoop.conf.Configuration configuration)

setOptimizeInFlight

public static void setOptimizeInFlight(org.apache.hadoop.mapreduce.Job job,
                                       boolean b)
Enabled by default, this will optimize the index while copying from the local index to the remote destination in HDFS. Used in conjunction with the setIndexLocally.

Parameters:
job - the job to setup.
b - the boolean to true enable, false to disable.

setOptimizeInFlight

public static void setOptimizeInFlight(org.apache.hadoop.conf.Configuration configuration,
                                       boolean b)
Enabled by default, this will optimize the index while copying from the local index to the remote destination in HDFS. Used in conjunction with the setIndexLocally.

Parameters:
job - the job to setup.
b - the boolean to true enable, false to disable.

isOptimizeInFlight

public static boolean isOptimizeInFlight(org.apache.hadoop.conf.Configuration configuration)

setupJob

public static void setupJob(org.apache.hadoop.mapreduce.Job job,
                            org.apache.blur.thrift.generated.TableDescriptor tableDescriptor)
                     throws IOException
Sets up the output portion of the map reduce job. This does effect the map side of the job, of a map and reduce job.

Parameters:
job - the job to setup.
tableDescriptor - the table descriptor to write the output of the indexing job.
Throws:
IOException


Copyright © 2012-2014 The Apache Software Foundation. All Rights Reserved.