org.apache.mahout.utils.nlp.collocations.llr
Class CollocDriver

java.lang.Object
  extended by org.apache.hadoop.conf.Configured
      extended by org.apache.mahout.utils.nlp.collocations.llr.CollocDriver
All Implemented Interfaces:
org.apache.hadoop.conf.Configurable, org.apache.hadoop.util.Tool

public class CollocDriver
extends org.apache.hadoop.conf.Configured
implements org.apache.hadoop.util.Tool

Driver for LLR Collocation discovery mapreduce job


Field Summary
static boolean DEFAULT_EMIT_UNIGRAMS
           
static int DEFAULT_MAX_NGRAM_SIZE
           
static java.lang.String DEFAULT_OUTPUT_DIRECTORY
           
static int DEFAULT_PASS1_NUM_REDUCE_TASKS
           
static java.lang.String EMIT_UNIGRAMS
           
static java.lang.String NGRAM_OUTPUT_DIRECTORY
           
static java.lang.String SUBGRAM_OUTPUT_DIRECTORY
           
 
Method Summary
static void computeNGramsPruneByLLR(long nGramTotal, java.lang.String output, boolean emitUnigrams, float minLLRValue, int reduceTasks)
          pass2: perform the LLR calculation
static void generateAllGrams(java.lang.String input, java.lang.String output, int maxNGramSize, int minSupport, float minLLRValue, int reduceTasks)
          Generate all ngrams for the DictionaryVectorizer job
static long generateCollocations(java.lang.String input, java.lang.String output, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport)
          pass1: generate collocations, ngrams
static void main(java.lang.String[] args)
           
 int run(java.lang.String[] args)
           
 
Methods inherited from class org.apache.hadoop.conf.Configured
getConf, setConf
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface org.apache.hadoop.conf.Configurable
getConf, setConf
 

Field Detail

DEFAULT_OUTPUT_DIRECTORY

public static final java.lang.String DEFAULT_OUTPUT_DIRECTORY
See Also:
Constant Field Values

SUBGRAM_OUTPUT_DIRECTORY

public static final java.lang.String SUBGRAM_OUTPUT_DIRECTORY
See Also:
Constant Field Values

NGRAM_OUTPUT_DIRECTORY

public static final java.lang.String NGRAM_OUTPUT_DIRECTORY
See Also:
Constant Field Values

EMIT_UNIGRAMS

public static final java.lang.String EMIT_UNIGRAMS
See Also:
Constant Field Values

DEFAULT_EMIT_UNIGRAMS

public static final boolean DEFAULT_EMIT_UNIGRAMS
See Also:
Constant Field Values

DEFAULT_MAX_NGRAM_SIZE

public static final int DEFAULT_MAX_NGRAM_SIZE
See Also:
Constant Field Values

DEFAULT_PASS1_NUM_REDUCE_TASKS

public static final int DEFAULT_PASS1_NUM_REDUCE_TASKS
See Also:
Constant Field Values
Method Detail

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception
Throws:
java.lang.Exception

run

public int run(java.lang.String[] args)
        throws java.lang.Exception
Specified by:
run in interface org.apache.hadoop.util.Tool
Throws:
java.lang.Exception

generateAllGrams

public static void generateAllGrams(java.lang.String input,
                                    java.lang.String output,
                                    int maxNGramSize,
                                    int minSupport,
                                    float minLLRValue,
                                    int reduceTasks)
                             throws java.io.IOException
Generate all ngrams for the DictionaryVectorizer job

Parameters:
input - input path containing tokenized documents
output - output path where ngrams are generated including unigrams
maxNGramSize - minValue = 2.
minSupport - minimum support to prune ngrams including unigrams
minLLRValue - minimum threshold to prune ngrams
reduceTasks - number of reducers used
Throws:
java.io.IOException

generateCollocations

public static long generateCollocations(java.lang.String input,
                                        java.lang.String output,
                                        boolean emitUnigrams,
                                        int maxNGramSize,
                                        int reduceTasks,
                                        int minSupport)
                                 throws java.io.IOException
pass1: generate collocations, ngrams

Throws:
java.io.IOException

computeNGramsPruneByLLR

public static void computeNGramsPruneByLLR(long nGramTotal,
                                           java.lang.String output,
                                           boolean emitUnigrams,
                                           float minLLRValue,
                                           int reduceTasks)
                                    throws java.io.IOException
pass2: perform the LLR calculation

Throws:
java.io.IOException


Copyright © 2008-2010 The Apache Software Foundation. All Rights Reserved.