org.apache.mahout.vectorizer.collocations.llr
Class CollocDriver

java.lang.Object
  extended by org.apache.hadoop.conf.Configured
      extended by org.apache.mahout.common.AbstractJob
          extended by org.apache.mahout.vectorizer.collocations.llr.CollocDriver
All Implemented Interfaces:
org.apache.hadoop.conf.Configurable, org.apache.hadoop.util.Tool

public final class CollocDriver
extends AbstractJob

Driver for LLR Collocation discovery mapreduce job


Field Summary
static boolean DEFAULT_EMIT_UNIGRAMS
           
static java.lang.String EMIT_UNIGRAMS
           
static java.lang.String NGRAM_OUTPUT_DIRECTORY
           
static java.lang.String SUBGRAM_OUTPUT_DIRECTORY
           
 
Method Summary
static void generateAllGrams(org.apache.hadoop.fs.Path input, org.apache.hadoop.fs.Path output, org.apache.hadoop.conf.Configuration baseConf, int maxNGramSize, int minSupport, float minLLRValue, int reduceTasks)
          Generate all ngrams for the org.apache.mahout.utils.vectors.text.DictionaryVectorizer job
static void main(java.lang.String[] args)
           
 int run(java.lang.String[] args)
           
 
Methods inherited from class org.apache.mahout.common.AbstractJob
addFlag, addInputOption, addOption, addOption, addOption, addOption, addOutputOption, getInputPath, getOption, getOutputPath, hasOption, keyFor, maybePut, parseArguments, parseDirectories, prepareJob, shouldRunNextPhase
 
Methods inherited from class org.apache.hadoop.conf.Configured
getConf, setConf
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface org.apache.hadoop.conf.Configurable
getConf, setConf
 

Field Detail

SUBGRAM_OUTPUT_DIRECTORY

public static final java.lang.String SUBGRAM_OUTPUT_DIRECTORY
See Also:
Constant Field Values

NGRAM_OUTPUT_DIRECTORY

public static final java.lang.String NGRAM_OUTPUT_DIRECTORY
See Also:
Constant Field Values

EMIT_UNIGRAMS

public static final java.lang.String EMIT_UNIGRAMS
See Also:
Constant Field Values

DEFAULT_EMIT_UNIGRAMS

public static final boolean DEFAULT_EMIT_UNIGRAMS
See Also:
Constant Field Values
Method Detail

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception
Throws:
java.lang.Exception

run

public int run(java.lang.String[] args)
        throws java.lang.Exception
Throws:
java.lang.Exception

generateAllGrams

public static void generateAllGrams(org.apache.hadoop.fs.Path input,
                                    org.apache.hadoop.fs.Path output,
                                    org.apache.hadoop.conf.Configuration baseConf,
                                    int maxNGramSize,
                                    int minSupport,
                                    float minLLRValue,
                                    int reduceTasks)
                             throws java.io.IOException,
                                    java.lang.InterruptedException,
                                    java.lang.ClassNotFoundException
Generate all ngrams for the org.apache.mahout.utils.vectors.text.DictionaryVectorizer job

Parameters:
input - input path containing tokenized documents
output - output path where ngrams are generated including unigrams
maxNGramSize - minValue = 2.
minSupport - minimum support to prune ngrams including unigrams
minLLRValue - minimum threshold to prune ngrams
reduceTasks - number of reducers used
Throws:
java.io.IOException
java.lang.InterruptedException
java.lang.ClassNotFoundException


Copyright © 2008-2010 The Apache Software Foundation. All Rights Reserved.