org.apache.mahout.utils.nlp.collocations.llr
Class CollocDriver
java.lang.Object
org.apache.hadoop.conf.Configured
org.apache.mahout.utils.nlp.collocations.llr.CollocDriver
- All Implemented Interfaces:
- org.apache.hadoop.conf.Configurable, org.apache.hadoop.util.Tool
public class CollocDriver
- extends org.apache.hadoop.conf.Configured
- implements org.apache.hadoop.util.Tool
Driver for LLR Collocation discovery mapreduce job
Method Summary |
static void |
computeNGramsPruneByLLR(long nGramTotal,
java.lang.String output,
boolean emitUnigrams,
float minLLRValue,
int reduceTasks)
pass2: perform the LLR calculation |
static void |
generateAllGrams(java.lang.String input,
java.lang.String output,
int maxNGramSize,
int minSupport,
float minLLRValue,
int reduceTasks)
Generate all ngrams for the DictionaryVectorizer job |
static long |
generateCollocations(java.lang.String input,
java.lang.String output,
boolean emitUnigrams,
int maxNGramSize,
int reduceTasks,
int minSupport)
pass1: generate collocations, ngrams |
static void |
main(java.lang.String[] args)
|
int |
run(java.lang.String[] args)
|
Methods inherited from class org.apache.hadoop.conf.Configured |
getConf, setConf |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Methods inherited from interface org.apache.hadoop.conf.Configurable |
getConf, setConf |
DEFAULT_OUTPUT_DIRECTORY
public static final java.lang.String DEFAULT_OUTPUT_DIRECTORY
- See Also:
- Constant Field Values
SUBGRAM_OUTPUT_DIRECTORY
public static final java.lang.String SUBGRAM_OUTPUT_DIRECTORY
- See Also:
- Constant Field Values
NGRAM_OUTPUT_DIRECTORY
public static final java.lang.String NGRAM_OUTPUT_DIRECTORY
- See Also:
- Constant Field Values
EMIT_UNIGRAMS
public static final java.lang.String EMIT_UNIGRAMS
- See Also:
- Constant Field Values
DEFAULT_EMIT_UNIGRAMS
public static final boolean DEFAULT_EMIT_UNIGRAMS
- See Also:
- Constant Field Values
DEFAULT_MAX_NGRAM_SIZE
public static final int DEFAULT_MAX_NGRAM_SIZE
- See Also:
- Constant Field Values
DEFAULT_PASS1_NUM_REDUCE_TASKS
public static final int DEFAULT_PASS1_NUM_REDUCE_TASKS
- See Also:
- Constant Field Values
main
public static void main(java.lang.String[] args)
throws java.lang.Exception
- Throws:
java.lang.Exception
run
public int run(java.lang.String[] args)
throws java.lang.Exception
- Specified by:
run
in interface org.apache.hadoop.util.Tool
- Throws:
java.lang.Exception
generateAllGrams
public static void generateAllGrams(java.lang.String input,
java.lang.String output,
int maxNGramSize,
int minSupport,
float minLLRValue,
int reduceTasks)
throws java.io.IOException
- Generate all ngrams for the
DictionaryVectorizer
job
- Parameters:
input
- input path containing tokenized documentsoutput
- output path where ngrams are generated including unigramsmaxNGramSize
- minValue = 2.minSupport
- minimum support to prune ngrams including unigramsminLLRValue
- minimum threshold to prune ngramsreduceTasks
- number of reducers used
- Throws:
java.io.IOException
generateCollocations
public static long generateCollocations(java.lang.String input,
java.lang.String output,
boolean emitUnigrams,
int maxNGramSize,
int reduceTasks,
int minSupport)
throws java.io.IOException
- pass1: generate collocations, ngrams
- Throws:
java.io.IOException
computeNGramsPruneByLLR
public static void computeNGramsPruneByLLR(long nGramTotal,
java.lang.String output,
boolean emitUnigrams,
float minLLRValue,
int reduceTasks)
throws java.io.IOException
- pass2: perform the LLR calculation
- Throws:
java.io.IOException
Copyright © 2008-2010 The Apache Software Foundation. All Rights Reserved.