org.apache.mahout.clustering.dirichlet
Class DirichletDriver

java.lang.Object
  extended by org.apache.hadoop.conf.Configured
      extended by org.apache.mahout.common.AbstractJob
          extended by org.apache.mahout.clustering.dirichlet.DirichletDriver
All Implemented Interfaces:
org.apache.hadoop.conf.Configurable, org.apache.hadoop.util.Tool

public class DirichletDriver
extends AbstractJob


Field Summary
static java.lang.String ALPHA_0_KEY
           
static java.lang.String ALPHA_OPTION
           
static java.lang.String EMIT_MOST_LIKELY_KEY
           
static java.lang.String MODEL_DISTRIBUTION_CLASS_OPTION
           
static java.lang.String MODEL_DISTRIBUTION_KEY
           
static java.lang.String MODEL_PROTOTYPE_CLASS_OPTION
           
static java.lang.String MODEL_PROTOTYPE_KEY
           
static java.lang.String NUM_CLUSTERS_KEY
           
static java.lang.String PROTOTYPE_SIZE_KEY
           
static java.lang.String STATE_IN_KEY
           
static java.lang.String THRESHOLD_KEY
           
 
Constructor Summary
DirichletDriver()
           
 
Method Summary
static org.apache.hadoop.fs.Path buildClusters(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path input, org.apache.hadoop.fs.Path output, ModelDistribution<VectorWritable> modelDistribution, int numClusters, int maxIterations, double alpha0, boolean runSequential)
          Iterate over the input vectors to produce cluster directories for each iteration
static void clusterData(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path input, org.apache.hadoop.fs.Path stateIn, org.apache.hadoop.fs.Path output, boolean emitMostLikely, double threshold, boolean runSequential)
          Run the job using supplied arguments
static AbstractVectorModelDistribution createModelDistribution(java.lang.String modelFactory, java.lang.String modelPrototype, java.lang.String distanceMeasure, int prototypeSize)
          Create an instance of AbstractVectorModelDistribution from the given command line arguments
static void main(java.lang.String[] args)
           
static int readPrototypeSize(org.apache.hadoop.fs.Path input)
          Read the first input vector to determine the prototype size for the modelPrototype
static void run(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path input, org.apache.hadoop.fs.Path output, ModelDistribution<VectorWritable> modelDistribution, int numModels, int maxIterations, double alpha0, boolean runClustering, boolean emitMostLikely, double threshold, boolean runSequential)
          Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to cluster the input vectors.
static void run(org.apache.hadoop.fs.Path input, org.apache.hadoop.fs.Path output, ModelDistribution<VectorWritable> modelDistribution, int numClusters, int maxIterations, double alpha0, boolean runClustering, boolean emitMostLikely, double threshold, boolean runSequential)
          Convenience method provides default Configuration Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to cluster the input vectors.
 int run(java.lang.String[] args)
           
 
Methods inherited from class org.apache.mahout.common.AbstractJob
addFlag, addInputOption, addOption, addOption, addOption, addOption, addOutputOption, getInputPath, getOption, getOutputPath, hasOption, keyFor, maybePut, parseArguments, parseDirectories, prepareJob, shouldRunNextPhase
 
Methods inherited from class org.apache.hadoop.conf.Configured
getConf, setConf
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface org.apache.hadoop.conf.Configurable
getConf, setConf
 

Field Detail

STATE_IN_KEY

public static final java.lang.String STATE_IN_KEY
See Also:
Constant Field Values

MODEL_DISTRIBUTION_KEY

public static final java.lang.String MODEL_DISTRIBUTION_KEY
See Also:
Constant Field Values

MODEL_PROTOTYPE_KEY

public static final java.lang.String MODEL_PROTOTYPE_KEY
See Also:
Constant Field Values

PROTOTYPE_SIZE_KEY

public static final java.lang.String PROTOTYPE_SIZE_KEY
See Also:
Constant Field Values

NUM_CLUSTERS_KEY

public static final java.lang.String NUM_CLUSTERS_KEY
See Also:
Constant Field Values

ALPHA_0_KEY

public static final java.lang.String ALPHA_0_KEY
See Also:
Constant Field Values

EMIT_MOST_LIKELY_KEY

public static final java.lang.String EMIT_MOST_LIKELY_KEY
See Also:
Constant Field Values

THRESHOLD_KEY

public static final java.lang.String THRESHOLD_KEY
See Also:
Constant Field Values

MODEL_PROTOTYPE_CLASS_OPTION

public static final java.lang.String MODEL_PROTOTYPE_CLASS_OPTION
See Also:
Constant Field Values

MODEL_DISTRIBUTION_CLASS_OPTION

public static final java.lang.String MODEL_DISTRIBUTION_CLASS_OPTION
See Also:
Constant Field Values

ALPHA_OPTION

public static final java.lang.String ALPHA_OPTION
See Also:
Constant Field Values
Constructor Detail

DirichletDriver

public DirichletDriver()
Method Detail

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception
Throws:
java.lang.Exception

run

public int run(java.lang.String[] args)
        throws java.io.IOException,
               java.lang.ClassNotFoundException,
               java.lang.InstantiationException,
               java.lang.IllegalAccessException,
               java.lang.NoSuchMethodException,
               java.lang.reflect.InvocationTargetException,
               java.lang.InterruptedException
Throws:
java.io.IOException
java.lang.ClassNotFoundException
java.lang.InstantiationException
java.lang.IllegalAccessException
java.lang.NoSuchMethodException
java.lang.reflect.InvocationTargetException
java.lang.InterruptedException

run

public static void run(org.apache.hadoop.conf.Configuration conf,
                       org.apache.hadoop.fs.Path input,
                       org.apache.hadoop.fs.Path output,
                       ModelDistribution<VectorWritable> modelDistribution,
                       int numModels,
                       int maxIterations,
                       double alpha0,
                       boolean runClustering,
                       boolean emitMostLikely,
                       double threshold,
                       boolean runSequential)
                throws java.io.IOException,
                       java.lang.InstantiationException,
                       java.lang.ClassNotFoundException,
                       java.lang.InterruptedException,
                       java.lang.IllegalAccessException
Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to cluster the input vectors.

Parameters:
conf - the Configuration to use
input - the directory Path for input points
output - the directory Path for output points
modelDistribution - the String class name of the model's prototype vector
maxIterations - the maximum number of iterations
alpha0 - the alpha_0 value for the DirichletDistribution
runClustering - true if clustering of points to be done after iterations
emitMostLikely - a boolean if true emit only most likely cluster for each point
threshold - a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
runSequential - execute sequentially if true
Throws:
java.io.IOException
java.lang.InstantiationException
java.lang.ClassNotFoundException
java.lang.InterruptedException
java.lang.IllegalAccessException

run

public static void run(org.apache.hadoop.fs.Path input,
                       org.apache.hadoop.fs.Path output,
                       ModelDistribution<VectorWritable> modelDistribution,
                       int numClusters,
                       int maxIterations,
                       double alpha0,
                       boolean runClustering,
                       boolean emitMostLikely,
                       double threshold,
                       boolean runSequential)
                throws java.io.IOException,
                       java.lang.InstantiationException,
                       java.lang.IllegalAccessException,
                       java.lang.ClassNotFoundException,
                       java.lang.InterruptedException
Convenience method provides default Configuration Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to cluster the input vectors.

Parameters:
input - the directory Path for input points
output - the directory Path for output points
modelDistribution - the String class name of the model's prototype vector
numClusters - the number of models to iterate over
maxIterations - the maximum number of iterations
alpha0 - the alpha_0 value for the DirichletDistribution
runClustering - true if clustering of points to be done after iterations
emitMostLikely - a boolean if true emit only most likely cluster for each point
threshold - a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
runSequential - execute sequentially if true
Throws:
java.io.IOException
java.lang.InstantiationException
java.lang.IllegalAccessException
java.lang.ClassNotFoundException
java.lang.InterruptedException

createModelDistribution

public static AbstractVectorModelDistribution createModelDistribution(java.lang.String modelFactory,
                                                                      java.lang.String modelPrototype,
                                                                      java.lang.String distanceMeasure,
                                                                      int prototypeSize)
                                                               throws java.lang.ClassNotFoundException,
                                                                      java.lang.InstantiationException,
                                                                      java.lang.IllegalAccessException,
                                                                      java.lang.NoSuchMethodException,
                                                                      java.lang.reflect.InvocationTargetException
Create an instance of AbstractVectorModelDistribution from the given command line arguments

Throws:
java.lang.ClassNotFoundException
java.lang.InstantiationException
java.lang.IllegalAccessException
java.lang.NoSuchMethodException
java.lang.reflect.InvocationTargetException

readPrototypeSize

public static int readPrototypeSize(org.apache.hadoop.fs.Path input)
                             throws java.io.IOException,
                                    java.lang.InstantiationException,
                                    java.lang.IllegalAccessException
Read the first input vector to determine the prototype size for the modelPrototype

Throws:
java.io.IOException
java.lang.InstantiationException
java.lang.IllegalAccessException

buildClusters

public static org.apache.hadoop.fs.Path buildClusters(org.apache.hadoop.conf.Configuration conf,
                                                      org.apache.hadoop.fs.Path input,
                                                      org.apache.hadoop.fs.Path output,
                                                      ModelDistribution<VectorWritable> modelDistribution,
                                                      int numClusters,
                                                      int maxIterations,
                                                      double alpha0,
                                                      boolean runSequential)
                                               throws java.io.IOException,
                                                      java.lang.InstantiationException,
                                                      java.lang.ClassNotFoundException,
                                                      java.lang.InterruptedException,
                                                      java.lang.IllegalAccessException
Iterate over the input vectors to produce cluster directories for each iteration

Parameters:
conf -
input - the directory Path for input points
output - the directory Path for output points
modelDistribution - the String class name of the model's prototype vector
numClusters - the number of models to iterate over
maxIterations - the maximum number of iterations
alpha0 - the alpha_0 value for the DirichletDistribution
runSequential - execute sequentially if true
Returns:
the Path of the final clusters directory
Throws:
java.io.IOException
java.lang.InstantiationException
java.lang.ClassNotFoundException
java.lang.InterruptedException
java.lang.IllegalAccessException

clusterData

public static void clusterData(org.apache.hadoop.conf.Configuration conf,
                               org.apache.hadoop.fs.Path input,
                               org.apache.hadoop.fs.Path stateIn,
                               org.apache.hadoop.fs.Path output,
                               boolean emitMostLikely,
                               double threshold,
                               boolean runSequential)
                        throws java.io.IOException,
                               java.lang.InterruptedException,
                               java.lang.ClassNotFoundException,
                               java.lang.InstantiationException,
                               java.lang.IllegalAccessException
Run the job using supplied arguments

Parameters:
conf -
input - the directory pathname for input points
stateIn - the directory pathname for input state
output - the directory pathname for output points
emitMostLikely - a boolean if true emit only most likely cluster for each point
threshold - a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
runSequential - execute sequentially if true
Throws:
java.io.IOException
java.lang.InterruptedException
java.lang.ClassNotFoundException
java.lang.InstantiationException
java.lang.IllegalAccessException


Copyright © 2008-2010 The Apache Software Foundation. All Rights Reserved.