Re: TFIDFConverter generates empty tfidf-vectors

Taner Diler Sun, 22 Sep 2013 03:44:31 -0700

Gökhan, thank you.

your advice solved my problem.



On Sun, Sep 8, 2013 at 3:11 PM, Gokhan Capan <gkhn...@gmail.com> wrote:

> Taner,
>
> It seems to have tf-idf vectors later, you need to create tf vectors
> (DictionaryVectorizer.createTermFrequencyVectors) with logNormalize option
> set to false, and normPower option set to -1.0f. This applies to
> HighDFWordsPruner.pruneVectors, too.
>
> I believe that solves your problem.
>
> Best
>
> Gokhan
>
>
> On Wed, Sep 4, 2013 at 4:54 PM, Taner Diler <taner.di...@gmail.com> wrote:
>
> > Actually, my real motivation was to visualize reuters vectors like
> > DisplayKMeans example and then implement to web contents that I've
> > collected and additionaly to discover what I can do with generated tfidf
> > vectors. But TDIDF doesn't work and why?
> >
> > There is one main class that doesn't extend AbstractJob. Yes it has main
> > method that executes all steps. And I'm trying to implement a sample that
> > in mahout wiki and everywhere in net. In Eclipse, I've just added
> > mahout-0.8-job.jar  , you know it includes all depended package, and
> > hadoop-core.1.2.0.jar.
> >
> > import java.io.IOException;
> > import java.util.ArrayList;
> > import java.util.Collections;
> > import java.util.Comparator;
> > import java.util.HashMap;
> > import java.util.List;
> > import java.util.Map;
> > import java.util.Set;
> >
> > import org.apache.hadoop.conf.Configuration;
> > import org.apache.hadoop.fs.FileSystem;
> > import org.apache.hadoop.fs.Path;
> > import org.apache.hadoop.io.IntWritable;
> > import org.apache.hadoop.io.LongWritable;
> > import org.apache.hadoop.io.SequenceFile;
> > import org.apache.hadoop.io.Text;
> > import org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat;
> > import org.apache.lucene.analysis.Analyzer;
> > import org.apache.lucene.analysis.standard.StandardAnalyzer;
> > import org.apache.lucene.util.Version;
> > import org.apache.mahout.clustering.Cluster;
> > import org.apache.mahout.clustering.canopy.CanopyDriver;
> > import org.apache.mahout.clustering.classify.WeightedVectorWritable;
> > import org.apache.mahout.clustering.display.DisplayKMeans;
> > import org.apache.mahout.clustering.kmeans.KMeansDriver;
> > import org.apache.mahout.common.HadoopUtil;
> > import org.apache.mahout.common.Pair;
> > import org.apache.mahout.common.StringTuple;
> > import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
> > import org.apache.mahout.common.distance.TanimotoDistanceMeasure;
> > import org.apache.mahout.math.Vector.Element;
> > import org.apache.mahout.math.VectorWritable;
> > import org.apache.mahout.vectorizer.DictionaryVectorizer;
> > import org.apache.mahout.vectorizer.DocumentProcessor;
> > import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
> >
> >
> > public class MahoutReutersKMeans {
> >
> >     private static int minSupport = 2;
> >     private static int maxNGramSize = 2;
> >     private static float minLLRValue = 1;
> >     private static float normPower = 2;
> >     private static boolean logNormalize = true;
> >     private static int numReducers = 1;
> >     private static int chunkSizeInMegabytes = 200;
> >     private static boolean sequentialAccess = true;
> >     private static boolean namedVectors = false;
> >
> >     private static int minDf = 5;
> >
> >     private static long maxDF = 95;
> >
> >     /**
> >      * @param args
> >      * @throws IOException
> >      * @throws InterruptedException
> >      * @throws ClassNotFoundException
> >      */
> >     public static void main(String[] args) throws IOException,
> > ClassNotFoundException, InterruptedException {
> >
> >         Configuration conf = new Configuration();
> >         String HADOOP_HOME = System.getenv("HADOOP_PREFIX");
> >
> >         conf.addResource(new Path(HADOOP_HOME, "conf/core-site.xml"));
> >         conf.addResource(new Path(HADOOP_HOME, "conf/hdfs-site.xml"));
> >         conf.addResource(new Path(HADOOP_HOME, "conf/mapred-site.xml"));
> >
> >         FileSystem fs  = FileSystem.get(conf);
> >
> >         Path inputDir = new Path("reuters-seqfiles");
> >         String outputDir = "reuters-kmeans-try";
> >         HadoopUtil.delete(conf, new Path(outputDir));
> >         StandardAnalyzer analyzer = new
> > StandardAnalyzer(Version.LUCENE_43);
> >         Path tokenizedPath = new
> > Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
> >         DocumentProcessor.tokenizeDocuments(inputDir,
> > analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath, conf);
> >
> >
> >         DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
> new
> > Path(outputDir),
> >                 DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf,
> > minSupport , maxNGramSize, minLLRValue, normPower , logNormalize,
> > numReducers , chunkSizeInMegabytes , sequentialAccess, namedVectors);
> >
> >
> >         Pair<Long[], List<Path>> features =
> TFIDFConverter.calculateDF(new
> > Path(outputDir,
> >                 DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
> > Path(outputDir), conf, chunkSizeInMegabytes);
> >         TFIDFConverter.processTfIdf(new Path(outputDir,
> >                 DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new
> > Path(outputDir), conf, features, minDf , maxDF , normPower, logNormalize,
> > sequentialAccess, false, numReducers);
> >
> >
> > //        Path tfidfVectorsPath = new Path(outputDir, "tfidf-vectors");
> > //        Path canopyCentroidsPath = new Path(outputDir,
> > "canopy-centroids");
> > //        Path clustersPath = new Path(outputDir, "clusters");
> > //
> > //        CanopyDriver.run(conf, tfidfVectorsPath, canopyCentroidsPath,
> new
> > EuclideanDistanceMeasure(), 250, 120, false, 0.01, false);
> > //        KMeansDriver.run(conf, tfidfVectorsPath, new
> > Path(canopyCentroidsPath, "clusters-0-final"), clustersPath, new
> > TanimotoDistanceMeasure(), 0.01, 20, true, 0.01, false);
> > //
> > //        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new
> > Path("reuters-clusters/" + Cluster.CLUSTERED_POINTS_DIR +
> "/part-m-00000"),
> > conf);
> > //
> > //        IntWritable key = new IntWritable();
> > //        WeightedVectorWritable value = new WeightedVectorWritable();
> > //        while (reader.next(key, value)) {
> > //            System.out.println(key.toString()+" belongs to cluster
> > "+value.toString());
> > //        }
> > //        reader.close();
> >
> >     }
> >
> > }
> >
> >
> >
> >
> > On Wed, Sep 4, 2013 at 3:29 PM, Gokhan Capan <gkhn...@gmail.com> wrote:
> >
> > > Taner,
> > >
> > > A few questions:
> > >
> > > Is there a specific reason not to consider using seq2sparse directly?
> > (You
> > > can edit seq2sparse.props to avoid passing commandline arguments every
> > time
> > > you run it, if that is the case)
> > >
> > > Java code you attached seems to do the same thing with
> > > SparseVectorFromSequenceFiles#run(String[]),  which is also the method
> > > called when you run seq2sparse. I'm gonna debug it anyway.
> > >
> > > And I would like to know how you run the java code. Does your main
> class
> > > extend AbstractJob to make it "runnable" using bin/mahout? And does it
> > have
> > > a main method that submits your job to your hadoop cluster? Are you
> using
> > > hadoop jar command to run it?
> > >
> > > Best
> > >
> > > Gokhan
> > >
> > >
> > > On Wed, Sep 4, 2013 at 1:15 PM, Taner Diler <taner.di...@gmail.com>
> > wrote:
> > >
> > > > Suneel, samples from generated seqfiles:
> > > >
> > > > df-count
> > > >
> > > > Key: -1: Value: 21578
> > > > Key: 0: Value: 43
> > > > Key: 1: Value: 2
> > > > Key: 2: Value: 2
> > > > Key: 3: Value: 2
> > > > ...
> > > >
> > > > tf-vectors
> > > >
> > > > Key class: class org.apache.hadoop.io.Text Value Class: class
> > > > org.apache.mahout.math.VectorWritable
> > > > Key: /reut2-000.sgm-0.txt: Value:
> > > >
> > > >
> > >
> >
> {62:0.024521886354905213,222:0.024521886354905213,291:0.024521886354905213,1411:0.024521886354905213,1421:0.024521886354905213,1451:0.024521886
> > > > 354905213,1456:0.024521886354905213....
> > > >
> > > > wordcount/ngrams
> > > >
> > > > Key class: class org.apache.hadoop.io.Text Value Class: class
> > > > org.apache.hadoop.io.DoubleWritable
> > > > Key: 0: Value: 166.0
> > > > Key: 0.003: Value: 2.0
> > > > Key: 0.006913: Value: 2.0
> > > > Key: 0.007050: Value: 2.0
> > > >
> > > > wordcount/subgrams
> > > >
> > > > Key class: class org.apache.mahout.vectorizer.collocations.llr.Gram
> > Value
> > > > Class: class org.apache.mahout.vectorizer.collocations.llr.Gram
> > > > Key: '0 0'[n]:12: Value: '0'[h]:166
> > > > Key: '0 25'[n]:2: Value: '0'[h]:166
> > > > Key: '0 92'[n]:107: Value: '0'[h]:166
> > > >
> > > > frequency.file-0
> > > >
> > > > Key class: class org.apache.hadoop.io.IntWritable Value Class: class
> > > > org.apache.hadoop.io.LongWritable
> > > > Key: 0: Value: 43
> > > > Key: 1: Value: 2
> > > > Key: 2: Value: 2
> > > > Key: 3: Value: 2
> > > > Key: 4: Value: 9
> > > > Key: 5: Value: 4
> > > >
> > > >
> > > > dictionary.file-0
> > > >
> > > > Key class: class org.apache.hadoop.io.Text Value Class: class
> > > > org.apache.hadoop.io.IntWritable
> > > > Key: 0: Value: 0
> > > > Key: 0.003: Value: 1
> > > > Key: 0.006913: Value: 2
> > > > Key: 0.007050: Value: 3
> > > > Key: 0.01: Value: 4
> > > > Key: 0.02: Value: 5
> > > > Key: 0.025: Value: 6
> > > >
> > > >
> > > >
> > > >
> > > >
> > > > On Wed, Sep 4, 2013 at 12:45 PM, Taner Diler <taner.di...@gmail.com>
> > > > wrote:
> > > >
> > > > > mahout seq2sparse -i reuters-seqfiles/ -o reuters-kmeans-try -chunk
> > 200
> > > > > -wt tfidf -s 2 -md 5 -x 95 -ng 2 -ml 50 -n 2 -seq
> > > > >
> > > > > this command works well.
> > > > >
> > > > > Gokhan, I changed minLLR value to 1.0 in java but result is same
> > empty
> > > > > tfidf-vectors.
> > > > >
> > > > >
> > > > > On Tue, Sep 3, 2013 at 10:47 AM, Taner Diler <
> taner.di...@gmail.com
> > > > >wrote:
> > > > >
> > > > >> Gokhan, I try it from commandline it works. I will send the
> command
> > to
> > > > >> compare command line parameters to TFIDFConverter params.
> > > > >>
> > > > >> Suneel, I had checked the seqfiles. I didn't see any problem other
> > > > >> generated seqfiles but I will checked  and send samples from each
> > > > seqfiles.
> > > > >>
> > > > >>
> > > > >> On Sun, Sep 1, 2013 at 11:02 PM, Gokhan Capan <gkhn...@gmail.com>
> > > > wrote:
> > > > >>
> > > > >>> Suneel is right indeed. I assumed that everything performed prior
> > to
> > > > >>> vector
> > > > >>> generation is done correctly.
> > > > >>>
> > > > >>> By the way, if the suggestions do not work, could you try running
> > > > >>> seq2sparse from commandline with the same arguments and see if
> that
> > > > works
> > > > >>> well?
> > > > >>>
> > > > >>> On Sun, Sep 1, 2013 at 7:23 PM, Suneel Marthi <
> > > suneel_mar...@yahoo.com
> > > > >>> >wrote:
> > > > >>>
> > > > >>> > I would first check to see if the input 'seqfiles' for
> > > TFIDFGenerator
> > > > >>> have
> > > > >>> > any meat in them.
> > > > >>> > This could also happen if the input seqfiles are empty.
> > > > >>>
> > > > >>>
> > > > >>> >
> > > > >>> >
> > > > >>> > ________________________________
> > > > >>> >  From: Taner Diler <taner.di...@gmail.com>
> > > > >>> > To: user@mahout.apache.org
> > > > >>> > Sent: Sunday, September 1, 2013 2:24 AM
> > > > >>> > Subject: TFIDFConverter generates empty tfidf-vectors
> > > > >>> >
> > > > >>> >
> > > > >>> > Hi all,
> > > > >>> >
> > > > >>> > I try to run Reuters KMeans example in Java, but TFIDFComverter
> > > > >>> generates
> > > > >>> > tfidf-vectors as empty. How can I fix that?
> > > > >>> >
> > > > >>> >     private static int minSupport = 2;
> > > > >>> >     private static int maxNGramSize = 2;
> > > > >>> >     private static float minLLRValue = 50;
> > > > >>> >     private static float normPower = 2;
> > > > >>> >     private static boolean logNormalize = true;
> > > > >>> >     private static int numReducers = 1;
> > > > >>> >     private static int chunkSizeInMegabytes = 200;
> > > > >>> >     private static boolean sequentialAccess = true;
> > > > >>> >     private static boolean namedVectors = false;
> > > > >>> >     private static int minDf = 5;
> > > > >>> >     private static long maxDF = 95;
> > > > >>> >
> > > > >>> >         Path inputDir = new Path("reuters-seqfiles");
> > > > >>> >         String outputDir = "reuters-kmeans-try";
> > > > >>> >         HadoopUtil.delete(conf, new Path(outputDir));
> > > > >>> >         StandardAnalyzer analyzer = new
> > > > >>> > StandardAnalyzer(Version.LUCENE_43);
> > > > >>> >         Path tokenizedPath = new
> > > > >>> > Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
> > > > >>> >         DocumentProcessor.tokenizeDocuments(inputDir,
> > > > >>> > analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath,
> > > conf);
> > > > >>> >
> > > > >>> >
> > > > >>> >
> > > > DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
> > > > >>> new
> > > > >>> > Path(outputDir),
> > > > >>> >
> > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
> > > > >>> conf,
> > > > >>> > minSupport , maxNGramSize, minLLRValue, normPower ,
> logNormalize,
> > > > >>> > numReducers , chunkSizeInMegabytes , sequentialAccess,
> > > namedVectors);
> > > > >>> >
> > > > >>> >
> > > > >>> >         Pair<Long[], List<Path>> features =
> > > > >>> TFIDFConverter.calculateDF(new
> > > > >>> > Path(outputDir,
> > > > >>> >
> > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
> > > > >>> new
> > > > >>> > Path(outputDir), conf, chunkSizeInMegabytes);
> > > > >>> >         TFIDFConverter.processTfIdf(new Path(outputDir,
> > > > >>> >
> > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
> > > > >>> new
> > > > >>> > Path(outputDir), conf, features, minDf , maxDF , normPower,
> > > > >>> logNormalize,
> > > > >>> > sequentialAccess, false, numReducers);
> > > > >>> >
> > > > >>>
> > > > >>
> > > > >>
> > > > >
> > > >
> > >
> >
>

Re: TFIDFConverter generates empty tfidf-vectors

Reply via email to