Gökhan, thank you. your advice solved my problem.
On Sun, Sep 8, 2013 at 3:11 PM, Gokhan Capan <gkhn...@gmail.com> wrote: > Taner, > > It seems to have tf-idf vectors later, you need to create tf vectors > (DictionaryVectorizer.createTermFrequencyVectors) with logNormalize option > set to false, and normPower option set to -1.0f. This applies to > HighDFWordsPruner.pruneVectors, too. > > I believe that solves your problem. > > Best > > Gokhan > > > On Wed, Sep 4, 2013 at 4:54 PM, Taner Diler <taner.di...@gmail.com> wrote: > > > Actually, my real motivation was to visualize reuters vectors like > > DisplayKMeans example and then implement to web contents that I've > > collected and additionaly to discover what I can do with generated tfidf > > vectors. But TDIDF doesn't work and why? > > > > There is one main class that doesn't extend AbstractJob. Yes it has main > > method that executes all steps. And I'm trying to implement a sample that > > in mahout wiki and everywhere in net. In Eclipse, I've just added > > mahout-0.8-job.jar , you know it includes all depended package, and > > hadoop-core.1.2.0.jar. > > > > import java.io.IOException; > > import java.util.ArrayList; > > import java.util.Collections; > > import java.util.Comparator; > > import java.util.HashMap; > > import java.util.List; > > import java.util.Map; > > import java.util.Set; > > > > import org.apache.hadoop.conf.Configuration; > > import org.apache.hadoop.fs.FileSystem; > > import org.apache.hadoop.fs.Path; > > import org.apache.hadoop.io.IntWritable; > > import org.apache.hadoop.io.LongWritable; > > import org.apache.hadoop.io.SequenceFile; > > import org.apache.hadoop.io.Text; > > import org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat; > > import org.apache.lucene.analysis.Analyzer; > > import org.apache.lucene.analysis.standard.StandardAnalyzer; > > import org.apache.lucene.util.Version; > > import org.apache.mahout.clustering.Cluster; > > import org.apache.mahout.clustering.canopy.CanopyDriver; > > import org.apache.mahout.clustering.classify.WeightedVectorWritable; > > import org.apache.mahout.clustering.display.DisplayKMeans; > > import org.apache.mahout.clustering.kmeans.KMeansDriver; > > import org.apache.mahout.common.HadoopUtil; > > import org.apache.mahout.common.Pair; > > import org.apache.mahout.common.StringTuple; > > import org.apache.mahout.common.distance.EuclideanDistanceMeasure; > > import org.apache.mahout.common.distance.TanimotoDistanceMeasure; > > import org.apache.mahout.math.Vector.Element; > > import org.apache.mahout.math.VectorWritable; > > import org.apache.mahout.vectorizer.DictionaryVectorizer; > > import org.apache.mahout.vectorizer.DocumentProcessor; > > import org.apache.mahout.vectorizer.tfidf.TFIDFConverter; > > > > > > public class MahoutReutersKMeans { > > > > private static int minSupport = 2; > > private static int maxNGramSize = 2; > > private static float minLLRValue = 1; > > private static float normPower = 2; > > private static boolean logNormalize = true; > > private static int numReducers = 1; > > private static int chunkSizeInMegabytes = 200; > > private static boolean sequentialAccess = true; > > private static boolean namedVectors = false; > > > > private static int minDf = 5; > > > > private static long maxDF = 95; > > > > /** > > * @param args > > * @throws IOException > > * @throws InterruptedException > > * @throws ClassNotFoundException > > */ > > public static void main(String[] args) throws IOException, > > ClassNotFoundException, InterruptedException { > > > > Configuration conf = new Configuration(); > > String HADOOP_HOME = System.getenv("HADOOP_PREFIX"); > > > > conf.addResource(new Path(HADOOP_HOME, "conf/core-site.xml")); > > conf.addResource(new Path(HADOOP_HOME, "conf/hdfs-site.xml")); > > conf.addResource(new Path(HADOOP_HOME, "conf/mapred-site.xml")); > > > > FileSystem fs = FileSystem.get(conf); > > > > Path inputDir = new Path("reuters-seqfiles"); > > String outputDir = "reuters-kmeans-try"; > > HadoopUtil.delete(conf, new Path(outputDir)); > > StandardAnalyzer analyzer = new > > StandardAnalyzer(Version.LUCENE_43); > > Path tokenizedPath = new > > Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); > > DocumentProcessor.tokenizeDocuments(inputDir, > > analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath, conf); > > > > > > DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, > new > > Path(outputDir), > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, conf, > > minSupport , maxNGramSize, minLLRValue, normPower , logNormalize, > > numReducers , chunkSizeInMegabytes , sequentialAccess, namedVectors); > > > > > > Pair<Long[], List<Path>> features = > TFIDFConverter.calculateDF(new > > Path(outputDir, > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new > > Path(outputDir), conf, chunkSizeInMegabytes); > > TFIDFConverter.processTfIdf(new Path(outputDir, > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new > > Path(outputDir), conf, features, minDf , maxDF , normPower, logNormalize, > > sequentialAccess, false, numReducers); > > > > > > // Path tfidfVectorsPath = new Path(outputDir, "tfidf-vectors"); > > // Path canopyCentroidsPath = new Path(outputDir, > > "canopy-centroids"); > > // Path clustersPath = new Path(outputDir, "clusters"); > > // > > // CanopyDriver.run(conf, tfidfVectorsPath, canopyCentroidsPath, > new > > EuclideanDistanceMeasure(), 250, 120, false, 0.01, false); > > // KMeansDriver.run(conf, tfidfVectorsPath, new > > Path(canopyCentroidsPath, "clusters-0-final"), clustersPath, new > > TanimotoDistanceMeasure(), 0.01, 20, true, 0.01, false); > > // > > // SequenceFile.Reader reader = new SequenceFile.Reader(fs, new > > Path("reuters-clusters/" + Cluster.CLUSTERED_POINTS_DIR + > "/part-m-00000"), > > conf); > > // > > // IntWritable key = new IntWritable(); > > // WeightedVectorWritable value = new WeightedVectorWritable(); > > // while (reader.next(key, value)) { > > // System.out.println(key.toString()+" belongs to cluster > > "+value.toString()); > > // } > > // reader.close(); > > > > } > > > > } > > > > > > > > > > On Wed, Sep 4, 2013 at 3:29 PM, Gokhan Capan <gkhn...@gmail.com> wrote: > > > > > Taner, > > > > > > A few questions: > > > > > > Is there a specific reason not to consider using seq2sparse directly? > > (You > > > can edit seq2sparse.props to avoid passing commandline arguments every > > time > > > you run it, if that is the case) > > > > > > Java code you attached seems to do the same thing with > > > SparseVectorFromSequenceFiles#run(String[]), which is also the method > > > called when you run seq2sparse. I'm gonna debug it anyway. > > > > > > And I would like to know how you run the java code. Does your main > class > > > extend AbstractJob to make it "runnable" using bin/mahout? And does it > > have > > > a main method that submits your job to your hadoop cluster? Are you > using > > > hadoop jar command to run it? > > > > > > Best > > > > > > Gokhan > > > > > > > > > On Wed, Sep 4, 2013 at 1:15 PM, Taner Diler <taner.di...@gmail.com> > > wrote: > > > > > > > Suneel, samples from generated seqfiles: > > > > > > > > df-count > > > > > > > > Key: -1: Value: 21578 > > > > Key: 0: Value: 43 > > > > Key: 1: Value: 2 > > > > Key: 2: Value: 2 > > > > Key: 3: Value: 2 > > > > ... > > > > > > > > tf-vectors > > > > > > > > Key class: class org.apache.hadoop.io.Text Value Class: class > > > > org.apache.mahout.math.VectorWritable > > > > Key: /reut2-000.sgm-0.txt: Value: > > > > > > > > > > > > > > {62:0.024521886354905213,222:0.024521886354905213,291:0.024521886354905213,1411:0.024521886354905213,1421:0.024521886354905213,1451:0.024521886 > > > > 354905213,1456:0.024521886354905213.... > > > > > > > > wordcount/ngrams > > > > > > > > Key class: class org.apache.hadoop.io.Text Value Class: class > > > > org.apache.hadoop.io.DoubleWritable > > > > Key: 0: Value: 166.0 > > > > Key: 0.003: Value: 2.0 > > > > Key: 0.006913: Value: 2.0 > > > > Key: 0.007050: Value: 2.0 > > > > > > > > wordcount/subgrams > > > > > > > > Key class: class org.apache.mahout.vectorizer.collocations.llr.Gram > > Value > > > > Class: class org.apache.mahout.vectorizer.collocations.llr.Gram > > > > Key: '0 0'[n]:12: Value: '0'[h]:166 > > > > Key: '0 25'[n]:2: Value: '0'[h]:166 > > > > Key: '0 92'[n]:107: Value: '0'[h]:166 > > > > > > > > frequency.file-0 > > > > > > > > Key class: class org.apache.hadoop.io.IntWritable Value Class: class > > > > org.apache.hadoop.io.LongWritable > > > > Key: 0: Value: 43 > > > > Key: 1: Value: 2 > > > > Key: 2: Value: 2 > > > > Key: 3: Value: 2 > > > > Key: 4: Value: 9 > > > > Key: 5: Value: 4 > > > > > > > > > > > > dictionary.file-0 > > > > > > > > Key class: class org.apache.hadoop.io.Text Value Class: class > > > > org.apache.hadoop.io.IntWritable > > > > Key: 0: Value: 0 > > > > Key: 0.003: Value: 1 > > > > Key: 0.006913: Value: 2 > > > > Key: 0.007050: Value: 3 > > > > Key: 0.01: Value: 4 > > > > Key: 0.02: Value: 5 > > > > Key: 0.025: Value: 6 > > > > > > > > > > > > > > > > > > > > > > > > On Wed, Sep 4, 2013 at 12:45 PM, Taner Diler <taner.di...@gmail.com> > > > > wrote: > > > > > > > > > mahout seq2sparse -i reuters-seqfiles/ -o reuters-kmeans-try -chunk > > 200 > > > > > -wt tfidf -s 2 -md 5 -x 95 -ng 2 -ml 50 -n 2 -seq > > > > > > > > > > this command works well. > > > > > > > > > > Gokhan, I changed minLLR value to 1.0 in java but result is same > > empty > > > > > tfidf-vectors. > > > > > > > > > > > > > > > On Tue, Sep 3, 2013 at 10:47 AM, Taner Diler < > taner.di...@gmail.com > > > > >wrote: > > > > > > > > > >> Gokhan, I try it from commandline it works. I will send the > command > > to > > > > >> compare command line parameters to TFIDFConverter params. > > > > >> > > > > >> Suneel, I had checked the seqfiles. I didn't see any problem other > > > > >> generated seqfiles but I will checked and send samples from each > > > > seqfiles. > > > > >> > > > > >> > > > > >> On Sun, Sep 1, 2013 at 11:02 PM, Gokhan Capan <gkhn...@gmail.com> > > > > wrote: > > > > >> > > > > >>> Suneel is right indeed. I assumed that everything performed prior > > to > > > > >>> vector > > > > >>> generation is done correctly. > > > > >>> > > > > >>> By the way, if the suggestions do not work, could you try running > > > > >>> seq2sparse from commandline with the same arguments and see if > that > > > > works > > > > >>> well? > > > > >>> > > > > >>> On Sun, Sep 1, 2013 at 7:23 PM, Suneel Marthi < > > > suneel_mar...@yahoo.com > > > > >>> >wrote: > > > > >>> > > > > >>> > I would first check to see if the input 'seqfiles' for > > > TFIDFGenerator > > > > >>> have > > > > >>> > any meat in them. > > > > >>> > This could also happen if the input seqfiles are empty. > > > > >>> > > > > >>> > > > > >>> > > > > > >>> > > > > > >>> > ________________________________ > > > > >>> > From: Taner Diler <taner.di...@gmail.com> > > > > >>> > To: user@mahout.apache.org > > > > >>> > Sent: Sunday, September 1, 2013 2:24 AM > > > > >>> > Subject: TFIDFConverter generates empty tfidf-vectors > > > > >>> > > > > > >>> > > > > > >>> > Hi all, > > > > >>> > > > > > >>> > I try to run Reuters KMeans example in Java, but TFIDFComverter > > > > >>> generates > > > > >>> > tfidf-vectors as empty. How can I fix that? > > > > >>> > > > > > >>> > private static int minSupport = 2; > > > > >>> > private static int maxNGramSize = 2; > > > > >>> > private static float minLLRValue = 50; > > > > >>> > private static float normPower = 2; > > > > >>> > private static boolean logNormalize = true; > > > > >>> > private static int numReducers = 1; > > > > >>> > private static int chunkSizeInMegabytes = 200; > > > > >>> > private static boolean sequentialAccess = true; > > > > >>> > private static boolean namedVectors = false; > > > > >>> > private static int minDf = 5; > > > > >>> > private static long maxDF = 95; > > > > >>> > > > > > >>> > Path inputDir = new Path("reuters-seqfiles"); > > > > >>> > String outputDir = "reuters-kmeans-try"; > > > > >>> > HadoopUtil.delete(conf, new Path(outputDir)); > > > > >>> > StandardAnalyzer analyzer = new > > > > >>> > StandardAnalyzer(Version.LUCENE_43); > > > > >>> > Path tokenizedPath = new > > > > >>> > Path(DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); > > > > >>> > DocumentProcessor.tokenizeDocuments(inputDir, > > > > >>> > analyzer.getClass().asSubclass(Analyzer.class), tokenizedPath, > > > conf); > > > > >>> > > > > > >>> > > > > > >>> > > > > > DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, > > > > >>> new > > > > >>> > Path(outputDir), > > > > >>> > > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER, > > > > >>> conf, > > > > >>> > minSupport , maxNGramSize, minLLRValue, normPower , > logNormalize, > > > > >>> > numReducers , chunkSizeInMegabytes , sequentialAccess, > > > namedVectors); > > > > >>> > > > > > >>> > > > > > >>> > Pair<Long[], List<Path>> features = > > > > >>> TFIDFConverter.calculateDF(new > > > > >>> > Path(outputDir, > > > > >>> > > > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), > > > > >>> new > > > > >>> > Path(outputDir), conf, chunkSizeInMegabytes); > > > > >>> > TFIDFConverter.processTfIdf(new Path(outputDir, > > > > >>> > > > > DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), > > > > >>> new > > > > >>> > Path(outputDir), conf, features, minDf , maxDF , normPower, > > > > >>> logNormalize, > > > > >>> > sequentialAccess, false, numReducers); > > > > >>> > > > > > >>> > > > > >> > > > > >> > > > > > > > > > > > > > > >