Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/TextParagraphSplittingJob.java Thu May 27 18:02:20 2010 @@ -53,13 +53,13 @@ public class TextParagraphSplittingJob e public static class SplitMap extends Mapper<Text,Text,Text,Text> { @Override - public void map(Text key, Text text, Context context) throws IOException, InterruptedException { + protected void map(Text key, Text text, Context context) throws IOException, InterruptedException { Text outText = new Text(); int loc = 0; - while(loc >= 0 && loc < text.getLength()) { - int nextLoc = text.find("\n\n", loc+1); + while (loc >= 0 && loc < text.getLength()) { + int nextLoc = text.find("\n\n", loc + 1); if (nextLoc > 0) { - outText.set(text.getBytes(), loc, (nextLoc - loc)); + outText.set(text.getBytes(), loc, nextLoc - loc); context.write(key, outText); } loc = nextLoc;
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java Thu May 27 18:02:20 2010 @@ -41,7 +41,7 @@ import org.apache.hadoop.mapred.jobcontr import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class SequenceFileDumper { +public final class SequenceFileDumper { private static final Logger log = LoggerFactory.getLogger(SequenceFileDumper.class); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Thu May 27 18:02:20 2010 @@ -19,7 +19,6 @@ package org.apache.mahout.utils.clusteri import java.io.File; import java.io.FileWriter; -import java.io.FilenameFilter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; @@ -30,7 +29,6 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.SortedMap; import java.util.TreeMap; import org.apache.commons.cli2.CommandLine; @@ -48,7 +46,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Thu May 27 18:02:20 2010 @@ -45,13 +45,12 @@ import org.apache.lucene.analysis.Analyz import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.text.DefaultAnalyzer; -import org.apache.mahout.utils.vectors.text.DictionaryVectorizer; import org.apache.mahout.utils.vectors.text.DocumentProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** Driver for LLR Collocation discovery mapreduce job */ -public class CollocDriver extends Configured implements Tool { +public final class CollocDriver extends Configured implements Tool { public static final String DEFAULT_OUTPUT_DIRECTORY = "output"; public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams"; public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams"; @@ -214,7 +213,7 @@ public class CollocDriver extends Config } /** - * Generate all ngrams for the {...@link DictionaryVectorizer} job + * Generate all ngrams for the {...@link org.apache.mahout.utils.vectors.text.DictionaryVectorizer} job * * @param input * input path containing tokenized documents Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java Thu May 27 18:02:20 2010 @@ -96,12 +96,10 @@ public class CollocReducer extends MapRe if (keyType == Gram.Type.UNIGRAM) { // sum frequencies for unigrams. processUnigram(key, values, output, reporter); - } - else if (keyType == Gram.Type.HEAD || keyType == Gram.Type.TAIL) { + } else if (keyType == Gram.Type.HEAD || keyType == Gram.Type.TAIL) { // sum frequencies for subgrams, ngram and collect for each ngram. processSubgram(key, values, output, reporter); - } - else { + } else { reporter.incrCounter(Skipped.MALFORMED_TYPES, 1); } } @@ -153,27 +151,23 @@ public class CollocReducer extends MapRe // collect frequency for subgrams. if (subgram == null) { subgram = new Gram(value); - } - else { + } else { subgram.incrementFrequency(value.getFrequency()); } - } - else if (!value.equals(currentNgram)) { + } else if (!value.equals(currentNgram)) { // we've collected frequency for all subgrams and we've encountered a new ngram. // collect the old ngram if there was one and we have sufficient support and // create the new ngram. if (currentNgram != null) { if (currentNgram.getFrequency() < minSupport) { reporter.incrCounter(Skipped.LESS_THAN_MIN_SUPPORT, 1); - } - else { + } else { output.collect(currentNgram, subgram); } } currentNgram = new Gram(value); - } - else { + } else { currentNgram.incrementFrequency(value.getFrequency()); } } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java Thu May 27 18:02:20 2010 @@ -140,7 +140,7 @@ public class Gram extends BinaryComparab */ public String getString() { try { - return Text.decode(bytes, 1, length-1); + return Text.decode(bytes, 1, length - 1); } catch (CharacterCodingException e) { throw new IllegalStateException("Should not have happened " + e.toString()); } @@ -194,7 +194,7 @@ public class Gram extends BinaryComparab * @param keepData should the old data be kept */ private void setCapacity(int len, boolean keepData) { - len+=1; // extra byte to hold type + len++; // extra byte to hold type if (bytes == null || bytes.length < len) { byte[] newBytes = new byte[len]; if (bytes != null && keepData) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java Thu May 27 18:02:20 2010 @@ -121,7 +121,7 @@ public class GramKey extends BinaryCompa public String getPrimaryString() { try { - return Text.decode(bytes, 1, primaryLength-1); + return Text.decode(bytes, 1, primaryLength - 1); } catch (CharacterCodingException e) { throw new IllegalStateException(e); } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java Thu May 27 18:02:20 2010 @@ -17,13 +17,15 @@ package org.apache.mahout.utils.nlp.collocations.llr; +import java.io.Serializable; + import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; /** Group GramKeys based on their Gram, ignoring the secondary sort key, so that all keys with the same Gram are sent * to the same call of the reduce method, sorted in natural order (for GramKeys). */ -public class GramKeyGroupComparator extends WritableComparator { +public class GramKeyGroupComparator extends WritableComparator implements Serializable { protected GramKeyGroupComparator() { super(GramKey.class, true); @@ -35,7 +37,8 @@ public class GramKeyGroupComparator exte GramKey gka = (GramKey) a; GramKey gkb = (GramKey) b; - return WritableComparator.compareBytes(gka.getBytes(), 0, gka.getPrimaryLength(), gkb.getBytes(), 0, gkb.getPrimaryLength()); + return WritableComparator.compareBytes(gka.getBytes(), 0, gka.getPrimaryLength(), + gkb.getBytes(), 0, gkb.getPrimaryLength()); } } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java Thu May 27 18:02:20 2010 @@ -108,8 +108,7 @@ public class LLRReducer extends MapReduc OutputCollector<Text,DoubleWritable> output, Reporter reporter) throws IOException { - int[] gramFreq = new int[2]; - gramFreq[0] = gramFreq[1] = -1; + int[] gramFreq = {-1, -1}; if (ngram.getType() == Gram.Type.UNIGRAM && emitUnigrams) { DoubleWritable dd = new DoubleWritable(ngram.getFrequency()); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/RowIdJob.java Thu May 27 18:02:20 2010 @@ -56,9 +56,9 @@ public class RowIdJob extends AbstractJo VectorWritable v = new VectorWritable(); int i = 0; - for(FileStatus status : fs.listStatus(inputPath)) { + for (FileStatus status : fs.listStatus(inputPath)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, status.getPath(), conf); - while(reader.next(inputKey, v)) { + while (reader.next(inputKey, v)) { docId.set(i); indexWriter.append(docId, inputKey); matrixWriter.append(docId, v); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java Thu May 27 18:02:20 2010 @@ -56,7 +56,7 @@ public class SequenceFileVectorIterable } } - public class SeqFileIterator implements Iterator<Vector> { + public final class SeqFileIterator implements Iterator<Vector> { private final Writable key; private final Writable value; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Thu May 27 18:02:20 2010 @@ -150,7 +150,7 @@ public final class VectorDumper { if (cmdLine.hasOption(outputOpt)) { writer.close(); } - System.err.println("Dumped " + i + " Vectors"); + System.out.println("Dumped " + i + " Vectors"); } } catch (OptionException e) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Thu May 27 18:02:20 2010 @@ -148,7 +148,7 @@ public class ARFFVectorIterable implemen return new ARFFIterator(); } - private class ARFFIterator implements Iterator<Vector> { + private final class ARFFIterator implements Iterator<Vector> { private String line; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Thu May 27 18:02:20 2010 @@ -21,11 +21,8 @@ import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; @@ -37,7 +34,8 @@ import org.apache.mahout.common.HadoopUt import org.apache.mahout.math.VectorWritable; /** - * This class groups a set of input vectors. The Sequence file input should have a {...@link WritableComparable} + * This class groups a set of input vectors. The Sequence file input should have a + * {...@link org.apache.hadoop.io.WritableComparable} * key containing document id and a {...@link VectorWritable} value containing the term frequency vector. This * class also does normalization of the vector. * @@ -64,7 +62,7 @@ public final class PartialVectorMerger { * {...@link org.apache.mahout.math.RandomAccessSparseVector} * * @param partialVectorPaths - * input directory of the vectors in {...@link SequenceFile} format + * input directory of the vectors in {...@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the partial vectors have to be created * @param normPower Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Thu May 27 18:02:20 2010 @@ -29,7 +29,6 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.Map.Entry; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; @@ -42,7 +41,6 @@ import org.apache.commons.cli2.commandli import org.apache.hadoop.fs.Path; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; -import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; @@ -116,7 +114,7 @@ public class ClusterLabels { private String idField; - private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints = null; + private Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints; private String output; @@ -124,8 +122,12 @@ public class ClusterLabels { private int maxLabels = DEFAULT_MAX_LABELS; - public ClusterLabels(Path seqFileDir, Path pointsDir, String indexDir, String contentField, int minNumIds, int maxLabels) - throws IOException { + public ClusterLabels(Path seqFileDir, + Path pointsDir, + String indexDir, + String contentField, + int minNumIds, + int maxLabels) throws IOException { this.seqFileDir = seqFileDir; this.pointsDir = pointsDir; this.indexDir = indexDir; @@ -149,7 +151,7 @@ public class ClusterLabels { writer = new OutputStreamWriter(System.out); } - for (Entry<Integer, List<WeightedVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) { + for (Map.Entry<Integer, List<WeightedVectorWritable>> integerListEntry : clusterIdToPoints.entrySet()) { List<WeightedVectorWritable> wvws = integerListEntry.getValue(); List<TermInfoClusterInOut> termInfos = getClusterLabels(integerListEntry.getKey(), wvws); if (termInfos != null) { @@ -173,14 +175,9 @@ public class ClusterLabels { /** * Get the list of labels, sorted by best score. - * - * @param integer - * @param wvws - * @return - * @throws CorruptIndexException - * @throws IOException */ - protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, List<WeightedVectorWritable> wvws) throws IOException { + protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, List<WeightedVectorWritable> wvws) + throws IOException { if (wvws.size() < minNumIds) { log.info("Skipping small cluster {} with size: {}", integer, wvws.size()); @@ -266,12 +263,14 @@ public class ClusterLabels { return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); } - private static OpenBitSet getClusterDocBitset(IndexReader reader, Set<String> idSet, String idField) throws IOException { + private static OpenBitSet getClusterDocBitset(IndexReader reader, Set<String> idSet, String idField) + throws IOException { int numDocs = reader.numDocs(); OpenBitSet bitset = new OpenBitSet(numDocs); - FieldSelector idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet()); + FieldSelector idFieldSelector = + new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet()); for (int i = 0; i < numDocs; i++) { String id = null; @@ -319,16 +318,16 @@ public class ClusterLabels { GroupBuilder gbuilder = new GroupBuilder(); Option indexOpt = obuilder.withLongName("dir").withRequired(true).withArgument( - abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).withDescription("The Lucene index directory") - .withShortName("d").create(); + abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) + .withDescription("The Lucene index directory").withShortName("d").create(); Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument( abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( "The output file. If not specified, the result is printed on console.").withShortName("o").create(); Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument( - abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription("The content field in the index") - .withShortName("f").create(); + abuilder.withName("field").withMinimum(1).withMaximum(1).create()) + .withDescription("The content field in the index").withShortName("f").create(); Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument( abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription( @@ -341,7 +340,8 @@ public class ClusterLabels { Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true).withArgument( abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription( - "The directory containing points sequence files mapping input vectors to their cluster. ").withShortName("p").create(); + "The directory containing points sequence files mapping input vectors to their cluster. ") + .withShortName("p").create(); Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false).withArgument( abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create()).withDescription( "The minimum number of points required in a cluster to print the labels for").withShortName("m").create(); @@ -350,9 +350,9 @@ public class ClusterLabels { "The maximum number of labels to print per cluster").withShortName("x").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create(); - Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt).withOption( - fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt).withOption(maxLabelsOpt).withOption( - minClusterSizeOpt).create(); + Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt) + .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt) + .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create(); try { Parser parser = new Parser(); Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Thu May 27 18:02:20 2010 @@ -54,7 +54,7 @@ import org.apache.mahout.utils.vectors.i import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class Driver { +public final class Driver { private static final Logger log = LoggerFactory.getLogger(Driver.class); private Driver() { } Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Thu May 27 18:02:20 2010 @@ -32,17 +32,17 @@ import org.apache.mahout.math.Vector; * {...@link Vector}. The Field used to create the Vector currently must have Term Vectors stored for it. */ public class LuceneIterable implements Iterable<Vector> { - + + public static final double NO_NORMALIZING = -1.0; + private final IndexReader indexReader; private final String field; - private final String idField; - private final FieldSelector idFieldSelector; + //private final String idField; + //private final FieldSelector idFieldSelector; private final VectorMapper mapper; private double normPower = NO_NORMALIZING; - - public static final double NO_NORMALIZING = -1.0; - + public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper) { this(reader, idField, field, mapper, NO_NORMALIZING); } @@ -70,9 +70,9 @@ public class LuceneIterable implements I if (normPower != NO_NORMALIZING && normPower < 0) { throw new IllegalArgumentException("normPower must either be -1 or >= 0"); } - idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet()); + //idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet()); this.indexReader = reader; - this.idField = idField; + //this.idField = idField; this.field = field; this.mapper = mapper; this.normPower = normPower; Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java Thu May 27 18:02:20 2010 @@ -21,9 +21,7 @@ import java.io.IOException; import java.nio.charset.Charset; import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; @@ -38,7 +36,8 @@ import org.apache.mahout.utils.vectors.t /** * This class converts a set of input documents in the sequence file format of {...@link StringTuple}s.The - * {...@link SequenceFile} input should have a {...@link Text} key containing the unique document identifier and a + * {...@link org.apache.hadoop.io.SequenceFile} input should have a {...@link Text} key + * containing the unique document identifier and a * {...@link Text} value containing the whole document. The document should be stored in UTF-8 encoding which is * recognizable by hadoop. It uses the given {...@link Analyzer} to process the document into * {...@link org.apache.lucene.analysis.Token}s. @@ -60,10 +59,10 @@ public final class DocumentProcessor { /** * Convert the input documents into token array using the {...@link StringTuple} The input documents has to be - * in the {...@link SequenceFile} format + * in the {...@link org.apache.hadoop.io.SequenceFile} format * * @param input - * input directory of the documents in {...@link SequenceFile} format + * input directory of the documents in {...@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {...@link StringTuple} token array of each document has to be created * @param analyzerClass Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Thu May 27 18:02:20 2010 @@ -44,7 +44,7 @@ public class SequenceFileTokenizerMapper public void map(Text key, Text value, OutputCollector<Text,StringTuple> output, Reporter reporter) throws IOException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); - TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); + TermAttribute termAtt = stream.addAttribute(TermAttribute.class); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.termLength() > 0) { Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=948935&r1=948934&r2=948935&view=diff ============================================================================== --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original) +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Thu May 27 18:02:20 2010 @@ -32,7 +32,6 @@ import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; @@ -49,7 +48,8 @@ import org.apache.mahout.utils.vectors.t /** * This class converts a set of input vectors with term frequencies to TfIdf vectors. The Sequence file input - * should have a {...@link WritableComparable} key containing and a {...@link VectorWritable} value containing the + * should have a {...@link org.apache.hadoop.io.WritableComparable} key containing and a + * {...@link VectorWritable} value containing the * term frequency vector. This is conversion class uses multiple map/reduces to convert the vectors to TfIdf * format *
