The FPM output is incorrect. I will have to see what broke. Robin
On Thu, Mar 11, 2010 at 2:56 PM, <sro...@apache.org> wrote: > Author: srowen > Date: Thu Mar 11 09:26:39 2010 > New Revision: 921751 > > URL: http://svn.apache.org/viewvc?rev=921751&view=rev > Log: > Last round of streamlining/style suggestions for 0.3, plus possible fix for > PFPGrowthTest unit test > > Modified: > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java > > > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVector.java > > > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestPrintableInterface.java > > > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java > > > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java > > > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/DataSet.java > > > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombiner.java > > > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducer.java > > > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombinerTest.java > > > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducerTest.java > > > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java > > > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java > > > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java > > > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java > > > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyPartitioner.java > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java > Thu Mar 11 09:26:39 2010 > @@ -18,9 +18,8 @@ > package org.apache.mahout.classifier.bayes.mapreduce.common; > > import java.io.IOException; > -import java.util.Arrays; > import java.util.Iterator; > -import java.util.List; > +import java.util.regex.Pattern; > > import org.apache.commons.lang.mutable.MutableDouble; > import org.apache.hadoop.io.DoubleWritable; > @@ -33,8 +32,10 @@ import org.apache.hadoop.mapred.Reporter > import org.apache.lucene.analysis.TokenStream; > import org.apache.lucene.analysis.shingle.ShingleFilter; > import org.apache.lucene.analysis.tokenattributes.TermAttribute; > +import org.apache.mahout.classifier.BayesFileFormatter; > import org.apache.mahout.common.Parameters; > import org.apache.mahout.common.StringTuple; > +import org.apache.mahout.common.iterator.ArrayIterator; > import org.apache.mahout.math.function.ObjectIntProcedure; > import org.apache.mahout.math.function.ObjectProcedure; > import org.apache.mahout.math.map.OpenObjectIntHashMap; > @@ -42,16 +43,17 @@ import org.slf4j.Logger; > import org.slf4j.LoggerFactory; > > /** > - * Reads the input train set(preprocessed using the {...@link > org.apache.mahout.classifier.BayesFileFormatter}). > + * Reads the input train set(preprocessed using the {...@link > BayesFileFormatter}). > */ > public class BayesFeatureMapper extends MapReduceBase implements > Mapper<Text,Text,StringTuple,DoubleWritable> { > > private static final Logger log = > LoggerFactory.getLogger(BayesFeatureMapper.class); > > private static final DoubleWritable ONE = new DoubleWritable(1.0); > - > + private static final Pattern SPACE_PATTERN = Pattern.compile("[ ]+"); > + > private int gramSize = 1; > - > + > /** > * We need to count the number of times we've seen a term with a given > label and we need to output that. But > * this Mapper does more than just outputing the count. It first does > weight normalisation. Secondly, it > @@ -75,27 +77,27 @@ public class BayesFeatureMapper extends > Reporter reporter) throws IOException { > // String line = value.toString(); > final String label = key.toString(); > - List<String> tokens = Arrays.asList(value.toString().split("[ ]+")); > - OpenObjectIntHashMap<String> wordList = new > OpenObjectIntHashMap<String>(tokens.size() * gramSize); > + String[] tokens = SPACE_PATTERN.split(value.toString()); > + OpenObjectIntHashMap<String> wordList = new > OpenObjectIntHashMap<String>(tokens.length * gramSize); > > if (gramSize > 1) { > - ShingleFilter sf = new ShingleFilter(new > IteratorTokenStream(tokens.iterator()), gramSize); > + ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new > ArrayIterator<String>(tokens)), gramSize); > do { > String term = ((TermAttribute) > sf.getAttribute(TermAttribute.class)).term(); > if (term.length() > 0) { > - if (wordList.containsKey(term) == false) { > - wordList.put(term, 1); > - } else { > + if (wordList.containsKey(term)) { > wordList.put(term, 1 + wordList.get(term)); > + } else { > + wordList.put(term, 1); > } > } > } while (sf.incrementToken()); > } else { > for (String term : tokens) { > - if (wordList.containsKey(term) == false) { > - wordList.put(term, 1); > - } else { > + if (wordList.containsKey(term)) { > wordList.put(term, 1 + wordList.get(term)); > + } else { > + wordList.put(term, 1); > } > } > } > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyCreatorMapper.java > Thu Mar 11 09:26:39 2010 > @@ -18,6 +18,7 @@ > package org.apache.mahout.clustering.meanshift; > > import java.io.IOException; > +import java.util.regex.Pattern; > > import org.apache.hadoop.io.Text; > import org.apache.hadoop.io.WritableComparable; > @@ -30,9 +31,10 @@ import org.apache.mahout.math.VectorWrit > > public class MeanShiftCanopyCreatorMapper extends MapReduceBase implements > Mapper<WritableComparable<?>,VectorWritable,Text,MeanShiftCanopy> { > - > + > + private static final Pattern UNDERSCORE_PATTERN = Pattern.compile("_"); > private static int nextCanopyId = -1; > - > + > @Override > public void map(WritableComparable<?> key, > VectorWritable vector, > @@ -47,7 +49,7 @@ public class MeanShiftCanopyCreatorMappe > super.configure(job); > if (nextCanopyId == -1) { > String taskId = job.get("mapred.task.id"); > - String[] parts = taskId.split("_"); > + String[] parts = UNDERSCORE_PATTERN.split(taskId); > if (parts.length != 6 || !parts[0].equals("attempt") > || (!"m".equals(parts[3]) && !"r".equals(parts[3]))) { > throw new IllegalArgumentException("TaskAttemptId string : " + > taskId + " is not properly formed"); > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapred/inmem/InMemInputFormat.java > Thu Mar 11 09:26:39 2010 > @@ -236,13 +236,11 @@ public class InMemInputFormat implements > } > > InMemInputSplit split = (InMemInputSplit) obj; > - > - if (seed == null && split.seed != null) { > - return false; > - } > - > - return firstId == split.firstId && nbTrees == split.nbTrees > - && (seed == null || seed.equals(split.seed)); > + > + return firstId == split.firstId && > + nbTrees == split.nbTrees && > + ((seed == null && split.seed == null) || > seed.equals(split.seed)); > + > } > > @Override > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/MapredOutput.java > Thu Mar 11 09:26:39 2010 > @@ -106,12 +106,9 @@ public class MapredOutput implements Wri > } > > MapredOutput mo = (MapredOutput) obj; > - > - if ((tree != null) && (tree.equals(mo.getTree()) == false)) { > - return false; > - } > - > - return Arrays.equals(predictions, mo.getPredictions()); > + > + return ((tree == null && mo.getTree() == null) || > tree.equals(mo.getTree())) && > + Arrays.equals(predictions, mo.getPredictions()); > } > > @Override > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/df/mapreduce/inmem/InMemInputFormat.java > Thu Mar 11 09:26:39 2010 > @@ -244,13 +244,11 @@ public class InMemInputFormat extends In > } > > InMemInputSplit split = (InMemInputSplit) obj; > - > - if ((seed == null) && (split.seed != null)) { > - return false; > - } > - > - return (firstId == split.firstId) && (nbTrees == split.nbTrees) > - && ((seed == null) || seed.equals(split.seed)); > + > + return firstId == split.firstId && > + nbTrees == split.nbTrees && > + ((seed == null && split.seed == null) || > seed.equals(split.seed)); > + > } > > @Override > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java > Thu Mar 11 09:26:39 2010 > @@ -51,13 +51,13 @@ import org.apache.mahout.math.map.OpenOb > > public class ParallelFPGrowthReducer extends > Reducer<LongWritable,TransactionTree,Text,TopKStringPatterns> { > > - private final List<Pair<Integer,Long>> fList = new > ArrayList<Pair<Integer,Long>>(); > + //private final List<Pair<Integer,Long>> fList = new > ArrayList<Pair<Integer,Long>>(); > > private final List<String> featureReverseMap = new ArrayList<String>(); > > private final OpenObjectIntHashMap<String> fMap = new > OpenObjectIntHashMap<String>(); > > - private final List<String> fRMap = new ArrayList<String>(); > + //private final List<String> fRMap = new ArrayList<String>(); > > private final OpenLongObjectHashMap<IntArrayList> groupFeatures = new > OpenLongObjectHashMap<IntArrayList>(); > > @@ -114,8 +114,8 @@ public class ParallelFPGrowthReducer ext > for (Pair<String,Long> e : PFPGrowth.deserializeList(params, "fList", > context.getConfiguration())) { > featureReverseMap.add(e.getFirst()); > fMap.put(e.getFirst(), i); > - fRMap.add(e.getFirst()); > - fList.add(new Pair<Integer,Long>(i++, e.getSecond())); > + //fRMap.add(e.getFirst()); // TODO never read? > + //fList.add(new Pair<Integer,Long>(i++, e.getSecond())); > > } > > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java > Thu Mar 11 09:26:39 2010 > @@ -57,10 +57,7 @@ public final class FrequentPatternMaxHea > } > > public boolean addable(long support) { > - if (count < maxSize) { > - return true; > - } > - return least.support() <= support; > + return count < maxSize || least.support() <= support; > } > > public PriorityQueue<Pattern> getHeap() { > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java > Thu Mar 11 09:26:39 2010 > @@ -64,7 +64,7 @@ public class DistributedRowMatrix implem > private static final Logger log = > LoggerFactory.getLogger(DistributedRowMatrix.class); > > private final String inputPathString; > - private String outputTmpPathString; > + private final String outputTmpPathString; > private JobConf conf; > private Path rowPath; > private Path outputTmpBasePath; > @@ -200,7 +200,7 @@ public class DistributedRowMatrix implem > > public static class DistributedMatrixIterator implements > Iterator<MatrixSlice> { > private SequenceFile.Reader reader; > - private FileStatus[] statuses; > + private final FileStatus[] statuses; > private boolean hasBuffered = false; > private boolean hasNext = false; > private int statusIndex = 0; > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java > Thu Mar 11 09:26:39 2010 > @@ -1,3 +1,20 @@ > +/** > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > package org.apache.mahout.math.hadoop; > > import org.apache.commons.cli2.Option; > @@ -14,7 +31,6 @@ import org.apache.hadoop.mapred.Sequence > import org.apache.hadoop.mapred.SequenceFileOutputFormat; > import org.apache.hadoop.mapred.join.CompositeInputFormat; > import org.apache.hadoop.mapred.join.TupleWritable; > -import org.apache.hadoop.mapred.lib.MultipleInputs; > import org.apache.hadoop.util.ToolRunner; > import org.apache.mahout.cf.taste.hadoop.AbstractJob; > import org.apache.mahout.math.RandomAccessSparseVector; > @@ -30,13 +46,11 @@ public class MatrixMultiplicationJob ext > > private static final String OUT_CARD = "output.vector.cardinality"; > > - private Map<String,String> argMap; > - > public static JobConf createMatrixMultiplyJobConf(Path aPath, Path bPath, > Path outPath, int outCardinality) { > JobConf conf = new JobConf(MatrixMultiplicationJob.class); > conf.setInputFormat(CompositeInputFormat.class); > conf.set("mapred.join.expr", CompositeInputFormat.compose( > - "inner", SequenceFileInputFormat.class, new Path[] {aPath, > bPath})); > + "inner", SequenceFileInputFormat.class, aPath, bPath)); > conf.setInt(OUT_CARD, outCardinality); > conf.setOutputFormat(SequenceFileOutputFormat.class); > FileOutputFormat.setOutputPath(conf, outPath); > @@ -76,13 +90,13 @@ public class MatrixMultiplicationJob ext > "ib", > "Path to the second input matrix"); > > - argMap = parseArguments(strings, > - numRowsAOpt, > - numRowsBOpt, > - numColsAOpt, > - numColsBOpt, > - inputPathA, > - inputPathB); > + Map<String, String> argMap = parseArguments(strings, > + numRowsAOpt, > + numRowsBOpt, > + numColsAOpt, > + numColsBOpt, > + inputPathA, > + inputPathB); > > DistributedRowMatrix a = new > DistributedRowMatrix(argMap.get("--inputPathA"), > > argMap.get("--tempDir"), > @@ -108,6 +122,7 @@ public class MatrixMultiplicationJob ext > private final IntWritable row = new IntWritable(); > private final VectorWritable outVector = new VectorWritable(); > > + @Override > public void configure(JobConf conf) { > outCardinality = conf.getInt(OUT_CARD, Integer.MAX_VALUE); > } > @@ -140,14 +155,13 @@ public class MatrixMultiplicationJob ext > OutputCollector<IntWritable,VectorWritable> out, > Reporter reporter) throws IOException { > Vector accumulator; > - Vector row; > if(it.hasNext()) { > accumulator = new RandomAccessSparseVector(it.next().get()); > } else { > return; > } > while(it.hasNext()) { > - row = it.next().get(); > + Vector row = it.next().get(); > row.addTo(accumulator); > } > out.collect(rowNum, new VectorWritable(new > SequentialAccessSparseVector(accumulator))); > > Modified: > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVector.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVector.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVector.java > (original) > +++ > lucene/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVector.java > Thu Mar 11 09:26:39 2010 > @@ -19,11 +19,16 @@ package org.apache.mahout.math.hadoop.de > > import org.apache.mahout.math.DenseVector; > > +import java.util.regex.Pattern; > + > /** > * TODO this is a horrible hack. Make a proper writable subclass also. > */ > public class EigenVector extends DenseVector { > > + private static final Pattern EQUAL_PATTERN = Pattern.compile(" = "); > + private static final Pattern PIPE_PATTERN = Pattern.compile("|"); > + > public EigenVector(DenseVector v, double eigenValue, double > cosAngleError, int order) { > super(v, false); > setName("e|" + order +"| = |"+eigenValue+"|, err = "+cosAngleError); > @@ -43,9 +48,9 @@ public class EigenVector extends DenseVe > > protected double[] parseMetaData() { > double[] m = new double[3]; > - String[] s = getName().split(" = "); > - m[0] = Double.parseDouble(s[0].split("|")[1]); > - m[1] = Double.parseDouble(s[1].split("|")[1]); > + String[] s = EQUAL_PATTERN.split(getName()); > + m[0] = Double.parseDouble(PIPE_PATTERN.split(s[0])[1]); > + m[1] = Double.parseDouble(PIPE_PATTERN.split(s[1])[1]); > m[2] = Double.parseDouble(s[2].substring(1)); > return m; > } > > Modified: > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestPrintableInterface.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestPrintableInterface.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestPrintableInterface.java > (original) > +++ > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestPrintableInterface.java > Thu Mar 11 09:26:39 2010 > @@ -19,8 +19,6 @@ package org.apache.mahout.clustering; > > import java.lang.reflect.Type; > > -import junit.framework.TestCase; > - > import org.apache.mahout.clustering.canopy.Canopy; > import org.apache.mahout.clustering.dirichlet.DirichletCluster; > import org.apache.mahout.clustering.dirichlet.JsonModelAdapter; > @@ -31,6 +29,7 @@ import org.apache.mahout.clustering.diri > import org.apache.mahout.clustering.dirichlet.models.SampledNormalModel; > import org.apache.mahout.clustering.kmeans.Cluster; > import org.apache.mahout.clustering.meanshift.MeanShiftCanopy; > +import org.apache.mahout.common.MahoutTestCase; > import org.apache.mahout.math.DenseVector; > import org.apache.mahout.math.SequentialAccessSparseVector; > import org.apache.mahout.math.Vector; > @@ -40,7 +39,7 @@ import com.google.gson.Gson; > import com.google.gson.GsonBuilder; > import com.google.gson.reflect.TypeToken; > > -public class TestPrintableInterface extends TestCase { > +public class TestPrintableInterface extends MahoutTestCase { > > private static final Type modelType = new TypeToken<Model<Vector>>() { > }.getType(); > > Modified: > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java > (original) > +++ > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthTest.java > Thu Mar 11 09:26:39 2010 > @@ -109,9 +109,9 @@ public class PFPGrowthTest extends Mahou > log.info("Starting Pattern Aggregation Test: {}", > params.get("maxHeapSize")); > PFPGrowth.startAggregating(params); > List<Pair<String, TopKStringPatterns>> frequentPatterns = > PFPGrowth.readFrequentPattern(params); > - assertEquals("[(A,([A],5), ([D, A],4), ([B, A],4), ([A, E],4)), > (B,([B],6), ([B, D],4), ([B, A],4)," > - + " ([B, D, A],3)), (C,([B, C],3)), (D,([D],6), ([D, A],4), ([B, > D],4), ([D, A, E],3))," > - + " (E,([A, E],4), ([D, A, E],3), ([B, A, E],3))]", > frequentPatterns.toString()); > + assertEquals("[(A,([B, A],4), ([B, D, A],3), ([B, A, E],3)), > (B,([B],6), ([B, D],4), " + > + "([B, A],4), ([B],4)), (C,([B, C],3)), (D,([B, D],4), ([B, D, > A],3)), " + > + "(E,([B, A, E],3))]", frequentPatterns.toString()); > > } > > > Modified: > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java > (original) > +++ > lucene/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java > Thu Mar 11 09:26:39 2010 > @@ -1,12 +1,29 @@ > +/** > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > package org.apache.mahout.math.hadoop; > > -import junit.framework.TestCase; > import org.apache.hadoop.conf.Configuration; > import org.apache.hadoop.fs.FileSystem; > import org.apache.hadoop.fs.Path; > import org.apache.hadoop.mapred.JobConf; > import org.apache.mahout.clustering.ClusteringTestUtils; > import org.apache.mahout.clustering.canopy.TestCanopyCreation; > +import org.apache.mahout.common.MahoutTestCase; > import org.apache.mahout.math.Matrix; > import org.apache.mahout.math.MatrixSlice; > import org.apache.mahout.math.RandomAccessSparseVector; > @@ -21,16 +38,13 @@ import java.util.HashMap; > import java.util.Iterator; > import java.util.Map; > > -public class TestDistributedRowMatrix extends TestCase { > +public class TestDistributedRowMatrix extends MahoutTestCase { > > private static final String TESTDATA = "testdata"; > > - public TestDistributedRowMatrix() { > - super(); > - } > - > @Override > public void setUp() throws Exception { > + super.setUp(); > File testData = new File(TESTDATA); > if (testData.exists()) { > TestCanopyCreation.rmr(TESTDATA); > @@ -59,12 +73,15 @@ public class TestDistributedRowMatrix ex > MatrixSlice mtts = mttIt.next(); > mttMap.put(mtts.index(), mtts.vector()); > } > - for(Integer i : mMap.keySet()) { > - if(mMap.get(i) == null || mttMap.get(i) == null) { > - assertTrue(mMap.get(i) == null || mMap.get(i).norm(2) == 0); > - assertTrue(mttMap.get(i) == null || mttMap.get(i).norm(2) == 0); > + for(Map.Entry<Integer, Vector> entry : mMap.entrySet()) { > + Integer key = entry.getKey(); > + Vector value = entry.getValue(); > + if(value == null || mttMap.get(key) == null) { > + assertTrue(value == null || value.norm(2) == 0); > + assertTrue(mttMap.get(key) == null || mttMap.get(key).norm(2) == > 0); > } else { > - assertTrue(mMap.get(i).getDistanceSquared(mttMap.get(i)) < > errorTolerance); > + assertTrue( > + value.getDistanceSquared(mttMap.get(key)) < errorTolerance); > } > } > } > @@ -74,7 +91,7 @@ public class TestDistributedRowMatrix ex > DistributedRowMatrix mt = m.transpose(); > mt.setOutputTempPathString(new Path(m.getOutputTempPath().getParent(), > "/tmpOutTranspose").toString()); > DistributedRowMatrix mtt = mt.transpose(); > - assertEquals(m, mtt, 1e-9); > + assertEquals(m, mtt, 1.0e-9); > } > > public void testMatrixTimesVector() throws Exception { > @@ -85,7 +102,7 @@ public class TestDistributedRowMatrix ex > > Vector expected = m.times(v); > Vector actual = dm.times(v); > - assertEquals(expected.getDistanceSquared(actual), 0.0, 1e-9); > + assertEquals(0.0, expected.getDistanceSquared(actual), 1.0e-9); > } > > public void testMatrixTimesSquaredVector() throws Exception { > @@ -96,7 +113,7 @@ public class TestDistributedRowMatrix ex > > Vector expected = m.timesSquared(v); > Vector actual = dm.timesSquared(v); > - assertEquals(expected.getDistanceSquared(actual), 0.0, 1e-9); > + assertEquals(0.0, expected.getDistanceSquared(actual), 1.0e-9); > } > > public void testMatrixTimesMatrix() throws Exception { > @@ -108,7 +125,7 @@ public class TestDistributedRowMatrix ex > DistributedRowMatrix distB = randomDistributedMatrix(20, 13, 25, 10, > 5.0, false, "/distB"); > DistributedRowMatrix product = distA.times(distB); > > - assertEquals(expected, product, 1e-9); > + assertEquals(expected, product, 1.0e-9); > } > > public static DistributedRowMatrix randomDistributedMatrix(int numRows, > > Modified: > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/DataSet.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/DataSet.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/DataSet.java > (original) > +++ > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/DataSet.java > Thu Mar 11 09:26:39 2010 > @@ -50,7 +50,7 @@ public class DataSet { > /** > * Singleton DataSet > * > - * @throws RuntimeException if the dataset has not been initialized > + * @throws IllegalStateException if the dataset has not been initialized > */ > public static DataSet getDataSet() { > if (dataset == null) { > @@ -93,7 +93,7 @@ public class DataSet { > * Maximum possible value for an attribute > * > * @param index of the attribute > - * @throws RuntimeException if the attribute is nominal > + * @throws IllegalArgumentException if the attribute is nominal > */ > public double getMax(int index) { > if (!isNumerical(index)) { > @@ -107,7 +107,7 @@ public class DataSet { > * Minimum possible value for an attribute > * > * @param index of the attribute > - * @throws RuntimeException if the attribute is nominal > + * @throws IllegalArgumentException if the attribute is nominal > */ > public double getMin(int index) { > if (!isNumerical(index)) { > @@ -121,7 +121,7 @@ public class DataSet { > * Number of values for a nominal attribute > * > * @param index of the attribute > - * @throws RuntimeException if the attribute is numerical > + * @throws IllegalArgumentException if the attribute is numerical > */ > public int getNbValues(int index) { > if (isNumerical(index)) { > @@ -147,7 +147,7 @@ public class DataSet { > * @param index of the attribute > * @param value > * @return an <code>int</code> representing the value > - * @throws RuntimeException if the value is not found. > + * @throws IllegalArgumentException if the value is not found. > */ > public int valueIndex(int index, String value) { > if (isNumerical(index)) { > > Modified: > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombiner.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombiner.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombiner.java > (original) > +++ > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombiner.java > Thu Mar 11 09:26:39 2010 > @@ -81,7 +81,7 @@ public class ToolCombiner extends MapRed > * @param values > * available values > * @return > - * @throws RuntimeException > + * @throws IllegalArgumentException > * if the attribute should be ignored. > */ > String createDescription(int index, Iterator<Text> values) { > > Modified: > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducer.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducer.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducer.java > (original) > +++ > lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducer.java > Thu Mar 11 09:26:39 2010 > @@ -82,7 +82,7 @@ public class ToolReducer extends MapRedu > * @param values > * available descriptions > * @return > - * @throws RuntimeException > + * @throws IllegalArgumentException > * if the attribute should be ignored. > */ > String combineDescriptions(int index, Iterator<Text> values) { > > Modified: > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombinerTest.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombinerTest.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombinerTest.java > (original) > +++ > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolCombinerTest.java > Thu Mar 11 09:26:39 2010 > @@ -46,8 +46,8 @@ public class ToolCombinerTest extends Ma > > try { > combiner.createDescription(0, null); > - fail("Should throw a RuntimeException"); > - } catch (RuntimeException e) { > + fail("Should throw a IllegalArgumentException"); > + } catch (IllegalArgumentException e) { > > } > } > > Modified: > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducerTest.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducerTest.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducerTest.java > (original) > +++ > lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/ToolReducerTest.java > Thu Mar 11 09:26:39 2010 > @@ -48,8 +48,8 @@ public class ToolReducerTest extends Mah > > try { > reducer.combineDescriptions(0, null); > - fail("Should throw a RuntimeException"); > - } catch (RuntimeException e) { > + fail("Should throw a IllegalArgumentException"); > + } catch (IllegalArgumentException e) { > > } > } > > Modified: > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java > (original) > +++ > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java > Thu Mar 11 09:26:39 2010 > @@ -70,9 +70,8 @@ public class CollocDriver extends Config > public static void main(String[] args) throws Exception { > ToolRunner.run(new CollocDriver(), args); > } > - /** > - * @param args > - */ > + > + @Override > public int run(String[] args) throws Exception { > DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); > ArgumentBuilder abuilder = new ArgumentBuilder(); > > Modified: > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java > (original) > +++ > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java > Thu Mar 11 09:26:39 2010 > @@ -42,6 +42,8 @@ import org.slf4j.LoggerFactory; > * <p/> > */ > public class CollocMapper extends MapReduceBase implements > Mapper<Text,StringTuple,GramKey,Gram> { > + > + private static final byte[] EMPTY = new byte[0]; > > public static final String MAX_SHINGLE_SIZE = "maxShingleSize"; > public static final int DEFAULT_MAX_SHINGLE_SIZE = 2; > @@ -130,7 +132,6 @@ public class CollocMapper extends MapRed > } while (sf.incrementToken()); > > try { > - final byte[] empty = new byte[0]; > final GramKey gramKey = new GramKey(); > > ngrams.forEachPair(new ObjectIntProcedure<String>() { > @@ -145,13 +146,13 @@ public class CollocMapper extends MapRed > Gram head = new Gram(term.substring(0, i), frequency, > Gram.Type.HEAD); > Gram tail = new Gram(term.substring(i + 1), frequency, > Gram.Type.TAIL); > > - gramKey.set(head, empty); > + gramKey.set(head, EMPTY); > collector.collect(gramKey, head); > > gramKey.set(head, ngram.getBytes()); > collector.collect(gramKey, ngram); > > - gramKey.set(tail, empty); > + gramKey.set(tail, EMPTY); > collector.collect(gramKey, tail); > > gramKey.set(tail, ngram.getBytes()); > @@ -170,7 +171,7 @@ public class CollocMapper extends MapRed > public boolean apply(String term, int frequency) { > try { > Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM); > - gramKey.set(unigram, empty); > + gramKey.set(unigram, EMPTY); > collector.collect(gramKey, unigram); > } catch (IOException e) { > throw new IllegalStateException(e); > > Modified: > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java > (original) > +++ > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java > Thu Mar 11 09:26:39 2010 > @@ -18,7 +18,6 @@ > package org.apache.mahout.utils.nlp.collocations.llr; > > import java.io.IOException; > -import java.util.HashMap; > import java.util.Iterator; > > import org.apache.hadoop.mapred.JobConf; > @@ -107,13 +106,8 @@ public class CollocReducer extends MapRe > } > } > > - /** Sum frequencies for unigrams and deliver to the collector > - * > - * @param keyFirst > - * @param values > - * @param output > - * @param reporter > - * @throws IOException > + /** > + * Sum frequencies for unigrams and deliver to the collector > */ > protected void processUnigram(GramKey key, Iterator<Gram> values, > OutputCollector<Gram, Gram> output, Reporter reporter) throws > IOException { > @@ -145,12 +139,6 @@ public class CollocReducer extends MapRe > * <p/> > * We end up calculating frequencies for ngrams for each sugram (head, > tail) here, which is > * some extra work. > - * > - * @param keyFirst > - * @param values > - * @param output > - * @param reporter > - * @throws IOException > */ > protected void processSubgram(GramKey key, Iterator<Gram> values, > OutputCollector<Gram,Gram> output, Reporter reporter) throws > IOException { > > Modified: > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java > (original) > +++ > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKey.java > Thu Mar 11 09:26:39 2010 > @@ -32,9 +32,9 @@ import org.apache.mahout.utils.nlp.collo > public class GramKey extends BinaryComparable implements > WritableComparable<BinaryComparable> { > > - int primaryLength; > - int length; > - byte[] bytes; > + private int primaryLength; > + private int length; > + private byte[] bytes; > > public GramKey() { > > @@ -123,11 +123,11 @@ public class GramKey extends BinaryCompa > try { > return Text.decode(bytes, 1, primaryLength-1); > } catch (CharacterCodingException e) { > - throw new RuntimeException("Should not have happened " + > e.toString()); > + throw new IllegalStateException(e); > } > } > > public String toString() { > - return '\'' + getPrimaryString() + "'[" + getType().x + "]"; > + return '\'' + getPrimaryString() + "'[" + getType().x + ']'; > } > } > > Modified: > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyPartitioner.java > URL: > http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyPartitioner.java?rev=921751&r1=921750&r2=921751&view=diff > > ============================================================================== > --- > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyPartitioner.java > (original) > +++ > lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyPartitioner.java > Thu Mar 11 09:26:39 2010 > @@ -33,7 +33,7 @@ public class GramKeyPartitioner implemen > conf.setInt(HASH_OFFSET_PROPERTY_NAME, left); > } > > - int offset; > + private int offset; > > @Override > public int getPartition(GramKey key, Gram value, int numPartitions) { > > >