Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1090881&r1=1090880&r2=1090881&view=diff ============================================================================== --- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original) +++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Sun Apr 10 20:00:13 2011 @@ -54,7 +54,8 @@ import org.junit.Test; public final class TestCanopyCreation extends MahoutTestCase { - private static final double[][] RAW = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } }; + private static final double[][] RAW = { { 1, 1 }, { 2, 1 }, { 1, 2 }, + { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } }; private List<Canopy> referenceManhattan; @@ -116,13 +117,18 @@ public final class TestCanopyCreation ex public void setUp() throws Exception { super.setUp(); fs = FileSystem.get(new Configuration()); - referenceManhattan = CanopyClusterer.createCanopies(getPoints(), manhattanDistanceMeasure, 3.1, 2.1); + referenceManhattan = CanopyClusterer.createCanopies(getPoints(), + manhattanDistanceMeasure, 3.1, 2.1); manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan); - referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), euclideanDistanceMeasure, 3.1, 2.1); + referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), + euclideanDistanceMeasure, 3.1, 2.1); euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean); } - /** Story: User can cluster points using a ManhattanDistanceMeasure and a reference implementation */ + /** + * Story: User can cluster points using a ManhattanDistanceMeasure and a + * reference implementation + */ @Test public void testReferenceManhattan() throws Exception { // see setUp for cluster creation @@ -131,50 +137,60 @@ public final class TestCanopyCreation ex for (int canopyIx = 0; canopyIx < referenceManhattan.size(); canopyIx++) { Canopy testCanopy = referenceManhattan.get(canopyIx); int[] expectedNumPoints = { 4, 4, 3 }; - double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 }, { 4.666666666666667, 4.6666666666666667 } }; - assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx], testCanopy.getNumPoints()); + double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 }, + { 4.666666666666667, 4.6666666666666667 } }; + assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx], + testCanopy.getNumPoints()); double[] refCentroid = expectedCentroids[canopyIx]; Vector testCentroid = testCanopy.computeCentroid(); for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) { - assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', refCentroid[pointIx], testCentroid.get(pointIx), EPSILON); + assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', + refCentroid[pointIx], testCentroid.get(pointIx), EPSILON); } } } - /** Story: User can cluster points using a EuclideanDistanceMeasure and a reference implementation */ + /** + * Story: User can cluster points using a EuclideanDistanceMeasure and a + * reference implementation + */ @Test public void testReferenceEuclidean() throws Exception { // see setUp for cluster creation printCanopies(referenceEuclidean); assertEquals("number of canopies", 3, referenceEuclidean.size()); int[] expectedNumPoints = { 5, 5, 3 }; - double[][] expectedCentroids = { { 1.8, 1.8 }, { 4.2, 4.2 }, { 4.666666666666667, 4.666666666666667 } }; + double[][] expectedCentroids = { { 1.8, 1.8 }, { 4.2, 4.2 }, + { 4.666666666666667, 4.666666666666667 } }; for (int canopyIx = 0; canopyIx < referenceEuclidean.size(); canopyIx++) { Canopy testCanopy = referenceEuclidean.get(canopyIx); - assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx], testCanopy.getNumPoints()); + assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx], + testCanopy.getNumPoints()); double[] refCentroid = expectedCentroids[canopyIx]; Vector testCentroid = testCanopy.computeCentroid(); for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) { - assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', refCentroid[pointIx], testCentroid.get(pointIx), EPSILON); + assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', + refCentroid[pointIx], testCentroid.get(pointIx), EPSILON); } } } /** - * Story: User can produce initial canopy centers using a ManhattanDistanceMeasure and a - * CanopyMapper which clusters input points to produce an output set of canopy centroid points. + * Story: User can produce initial canopy centers using a + * ManhattanDistanceMeasure and a CanopyMapper which clusters input points to + * produce an output set of canopy centroid points. */ @Test public void testCanopyMapperManhattan() throws Exception { CanopyMapper mapper = new CanopyMapper(); Configuration conf = new Configuration(); - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure.getClass().getName()); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure + .getClass().getName()); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>(); - Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter.build(mapper, - conf, - writer); + Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter + .build(mapper, conf, writer); mapper.setup(context); List<VectorWritable> points = getPointsWritable(); @@ -188,25 +204,28 @@ public final class TestCanopyCreation ex List<VectorWritable> data = writer.getValue(new Text("centroid")); assertEquals("Number of centroids", 3, data.size()); for (int i = 0; i < data.size(); i++) { - assertEquals("Centroid error", manhattanCentroids.get(i).asFormatString(), data.get(i).get().asFormatString()); + assertEquals("Centroid error", + manhattanCentroids.get(i).asFormatString(), data.get(i).get() + .asFormatString()); } } /** - * Story: User can produce initial canopy centers using a EuclideanDistanceMeasure and a - * CanopyMapper/Combiner which clusters input points to produce an output set of canopy centroid points. + * Story: User can produce initial canopy centers using a + * EuclideanDistanceMeasure and a CanopyMapper/Combiner which clusters input + * points to produce an output set of canopy centroid points. */ @Test public void testCanopyMapperEuclidean() throws Exception { CanopyMapper mapper = new CanopyMapper(); Configuration conf = new Configuration(); - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure.getClass().getName()); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure + .getClass().getName()); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>(); - Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter.build(mapper, - conf, - writer); + Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter + .build(mapper, conf, writer); mapper.setup(context); List<VectorWritable> points = getPointsWritable(); @@ -220,27 +239,28 @@ public final class TestCanopyCreation ex List<VectorWritable> data = writer.getValue(new Text("centroid")); assertEquals("Number of centroids", 3, data.size()); for (int i = 0; i < data.size(); i++) { - assertEquals("Centroid error", euclideanCentroids.get(i).asFormatString(), data.get(i).get().asFormatString()); + assertEquals("Centroid error", + euclideanCentroids.get(i).asFormatString(), data.get(i).get() + .asFormatString()); } } /** - * Story: User can produce final canopy centers using a ManhattanDistanceMeasure and a CanopyReducer which - * clusters input centroid points to produce an output set of final canopy centroid points. + * Story: User can produce final canopy centers using a + * ManhattanDistanceMeasure and a CanopyReducer which clusters input centroid + * points to produce an output set of final canopy centroid points. */ @Test public void testCanopyReducerManhattan() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = new Configuration(); - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, + "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>(); - Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter.build(reducer, - conf, - writer, - Text.class, - VectorWritable.class); + Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter + .build(reducer, conf, writer, Text.class, VectorWritable.class); reducer.setup(context); List<VectorWritable> points = getPointsWritable(); @@ -250,29 +270,30 @@ public final class TestCanopyCreation ex int i = 0; for (Text key : keys) { List<Canopy> data = writer.getValue(key); - assertEquals(manhattanCentroids.get(i).asFormatString() + " is not equal to " - + data.get(0).computeCentroid().asFormatString(), manhattanCentroids.get(i), data.get(0).computeCentroid()); + assertEquals(manhattanCentroids.get(i).asFormatString() + + " is not equal to " + + data.get(0).computeCentroid().asFormatString(), manhattanCentroids + .get(i), data.get(0).computeCentroid()); i++; } } /** - * Story: User can produce final canopy centers using a EuclideanDistanceMeasure and a CanopyReducer which - * clusters input centroid points to produce an output set of final canopy centroid points. + * Story: User can produce final canopy centers using a + * EuclideanDistanceMeasure and a CanopyReducer which clusters input centroid + * points to produce an output set of final canopy centroid points. */ @Test public void testCanopyReducerEuclidean() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = new Configuration(); - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure"); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, + "org.apache.mahout.common.distance.EuclideanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>(); - Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter.build(reducer, - conf, - writer, - Text.class, - VectorWritable.class); + Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter + .build(reducer, conf, writer, Text.class, VectorWritable.class); reducer.setup(context); List<VectorWritable> points = getPointsWritable(); @@ -282,25 +303,30 @@ public final class TestCanopyCreation ex int i = 0; for (Text key : keys) { List<Canopy> data = writer.getValue(key); - assertEquals(euclideanCentroids.get(i).asFormatString() + " is not equal to " - + data.get(0).computeCentroid().asFormatString(), euclideanCentroids.get(i), data.get(0).computeCentroid()); + assertEquals(euclideanCentroids.get(i).asFormatString() + + " is not equal to " + + data.get(0).computeCentroid().asFormatString(), euclideanCentroids + .get(i), data.get(0).computeCentroid()); i++; } } /** - * Story: User can produce final canopy centers using a Hadoop map/reduce job and a - * ManhattanDistanceMeasure. + * Story: User can produce final canopy centers using a Hadoop map/reduce job + * and a ManhattanDistanceMeasure. */ @Test public void testCanopyGenManhattanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, config); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, config); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file2"), fs, config); // now run the Canopy Driver Path output = getTestTempDirPath("output"); - CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, false, false); + CanopyDriver.run(config, getTestTempDirPath("testdata"), output, + manhattanDistanceMeasure, 3.1, 2.1, false, false); // verify output from sequence file Path path = new Path(output, "clusters-0/part-r-00000"); @@ -314,25 +340,30 @@ public final class TestCanopyCreation ex assertEquals("1st y value", 1.5, canopy.getCenter().get(1), EPSILON); assertTrue("more to come", reader.next(key, canopy)); assertEquals("2nd key", "C-1", key.toString()); - assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0), EPSILON); - assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1), EPSILON); + assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0), + EPSILON); + assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1), + EPSILON); assertFalse("more to come", reader.next(key, canopy)); reader.close(); } /** - * Story: User can produce final canopy centers using a Hadoop map/reduce job and a - * EuclideanDistanceMeasure. + * Story: User can produce final canopy centers using a Hadoop map/reduce job + * and a EuclideanDistanceMeasure. */ @Test public void testCanopyGenEuclideanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, config); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, config); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file2"), fs, config); // now run the Canopy Driver Path output = getTestTempDirPath("output"); - CanopyDriver.run(config, getTestTempDirPath("testdata"), output, euclideanDistanceMeasure, 3.1, 2.1, false, false); + CanopyDriver.run(config, getTestTempDirPath("testdata"), output, + euclideanDistanceMeasure, 3.1, 2.1, false, false); // verify output from sequence file Path path = new Path(output, "clusters-0/part-r-00000"); @@ -346,18 +377,24 @@ public final class TestCanopyCreation ex assertEquals("1st y value", 1.8, value.getCenter().get(1), EPSILON); assertTrue("more to come", reader.next(key, value)); assertEquals("2nd key", "C-1", key.toString()); - assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0), EPSILON); - assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1), EPSILON); + assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0), + EPSILON); + assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1), + EPSILON); assertFalse("more to come", reader.next(key, value)); reader.close(); } - /** Story: User can cluster a subset of the points using a ClusterMapper and a ManhattanDistanceMeasure. */ + /** + * Story: User can cluster a subset of the points using a ClusterMapper and a + * ManhattanDistanceMeasure. + */ @Test public void testClusterMapperManhattan() throws Exception { ClusterMapper mapper = new ClusterMapper(); Configuration conf = new Configuration(); - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, + "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new DummyRecordWriter<IntWritable, WeightedVectorWritable>(); @@ -368,7 +405,8 @@ public final class TestCanopyCreation ex Collection<Canopy> canopies = new ArrayList<Canopy>(); int nextCanopyId = 0; for (Vector centroid : manhattanCentroids) { - canopies.add(new Canopy(centroid, nextCanopyId++, manhattanDistanceMeasure)); + canopies.add(new Canopy(centroid, nextCanopyId++, + manhattanDistanceMeasure)); } mapper.config(canopies); List<VectorWritable> points = getPointsWritable(); @@ -378,22 +416,28 @@ public final class TestCanopyCreation ex } Map<IntWritable, List<WeightedVectorWritable>> data = writer.getData(); assertEquals("Number of map results", canopies.size(), data.size()); - for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data.entrySet()) { + for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data + .entrySet()) { IntWritable key = stringListEntry.getKey(); Canopy canopy = findCanopy(key.get(), canopies); List<WeightedVectorWritable> pts = stringListEntry.getValue(); for (WeightedVectorWritable ptDef : pts) { - assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef.getVector())); + assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef + .getVector())); } } } - /** Story: User can cluster a subset of the points using a ClusterMapper and a EuclideanDistanceMeasure. */ + /** + * Story: User can cluster a subset of the points using a ClusterMapper and a + * EuclideanDistanceMeasure. + */ @Test public void testClusterMapperEuclidean() throws Exception { ClusterMapper mapper = new ClusterMapper(); Configuration conf = new Configuration(); - conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure"); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, + "org.apache.mahout.common.distance.EuclideanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new DummyRecordWriter<IntWritable, WeightedVectorWritable>(); @@ -404,7 +448,8 @@ public final class TestCanopyCreation ex Collection<Canopy> canopies = new ArrayList<Canopy>(); int nextCanopyId = 0; for (Vector centroid : euclideanCentroids) { - canopies.add(new Canopy(centroid, nextCanopyId++, euclideanDistanceMeasure)); + canopies.add(new Canopy(centroid, nextCanopyId++, + euclideanDistanceMeasure)); } mapper.config(canopies); List<VectorWritable> points = getPointsWritable(); @@ -414,12 +459,14 @@ public final class TestCanopyCreation ex } Map<IntWritable, List<WeightedVectorWritable>> data = writer.getData(); assertEquals("Number of map results", canopies.size(), data.size()); - for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data.entrySet()) { + for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data + .entrySet()) { IntWritable key = stringListEntry.getKey(); Canopy canopy = findCanopy(key.get(), canopies); List<WeightedVectorWritable> pts = stringListEntry.getValue(); for (WeightedVectorWritable ptDef : pts) { - assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef.getVector())); + assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef + .getVector())); } } } @@ -429,16 +476,20 @@ public final class TestCanopyCreation ex public void testClusteringManhattanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, config); // now run the Canopy Driver in sequential mode Path output = getTestTempDirPath("output"); - CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, true); + CanopyDriver.run(config, getTestTempDirPath("testdata"), output, + manhattanDistanceMeasure, 3.1, 2.1, true, true); // verify output from sequence file Path path = new Path(output, "clusters-0/part-r-00000"); int ix = 0; - for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true, config)) { - assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), value.getCenter()); + for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true, + config)) { + assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), value + .getCenter()); ix++; } @@ -452,14 +503,20 @@ public final class TestCanopyCreation ex public void testClusteringEuclideanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, config); // now run the Canopy Driver in sequential mode Path output = getTestTempDirPath("output"); - String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), - optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), - EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", - optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.CLUSTERING_OPTION), - optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION), + String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), + getTestTempDirPath("testdata").toString(), + optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), + optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), + EuclideanDistanceMeasure.class.getName(), + optKey(DefaultOptionCreator.T1_OPTION), "3.1", + optKey(DefaultOptionCreator.T2_OPTION), "2.1", + optKey(DefaultOptionCreator.CLUSTERING_OPTION), + optKey(DefaultOptionCreator.OVERWRITE_OPTION), + optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD }; new CanopyDriver().run(args); @@ -467,8 +524,10 @@ public final class TestCanopyCreation ex Path path = new Path(output, "clusters-0/part-r-00000"); int ix = 0; - for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true, config)) { - assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), value.getCenter()); + for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true, + config)) { + assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), value + .getCenter()); ix++; } @@ -478,39 +537,48 @@ public final class TestCanopyCreation ex } /** - * Story: User can produce final point clustering using a Hadoop map/reduce job and a - * ManhattanDistanceMeasure. + * Story: User can produce final point clustering using a Hadoop map/reduce + * job and a ManhattanDistanceMeasure. */ @Test public void testClusteringManhattanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, conf); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, conf); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, conf); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job Path output = getTestTempDirPath("output"); - CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, false); + CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, + manhattanDistanceMeasure, 3.1, 2.1, true, false); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); assertEquals("number of points", points.size(), count); } /** - * Story: User can produce final point clustering using a Hadoop map/reduce job and a - * EuclideanDistanceMeasure. + * Story: User can produce final point clustering using a Hadoop map/reduce + * job and a EuclideanDistanceMeasure. */ @Test public void testClusteringEuclideanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, conf); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, conf); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, conf); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job using the run() command. Others can use runJob(). Path output = getTestTempDirPath("output"); - String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), - optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), - EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", - optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.CLUSTERING_OPTION), + String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), + getTestTempDirPath("testdata").toString(), + optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), + optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), + EuclideanDistanceMeasure.class.getName(), + optKey(DefaultOptionCreator.T1_OPTION), "3.1", + optKey(DefaultOptionCreator.T2_OPTION), "2.1", + optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) }; ToolRunner.run(new Configuration(), new CanopyDriver(), args); Path path = new Path(output, "clusteredPoints/part-m-00000"); @@ -518,17 +586,23 @@ public final class TestCanopyCreation ex assertEquals("number of points", points.size(), count); } - /** Story: Clustering algorithm must support arbitrary user defined distance measure */ + /** + * Story: Clustering algorithm must support arbitrary user defined distance + * measure + */ @Test public void testUserDefinedDistanceMeasure() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = new Configuration(); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, conf); - ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, conf); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file1"), fs, conf); + ClusteringTestUtils.writePointsToFile(points, + getTestTempFilePath("testdata/file2"), fs, conf); // now run the Canopy Driver. User defined measure happens to be a Manhattan // subclass so results are same. Path output = getTestTempDirPath("output"); - CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, new UserDefinedDistanceMeasure(), 3.1, 2.1, false, false); + CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, + new UserDefinedDistanceMeasure(), 3.1, 2.1, false, false); // verify output from sequence file Configuration job = new Configuration(); @@ -545,9 +619,33 @@ public final class TestCanopyCreation ex assertTrue("more to come", reader.next(key, value)); assertEquals("2nd key", "C-1", key.toString()); - assertEquals("1st x value", 4.333333333333334, value.getCenter().get(0), EPSILON); - assertEquals("1st y value", 4.333333333333334, value.getCenter().get(1), EPSILON); + assertEquals("1st x value", 4.333333333333334, value.getCenter().get(0), + EPSILON); + assertEquals("1st y value", 4.333333333333334, value.getCenter().get(1), + EPSILON); assertFalse("more to come", reader.next(key, value)); reader.close(); } + + /** + * Story: User can set T3 and T4 values to be used by the reducer for its T1 + * and T2 thresholds + */ + @Test + public void testCanopyReducerT3T4Configuration() throws Exception { + CanopyReducer reducer = new CanopyReducer(); + Configuration conf = new Configuration(); + conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, + "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); + conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); + conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); + conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1)); + conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1)); + DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, Canopy>(); + Reducer<Text, VectorWritable, Text, Canopy>.Context context = DummyRecordWriter + .build(reducer, conf, writer, Text.class, VectorWritable.class); + reducer.setup(context); + assertEquals(1.1, reducer.canopyClusterer.t1, EPSILON); + assertEquals(0.1, reducer.canopyClusterer.t2, EPSILON); + } }
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=1090881&r1=1090880&r2=1090881&view=diff ============================================================================== --- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original) +++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Sun Apr 10 20:00:13 2011 @@ -56,12 +56,15 @@ public final class Job extends AbstractJ } /** - * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 parameters. - * All output data will be written to the output directory, which will be initially deleted if it exists. - * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a - * file containing synthetic_control.data as obtained from - * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named - * "testdata", and writes output to a directory named "output". + * Run the canopy clustering job on an input dataset using the given distance + * measure, t1 and t2 parameters. All output data will be written to the + * output directory, which will be initially deleted if it exists. The + * clustered points will reside in the path <output>/clustered-points. By + * default, the job expects the a file containing synthetic_control.data as + * obtained from + * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series + * resides in a directory named "testdata", and writes output to a directory + * named "output". * * @param input * the String denoting the input directory path @@ -73,15 +76,24 @@ public final class Job extends AbstractJ * the canopy T1 threshold * @param t2 * the canopy T2 threshold + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + * @throws InstantiationException + * @throws IllegalAccessException */ - private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2) - throws IOException, InterruptedException, ClassNotFoundException { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); - CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, false); + private static void run(Path input, Path output, DistanceMeasure measure, + double t1, double t2) throws IOException, InterruptedException, + ClassNotFoundException, InstantiationException, IllegalAccessException { + Path directoryContainingConvertedInput = new Path(output, + DIRECTORY_CONTAINING_CONVERTED_INPUT); + InputDriver.runJob(input, directoryContainingConvertedInput, + "org.apache.mahout.math.RandomAccessSparseVector"); + CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, + output, measure, t1, t2, true, false); // run ClusterDumper - ClusterDumper clusterDumper = - new ClusterDumper(new Path(output, "clusters-0"), new Path(output, "clusteredPoints")); + ClusterDumper clusterDumper = new ClusterDumper(new Path(output, + "clusters-0"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); } @@ -109,7 +121,8 @@ public final class Job extends AbstractJ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); ClassLoader ccl = Thread.currentThread().getContextClassLoader(); - DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(measureClass)).newInstance(); + DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl + .loadClass(measureClass)).newInstance(); run(input, output, measure, t1, t2); return 0; Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=1090881&r1=1090880&r2=1090881&view=diff ============================================================================== --- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java (original) +++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java Sun Apr 10 20:00:13 2011 @@ -45,6 +45,7 @@ public final class Job extends AbstractJ private static final Logger log = LoggerFactory.getLogger(Job.class); private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data"; + private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION; private Job() { @@ -59,17 +60,14 @@ public final class Job extends AbstractJ Path output = new Path("output"); Configuration conf = new Configuration(); HadoopUtil.delete(conf, output); - new Job().run(conf, - new Path("testdata"), - output, - new EuclideanDistanceMeasure(), - 80, 55, 10, (float) 2, 0.5); + new Job().run(conf, new Path("testdata"), output, + new EuclideanDistanceMeasure(), 80, 55, 10, (float) 2, 0.5); } } @Override - public int run(String[] args) - throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException { + public int run(String[] args) throws IOException, ClassNotFoundException, + InstantiationException, IllegalAccessException, InterruptedException { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); @@ -78,7 +76,8 @@ public final class Job extends AbstractJ addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); - addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true); + addOption(M_OPTION, M_OPTION, + "coefficient normalization factor, must be greater than 1", true); Map<String, String> argMap = parseArguments(args); if (argMap == null) { @@ -91,29 +90,36 @@ public final class Job extends AbstractJ if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); + double convergenceDelta = Double + .parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); + int maxIterations = Integer + .parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); float fuzziness = Float.parseFloat(getOption(M_OPTION)); - addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true) - .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create()) - .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION) - .create()); + addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired( + true).withArgument( + new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1) + .create()).withDescription( + "coefficient normalization factor, must be greater than 1") + .withShortName(M_OPTION).create()); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } ClassLoader ccl = Thread.currentThread().getContextClassLoader(); - DistanceMeasure measure = ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance(); + DistanceMeasure measure = ccl.loadClass(measureClass).asSubclass( + DistanceMeasure.class).newInstance(); double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta); + run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, + convergenceDelta); return 0; } /** * Return the path to the final iteration's clusters */ - private static Path finalClusterPath(Configuration conf, Path output, int maxIterations) throws IOException { + private static Path finalClusterPath(Configuration conf, Path output, + int maxIterations) throws IOException { FileSystem fs = FileSystem.get(conf); for (int i = maxIterations; i >= 0; i--) { Path clusters = new Path(output, "clusters-" + i); @@ -125,12 +131,16 @@ public final class Job extends AbstractJ } /** - * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration - * parameters. All output data will be written to the output directory, which will be initially deleted if - * it exists. The clustered points will reside in the path <output>/clustered-points. By default, the job - * expects the a file containing synthetic_control.data as obtained from - * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named - * "testdata", and writes output to a directory named "output". + * Run the kmeans clustering job on an input dataset using the given distance + * measure, t1, t2 and iteration parameters. All output data will be written + * to the output directory, which will be initially deleted if it exists. The + * clustered points will reside in the path <output>/clustered-points. By + * default, the job expects the a file containing synthetic_control.data as + * obtained from + * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series + * resides in a directory named "testdata", and writes output to a directory + * named "output". + * * @param input * the String denoting the input directory path * @param output @@ -139,43 +149,37 @@ public final class Job extends AbstractJ * the canopy T1 threshold * @param t2 * the canopy T2 threshold - * @param maxIterations + * @param maxIterations * the int maximum number of iterations - * @param fuzziness + * @param fuzziness * the float "m" fuzziness coefficient * @param convergenceDelta * the double convergence criteria for iterations + * @throws InterruptedException + * @throws ClassNotFoundException + * @throws IllegalAccessException + * @throws InstantiationException */ - public void run(Configuration conf, - Path input, - Path output, - DistanceMeasure measure, - double t1, - double t2, - int maxIterations, - float fuzziness, - double convergenceDelta) - throws IOException, InterruptedException, ClassNotFoundException { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); + public void run(Configuration conf, Path input, Path output, + DistanceMeasure measure, double t1, double t2, int maxIterations, + float fuzziness, double convergenceDelta) throws IOException, + InterruptedException, ClassNotFoundException, InstantiationException, + IllegalAccessException { + Path directoryContainingConvertedInput = new Path(output, + DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); + InputDriver.runJob(input, directoryContainingConvertedInput, + "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); - CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, false, false); + CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, + output, measure, t1, t2, false, false); log.info("Running FuzzyKMeans"); - FuzzyKMeansDriver.run(directoryContainingConvertedInput, - new Path(output, Cluster.INITIAL_CLUSTERS_DIR), - output, - measure, - convergenceDelta, - maxIterations, - fuzziness, - true, - true, - 0.0, - false); + FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(output, + Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, + maxIterations, fuzziness, true, true, 0.0, false); // run ClusterDumper - ClusterDumper clusterDumper = - new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); + ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, + output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); } } Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1090881&r1=1090880&r2=1090881&view=diff ============================================================================== --- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original) +++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sun Apr 10 20:00:13 2011 @@ -57,13 +57,14 @@ public final class Job extends AbstractJ Path output = new Path("output"); Configuration conf = new Configuration(); HadoopUtil.delete(conf, output); - new Job().run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10); + new Job().run(conf, new Path("testdata"), output, + new EuclideanDistanceMeasure(), 6, 0.5, 10); } } @Override - public int run(String[] args) - throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, InterruptedException { + public int run(String[] args) throws IOException, ClassNotFoundException, + InstantiationException, IllegalAccessException, InterruptedException { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); @@ -85,8 +86,10 @@ public final class Job extends AbstractJ if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } - double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); - int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); + double convergenceDelta = Double + .parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); + int maxIterations = Integer + .parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } @@ -94,74 +97,76 @@ public final class Job extends AbstractJ Class<?> cl = ccl.loadClass(measureClass); DistanceMeasure measure = (DistanceMeasure) cl.newInstance(); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { - int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); + int k = Integer + .parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); run(getConf(), input, output, measure, k, convergenceDelta, maxIterations); } else { double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); - run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations); + run(getConf(), input, output, measure, t1, t2, convergenceDelta, + maxIterations); } return 0; } - + /** - * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration - * parameters. All output data will be written to the output directory, which will be initially deleted if - * it exists. The clustered points will reside in the path <output>/clustered-points. By default, the job - * expects a file containing equal length space delimited data that resides in a directory named + * Run the kmeans clustering job on an input dataset using the given the + * number of clusters k and iteration parameters. All output data will be + * written to the output directory, which will be initially deleted if it + * exists. The clustered points will reside in the path + * <output>/clustered-points. By default, the job expects a file containing + * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". - * @param conf the Configuration to use + * + * @param conf + * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use - * @param k + * @param k * the number of clusters in Kmeans * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ - public void run(Configuration conf, - Path input, - Path output, - DistanceMeasure measure, - int k, - double convergenceDelta, - int maxIterations) - throws IOException, InterruptedException, ClassNotFoundException { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); + public void run(Configuration conf, Path input, Path output, + DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) + throws IOException, InterruptedException, ClassNotFoundException { + Path directoryContainingConvertedInput = new Path(output, + DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); + InputDriver.runJob(input, directoryContainingConvertedInput, + "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); - clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); + clusters = RandomSeedGenerator.buildRandom(conf, + directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); - KMeansDriver.run(conf, - directoryContainingConvertedInput, - clusters, - output, - measure, - convergenceDelta, - maxIterations, - true, - false); + KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, + measure, convergenceDelta, maxIterations, true, false); // run ClusterDumper - ClusterDumper clusterDumper = - new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); + ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, + output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); } /** - * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration - * parameters. All output data will be written to the output directory, which will be initially deleted if - * it exists. The clustered points will reside in the path <output>/clustered-points. By default, the job - * expects the a file containing synthetic_control.data as obtained from - * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named - * "testdata", and writes output to a directory named "output". - * @param conf the Configuration to use + * Run the kmeans clustering job on an input dataset using the given distance + * measure, t1, t2 and iteration parameters. All output data will be written + * to the output directory, which will be initially deleted if it exists. The + * clustered points will reside in the path <output>/clustered-points. By + * default, the job expects the a file containing synthetic_control.data as + * obtained from + * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series + * resides in a directory named "testdata", and writes output to a directory + * named "output". + * + * @param conf + * the Configuration to use * @param input * the String denoting the input directory path * @param output @@ -176,41 +181,39 @@ public final class Job extends AbstractJ * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + * @throws IllegalAccessException + * @throws InstantiationException */ - public void run(Configuration conf, - Path input, - Path output, - DistanceMeasure measure, - double t1, - double t2, - double convergenceDelta, - int maxIterations) - throws IOException, InterruptedException, ClassNotFoundException { - Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); + public void run(Configuration conf, Path input, Path output, + DistanceMeasure measure, double t1, double t2, double convergenceDelta, + int maxIterations) throws IOException, InterruptedException, + ClassNotFoundException, InstantiationException, IllegalAccessException { + Path directoryContainingConvertedInput = new Path(output, + DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); - InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); + InputDriver.runJob(input, directoryContainingConvertedInput, + "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); - CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false); + CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, + t1, t2, false, false); log.info("Running KMeans"); - KMeansDriver.run(conf, - directoryContainingConvertedInput, - new Path(output, Cluster.INITIAL_CLUSTERS_DIR), - output, - measure, - convergenceDelta, - maxIterations, - true, - false); + KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, + Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, + maxIterations, true, false); // run ClusterDumper - ClusterDumper clusterDumper = - new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); + ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, + output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); } /** * Return the path to the final iteration's clusters */ - private static Path finalClusterPath(Configuration conf, Path output, int maxIterations) throws IOException { + private static Path finalClusterPath(Configuration conf, Path output, + int maxIterations) throws IOException { FileSystem fs = FileSystem.get(conf); for (int i = maxIterations; i >= 0; i--) { Path clusters = new Path(output, "clusters-" + i);
