ma...

jeastman Sun, 10 Apr 2011 13:00:39 -0700

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=1090881&r1=1090880&r2=1090881&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
 Sun Apr 10 20:00:13 2011
@@ -54,7 +54,8 @@ import org.junit.Test;
 
 public final class TestCanopyCreation extends MahoutTestCase {
 
-  private static final double[][] RAW = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 
}, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
+  private static final double[][] RAW = { { 1, 1 }, { 2, 1 }, { 1, 2 },
+      { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
 
   private List<Canopy> referenceManhattan;
 
@@ -116,13 +117,18 @@ public final class TestCanopyCreation ex
   public void setUp() throws Exception {
     super.setUp();
     fs = FileSystem.get(new Configuration());
-    referenceManhattan = CanopyClusterer.createCanopies(getPoints(), 
manhattanDistanceMeasure, 3.1, 2.1);
+    referenceManhattan = CanopyClusterer.createCanopies(getPoints(),
+        manhattanDistanceMeasure, 3.1, 2.1);
     manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan);
-    referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), 
euclideanDistanceMeasure, 3.1, 2.1);
+    referenceEuclidean = CanopyClusterer.createCanopies(getPoints(),
+        euclideanDistanceMeasure, 3.1, 2.1);
     euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean);
   }
 
-  /** Story: User can cluster points using a ManhattanDistanceMeasure and a 
reference implementation */
+  /**
+   * Story: User can cluster points using a ManhattanDistanceMeasure and a
+   * reference implementation
+   */
   @Test
   public void testReferenceManhattan() throws Exception {
     // see setUp for cluster creation
@@ -131,50 +137,60 @@ public final class TestCanopyCreation ex
     for (int canopyIx = 0; canopyIx < referenceManhattan.size(); canopyIx++) {
       Canopy testCanopy = referenceManhattan.get(canopyIx);
       int[] expectedNumPoints = { 4, 4, 3 };
-      double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 }, { 
4.666666666666667, 4.6666666666666667 } };
-      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx], 
testCanopy.getNumPoints());
+      double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 },
+          { 4.666666666666667, 4.6666666666666667 } };
+      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
+          testCanopy.getNumPoints());
       double[] refCentroid = expectedCentroids[canopyIx];
       Vector testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
-        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', 
refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
+        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
+            refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
       }
     }
   }
 
-  /** Story: User can cluster points using a EuclideanDistanceMeasure and a 
reference implementation */
+  /**
+   * Story: User can cluster points using a EuclideanDistanceMeasure and a
+   * reference implementation
+   */
   @Test
   public void testReferenceEuclidean() throws Exception {
     // see setUp for cluster creation
     printCanopies(referenceEuclidean);
     assertEquals("number of canopies", 3, referenceEuclidean.size());
     int[] expectedNumPoints = { 5, 5, 3 };
-    double[][] expectedCentroids = { { 1.8, 1.8 }, { 4.2, 4.2 }, { 
4.666666666666667, 4.666666666666667 } };
+    double[][] expectedCentroids = { { 1.8, 1.8 }, { 4.2, 4.2 },
+        { 4.666666666666667, 4.666666666666667 } };
     for (int canopyIx = 0; canopyIx < referenceEuclidean.size(); canopyIx++) {
       Canopy testCanopy = referenceEuclidean.get(canopyIx);
-      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx], 
testCanopy.getNumPoints());
+      assertEquals("canopy points " + canopyIx, expectedNumPoints[canopyIx],
+          testCanopy.getNumPoints());
       double[] refCentroid = expectedCentroids[canopyIx];
       Vector testCentroid = testCanopy.computeCentroid();
       for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
-        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']', 
refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
+        assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
+            refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
       }
     }
   }
 
   /**
-   * Story: User can produce initial canopy centers using a 
ManhattanDistanceMeasure and a
-   * CanopyMapper which clusters input points to produce an output set of 
canopy centroid points.
+   * Story: User can produce initial canopy centers using a
+   * ManhattanDistanceMeasure and a CanopyMapper which clusters input points to
+   * produce an output set of canopy centroid points.
    */
   @Test
   public void testCanopyMapperManhattan() throws Exception {
     CanopyMapper mapper = new CanopyMapper();
     Configuration conf = new Configuration();
-    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, 
manhattanDistanceMeasure.getClass().getName());
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
+        .getClass().getName());
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<Text, VectorWritable> writer = new 
DummyRecordWriter<Text, VectorWritable>();
-    Mapper<WritableComparable<?>, VectorWritable, Text, 
VectorWritable>.Context context = DummyRecordWriter.build(mapper,
-                                                                               
                                   conf,
-                                                                               
                                   writer);
+    Mapper<WritableComparable<?>, VectorWritable, Text, 
VectorWritable>.Context context = DummyRecordWriter
+        .build(mapper, conf, writer);
     mapper.setup(context);
 
     List<VectorWritable> points = getPointsWritable();
@@ -188,25 +204,28 @@ public final class TestCanopyCreation ex
     List<VectorWritable> data = writer.getValue(new Text("centroid"));
     assertEquals("Number of centroids", 3, data.size());
     for (int i = 0; i < data.size(); i++) {
-      assertEquals("Centroid error", 
manhattanCentroids.get(i).asFormatString(), data.get(i).get().asFormatString());
+      assertEquals("Centroid error",
+          manhattanCentroids.get(i).asFormatString(), data.get(i).get()
+              .asFormatString());
     }
   }
 
   /**
-   * Story: User can produce initial canopy centers using a 
EuclideanDistanceMeasure and a
-   * CanopyMapper/Combiner which clusters input points to produce an output 
set of canopy centroid points.
+   * Story: User can produce initial canopy centers using a
+   * EuclideanDistanceMeasure and a CanopyMapper/Combiner which clusters input
+   * points to produce an output set of canopy centroid points.
    */
   @Test
   public void testCanopyMapperEuclidean() throws Exception {
     CanopyMapper mapper = new CanopyMapper();
     Configuration conf = new Configuration();
-    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, 
euclideanDistanceMeasure.getClass().getName());
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure
+        .getClass().getName());
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<Text, VectorWritable> writer = new 
DummyRecordWriter<Text, VectorWritable>();
-    Mapper<WritableComparable<?>, VectorWritable, Text, 
VectorWritable>.Context context = DummyRecordWriter.build(mapper,
-                                                                               
                                   conf,
-                                                                               
                                   writer);
+    Mapper<WritableComparable<?>, VectorWritable, Text, 
VectorWritable>.Context context = DummyRecordWriter
+        .build(mapper, conf, writer);
     mapper.setup(context);
 
     List<VectorWritable> points = getPointsWritable();
@@ -220,27 +239,28 @@ public final class TestCanopyCreation ex
     List<VectorWritable> data = writer.getValue(new Text("centroid"));
     assertEquals("Number of centroids", 3, data.size());
     for (int i = 0; i < data.size(); i++) {
-      assertEquals("Centroid error", 
euclideanCentroids.get(i).asFormatString(), data.get(i).get().asFormatString());
+      assertEquals("Centroid error",
+          euclideanCentroids.get(i).asFormatString(), data.get(i).get()
+              .asFormatString());
     }
   }
 
   /**
-   * Story: User can produce final canopy centers using a 
ManhattanDistanceMeasure and a CanopyReducer which
-   * clusters input centroid points to produce an output set of final canopy 
centroid points.
+   * Story: User can produce final canopy centers using a
+   * ManhattanDistanceMeasure and a CanopyReducer which clusters input centroid
+   * points to produce an output set of final canopy centroid points.
    */
   @Test
   public void testCanopyReducerManhattan() throws Exception {
     CanopyReducer reducer = new CanopyReducer();
     Configuration conf = new Configuration();
-    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, 
"org.apache.mahout.common.distance.ManhattanDistanceMeasure");
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
+        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, 
Canopy>();
-    Reducer<Text, VectorWritable, Text, Canopy>.Context context = 
DummyRecordWriter.build(reducer,
-                                                                               
           conf,
-                                                                               
           writer,
-                                                                               
           Text.class,
-                                                                               
           VectorWritable.class);
+    Reducer<Text, VectorWritable, Text, Canopy>.Context context = 
DummyRecordWriter
+        .build(reducer, conf, writer, Text.class, VectorWritable.class);
     reducer.setup(context);
 
     List<VectorWritable> points = getPointsWritable();
@@ -250,29 +270,30 @@ public final class TestCanopyCreation ex
     int i = 0;
     for (Text key : keys) {
       List<Canopy> data = writer.getValue(key);
-      assertEquals(manhattanCentroids.get(i).asFormatString() + " is not equal 
to "
-          + data.get(0).computeCentroid().asFormatString(), 
manhattanCentroids.get(i), data.get(0).computeCentroid());
+      assertEquals(manhattanCentroids.get(i).asFormatString()
+          + " is not equal to "
+          + data.get(0).computeCentroid().asFormatString(), manhattanCentroids
+          .get(i), data.get(0).computeCentroid());
       i++;
     }
   }
 
   /**
-   * Story: User can produce final canopy centers using a 
EuclideanDistanceMeasure and a CanopyReducer which
-   * clusters input centroid points to produce an output set of final canopy 
centroid points.
+   * Story: User can produce final canopy centers using a
+   * EuclideanDistanceMeasure and a CanopyReducer which clusters input centroid
+   * points to produce an output set of final canopy centroid points.
    */
   @Test
   public void testCanopyReducerEuclidean() throws Exception {
     CanopyReducer reducer = new CanopyReducer();
     Configuration conf = new Configuration();
-    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, 
"org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
+        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, 
Canopy>();
-    Reducer<Text, VectorWritable, Text, Canopy>.Context context = 
DummyRecordWriter.build(reducer,
-                                                                               
           conf,
-                                                                               
           writer,
-                                                                               
           Text.class,
-                                                                               
           VectorWritable.class);
+    Reducer<Text, VectorWritable, Text, Canopy>.Context context = 
DummyRecordWriter
+        .build(reducer, conf, writer, Text.class, VectorWritable.class);
     reducer.setup(context);
 
     List<VectorWritable> points = getPointsWritable();
@@ -282,25 +303,30 @@ public final class TestCanopyCreation ex
     int i = 0;
     for (Text key : keys) {
       List<Canopy> data = writer.getValue(key);
-      assertEquals(euclideanCentroids.get(i).asFormatString() + " is not equal 
to "
-          + data.get(0).computeCentroid().asFormatString(), 
euclideanCentroids.get(i), data.get(0).computeCentroid());
+      assertEquals(euclideanCentroids.get(i).asFormatString()
+          + " is not equal to "
+          + data.get(0).computeCentroid().asFormatString(), euclideanCentroids
+          .get(i), data.get(0).computeCentroid());
       i++;
     }
   }
 
   /**
-   * Story: User can produce final canopy centers using a Hadoop map/reduce 
job and a
-   * ManhattanDistanceMeasure.
+   * Story: User can produce final canopy centers using a Hadoop map/reduce job
+   * and a ManhattanDistanceMeasure.
    */
   @Test
   public void testCanopyGenManhattanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration config = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, config);
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file2"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file2"), fs, config);
     // now run the Canopy Driver
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(config, getTestTempDirPath("testdata"), output, 
manhattanDistanceMeasure, 3.1, 2.1, false, false);
+    CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
+        manhattanDistanceMeasure, 3.1, 2.1, false, false);
 
     // verify output from sequence file
     Path path = new Path(output, "clusters-0/part-r-00000");
@@ -314,25 +340,30 @@ public final class TestCanopyCreation ex
     assertEquals("1st y value", 1.5, canopy.getCenter().get(1), EPSILON);
     assertTrue("more to come", reader.next(key, canopy));
     assertEquals("2nd key", "C-1", key.toString());
-    assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0), 
EPSILON);
-    assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1), 
EPSILON);
+    assertEquals("2nd x value", 4.333333333333334, canopy.getCenter().get(0),
+        EPSILON);
+    assertEquals("2nd y value", 4.333333333333334, canopy.getCenter().get(1),
+        EPSILON);
     assertFalse("more to come", reader.next(key, canopy));
     reader.close();
   }
 
   /**
-   * Story: User can produce final canopy centers using a Hadoop map/reduce 
job and a
-   * EuclideanDistanceMeasure.
+   * Story: User can produce final canopy centers using a Hadoop map/reduce job
+   * and a EuclideanDistanceMeasure.
    */
   @Test
   public void testCanopyGenEuclideanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration config = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, config);
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file2"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file2"), fs, config);
     // now run the Canopy Driver
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(config, getTestTempDirPath("testdata"), output, 
euclideanDistanceMeasure, 3.1, 2.1, false, false);
+    CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
+        euclideanDistanceMeasure, 3.1, 2.1, false, false);
 
     // verify output from sequence file
     Path path = new Path(output, "clusters-0/part-r-00000");
@@ -346,18 +377,24 @@ public final class TestCanopyCreation ex
     assertEquals("1st y value", 1.8, value.getCenter().get(1), EPSILON);
     assertTrue("more to come", reader.next(key, value));
     assertEquals("2nd key", "C-1", key.toString());
-    assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0), 
EPSILON);
-    assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1), 
EPSILON);
+    assertEquals("2nd x value", 4.433333333333334, value.getCenter().get(0),
+        EPSILON);
+    assertEquals("2nd y value", 4.433333333333334, value.getCenter().get(1),
+        EPSILON);
     assertFalse("more to come", reader.next(key, value));
     reader.close();
   }
 
-  /** Story: User can cluster a subset of the points using a ClusterMapper and 
a ManhattanDistanceMeasure. */
+  /**
+   * Story: User can cluster a subset of the points using a ClusterMapper and a
+   * ManhattanDistanceMeasure.
+   */
   @Test
   public void testClusterMapperManhattan() throws Exception {
     ClusterMapper mapper = new ClusterMapper();
     Configuration conf = new Configuration();
-    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, 
"org.apache.mahout.common.distance.ManhattanDistanceMeasure");
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
+        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new 
DummyRecordWriter<IntWritable, WeightedVectorWritable>();
@@ -368,7 +405,8 @@ public final class TestCanopyCreation ex
     Collection<Canopy> canopies = new ArrayList<Canopy>();
     int nextCanopyId = 0;
     for (Vector centroid : manhattanCentroids) {
-      canopies.add(new Canopy(centroid, nextCanopyId++, 
manhattanDistanceMeasure));
+      canopies.add(new Canopy(centroid, nextCanopyId++,
+          manhattanDistanceMeasure));
     }
     mapper.config(canopies);
     List<VectorWritable> points = getPointsWritable();
@@ -378,22 +416,28 @@ public final class TestCanopyCreation ex
     }
     Map<IntWritable, List<WeightedVectorWritable>> data = writer.getData();
     assertEquals("Number of map results", canopies.size(), data.size());
-    for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : 
data.entrySet()) {
+    for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : 
data
+        .entrySet()) {
       IntWritable key = stringListEntry.getKey();
       Canopy canopy = findCanopy(key.get(), canopies);
       List<WeightedVectorWritable> pts = stringListEntry.getValue();
       for (WeightedVectorWritable ptDef : pts) {
-        assertTrue("Point not in canopy", mapper.canopyCovers(canopy, 
ptDef.getVector()));
+        assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef
+            .getVector()));
       }
     }
   }
 
-  /** Story: User can cluster a subset of the points using a ClusterMapper and 
a EuclideanDistanceMeasure. */
+  /**
+   * Story: User can cluster a subset of the points using a ClusterMapper and a
+   * EuclideanDistanceMeasure.
+   */
   @Test
   public void testClusterMapperEuclidean() throws Exception {
     ClusterMapper mapper = new ClusterMapper();
     Configuration conf = new Configuration();
-    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, 
"org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
+        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
     conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
     conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
     DummyRecordWriter<IntWritable, WeightedVectorWritable> writer = new 
DummyRecordWriter<IntWritable, WeightedVectorWritable>();
@@ -404,7 +448,8 @@ public final class TestCanopyCreation ex
     Collection<Canopy> canopies = new ArrayList<Canopy>();
     int nextCanopyId = 0;
     for (Vector centroid : euclideanCentroids) {
-      canopies.add(new Canopy(centroid, nextCanopyId++, 
euclideanDistanceMeasure));
+      canopies.add(new Canopy(centroid, nextCanopyId++,
+          euclideanDistanceMeasure));
     }
     mapper.config(canopies);
     List<VectorWritable> points = getPointsWritable();
@@ -414,12 +459,14 @@ public final class TestCanopyCreation ex
     }
     Map<IntWritable, List<WeightedVectorWritable>> data = writer.getData();
     assertEquals("Number of map results", canopies.size(), data.size());
-    for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : 
data.entrySet()) {
+    for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : 
data
+        .entrySet()) {
       IntWritable key = stringListEntry.getKey();
       Canopy canopy = findCanopy(key.get(), canopies);
       List<WeightedVectorWritable> pts = stringListEntry.getValue();
       for (WeightedVectorWritable ptDef : pts) {
-        assertTrue("Point not in canopy", mapper.canopyCovers(canopy, 
ptDef.getVector()));
+        assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef
+            .getVector()));
       }
     }
   }
@@ -429,16 +476,20 @@ public final class TestCanopyCreation ex
   public void testClusteringManhattanSeq() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration config = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, config);
     // now run the Canopy Driver in sequential mode
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(config, getTestTempDirPath("testdata"), output, 
manhattanDistanceMeasure, 3.1, 2.1, true, true);
+    CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
+        manhattanDistanceMeasure, 3.1, 2.1, true, true);
 
     // verify output from sequence file
     Path path = new Path(output, "clusters-0/part-r-00000");
     int ix = 0;
-    for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true, 
config)) {
-      assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), 
value.getCenter());
+    for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true,
+        config)) {
+      assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), value
+          .getCenter());
       ix++;
     }
 
@@ -452,14 +503,20 @@ public final class TestCanopyCreation ex
   public void testClusteringEuclideanSeq() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration config = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, config);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, config);
     // now run the Canopy Driver in sequential mode
     Path output = getTestTempDirPath("output");
-    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), 
getTestTempDirPath("testdata").toString(),
-        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), 
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
-        EuclideanDistanceMeasure.class.getName(), 
optKey(DefaultOptionCreator.T1_OPTION), "3.1",
-        optKey(DefaultOptionCreator.T2_OPTION), "2.1", 
optKey(DefaultOptionCreator.CLUSTERING_OPTION),
-        optKey(DefaultOptionCreator.OVERWRITE_OPTION), 
optKey(DefaultOptionCreator.METHOD_OPTION),
+    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
+        getTestTempDirPath("testdata").toString(),
+        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
+        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
+        EuclideanDistanceMeasure.class.getName(),
+        optKey(DefaultOptionCreator.T1_OPTION), "3.1",
+        optKey(DefaultOptionCreator.T2_OPTION), "2.1",
+        optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+        optKey(DefaultOptionCreator.OVERWRITE_OPTION),
+        optKey(DefaultOptionCreator.METHOD_OPTION),
         DefaultOptionCreator.SEQUENTIAL_METHOD };
     new CanopyDriver().run(args);
 
@@ -467,8 +524,10 @@ public final class TestCanopyCreation ex
     Path path = new Path(output, "clusters-0/part-r-00000");
 
     int ix = 0;
-    for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true, 
config)) {
-      assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), 
value.getCenter());
+    for (Canopy value : new SequenceFileValueIterable<Canopy>(path, true,
+        config)) {
+      assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), value
+          .getCenter());
       ix++;
     }
 
@@ -478,39 +537,48 @@ public final class TestCanopyCreation ex
   }
 
   /**
-   * Story: User can produce final point clustering using a Hadoop map/reduce 
job and a
-   * ManhattanDistanceMeasure.
+   * Story: User can produce final point clustering using a Hadoop map/reduce
+   * job and a ManhattanDistanceMeasure.
    */
   @Test
   public void testClusteringManhattanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file2"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Job
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, 
manhattanDistanceMeasure, 3.1, 2.1, true, false);
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
+        manhattanDistanceMeasure, 3.1, 2.1, true, false);
     Path path = new Path(output, "clusteredPoints/part-m-00000");
     long count = HadoopUtil.countRecords(path, conf);
     assertEquals("number of points", points.size(), count);
   }
 
   /**
-   * Story: User can produce final point clustering using a Hadoop map/reduce 
job and a
-   * EuclideanDistanceMeasure.
+   * Story: User can produce final point clustering using a Hadoop map/reduce
+   * job and a EuclideanDistanceMeasure.
    */
   @Test
   public void testClusteringEuclideanMR() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file2"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Job using the run() command. Others can use runJob().
     Path output = getTestTempDirPath("output");
-    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), 
getTestTempDirPath("testdata").toString(),
-        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), 
optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
-        EuclideanDistanceMeasure.class.getName(), 
optKey(DefaultOptionCreator.T1_OPTION), "3.1",
-        optKey(DefaultOptionCreator.T2_OPTION), "2.1", 
optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+    String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
+        getTestTempDirPath("testdata").toString(),
+        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
+        optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
+        EuclideanDistanceMeasure.class.getName(),
+        optKey(DefaultOptionCreator.T1_OPTION), "3.1",
+        optKey(DefaultOptionCreator.T2_OPTION), "2.1",
+        optKey(DefaultOptionCreator.CLUSTERING_OPTION),
         optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
     ToolRunner.run(new Configuration(), new CanopyDriver(), args);
     Path path = new Path(output, "clusteredPoints/part-m-00000");
@@ -518,17 +586,23 @@ public final class TestCanopyCreation ex
     assertEquals("number of points", points.size(), count);
   }
 
-  /** Story: Clustering algorithm must support arbitrary user defined distance 
measure */
+  /**
+   * Story: Clustering algorithm must support arbitrary user defined distance
+   * measure
+   */
   @Test
   public void testUserDefinedDistanceMeasure() throws Exception {
     List<VectorWritable> points = getPointsWritable();
     Configuration conf = new Configuration();
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file1"), fs, conf);
-    ClusteringTestUtils.writePointsToFile(points, 
getTestTempFilePath("testdata/file2"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file1"), fs, conf);
+    ClusteringTestUtils.writePointsToFile(points,
+        getTestTempFilePath("testdata/file2"), fs, conf);
     // now run the Canopy Driver. User defined measure happens to be a 
Manhattan
     // subclass so results are same.
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, new 
UserDefinedDistanceMeasure(), 3.1, 2.1, false, false);
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
+        new UserDefinedDistanceMeasure(), 3.1, 2.1, false, false);
 
     // verify output from sequence file
     Configuration job = new Configuration();
@@ -545,9 +619,33 @@ public final class TestCanopyCreation ex
     assertTrue("more to come", reader.next(key, value));
     assertEquals("2nd key", "C-1", key.toString());
 
-    assertEquals("1st x value", 4.333333333333334, value.getCenter().get(0), 
EPSILON);
-    assertEquals("1st y value", 4.333333333333334, value.getCenter().get(1), 
EPSILON);
+    assertEquals("1st x value", 4.333333333333334, value.getCenter().get(0),
+        EPSILON);
+    assertEquals("1st y value", 4.333333333333334, value.getCenter().get(1),
+        EPSILON);
     assertFalse("more to come", reader.next(key, value));
     reader.close();
   }
+
+  /**
+   * Story: User can set T3 and T4 values to be used by the reducer for its T1
+   * and T2 thresholds
+   */
+  @Test
+  public void testCanopyReducerT3T4Configuration() throws Exception {
+    CanopyReducer reducer = new CanopyReducer();
+    Configuration conf = new Configuration();
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
+        "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
+    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
+    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
+    conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1));
+    conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1));
+    DummyRecordWriter<Text, Canopy> writer = new DummyRecordWriter<Text, 
Canopy>();
+    Reducer<Text, VectorWritable, Text, Canopy>.Context context = 
DummyRecordWriter
+        .build(reducer, conf, writer, Text.class, VectorWritable.class);
+    reducer.setup(context);
+    assertEquals(1.1, reducer.canopyClusterer.t1, EPSILON);
+    assertEquals(0.1, reducer.canopyClusterer.t2, EPSILON);
+  }
 }


Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=1090881&r1=1090880&r2=1090881&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
 Sun Apr 10 20:00:13 2011
@@ -56,12 +56,15 @@ public final class Job extends AbstractJ
   }
 
   /**
-   * Run the canopy clustering job on an input dataset using the given 
distance measure, t1 and t2 parameters.
-   * All output data will be written to the output directory, which will be 
initially deleted if it exists.
-   * The clustered points will reside in the path <output>/clustered-points. 
By default, the job expects the a
-   * file containing synthetic_control.data as obtained from
-   * 
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series 
resides in a directory named
-   * "testdata", and writes output to a directory named "output".
+   * Run the canopy clustering job on an input dataset using the given distance
+   * measure, t1 and t2 parameters. All output data will be written to the
+   * output directory, which will be initially deleted if it exists. The
+   * clustered points will reside in the path <output>/clustered-points. By
+   * default, the job expects the a file containing synthetic_control.data as
+   * obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
    * 
    * @param input
    *          the String denoting the input directory path
@@ -73,15 +76,24 @@ public final class Job extends AbstractJ
    *          the canopy T1 threshold
    * @param t2
    *          the canopy T2 threshold
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws ClassNotFoundException
+   * @throws InstantiationException
+   * @throws IllegalAccessException
    */
-  private static void run(Path input, Path output, DistanceMeasure measure, 
double t1, double t2)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Path directoryContainingConvertedInput = new Path(output, 
DIRECTORY_CONTAINING_CONVERTED_INPUT);
-    InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, 
output, measure, t1, t2, true, false);
+  private static void run(Path input, Path output, DistanceMeasure measure,
+      double t1, double t2) throws IOException, InterruptedException,
+      ClassNotFoundException, InstantiationException, IllegalAccessException {
+    Path directoryContainingConvertedInput = new Path(output,
+        DIRECTORY_CONTAINING_CONVERTED_INPUT);
+    InputDriver.runJob(input, directoryContainingConvertedInput,
+        "org.apache.mahout.math.RandomAccessSparseVector");
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
+        output, measure, t1, t2, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper =
-        new ClusterDumper(new Path(output, "clusters-0"), new Path(output, 
"clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
+        "clusters-0"), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 
@@ -109,7 +121,8 @@ public final class Job extends AbstractJ
     double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
     double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-    DistanceMeasure measure = (DistanceMeasure) ((Class<?>) 
ccl.loadClass(measureClass)).newInstance();
+    DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl
+        .loadClass(measureClass)).newInstance();
 
     run(input, output, measure, t1, t2);
     return 0;

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=1090881&r1=1090880&r2=1090881&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
 Sun Apr 10 20:00:13 2011
@@ -45,6 +45,7 @@ public final class Job extends AbstractJ
   private static final Logger log = LoggerFactory.getLogger(Job.class);
 
   private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+
   private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
 
   private Job() {
@@ -59,17 +60,14 @@ public final class Job extends AbstractJ
       Path output = new Path("output");
       Configuration conf = new Configuration();
       HadoopUtil.delete(conf, output);
-      new Job().run(conf,
-                    new Path("testdata"),
-                    output,
-                    new EuclideanDistanceMeasure(),
-                    80, 55, 10, (float) 2, 0.5);
+      new Job().run(conf, new Path("testdata"), output,
+          new EuclideanDistanceMeasure(), 80, 55, 10, (float) 2, 0.5);
     }
   }
 
   @Override
-  public int run(String[] args)
-    throws IOException, ClassNotFoundException, InstantiationException, 
IllegalAccessException, InterruptedException {
+  public int run(String[] args) throws IOException, ClassNotFoundException,
+      InstantiationException, IllegalAccessException, InterruptedException {
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
@@ -78,7 +76,8 @@ public final class Job extends AbstractJ
     addOption(DefaultOptionCreator.overwriteOption().create());
     addOption(DefaultOptionCreator.t1Option().create());
     addOption(DefaultOptionCreator.t2Option().create());
-    addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be 
greater than 1", true);
+    addOption(M_OPTION, M_OPTION,
+        "coefficient normalization factor, must be greater than 1", true);
 
     Map<String, String> argMap = parseArguments(args);
     if (argMap == null) {
@@ -91,29 +90,36 @@ public final class Job extends AbstractJ
     if (measureClass == null) {
       measureClass = SquaredEuclideanDistanceMeasure.class.getName();
     }
-    double convergenceDelta = 
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
-    int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    double convergenceDelta = Double
+        .parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer
+        .parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     float fuzziness = Float.parseFloat(getOption(M_OPTION));
 
-    addOption(new 
DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
-        .withArgument(new 
ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
-        .withDescription("coefficient normalization factor, must be greater 
than 1").withShortName(M_OPTION)
-        .create());
+    addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(
+        true).withArgument(
+        new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1)
+            .create()).withDescription(
+        "coefficient normalization factor, must be greater than 1")
+        .withShortName(M_OPTION).create());
     if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
       HadoopUtil.delete(getConf(), output);
     }
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-    DistanceMeasure measure = 
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
+    DistanceMeasure measure = ccl.loadClass(measureClass).asSubclass(
+        DistanceMeasure.class).newInstance();
     double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
     double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, 
convergenceDelta);
+    run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness,
+        convergenceDelta);
     return 0;
   }
 
   /**
    * Return the path to the final iteration's clusters
    */
-  private static Path finalClusterPath(Configuration conf, Path output, int 
maxIterations) throws IOException {
+  private static Path finalClusterPath(Configuration conf, Path output,
+      int maxIterations) throws IOException {
     FileSystem fs = FileSystem.get(conf);
     for (int i = maxIterations; i >= 0; i--) {
       Path clusters = new Path(output, "clusters-" + i);
@@ -125,12 +131,16 @@ public final class Job extends AbstractJ
   }
 
   /**
-   * Run the kmeans clustering job on an input dataset using the given 
distance measure, t1, t2 and iteration
-   * parameters. All output data will be written to the output directory, 
which will be initially deleted if
-   * it exists. The clustered points will reside in the path 
<output>/clustered-points. By default, the job
-   * expects the a file containing synthetic_control.data as obtained from
-   * 
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series 
resides in a directory named
-   * "testdata", and writes output to a directory named "output".
+   * Run the kmeans clustering job on an input dataset using the given distance
+   * measure, t1, t2 and iteration parameters. All output data will be written
+   * to the output directory, which will be initially deleted if it exists. The
+   * clustered points will reside in the path <output>/clustered-points. By
+   * default, the job expects the a file containing synthetic_control.data as
+   * obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
+   * 
    * @param input
    *          the String denoting the input directory path
    * @param output
@@ -139,43 +149,37 @@ public final class Job extends AbstractJ
    *          the canopy T1 threshold
    * @param t2
    *          the canopy T2 threshold
-   * @param maxIterations 
+   * @param maxIterations
    *          the int maximum number of iterations
-   * @param fuzziness 
+   * @param fuzziness
    *          the float "m" fuzziness coefficient
    * @param convergenceDelta
    *          the double convergence criteria for iterations
+   * @throws InterruptedException 
+   * @throws ClassNotFoundException 
+   * @throws IllegalAccessException
+   * @throws InstantiationException
    */
-  public void run(Configuration conf,
-                  Path input,
-                  Path output,
-                  DistanceMeasure measure,
-                  double t1,
-                  double t2,
-                  int maxIterations,
-                  float fuzziness,
-                  double convergenceDelta)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Path directoryContainingConvertedInput = new Path(output, 
DIRECTORY_CONTAINING_CONVERTED_INPUT);
+  public void run(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, int maxIterations,
+      float fuzziness, double convergenceDelta) throws IOException,
+      InterruptedException, ClassNotFoundException, InstantiationException,
+      IllegalAccessException {
+    Path directoryContainingConvertedInput = new Path(output,
+        DIRECTORY_CONTAINING_CONVERTED_INPUT);
     log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
+    InputDriver.runJob(input, directoryContainingConvertedInput,
+        "org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running Canopy to get initial clusters");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, 
output, measure, t1, t2, false, false);
+    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
+        output, measure, t1, t2, false, false);
     log.info("Running FuzzyKMeans");
-    FuzzyKMeansDriver.run(directoryContainingConvertedInput,
-                          new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
-                          output,
-                          measure,
-                          convergenceDelta,
-                          maxIterations,
-                          fuzziness,
-                          true,
-                          true,
-                          0.0,
-                          false);
+    FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(output,
+        Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta,
+        maxIterations, fuzziness, true, true, 0.0, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper =
-        new ClusterDumper(finalClusterPath(conf, output, maxIterations), new 
Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+        output, maxIterations), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 }

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1090881&r1=1090880&r2=1090881&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
 Sun Apr 10 20:00:13 2011
@@ -57,13 +57,14 @@ public final class Job extends AbstractJ
       Path output = new Path("output");
       Configuration conf = new Configuration();
       HadoopUtil.delete(conf, output);
-      new Job().run(conf, new Path("testdata"), output, new 
EuclideanDistanceMeasure(), 6, 0.5, 10);
+      new Job().run(conf, new Path("testdata"), output,
+          new EuclideanDistanceMeasure(), 6, 0.5, 10);
     }
   }
 
   @Override
-  public int run(String[] args)
-    throws IOException, ClassNotFoundException, InstantiationException, 
IllegalAccessException, InterruptedException {
+  public int run(String[] args) throws IOException, ClassNotFoundException,
+      InstantiationException, IllegalAccessException, InterruptedException {
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
@@ -85,8 +86,10 @@ public final class Job extends AbstractJ
     if (measureClass == null) {
       measureClass = SquaredEuclideanDistanceMeasure.class.getName();
     }
-    double convergenceDelta = 
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
-    int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+    double convergenceDelta = Double
+        .parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer
+        .parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
       HadoopUtil.delete(getConf(), output);
     }
@@ -94,74 +97,76 @@ public final class Job extends AbstractJ
     Class<?> cl = ccl.loadClass(measureClass);
     DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
     if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
-      int k = 
Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+      int k = Integer
+          .parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
       run(getConf(), input, output, measure, k, convergenceDelta, 
maxIterations);
     } else {
       double t1 = 
Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
       double t2 = 
Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-      run(getConf(), input, output, measure, t1, t2, convergenceDelta, 
maxIterations);
+      run(getConf(), input, output, measure, t1, t2, convergenceDelta,
+          maxIterations);
     }
     return 0;
   }
-  
+
   /**
-   * Run the kmeans clustering job on an input dataset using the given the 
number of clusters k and iteration
-   * parameters. All output data will be written to the output directory, 
which will be initially deleted if
-   * it exists. The clustered points will reside in the path 
<output>/clustered-points. By default, the job
-   * expects a file containing equal length space delimited data that resides 
in a directory named
+   * Run the kmeans clustering job on an input dataset using the given the
+   * number of clusters k and iteration parameters. All output data will be
+   * written to the output directory, which will be initially deleted if it
+   * exists. The clustered points will reside in the path
+   * <output>/clustered-points. By default, the job expects a file containing
+   * equal length space delimited data that resides in a directory named
    * "testdata", and writes output to a directory named "output".
-   * @param conf the Configuration to use
+   * 
+   * @param conf
+   *          the Configuration to use
    * @param input
    *          the String denoting the input directory path
    * @param output
    *          the String denoting the output directory path
    * @param measure
    *          the DistanceMeasure to use
-   * @param k 
+   * @param k
    *          the number of clusters in Kmeans
    * @param convergenceDelta
    *          the double convergence criteria for iterations
    * @param maxIterations
    *          the int maximum number of iterations
    */
-  public void run(Configuration conf,
-                  Path input,
-                  Path output,
-                  DistanceMeasure measure,
-                  int k,
-                  double convergenceDelta,
-                  int maxIterations)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Path directoryContainingConvertedInput = new Path(output, 
DIRECTORY_CONTAINING_CONVERTED_INPUT);
+  public void run(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, int k, double convergenceDelta, int 
maxIterations)
+      throws IOException, InterruptedException, ClassNotFoundException {
+    Path directoryContainingConvertedInput = new Path(output,
+        DIRECTORY_CONTAINING_CONVERTED_INPUT);
     log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
+    InputDriver.runJob(input, directoryContainingConvertedInput,
+        "org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running random seed to get initial clusters");
     Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
-    clusters = RandomSeedGenerator.buildRandom(conf, 
directoryContainingConvertedInput, clusters, k, measure);
+    clusters = RandomSeedGenerator.buildRandom(conf,
+        directoryContainingConvertedInput, clusters, k, measure);
     log.info("Running KMeans");
-    KMeansDriver.run(conf,
-                     directoryContainingConvertedInput,
-                     clusters,
-                     output,
-                     measure,
-                     convergenceDelta,
-                     maxIterations,
-                     true,
-                     false);
+    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
+        measure, convergenceDelta, maxIterations, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper =
-        new ClusterDumper(finalClusterPath(conf, output, maxIterations), new 
Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+        output, maxIterations), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 
   /**
-   * Run the kmeans clustering job on an input dataset using the given 
distance measure, t1, t2 and iteration
-   * parameters. All output data will be written to the output directory, 
which will be initially deleted if
-   * it exists. The clustered points will reside in the path 
<output>/clustered-points. By default, the job
-   * expects the a file containing synthetic_control.data as obtained from
-   * 
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series 
resides in a directory named
-   * "testdata", and writes output to a directory named "output".
-   * @param conf the Configuration to use
+   * Run the kmeans clustering job on an input dataset using the given distance
+   * measure, t1, t2 and iteration parameters. All output data will be written
+   * to the output directory, which will be initially deleted if it exists. The
+   * clustered points will reside in the path <output>/clustered-points. By
+   * default, the job expects the a file containing synthetic_control.data as
+   * obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
+   * 
+   * @param conf
+   *          the Configuration to use
    * @param input
    *          the String denoting the input directory path
    * @param output
@@ -176,41 +181,39 @@ public final class Job extends AbstractJ
    *          the double convergence criteria for iterations
    * @param maxIterations
    *          the int maximum number of iterations
+   * @throws IOException 
+   * @throws InterruptedException 
+   * @throws ClassNotFoundException 
+   * @throws IllegalAccessException
+   * @throws InstantiationException
    */
-  public void run(Configuration conf,
-                  Path input,
-                  Path output,
-                  DistanceMeasure measure,
-                  double t1,
-                  double t2,
-                  double convergenceDelta,
-                  int maxIterations)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Path directoryContainingConvertedInput = new Path(output, 
DIRECTORY_CONTAINING_CONVERTED_INPUT);
+  public void run(Configuration conf, Path input, Path output,
+      DistanceMeasure measure, double t1, double t2, double convergenceDelta,
+      int maxIterations) throws IOException, InterruptedException,
+      ClassNotFoundException, InstantiationException, IllegalAccessException {
+    Path directoryContainingConvertedInput = new Path(output,
+        DIRECTORY_CONTAINING_CONVERTED_INPUT);
     log.info("Preparing Input");
-    InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
+    InputDriver.runJob(input, directoryContainingConvertedInput,
+        "org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running Canopy to get initial clusters");
-    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, 
t1, t2, false, false);
+    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure,
+        t1, t2, false, false);
     log.info("Running KMeans");
-    KMeansDriver.run(conf,
-                     directoryContainingConvertedInput,
-                     new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
-                     output,
-                     measure,
-                     convergenceDelta,
-                     maxIterations,
-                     true,
-                     false);
+    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output,
+        Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta,
+        maxIterations, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper =
-        new ClusterDumper(finalClusterPath(conf, output, maxIterations), new 
Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
+        output, maxIterations), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 
   /**
    * Return the path to the final iteration's clusters
    */
-  private static Path finalClusterPath(Configuration conf, Path output, int 
maxIterations) throws IOException {
+  private static Path finalClusterPath(Configuration conf, Path output,
+      int maxIterations) throws IOException {
     FileSystem fs = FileSystem.get(conf);
     for (int i = maxIterations; i >= 0; i--) {
       Path clusters = new Path(output, "clusters-" + i);

svn commit: r1090881 [2/2] - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/common/commandline/ core/src/test/java/org/apache/mahout/clustering/canopy/ examples/src/main/java/org/apache/ma...

Reply via email to