Author: gsingers
Date: Thu Jul 14 18:18:49 2011
New Revision: 1146835
URL: http://svn.apache.org/viewvc?rev=1146835&view=rev
Log:
MAHOUT-761: kmeans can emit it's distance in the clustering step
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
Thu Jul 14 18:18:49 2011
@@ -25,10 +25,10 @@ import org.apache.hadoop.io.Writable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
-public final class WeightedVectorWritable implements Writable {
+public class WeightedVectorWritable implements Writable {
- private final VectorWritable vectorWritable = new VectorWritable();
- private double weight;
+ protected VectorWritable vectorWritable = new VectorWritable();
+ protected double weight;
public WeightedVectorWritable() {
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
Thu Jul 14 18:18:49 2011
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.VectorWritable;
@@ -38,7 +39,7 @@ import org.apache.mahout.math.VectorWrit
* @see KMeansDriver for more information on how to invoke this process
*/
public class KMeansClusterMapper
- extends
Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable>
{
+ extends
Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedPropertyVectorWritable>
{
private final Collection<Cluster> clusters = Lists.newArrayList();
private KMeansClusterer clusterer;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
Thu Jul 14 18:18:49 2011
@@ -17,15 +17,20 @@
package org.apache.mahout.clustering.kmeans;
import java.io.IOException;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import com.google.common.collect.Lists;
+import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.Writer;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.ClusterObservations;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.Vector;
@@ -117,7 +122,7 @@ public class KMeansClusterer {
public void outputPointWithClusterInfo(Vector vector,
Iterable<Cluster> clusters,
-
Mapper<?,?,IntWritable,WeightedVectorWritable>.Context context)
+
Mapper<?,?,IntWritable,WeightedPropertyVectorWritable>.Context context)
throws IOException, InterruptedException {
AbstractCluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
@@ -129,7 +134,9 @@ public class KMeansClusterer {
nearestDistance = distance;
}
}
- context.write(new IntWritable(nearestCluster.getId()), new
WeightedVectorWritable(1, vector));
+ Map<Text, Text> props = new HashMap<Text, Text>();
+ props.put(new Text("distance"), new Text(String.valueOf(nearestDistance)));
+ context.write(new IntWritable(nearestCluster.getId()), new
WeightedPropertyVectorWritable(1, vector, props));
}
/**
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
Thu Jul 14 18:18:49 2011
@@ -36,6 +36,7 @@ import org.apache.hadoop.mapreduce.lib.o
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.ClusterObservations;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
@@ -474,7 +475,7 @@ public class KMeansDriver extends Abstra
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(IntWritable.class);
- job.setOutputValueClass(WeightedVectorWritable.class);
+ job.setOutputValueClass(WeightedPropertyVectorWritable.class);
FileInputFormat.setInputPaths(job, input);
HadoopUtil.delete(conf, output);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1146835&r1=1146834&r2=1146835&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Thu Jul 14 18:18:49 2011
@@ -17,19 +17,6 @@
package org.apache.mahout.utils.clustering;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
@@ -38,8 +25,10 @@ import org.apache.commons.lang.StringUti
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
@@ -52,6 +41,19 @@ import org.apache.mahout.utils.vectors.V
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
public final class ClusterDumper extends AbstractJob {
public static final String OUTPUT_OPTION = "output";
@@ -94,8 +96,8 @@ public final class ClusterDumper extends
addOption(SUBSTRING_OPTION, "b", "The number of chars of the
asFormatString() to print");
addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
addOption(POINTS_DIR_OPTION, "p",
- "The directory containing points sequence files mapping input vectors
to their cluster. "
- + "If specified, then the program will output the points
associated with a cluster");
+ "The directory containing points sequence files mapping input
vectors to their cluster. "
+ + "If specified, then the program will output the points
associated with a cluster");
addOption(DICTIONARY_OPTION, "d", "The dictionary file");
addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type
(text|sequencefile)", "text");
if (parseArguments(args) == null) {
@@ -147,7 +149,7 @@ public final class ClusterDumper extends
}
try {
for (Cluster value :
- new SequenceFileDirValueIterable<Cluster>(new Path(seqFileDir,
"part-*"), PathType.GLOB, conf)) {
+ new SequenceFileDirValueIterable<Cluster>(new Path(seqFileDir,
"part-*"), PathType.GLOB, conf)) {
String fmtStr = value.asFormatString(dictionary);
if (subString > 0 && fmtStr.length() > subString) {
writer.write(':');
@@ -167,11 +169,24 @@ public final class ClusterDumper extends
List<WeightedVectorWritable> points =
clusterIdToPoints.get(value.getId());
if (points != null) {
- writer.write("\tWeight: Point:\n\t");
- for (Iterator<WeightedVectorWritable> iterator = points.iterator();
iterator.hasNext();) {
+ writer.write("\tWeight : [props - optional]: Point:\n\t");
+ for (Iterator<WeightedVectorWritable> iterator = points.iterator();
iterator.hasNext(); ) {
WeightedVectorWritable point = iterator.next();
writer.write(String.valueOf(point.getWeight()));
+ if (point instanceof WeightedPropertyVectorWritable) {
+ WeightedPropertyVectorWritable tmp =
(WeightedPropertyVectorWritable) point;
+ Map<Text, Text> map = tmp.getProperties();
+ writer.write(" : [");
+ for (Map.Entry<Text, Text> entry : map.entrySet()) {
+ writer.write(entry.getKey().toString());
+ writer.write("=");
+ writer.write(entry.getValue().toString());
+ }
+ writer.write("]");
+ }
+
writer.write(": ");
+
writer.write(AbstractCluster.formatVector(point.getVector(),
dictionary));
if (iterator.hasNext()) {
writer.write("\n\t");
@@ -236,9 +251,9 @@ public final class ClusterDumper extends
public static Map<Integer, List<WeightedVectorWritable>> readPoints(Path
pointsPathDir, Configuration conf) {
Map<Integer, List<WeightedVectorWritable>> result = new TreeMap<Integer,
List<WeightedVectorWritable>>();
- for (Pair<IntWritable,WeightedVectorWritable> record :
- new SequenceFileDirIterable<IntWritable,WeightedVectorWritable>(
- pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf))
{
+ for (Pair<IntWritable, WeightedVectorWritable> record :
+ new SequenceFileDirIterable<IntWritable, WeightedVectorWritable>(
+ pointsPathDir, PathType.LIST, PathFilters.logsCRCFilter(),
conf)) {
// value is the cluster id as an int, key is the name/id of the
// vector, but that doesn't matter because we only care about printing
// it
@@ -257,6 +272,7 @@ public final class ClusterDumper extends
private static class TermIndexWeight {
private final int index;
private final double weight;
+
TermIndexWeight(int index, double weight) {
this.index = index;
this.weight = weight;