Author: ssc
Date: Fri Sep 9 11:49:49 2011
New Revision: 1167115
URL: http://svn.apache.org/viewvc?rev=1167115&view=rev
Log:
MAHOUT-767 Improve RowSimilarityJob performance, threshold integration
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/MostSimilarItemPairsMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/MostSimilarItemPairsReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
Fri Sep 9 11:49:49 2011
@@ -62,9 +62,9 @@ import java.util.regex.Pattern;
* <p>Command line arguments specific to this class are:</p>
*
* <ol>
- * <li>-Dmapred.input.dir=(path): Directory containing one or more text files
with the preference data</li>
- * <li>-Dmapred.output.dir=(path): output path where recommender output should
go</li>
- * <li>--similarityClassname (classname): Name of distributed similarity class
to instantiate or a predefined similarity
+ * <li>--input(path): Directory containing one or more text files with the
preference data</li>
+ * <li>--output(path): output path where recommender output should go</li>
+ * <li>--similarityClassname (classname): Name of vector similarity class to
instantiate or a predefined similarity
* from {@link
org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}</li>
* <li>--usersFile (path): only compute recommendations for user IDs contained
in this file (optional)</li>
* <li>--itemsFile (path): only include item IDs from this file in the
recommendations (optional)</li>
@@ -72,10 +72,12 @@ import java.util.regex.Pattern;
* recommendations for that user (optional)</li>
* <li>--numRecommendations (integer): Number of recommendations to compute
per user (10)</li>
* <li>--booleanData (boolean): Treat input data as having no pref values
(false)</li>
- * <li>--maxPrefsPerUser (integer): Maximum number of preferences considered
per user in
- * final recommendation phase (10)</li>
+ * <li>--maxPrefsPerUser (integer): Maximum number of preferences considered
per user in final recommendation phase (10)</li>
* <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities
considered per item (100)</li>
- * <li>--maxCooccurrencesPerItem (integer): Maximum number of cooccurrences
considered per item (100)</li>
+ * <li>--minPrefsPerUser (integer): ignore users with less preferences than
this in the similarity computation (1)</li>
+ * <li>--maxPrefsPerUserInItemSimilarity (integer): max number of preferences
to consider per user in the item similarity computation phase,
+ * users with more preferences will be sampled down (1000)</li>
+ * <li>--threshold (double): discard item pairs with a similarity value below
this</li>
* </ol>
*
* <p>General command line options are documented in {@link AbstractJob}.</p>
@@ -115,6 +117,7 @@ public final class RecommenderJob extend
DEFAULT_MAX_PREFS_PER_USER + ")",
String.valueOf(DEFAULT_MAX_PREFS_PER_USER));
addOption("similarityClassname", "s", "Name of distributed similarity
measures class to instantiate, " +
"alternatively use one of the predefined similarities (" +
VectorSimilarityMeasures.list() + ')');
+ addOption("threshold", "tr", "discard item pairs with a similarity value
below this", false);
Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
@@ -132,6 +135,9 @@ public final class RecommenderJob extend
int maxPrefsPerUserInItemSimilarity =
Integer.parseInt(parsedArgs.get("--maxPrefsPerUserInItemSimilarity"));
int maxSimilaritiesPerItem =
Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
String similarityClassname = parsedArgs.get("--similarityClassname");
+ double threshold = parsedArgs.containsKey("--threshold") ?
+ Double.parseDouble(parsedArgs.get("--threshold")) :
RowSimilarityJob.NO_THRESHOLD;
+
Path prepPath = getTempPath("preparePreferenceMatrix");
Path similarityMatrixPath = getTempPath("similarityMatrix");
@@ -172,7 +178,9 @@ public final class RecommenderJob extend
"--output", similarityMatrixPath.toString(),
"--numberOfColumns", String.valueOf(numberOfUsers),
"--similarityClassname", similarityClassname,
- "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1),
+ "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),
+ "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+ "--threshold", String.valueOf(threshold),
"--tempDir", getTempPath().toString() });
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Fri Sep 9 11:49:49 2011
@@ -17,23 +17,33 @@
package org.apache.mahout.cf.taste.hadoop.similarity.item;
+import java.io.IOException;
+import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
+import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.common.TopK;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import
org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
import
org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
/**
* <p>Distributed precomputation of the item-item-similarities for Itembased
Collaborative Filtering</p>
@@ -95,6 +105,7 @@ public final class ItemSimilarityJob ext
addOption("minPrefsPerUser", "mp", "ignore users with less preferences
than this "
+ "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
addOption("booleanData", "b", "Treat input as without pref values",
String.valueOf(Boolean.FALSE));
+ addOption("threshold", "tr", "discard item pairs with a similarity value
below this", false);
Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
@@ -107,50 +118,102 @@ public final class ItemSimilarityJob ext
int minPrefsPerUser =
Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
+ double threshold = parsedArgs.containsKey("--threshold") ?
+ Double.parseDouble(parsedArgs.get("--threshold")) :
RowSimilarityJob.NO_THRESHOLD;
+
Path similarityMatrixPath = getTempPath("similarityMatrix");
Path prepPath = getTempPath("prepareRatingMatrix");
AtomicInteger currentPhase = new AtomicInteger();
- ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
- "--input", getInputPath().toString(),
- "--output", prepPath.toString(),
- "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser),
- "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
- "--booleanData", String.valueOf(booleanData),
- "--tempDir", getTempPath().toString()});
-
- int numberOfUsers = TasteHadoopUtils.readInt(new Path(prepPath,
PreparePreferenceMatrixJob.NUM_USERS), getConf());
-
- /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor
this call to something like
- * new DistributedRowMatrix(...).rowSimilarity(...) */
- ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
- "--input", new Path(prepPath,
PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
- "--output", similarityMatrixPath.toString(),
- "--numberOfColumns", String.valueOf(numberOfUsers),
- "--similarityClassname", similarityClassName,
- "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1),
- "--tempDir", getTempPath().toString() });
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
+ "--input", getInputPath().toString(),
+ "--output", prepPath.toString(),
+ "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser),
+ "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
+ "--booleanData", String.valueOf(booleanData),
+ "--tempDir", getTempPath().toString() });
+ }
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- Job mostSimilarItems = prepareJob(similarityMatrixPath,
- getOutputPath(),
- SequenceFileInputFormat.class,
- MostSimilarItemPairsMapper.class,
- EntityEntityWritable.class,
- DoubleWritable.class,
- MostSimilarItemPairsReducer.class,
- EntityEntityWritable.class,
- DoubleWritable.class,
- TextOutputFormat.class);
+ int numberOfUsers = TasteHadoopUtils.readInt(new Path(prepPath,
PreparePreferenceMatrixJob.NUM_USERS),
+ getConf());
+
+ ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
+ "--input", new Path(prepPath,
PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
+ "--output", similarityMatrixPath.toString(),
+ "--numberOfColumns", String.valueOf(numberOfUsers),
+ "--similarityClassname", similarityClassName,
+ "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem),
+ "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+ "--threshold", String.valueOf(threshold),
+ "--tempDir", getTempPath().toString() });
+ }
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(),
SequenceFileInputFormat.class,
+ MostSimilarItemPairsMapper.class, EntityEntityWritable.class,
DoubleWritable.class,
+ MostSimilarItemPairsReducer.class, EntityEntityWritable.class,
DoubleWritable.class, TextOutputFormat.class);
Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
new Path(prepPath,
PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM,
maxSimilarItemsPerItem);
- mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
mostSimilarItems.waitForCompletion(true);
}
return 0;
}
+
+ public static class MostSimilarItemPairsMapper
+ extends
Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable> {
+
+ private OpenIntLongHashMap indexItemIDMap;
+ private int maxSimilarItemsPerItem;
+
+ @Override
+ protected void setup(Context ctx) {
+ Configuration conf = ctx.getConfiguration();
+ maxSimilarItemsPerItem =
conf.getInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, -1);
+ indexItemIDMap =
TasteHadoopUtils.readItemIDIndexMap(conf.get(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR),
conf);
+
+ Preconditions.checkArgument(maxSimilarItemsPerItem > 0,
"maxSimilarItemsPerItem was not correctly set!");
+ }
+
+ @Override
+ protected void map(IntWritable itemIDIndexWritable, VectorWritable
similarityVector, Context ctx)
+ throws IOException, InterruptedException {
+
+ int itemIDIndex = itemIDIndexWritable.get();
+
+ TopK<SimilarItem> topKMostSimilarItems =
+ new TopK<SimilarItem>(maxSimilarItemsPerItem,
SimilarItem.COMPARE_BY_SIMILARITY);
+
+ Iterator<Vector.Element> similarityVectorIterator =
similarityVector.get().iterateNonZero();
+
+ while (similarityVectorIterator.hasNext()) {
+ Vector.Element element = similarityVectorIterator.next();
+ topKMostSimilarItems.offer(new
SimilarItem(indexItemIDMap.get(element.index()), element.get()));
+ }
+
+ long itemID = indexItemIDMap.get(itemIDIndex);
+ for (SimilarItem similarItem : topKMostSimilarItems.retrieve()) {
+ long otherItemID = similarItem.getItemID();
+ if (itemID < otherItemID) {
+ ctx.write(new EntityEntityWritable(itemID, otherItemID), new
DoubleWritable(similarItem.getSimilarity()));
+ } else {
+ ctx.write(new EntityEntityWritable(otherItemID, itemID), new
DoubleWritable(similarItem.getSimilarity()));
+ }
+ }
+ }
+ }
+
+ static class MostSimilarItemPairsReducer
+ extends
Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable>
{
+ @Override
+ protected void reduce(EntityEntityWritable pair, Iterable<DoubleWritable>
values, Context ctx)
+ throws IOException, InterruptedException {
+ ctx.write(pair, values.iterator().next());
+ }
+ }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
Fri Sep 9 11:49:49 2011
@@ -35,7 +35,6 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import
org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
import
org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure;
-import org.apache.mahout.math.map.OpenIntDoubleHashMap;
import org.apache.mahout.math.map.OpenIntIntHashMap;
import java.io.IOException;
@@ -47,18 +46,19 @@ import java.util.concurrent.atomic.Atomi
public class RowSimilarityJob extends AbstractJob {
+ public static final double NO_THRESHOLD = Double.MIN_VALUE;
+
static final String SIMILARITY_CLASSNAME = RowSimilarityJob.class +
".distributedSimilarityClassname";
static final String NUMBER_OF_COLUMNS = RowSimilarityJob.class +
".numberOfColumns";
static final String MAX_SIMILARITIES_PER_ROW = RowSimilarityJob.class +
".maxSimilaritiesPerRow";
static final String EXCLUDE_SELF_SIMILARITY = RowSimilarityJob.class +
".excludeSelfSimilarity";
- static final String THRESHOLD = RowSimilarityJob.class + ".threshold";
+ static final String THRESHOLD = RowSimilarityJob.class + ".threshold";
static final String NORMS_PATH = RowSimilarityJob.class + ".normsPath";
static final String MAXVALUES_PATH = RowSimilarityJob.class +
".maxWeightsPath";
- static final String NUM_NON_ZERO_ENTRIES_PATH = RowSimilarityJob.class +
".nonZeroEntriesPath";
+ static final String NUM_NON_ZERO_ENTRIES_PATH = RowSimilarityJob.class +
".nonZeroEntriesPath";
private static final int DEFAULT_MAX_SIMILARITIES_PER_ROW = 100;
- private static final double NO_THRESHOLD = Double.MIN_VALUE;
private static final int NORM_VECTOR_MARKER = Integer.MIN_VALUE;
private static final int MAXVALUE_VECTOR_MARKER = Integer.MIN_VALUE + 1;
@@ -70,6 +70,7 @@ public class RowSimilarityJob extends Ab
ToolRunner.run(new RowSimilarityJob(), args);
}
+
@Override
public int run(String[] args) throws Exception {
@@ -81,7 +82,7 @@ public class RowSimilarityJob extends Ab
addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities
per row (default: "
+ DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to
themselves?", String.valueOf(false));
- addOption("threshold", "tr", "drop row pairs with a similarity value below
this");
+ addOption("threshold", "tr", "discard row pairs with a similarity value
below this", false);
Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
Fri Sep 9 11:49:49 2011
@@ -47,7 +47,7 @@ import org.junit.Test;
public final class ItemSimilarityJobTest extends TasteTestCase {
/**
- * Tests {@link MostSimilarItemPairsMapper}
+ * Tests {@link ItemSimilarityJob.MostSimilarItemPairsMapper}
*/
@Test
public void testMostSimilarItemsPairsMapper() throws Exception {
@@ -66,10 +66,9 @@ public final class ItemSimilarityJobTest
Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
vector.set(12, 0.2);
- vector.set(34, 1.0);
vector.set(56, 0.9);
- MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper();
+ ItemSimilarityJob.MostSimilarItemPairsMapper mapper = new
ItemSimilarityJob.MostSimilarItemPairsMapper();
setField(mapper, "indexItemIDMap", indexItemIDMap);
setField(mapper, "maxSimilarItemsPerItem", 1);
@@ -79,7 +78,7 @@ public final class ItemSimilarityJobTest
}
/**
- * Tests {@link MostSimilarItemPairsReducer}
+ * Tests {@link ItemSimilarityJob.MostSimilarItemPairsReducer}
*/
@Test
public void testMostSimilarItemPairsReducer() throws Exception {
@@ -90,7 +89,7 @@ public final class ItemSimilarityJobTest
EasyMock.replay(context);
- new MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L,
456L),
+ new ItemSimilarityJob.MostSimilarItemPairsReducer().reduce(new
EntityEntityWritable(123L, 456L),
Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)),
context);
EasyMock.verify(context);