Author: tdunning
Date: Fri Apr 22 04:58:14 2011
New Revision: 1095864
URL: http://svn.apache.org/viewvc?rev=1095864&view=rev
Log:
MAHOUT-675 - Add better handling of empty term vectors in lucene conversion to
vectors.
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
Fri Apr 22 04:58:14 2011
@@ -35,11 +35,16 @@ public final class LuceneIterable implem
private final String idField;
private final VectorMapper mapper;
private final double normPower;
+ private final double maxPercentErrorDocs;
public LuceneIterable(IndexReader reader, String idField, String field,
VectorMapper mapper) {
this(reader, idField, field, mapper, NO_NORMALIZING);
}
+ public LuceneIterable(IndexReader indexReader, String idField, String field,
VectorMapper mapper, double normPower) {
+ this(indexReader, idField, field, mapper, normPower, 0);
+ }
+
/**
* Produce a LuceneIterable that can create the Vector plus normalize it.
*
@@ -49,18 +54,19 @@ public final class LuceneIterable implem
* @param mapper {@link VectorMapper} for creating {@link Vector}s from
Lucene's TermVectors.
* @param normPower the normalization value. Must be nonnegative, or {@link
#NO_NORMALIZING}
*/
- public LuceneIterable(IndexReader indexReader, String idField, String field,
VectorMapper mapper, double normPower) {
+ public LuceneIterable(IndexReader indexReader, String idField, String field,
VectorMapper mapper, double normPower, double maxPercentErrorDocs) {
this.indexReader = indexReader;
this.idField = idField;
this.field = field;
this.mapper = mapper;
this.normPower = normPower;
+ this.maxPercentErrorDocs = maxPercentErrorDocs;
}
@Override
public Iterator<Vector> iterator() {
try {
- return new LuceneIterator(indexReader, idField, field, mapper,
normPower);
+ return new LuceneIterator(indexReader, idField, field, mapper,
normPower, maxPercentErrorDocs);
} catch (IOException e) {
throw new IllegalStateException(e);
}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
Fri Apr 22 04:58:14 2011
@@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermFreqVector;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.Bump125;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -48,8 +49,12 @@ public final class LuceneIterator extend
private final VectorMapper mapper;
private final double normPower;
private final TermDocs termDocs;
- private int numErrorDocs;
- private int maxErrorDocs;
+
+ private int numErrorDocs = 0;
+ private int maxErrorDocs = 0;
+ private Bump125 bump = new Bump125();
+ private long nextLogRecord = bump.increment();
+ private int skippedErrorMessages = 0;
/**
* Produce a LuceneIterable that can create the Vector plus normalize it.
@@ -65,7 +70,7 @@ public final class LuceneIterator extend
String field,
VectorMapper mapper,
double normPower) throws IOException {
- this(indexReader, idField, field, mapper, normPower, 1.0);
+ this(indexReader, idField, field, mapper, normPower, 0.0);
}
/**
@@ -91,7 +96,6 @@ public final class LuceneIterator extend
// term docs(null) is a better way of iterating all the docs in Lucene
this.termDocs = indexReader.termDocs(null);
this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
- this.numErrorDocs = 0;
}
@Override
@@ -104,11 +108,22 @@ public final class LuceneIterator extend
int doc = termDocs.doc();
TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc,
field);
if (termFreqVector == null) {
- if (++numErrorDocs >= maxErrorDocs) {
+ numErrorDocs++;
+ if (numErrorDocs >= maxErrorDocs) {
log.error("There are too many documents that do not have a term
vector for {}", field);
throw new IllegalStateException("There are too many documents that
do not have a term vector for " + field);
}
- log.warn("{} does not have a term vector for {}",
indexReader.document(doc).get(idField), field);
+ if (numErrorDocs >= nextLogRecord) {
+ if (skippedErrorMessages == 0) {
+ log.warn("{} does not have a term vector for {}",
indexReader.document(doc).get(idField), field);
+ } else {
+ log.warn("{} documents do not have a term vector for {}",
numErrorDocs, field);
+ }
+ nextLogRecord = bump.increment();
+ skippedErrorMessages = 0;
+ } else {
+ skippedErrorMessages++;
+ }
computeNext();
}
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Fri Apr 22 04:58:14 2011
@@ -97,16 +97,81 @@ public final class LuceneIterableTest ex
iterator.next();
}
+ @Test
+ public void testIterable_someNoiseTermVectors() throws IOException {
+ //get noise vectors
+ RAMDirectory directory = createTestIndex(Field.TermVector.YES, new
RAMDirectory(), true, 0);
+ //get real vectors
+ createTestIndex(Field.TermVector.NO, directory, false, 5);
+
+ IndexReader reader = IndexReader.open(directory, true);
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+ VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+
+ boolean exceptionThrown;
+ //0 percent tolerance
+ LuceneIterable iterable = new LuceneIterable(reader, "id", "content",
mapper);
+ try {
+ Iterator<Vector> iterator = iterable.iterator();
+ while (iterator.hasNext()) {
+ iterator.next();
+ }
+ exceptionThrown = false;
+ }
+ catch(IllegalStateException ise) {
+ exceptionThrown = true;
+ }
+ assertTrue(exceptionThrown);
+
+ //100 percent tolerance
+ iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 1.0);
+ try {
+ Iterator<Vector> iterator = iterable.iterator();
+ while (iterator.hasNext()) {
+ iterator.next();
+ }
+ exceptionThrown = false;
+ }
+ catch(IllegalStateException ise) {
+ exceptionThrown = true;
+ }
+ assertFalse(exceptionThrown);
+
+ //50 percent tolerance
+ iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 0.5);
+ Iterator<Vector> iterator = iterable.iterator();
+ iterator.next();
+ iterator.next();
+ iterator.next();
+ iterator.next();
+ iterator.next();
+
+ try {
+ while (iterator.hasNext()) {
+ iterator.next();
+ }
+ exceptionThrown = false;
+ }
+ catch(IllegalStateException ise) {
+ exceptionThrown = true;
+ }
+ assertTrue(exceptionThrown);
+ }
+
private static RAMDirectory createTestIndex(Field.TermVector termVector)
throws IOException {
- RAMDirectory directory = new RAMDirectory();
+ return createTestIndex(termVector, new RAMDirectory(), true, 0);
+ }
+
+ private static RAMDirectory createTestIndex(Field.TermVector termVector,
RAMDirectory directory, boolean createNew, int startingId) throws IOException {
IndexWriter writer = new IndexWriter(
directory,
new StandardAnalyzer(Version.LUCENE_30),
- true,
+ createNew,
IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) {
Document doc = new Document();
- Fieldable id = new Field("id", "doc_" + i, Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
+ Fieldable id = new Field("id", "doc_" + (i + startingId),
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
doc.add(id);
//Store both position and offset information
Fieldable text = new Field("content", DOCS[i], Field.Store.NO,
Field.Index.ANALYZED, termVector);