Author: tdunning
Date: Fri Apr 22 04:58:14 2011
New Revision: 1095864

URL: http://svn.apache.org/viewvc?rev=1095864&view=rev
Log:
MAHOUT-675 - Add better handling of empty term vectors in lucene conversion to 
vectors.

Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 Fri Apr 22 04:58:14 2011
@@ -35,11 +35,16 @@ public final class LuceneIterable implem
   private final String idField;
   private final VectorMapper mapper;
   private final double normPower;
+  private final double maxPercentErrorDocs;
 
   public LuceneIterable(IndexReader reader, String idField, String field, 
VectorMapper mapper) {
     this(reader, idField, field, mapper, NO_NORMALIZING);
   }
   
+  public LuceneIterable(IndexReader indexReader, String idField, String field, 
VectorMapper mapper, double normPower) {
+    this(indexReader, idField, field, mapper, normPower, 0);
+  }
+  
   /**
    * Produce a LuceneIterable that can create the Vector plus normalize it.
    * 
@@ -49,18 +54,19 @@ public final class LuceneIterable implem
    * @param mapper {@link VectorMapper} for creating {@link Vector}s from 
Lucene's TermVectors.
    * @param normPower the normalization value. Must be nonnegative, or {@link 
#NO_NORMALIZING}
    */
-  public LuceneIterable(IndexReader indexReader, String idField, String field, 
VectorMapper mapper, double normPower) {
+  public LuceneIterable(IndexReader indexReader, String idField, String field, 
VectorMapper mapper, double normPower, double maxPercentErrorDocs) {
     this.indexReader = indexReader;
     this.idField = idField;
     this.field = field;
     this.mapper = mapper;
     this.normPower = normPower;
+    this.maxPercentErrorDocs = maxPercentErrorDocs;
   }
   
   @Override
   public Iterator<Vector> iterator() {
     try {
-      return new LuceneIterator(indexReader, idField, field, mapper, 
normPower);
+      return new LuceneIterator(indexReader, idField, field, mapper, 
normPower, maxPercentErrorDocs);
     } catch (IOException e) {
       throw new IllegalStateException(e);
     }

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
 Fri Apr 22 04:58:14 2011
@@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.TermFreqVector;
 import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
+import org.apache.mahout.utils.Bump125;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -48,8 +49,12 @@ public final class LuceneIterator extend
   private final VectorMapper mapper;
   private final double normPower;
   private final TermDocs termDocs;
-  private int numErrorDocs;
-  private int maxErrorDocs;
+
+  private int numErrorDocs = 0;
+  private int maxErrorDocs = 0;
+  private Bump125 bump = new Bump125();
+  private long nextLogRecord = bump.increment();
+  private int skippedErrorMessages = 0;
 
   /**
    * Produce a LuceneIterable that can create the Vector plus normalize it.
@@ -65,7 +70,7 @@ public final class LuceneIterator extend
                         String field,
                         VectorMapper mapper,
                         double normPower) throws IOException {
-    this(indexReader, idField, field, mapper, normPower, 1.0);
+    this(indexReader, idField, field, mapper, normPower, 0.0);
   }
 
   /**
@@ -91,7 +96,6 @@ public final class LuceneIterator extend
     // term docs(null) is a better way of iterating all the docs in Lucene
     this.termDocs = indexReader.termDocs(null);
     this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
-    this.numErrorDocs = 0;
   }
 
   @Override
@@ -104,11 +108,22 @@ public final class LuceneIterator extend
       int doc = termDocs.doc();
       TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, 
field);
       if (termFreqVector == null) {
-        if (++numErrorDocs >= maxErrorDocs) {
+        numErrorDocs++;
+        if (numErrorDocs >= maxErrorDocs) {
           log.error("There are too many documents that do not have a term 
vector for {}", field);
           throw new IllegalStateException("There are too many documents that 
do not have a term vector for " + field);
         }
-        log.warn("{} does not have a term vector for {}", 
indexReader.document(doc).get(idField), field);
+        if (numErrorDocs >= nextLogRecord) {
+          if (skippedErrorMessages == 0) {
+            log.warn("{} does not have a term vector for {}", 
indexReader.document(doc).get(idField), field);
+          } else {
+            log.warn("{} documents do not have a term vector for {}", 
numErrorDocs, field);
+          }
+          nextLogRecord = bump.increment();
+          skippedErrorMessages = 0;
+        } else {
+          skippedErrorMessages++;
+        }
         computeNext();
       }
 

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
 Fri Apr 22 04:58:14 2011
@@ -97,16 +97,81 @@ public final class LuceneIterableTest ex
     iterator.next();
   }
 
+  @Test
+  public void testIterable_someNoiseTermVectors() throws IOException {
+    //get noise vectors
+    RAMDirectory directory = createTestIndex(Field.TermVector.YES, new 
RAMDirectory(), true, 0);
+    //get real vectors
+    createTestIndex(Field.TermVector.NO, directory, false, 5);
+      
+    IndexReader reader = IndexReader.open(directory, true);
+    Weight weight = new TFIDF();
+    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+    
+    boolean exceptionThrown;
+    //0 percent tolerance
+    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", 
mapper);
+    try {
+        Iterator<Vector> iterator = iterable.iterator();
+        while (iterator.hasNext()) {
+            iterator.next();
+        }
+        exceptionThrown = false;
+    }
+    catch(IllegalStateException ise) {
+        exceptionThrown = true;
+    }
+    assertTrue(exceptionThrown);
+    
+    //100 percent tolerance
+    iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 1.0);
+    try {
+        Iterator<Vector> iterator = iterable.iterator();
+        while (iterator.hasNext()) {
+            iterator.next();
+        }
+        exceptionThrown = false;
+    }
+    catch(IllegalStateException ise) {
+        exceptionThrown = true;
+    }
+    assertFalse(exceptionThrown);
+    
+    //50 percent tolerance
+    iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 0.5);
+    Iterator<Vector> iterator = iterable.iterator();
+    iterator.next();
+    iterator.next();
+    iterator.next();
+    iterator.next();
+    iterator.next();
+
+    try {
+        while (iterator.hasNext()) {
+            iterator.next();
+        }
+        exceptionThrown = false;
+    }
+    catch(IllegalStateException ise) {
+        exceptionThrown = true;
+    }
+    assertTrue(exceptionThrown);
+  }
+  
   private static RAMDirectory createTestIndex(Field.TermVector termVector) 
throws IOException {
-    RAMDirectory directory = new RAMDirectory();
+      return createTestIndex(termVector, new RAMDirectory(), true, 0);
+  }
+  
+  private static RAMDirectory createTestIndex(Field.TermVector termVector, 
RAMDirectory directory, boolean createNew, int startingId) throws IOException {
     IndexWriter writer = new IndexWriter(
         directory,
         new StandardAnalyzer(Version.LUCENE_30),
-        true,
+        createNew,
         IndexWriter.MaxFieldLength.UNLIMITED);
     for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) {
       Document doc = new Document();
-      Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, 
Field.Index.NOT_ANALYZED_NO_NORMS);
+      Fieldable id = new Field("id", "doc_" + (i + startingId), 
Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
       doc.add(id);
       //Store both position and offset information
       Fieldable text = new Field("content", DOCS[i], Field.Store.NO, 
Field.Index.ANALYZED, termVector);


Reply via email to