Re: svn commit: r1095864 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/utils/vectors/lucene/

Ted Dunning Thu, 21 Apr 2011 23:10:54 -0700

Yes.  Forgotten add.

On Thu, Apr 21, 2011 at 10:21 PM, Dmitriy Lyubimov <[email protected]>wrote:


> Hm. I am getting this after this commit. Forgotten class?
>
> [INFO] Compilation failure
>
>
> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce
> neIterator.java:[33,30] cannot find symbol
> symbol  : class Bump125
> location: package org.apache.mahout.utils
>
> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce
> neIterator.java:[55,10] cannot find symbol
> symbol  : class Bump125
> location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator
>
> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce
> neIterator.java:[55,29] cannot find symbol
> symbol  : class Bump125
> location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator
>
> [INFO]
> ------------------------------------------------------------------------
> [INFO] For more information, run Maven with the -e switch
> [INFO]
> ------------------------------------------------------------------------
>
> On Thu, Apr 21, 2011 at 9:58 PM,  <[email protected]> wrote:
> > Author: tdunning
> > Date: Fri Apr 22 04:58:14 2011
> > New Revision: 1095864
> >
> > URL: http://svn.apache.org/viewvc?rev=1095864&view=rev
> > Log:
> > MAHOUT-675 - Add better handling of empty term vectors in lucene
> conversion to vectors.
> >
> > Modified:
> >
>  
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
> >
>  
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
> >
>  
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
> >
> > Modified:
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
> > URL:
> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff
> >
> ==============================================================================
> > ---
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
> (original)
> > +++
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
> Fri Apr 22 04:58:14 2011
> > @@ -35,11 +35,16 @@ public final class LuceneIterable implem
> >   private final String idField;
> >   private final VectorMapper mapper;
> >   private final double normPower;
> > +  private final double maxPercentErrorDocs;
> >
> >   public LuceneIterable(IndexReader reader, String idField, String field,
> VectorMapper mapper) {
> >     this(reader, idField, field, mapper, NO_NORMALIZING);
> >   }
> >
> > +  public LuceneIterable(IndexReader indexReader, String idField, String
> field, VectorMapper mapper, double normPower) {
> > +    this(indexReader, idField, field, mapper, normPower, 0);
> > +  }
> > +
> >   /**
> >    * Produce a LuceneIterable that can create the Vector plus normalize
> it.
> >    *
> > @@ -49,18 +54,19 @@ public final class LuceneIterable implem
> >    * @param mapper {@link VectorMapper} for creating {@link Vector}s from
> Lucene's TermVectors.
> >    * @param normPower the normalization value. Must be nonnegative, or
> {@link #NO_NORMALIZING}
> >    */
> > -  public LuceneIterable(IndexReader indexReader, String idField, String
> field, VectorMapper mapper, double normPower) {
> > +  public LuceneIterable(IndexReader indexReader, String idField, String
> field, VectorMapper mapper, double normPower, double maxPercentErrorDocs) {
> >     this.indexReader = indexReader;
> >     this.idField = idField;
> >     this.field = field;
> >     this.mapper = mapper;
> >     this.normPower = normPower;
> > +    this.maxPercentErrorDocs = maxPercentErrorDocs;
> >   }
> >
> >   @Override
> >   public Iterator<Vector> iterator() {
> >     try {
> > -      return new LuceneIterator(indexReader, idField, field, mapper,
> normPower);
> > +      return new LuceneIterator(indexReader, idField, field, mapper,
> normPower, maxPercentErrorDocs);
> >     } catch (IOException e) {
> >       throw new IllegalStateException(e);
> >     }
> >
> > Modified:
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
> > URL:
> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff
> >
> ==============================================================================
> > ---
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
> (original)
> > +++
> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
> Fri Apr 22 04:58:14 2011
> > @@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs;
> >  import org.apache.lucene.index.TermFreqVector;
> >  import org.apache.mahout.math.NamedVector;
> >  import org.apache.mahout.math.Vector;
> > +import org.apache.mahout.utils.Bump125;
> >  import org.slf4j.Logger;
> >  import org.slf4j.LoggerFactory;
> >
> > @@ -48,8 +49,12 @@ public final class LuceneIterator extend
> >   private final VectorMapper mapper;
> >   private final double normPower;
> >   private final TermDocs termDocs;
> > -  private int numErrorDocs;
> > -  private int maxErrorDocs;
> > +
> > +  private int numErrorDocs = 0;
> > +  private int maxErrorDocs = 0;
> > +  private Bump125 bump = new Bump125();
> > +  private long nextLogRecord = bump.increment();
> > +  private int skippedErrorMessages = 0;
> >
> >   /**
> >    * Produce a LuceneIterable that can create the Vector plus normalize
> it.
> > @@ -65,7 +70,7 @@ public final class LuceneIterator extend
> >                         String field,
> >                         VectorMapper mapper,
> >                         double normPower) throws IOException {
> > -    this(indexReader, idField, field, mapper, normPower, 1.0);
> > +    this(indexReader, idField, field, mapper, normPower, 0.0);
> >   }
> >
> >   /**
> > @@ -91,7 +96,6 @@ public final class LuceneIterator extend
> >     // term docs(null) is a better way of iterating all the docs in
> Lucene
> >     this.termDocs = indexReader.termDocs(null);
> >     this.maxErrorDocs = (int) (maxPercentErrorDocs *
> indexReader.numDocs());
> > -    this.numErrorDocs = 0;
> >   }
> >
> >   @Override
> > @@ -104,11 +108,22 @@ public final class LuceneIterator extend
> >       int doc = termDocs.doc();
> >       TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc,
> field);
> >       if (termFreqVector == null) {
> > -        if (++numErrorDocs >= maxErrorDocs) {
> > +        numErrorDocs++;
> > +        if (numErrorDocs >= maxErrorDocs) {
> >           log.error("There are too many documents that do not have a term
> vector for {}", field);
> >           throw new IllegalStateException("There are too many documents
> that do not have a term vector for " + field);
> >         }
> > -        log.warn("{} does not have a term vector for {}",
> indexReader.document(doc).get(idField), field);
> > +        if (numErrorDocs >= nextLogRecord) {
> > +          if (skippedErrorMessages == 0) {
> > +            log.warn("{} does not have a term vector for {}",
> indexReader.document(doc).get(idField), field);
> > +          } else {
> > +            log.warn("{} documents do not have a term vector for {}",
> numErrorDocs, field);
> > +          }
> > +          nextLogRecord = bump.increment();
> > +          skippedErrorMessages = 0;
> > +        } else {
> > +          skippedErrorMessages++;
> > +        }
> >         computeNext();
> >       }
> >
> >
> > Modified:
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
> > URL:
> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff
> >
> ==============================================================================
> > ---
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
> (original)
> > +++
> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
> Fri Apr 22 04:58:14 2011
> > @@ -97,16 +97,81 @@ public final class LuceneIterableTest ex
> >     iterator.next();
> >   }
> >
> > +  @Test
> > +  public void testIterable_someNoiseTermVectors() throws IOException {
> > +    //get noise vectors
> > +    RAMDirectory directory = createTestIndex(Field.TermVector.YES, new
> RAMDirectory(), true, 0);
> > +    //get real vectors
> > +    createTestIndex(Field.TermVector.NO, directory, false, 5);
> > +
> > +    IndexReader reader = IndexReader.open(directory, true);
> > +    Weight weight = new TFIDF();
> > +    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
> > +    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
> > +
> > +    boolean exceptionThrown;
> > +    //0 percent tolerance
> > +    LuceneIterable iterable = new LuceneIterable(reader, "id",
> "content", mapper);
> > +    try {
> > +        Iterator<Vector> iterator = iterable.iterator();
> > +        while (iterator.hasNext()) {
> > +            iterator.next();
> > +        }
> > +        exceptionThrown = false;
> > +    }
> > +    catch(IllegalStateException ise) {
> > +        exceptionThrown = true;
> > +    }
> > +    assertTrue(exceptionThrown);
> > +
> > +    //100 percent tolerance
> > +    iterable = new LuceneIterable(reader, "id", "content", mapper, -1,
> 1.0);
> > +    try {
> > +        Iterator<Vector> iterator = iterable.iterator();
> > +        while (iterator.hasNext()) {
> > +            iterator.next();
> > +        }
> > +        exceptionThrown = false;
> > +    }
> > +    catch(IllegalStateException ise) {
> > +        exceptionThrown = true;
> > +    }
> > +    assertFalse(exceptionThrown);
> > +
> > +    //50 percent tolerance
> > +    iterable = new LuceneIterable(reader, "id", "content", mapper, -1,
> 0.5);
> > +    Iterator<Vector> iterator = iterable.iterator();
> > +    iterator.next();
> > +    iterator.next();
> > +    iterator.next();
> > +    iterator.next();
> > +    iterator.next();
> > +
> > +    try {
> > +        while (iterator.hasNext()) {
> > +            iterator.next();
> > +        }
> > +        exceptionThrown = false;
> > +    }
> > +    catch(IllegalStateException ise) {
> > +        exceptionThrown = true;
> > +    }
> > +    assertTrue(exceptionThrown);
> > +  }
> > +
> >   private static RAMDirectory createTestIndex(Field.TermVector
> termVector) throws IOException {
> > -    RAMDirectory directory = new RAMDirectory();
> > +      return createTestIndex(termVector, new RAMDirectory(), true, 0);
> > +  }
> > +
> > +  private static RAMDirectory createTestIndex(Field.TermVector
> termVector, RAMDirectory directory, boolean createNew, int startingId)
> throws IOException {
> >     IndexWriter writer = new IndexWriter(
> >         directory,
> >         new StandardAnalyzer(Version.LUCENE_30),
> > -        true,
> > +        createNew,
> >         IndexWriter.MaxFieldLength.UNLIMITED);
> >     for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) {
> >       Document doc = new Document();
> > -      Fieldable id = new Field("id", "doc_" + i, Field.Store.YES,
> Field.Index.NOT_ANALYZED_NO_NORMS);
> > +      Fieldable id = new Field("id", "doc_" + (i + startingId),
> Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
> >       doc.add(id);
> >       //Store both position and offset information
> >       Fieldable text = new Field("content", DOCS[i], Field.Store.NO,
> Field.Index.ANALYZED, termVector);
> >
> >
> >
>

Re: svn commit: r1095864 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/utils/vectors/lucene/

Reply via email to