Yes. Forgotten add. On Thu, Apr 21, 2011 at 10:21 PM, Dmitriy Lyubimov <[email protected]>wrote:
> Hm. I am getting this after this commit. Forgotten class? > > [INFO] Compilation failure > > > \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce > neIterator.java:[33,30] cannot find symbol > symbol : class Bump125 > location: package org.apache.mahout.utils > > \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce > neIterator.java:[55,10] cannot find symbol > symbol : class Bump125 > location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator > > \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce > neIterator.java:[55,29] cannot find symbol > symbol : class Bump125 > location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator > > [INFO] > ------------------------------------------------------------------------ > [INFO] For more information, run Maven with the -e switch > [INFO] > ------------------------------------------------------------------------ > > On Thu, Apr 21, 2011 at 9:58 PM, <[email protected]> wrote: > > Author: tdunning > > Date: Fri Apr 22 04:58:14 2011 > > New Revision: 1095864 > > > > URL: http://svn.apache.org/viewvc?rev=1095864&view=rev > > Log: > > MAHOUT-675 - Add better handling of empty term vectors in lucene > conversion to vectors. > > > > Modified: > > > > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > > > > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > > > > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > > > > Modified: > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > > URL: > http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff > > > ============================================================================== > > --- > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > (original) > > +++ > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > Fri Apr 22 04:58:14 2011 > > @@ -35,11 +35,16 @@ public final class LuceneIterable implem > > private final String idField; > > private final VectorMapper mapper; > > private final double normPower; > > + private final double maxPercentErrorDocs; > > > > public LuceneIterable(IndexReader reader, String idField, String field, > VectorMapper mapper) { > > this(reader, idField, field, mapper, NO_NORMALIZING); > > } > > > > + public LuceneIterable(IndexReader indexReader, String idField, String > field, VectorMapper mapper, double normPower) { > > + this(indexReader, idField, field, mapper, normPower, 0); > > + } > > + > > /** > > * Produce a LuceneIterable that can create the Vector plus normalize > it. > > * > > @@ -49,18 +54,19 @@ public final class LuceneIterable implem > > * @param mapper {@link VectorMapper} for creating {@link Vector}s from > Lucene's TermVectors. > > * @param normPower the normalization value. Must be nonnegative, or > {@link #NO_NORMALIZING} > > */ > > - public LuceneIterable(IndexReader indexReader, String idField, String > field, VectorMapper mapper, double normPower) { > > + public LuceneIterable(IndexReader indexReader, String idField, String > field, VectorMapper mapper, double normPower, double maxPercentErrorDocs) { > > this.indexReader = indexReader; > > this.idField = idField; > > this.field = field; > > this.mapper = mapper; > > this.normPower = normPower; > > + this.maxPercentErrorDocs = maxPercentErrorDocs; > > } > > > > @Override > > public Iterator<Vector> iterator() { > > try { > > - return new LuceneIterator(indexReader, idField, field, mapper, > normPower); > > + return new LuceneIterator(indexReader, idField, field, mapper, > normPower, maxPercentErrorDocs); > > } catch (IOException e) { > > throw new IllegalStateException(e); > > } > > > > Modified: > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > > URL: > http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff > > > ============================================================================== > > --- > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > (original) > > +++ > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > Fri Apr 22 04:58:14 2011 > > @@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs; > > import org.apache.lucene.index.TermFreqVector; > > import org.apache.mahout.math.NamedVector; > > import org.apache.mahout.math.Vector; > > +import org.apache.mahout.utils.Bump125; > > import org.slf4j.Logger; > > import org.slf4j.LoggerFactory; > > > > @@ -48,8 +49,12 @@ public final class LuceneIterator extend > > private final VectorMapper mapper; > > private final double normPower; > > private final TermDocs termDocs; > > - private int numErrorDocs; > > - private int maxErrorDocs; > > + > > + private int numErrorDocs = 0; > > + private int maxErrorDocs = 0; > > + private Bump125 bump = new Bump125(); > > + private long nextLogRecord = bump.increment(); > > + private int skippedErrorMessages = 0; > > > > /** > > * Produce a LuceneIterable that can create the Vector plus normalize > it. > > @@ -65,7 +70,7 @@ public final class LuceneIterator extend > > String field, > > VectorMapper mapper, > > double normPower) throws IOException { > > - this(indexReader, idField, field, mapper, normPower, 1.0); > > + this(indexReader, idField, field, mapper, normPower, 0.0); > > } > > > > /** > > @@ -91,7 +96,6 @@ public final class LuceneIterator extend > > // term docs(null) is a better way of iterating all the docs in > Lucene > > this.termDocs = indexReader.termDocs(null); > > this.maxErrorDocs = (int) (maxPercentErrorDocs * > indexReader.numDocs()); > > - this.numErrorDocs = 0; > > } > > > > @Override > > @@ -104,11 +108,22 @@ public final class LuceneIterator extend > > int doc = termDocs.doc(); > > TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, > field); > > if (termFreqVector == null) { > > - if (++numErrorDocs >= maxErrorDocs) { > > + numErrorDocs++; > > + if (numErrorDocs >= maxErrorDocs) { > > log.error("There are too many documents that do not have a term > vector for {}", field); > > throw new IllegalStateException("There are too many documents > that do not have a term vector for " + field); > > } > > - log.warn("{} does not have a term vector for {}", > indexReader.document(doc).get(idField), field); > > + if (numErrorDocs >= nextLogRecord) { > > + if (skippedErrorMessages == 0) { > > + log.warn("{} does not have a term vector for {}", > indexReader.document(doc).get(idField), field); > > + } else { > > + log.warn("{} documents do not have a term vector for {}", > numErrorDocs, field); > > + } > > + nextLogRecord = bump.increment(); > > + skippedErrorMessages = 0; > > + } else { > > + skippedErrorMessages++; > > + } > > computeNext(); > > } > > > > > > Modified: > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > > URL: > http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff > > > ============================================================================== > > --- > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > (original) > > +++ > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > Fri Apr 22 04:58:14 2011 > > @@ -97,16 +97,81 @@ public final class LuceneIterableTest ex > > iterator.next(); > > } > > > > + @Test > > + public void testIterable_someNoiseTermVectors() throws IOException { > > + //get noise vectors > > + RAMDirectory directory = createTestIndex(Field.TermVector.YES, new > RAMDirectory(), true, 0); > > + //get real vectors > > + createTestIndex(Field.TermVector.NO, directory, false, 5); > > + > > + IndexReader reader = IndexReader.open(directory, true); > > + Weight weight = new TFIDF(); > > + TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); > > + VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); > > + > > + boolean exceptionThrown; > > + //0 percent tolerance > > + LuceneIterable iterable = new LuceneIterable(reader, "id", > "content", mapper); > > + try { > > + Iterator<Vector> iterator = iterable.iterator(); > > + while (iterator.hasNext()) { > > + iterator.next(); > > + } > > + exceptionThrown = false; > > + } > > + catch(IllegalStateException ise) { > > + exceptionThrown = true; > > + } > > + assertTrue(exceptionThrown); > > + > > + //100 percent tolerance > > + iterable = new LuceneIterable(reader, "id", "content", mapper, -1, > 1.0); > > + try { > > + Iterator<Vector> iterator = iterable.iterator(); > > + while (iterator.hasNext()) { > > + iterator.next(); > > + } > > + exceptionThrown = false; > > + } > > + catch(IllegalStateException ise) { > > + exceptionThrown = true; > > + } > > + assertFalse(exceptionThrown); > > + > > + //50 percent tolerance > > + iterable = new LuceneIterable(reader, "id", "content", mapper, -1, > 0.5); > > + Iterator<Vector> iterator = iterable.iterator(); > > + iterator.next(); > > + iterator.next(); > > + iterator.next(); > > + iterator.next(); > > + iterator.next(); > > + > > + try { > > + while (iterator.hasNext()) { > > + iterator.next(); > > + } > > + exceptionThrown = false; > > + } > > + catch(IllegalStateException ise) { > > + exceptionThrown = true; > > + } > > + assertTrue(exceptionThrown); > > + } > > + > > private static RAMDirectory createTestIndex(Field.TermVector > termVector) throws IOException { > > - RAMDirectory directory = new RAMDirectory(); > > + return createTestIndex(termVector, new RAMDirectory(), true, 0); > > + } > > + > > + private static RAMDirectory createTestIndex(Field.TermVector > termVector, RAMDirectory directory, boolean createNew, int startingId) > throws IOException { > > IndexWriter writer = new IndexWriter( > > directory, > > new StandardAnalyzer(Version.LUCENE_30), > > - true, > > + createNew, > > IndexWriter.MaxFieldLength.UNLIMITED); > > for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) { > > Document doc = new Document(); > > - Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, > Field.Index.NOT_ANALYZED_NO_NORMS); > > + Fieldable id = new Field("id", "doc_" + (i + startingId), > Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); > > doc.add(id); > > //Store both position and offset information > > Fieldable text = new Field("content", DOCS[i], Field.Store.NO, > Field.Index.ANALYZED, termVector); > > > > > > >
