Hm. I am getting this after this commit. Forgotten class? [INFO] Compilation failure
\projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce neIterator.java:[33,30] cannot find symbol symbol : class Bump125 location: package org.apache.mahout.utils \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce neIterator.java:[55,10] cannot find symbol symbol : class Bump125 location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce neIterator.java:[55,29] cannot find symbol symbol : class Bump125 location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator [INFO] ------------------------------------------------------------------------ [INFO] For more information, run Maven with the -e switch [INFO] ------------------------------------------------------------------------ On Thu, Apr 21, 2011 at 9:58 PM, <[email protected]> wrote: > Author: tdunning > Date: Fri Apr 22 04:58:14 2011 > New Revision: 1095864 > > URL: http://svn.apache.org/viewvc?rev=1095864&view=rev > Log: > MAHOUT-675 - Add better handling of empty term vectors in lucene conversion > to vectors. > > Modified: > > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > > Modified: > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > URL: > http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff > ============================================================================== > --- > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > (original) > +++ > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java > Fri Apr 22 04:58:14 2011 > @@ -35,11 +35,16 @@ public final class LuceneIterable implem > private final String idField; > private final VectorMapper mapper; > private final double normPower; > + private final double maxPercentErrorDocs; > > public LuceneIterable(IndexReader reader, String idField, String field, > VectorMapper mapper) { > this(reader, idField, field, mapper, NO_NORMALIZING); > } > > + public LuceneIterable(IndexReader indexReader, String idField, String > field, VectorMapper mapper, double normPower) { > + this(indexReader, idField, field, mapper, normPower, 0); > + } > + > /** > * Produce a LuceneIterable that can create the Vector plus normalize it. > * > @@ -49,18 +54,19 @@ public final class LuceneIterable implem > * @param mapper {@link VectorMapper} for creating {@link Vector}s from > Lucene's TermVectors. > * @param normPower the normalization value. Must be nonnegative, or {@link > #NO_NORMALIZING} > */ > - public LuceneIterable(IndexReader indexReader, String idField, String > field, VectorMapper mapper, double normPower) { > + public LuceneIterable(IndexReader indexReader, String idField, String > field, VectorMapper mapper, double normPower, double maxPercentErrorDocs) { > this.indexReader = indexReader; > this.idField = idField; > this.field = field; > this.mapper = mapper; > this.normPower = normPower; > + this.maxPercentErrorDocs = maxPercentErrorDocs; > } > > @Override > public Iterator<Vector> iterator() { > try { > - return new LuceneIterator(indexReader, idField, field, mapper, > normPower); > + return new LuceneIterator(indexReader, idField, field, mapper, > normPower, maxPercentErrorDocs); > } catch (IOException e) { > throw new IllegalStateException(e); > } > > Modified: > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > URL: > http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff > ============================================================================== > --- > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > (original) > +++ > mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java > Fri Apr 22 04:58:14 2011 > @@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs; > import org.apache.lucene.index.TermFreqVector; > import org.apache.mahout.math.NamedVector; > import org.apache.mahout.math.Vector; > +import org.apache.mahout.utils.Bump125; > import org.slf4j.Logger; > import org.slf4j.LoggerFactory; > > @@ -48,8 +49,12 @@ public final class LuceneIterator extend > private final VectorMapper mapper; > private final double normPower; > private final TermDocs termDocs; > - private int numErrorDocs; > - private int maxErrorDocs; > + > + private int numErrorDocs = 0; > + private int maxErrorDocs = 0; > + private Bump125 bump = new Bump125(); > + private long nextLogRecord = bump.increment(); > + private int skippedErrorMessages = 0; > > /** > * Produce a LuceneIterable that can create the Vector plus normalize it. > @@ -65,7 +70,7 @@ public final class LuceneIterator extend > String field, > VectorMapper mapper, > double normPower) throws IOException { > - this(indexReader, idField, field, mapper, normPower, 1.0); > + this(indexReader, idField, field, mapper, normPower, 0.0); > } > > /** > @@ -91,7 +96,6 @@ public final class LuceneIterator extend > // term docs(null) is a better way of iterating all the docs in Lucene > this.termDocs = indexReader.termDocs(null); > this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); > - this.numErrorDocs = 0; > } > > @Override > @@ -104,11 +108,22 @@ public final class LuceneIterator extend > int doc = termDocs.doc(); > TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, > field); > if (termFreqVector == null) { > - if (++numErrorDocs >= maxErrorDocs) { > + numErrorDocs++; > + if (numErrorDocs >= maxErrorDocs) { > log.error("There are too many documents that do not have a term > vector for {}", field); > throw new IllegalStateException("There are too many documents that > do not have a term vector for " + field); > } > - log.warn("{} does not have a term vector for {}", > indexReader.document(doc).get(idField), field); > + if (numErrorDocs >= nextLogRecord) { > + if (skippedErrorMessages == 0) { > + log.warn("{} does not have a term vector for {}", > indexReader.document(doc).get(idField), field); > + } else { > + log.warn("{} documents do not have a term vector for {}", > numErrorDocs, field); > + } > + nextLogRecord = bump.increment(); > + skippedErrorMessages = 0; > + } else { > + skippedErrorMessages++; > + } > computeNext(); > } > > > Modified: > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > URL: > http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff > ============================================================================== > --- > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > (original) > +++ > mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java > Fri Apr 22 04:58:14 2011 > @@ -97,16 +97,81 @@ public final class LuceneIterableTest ex > iterator.next(); > } > > + @Test > + public void testIterable_someNoiseTermVectors() throws IOException { > + //get noise vectors > + RAMDirectory directory = createTestIndex(Field.TermVector.YES, new > RAMDirectory(), true, 0); > + //get real vectors > + createTestIndex(Field.TermVector.NO, directory, false, 5); > + > + IndexReader reader = IndexReader.open(directory, true); > + Weight weight = new TFIDF(); > + TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); > + VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); > + > + boolean exceptionThrown; > + //0 percent tolerance > + LuceneIterable iterable = new LuceneIterable(reader, "id", "content", > mapper); > + try { > + Iterator<Vector> iterator = iterable.iterator(); > + while (iterator.hasNext()) { > + iterator.next(); > + } > + exceptionThrown = false; > + } > + catch(IllegalStateException ise) { > + exceptionThrown = true; > + } > + assertTrue(exceptionThrown); > + > + //100 percent tolerance > + iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 1.0); > + try { > + Iterator<Vector> iterator = iterable.iterator(); > + while (iterator.hasNext()) { > + iterator.next(); > + } > + exceptionThrown = false; > + } > + catch(IllegalStateException ise) { > + exceptionThrown = true; > + } > + assertFalse(exceptionThrown); > + > + //50 percent tolerance > + iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 0.5); > + Iterator<Vector> iterator = iterable.iterator(); > + iterator.next(); > + iterator.next(); > + iterator.next(); > + iterator.next(); > + iterator.next(); > + > + try { > + while (iterator.hasNext()) { > + iterator.next(); > + } > + exceptionThrown = false; > + } > + catch(IllegalStateException ise) { > + exceptionThrown = true; > + } > + assertTrue(exceptionThrown); > + } > + > private static RAMDirectory createTestIndex(Field.TermVector termVector) > throws IOException { > - RAMDirectory directory = new RAMDirectory(); > + return createTestIndex(termVector, new RAMDirectory(), true, 0); > + } > + > + private static RAMDirectory createTestIndex(Field.TermVector termVector, > RAMDirectory directory, boolean createNew, int startingId) throws IOException > { > IndexWriter writer = new IndexWriter( > directory, > new StandardAnalyzer(Version.LUCENE_30), > - true, > + createNew, > IndexWriter.MaxFieldLength.UNLIMITED); > for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) { > Document doc = new Document(); > - Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, > Field.Index.NOT_ANALYZED_NO_NORMS); > + Fieldable id = new Field("id", "doc_" + (i + startingId), > Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); > doc.add(id); > //Store both position and offset information > Fieldable text = new Field("content", DOCS[i], Field.Store.NO, > Field.Index.ANALYZED, termVector); > > >
