Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java?rev=1440334&r1=1440333&r2=1440334&view=diff ============================================================================== --- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java (original) +++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java Wed Jan 30 10:27:17 2013 @@ -27,8 +27,6 @@ import org.apache.hadoop.conf.Configurat import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; @@ -50,14 +48,17 @@ import org.apache.mahout.utils.MahoutTes import org.apache.mahout.utils.vectors.TermInfo; import org.apache.mahout.utils.vectors.lucene.CachedTermInfo; import org.apache.mahout.utils.vectors.lucene.LuceneIterable; -import org.apache.mahout.utils.vectors.lucene.TFDFMapper; -import org.apache.mahout.utils.vectors.lucene.VectorMapper; import org.apache.mahout.vectorizer.TFIDF; import org.apache.mahout.vectorizer.Weight; import org.junit.Test; import com.google.common.collect.Lists; -import com.google.common.io.Closeables; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; public final class TestL1ModelClustering extends MahoutTestCase { @@ -134,26 +135,33 @@ public final class TestL1ModelClustering System.out.println(); sampleData = Lists.newArrayList(); RAMDirectory directory = new RAMDirectory(); - IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_34), true, - IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_41,new StandardAnalyzer(Version.LUCENE_41))); + + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.setStoreTermVectors(true); + try { for (int i = 0; i < docs2.length; i++) { Document doc = new Document(); - Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); + Field id = new Field("id", "doc_" + i, StringField.TYPE_STORED); doc.add(id); // Store both position and offset information - Fieldable text = new Field("content", docs2[i], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); + Field text = new Field("content", docs2[i], customType); doc.add(text); writer.addDocument(doc); + writer.commit(); } } finally { - Closeables.closeQuietly(writer); + writer.close(); } - IndexReader reader = IndexReader.open(directory, true); + + IndexReader reader = DirectoryReader.open(directory); + System.out.println("Number of documents: \t"+reader.numDocs()); + + Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); - Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", mapper); + Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", termInfo,weight); int i = 0; for (Vector vector : iterable) {
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java?rev=1440334&r1=1440333&r2=1440334&view=diff ============================================================================== --- mahout/trunk/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java (original) +++ mahout/trunk/integration/src/test/java/org/apache/mahout/text/MailArchivesClusteringAnalyzerTest.java Wed Jan 30 10:27:17 2013 @@ -53,12 +53,15 @@ public class MailArchivesClusteringAnaly }; TokenStream tokenStream = analyzer.tokenStream("test", reader); - assertNotNull(tokenStream); + assertNotNull(tokenStream); + tokenStream.reset(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); int e = 0; while (tokenStream.incrementToken() && e < expectedTokens.length) { assertEquals(expectedTokens[e++], termAtt.toString()); } assertEquals(e, expectedTokens.length); + tokenStream.end(); + tokenStream.close(); } } Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java?rev=1440334&r1=1440333&r2=1440334&view=diff ============================================================================== --- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java (original) +++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java Wed Jan 30 10:27:17 2013 @@ -33,7 +33,7 @@ import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; @@ -79,40 +79,52 @@ public final class BloomTokenFilterTest @Test public void testAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31); + Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_41); TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); validateTokens(allTokens, ts); + ts.end(); + ts.close(); } /** filtered analyzer */ @Test public void testNonKeepdAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31); + Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_41); TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts); validateTokens(expectedNonKeepTokens, f); + ts.end(); + ts.close(); } /** keep analyzer */ @Test public void testKeepAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31); + Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_41); TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts); validateTokens(expectedKeepTokens, f); + ts.end(); + ts.close(); } /** shingles, keep those matching whitelist */ @Test public void testShingleFilteredAnalyzer() throws IOException { Reader reader = new StringReader(input); - Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_31); + Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_41); TokenStream ts = analyzer.tokenStream(null, reader); + ts.reset(); ShingleFilter sf = new ShingleFilter(ts, 3); TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf); validateTokens(expectedShingleTokens, f); + ts.end(); + ts.close(); } private static void setKey(Key k, String s) throws IOException { Added: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java?rev=1440334&view=auto ============================================================================== --- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java (added) +++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfoTest.java Wed Jan 30 10:27:17 2013 @@ -0,0 +1,101 @@ +package org.apache.mahout.utils.vectors.lucene; + + +import com.google.common.io.Closeables; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import org.apache.mahout.utils.MahoutTestCase; +import org.apache.mahout.utils.vectors.TermEntry; +import org.junit.Test; + +import java.io.IOException; +import java.util.Iterator; + +/** + * + * + **/ +public class CachedTermInfoTest extends MahoutTestCase { + private RAMDirectory directory; + private static final String[] DOCS = { + "a a b b c c", + "a b a b a b a b", + "a b a", + "a", + "b", + "a", + "a" + }; + + private static final String[] DOCS2 = { + "d d d d", + "e e e e", + "d e d e", + "d", + "e", + "d", + "e" + }; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = new RAMDirectory(); + directory = createTestIndex(Field.TermVector.NO, directory, true, 0); + } + + @Test + public void test() throws Exception { + IndexReader reader = DirectoryReader.open(directory); + CachedTermInfo cti = new CachedTermInfo(reader, "content", 0, 100); + assertEquals(3, cti.totalTerms("content")); + assertNotNull(cti.getTermEntry("content", "a")); + assertNull(cti.getTermEntry("content", "e")); + //minDf + cti = new CachedTermInfo(reader, "content", 3, 100); + assertEquals(2, cti.totalTerms("content")); + assertNotNull(cti.getTermEntry("content", "a")); + assertNull(cti.getTermEntry("content", "c")); + //maxDFPercent, a is in 6 of 7 docs: numDocs * maxDfPercent / 100 < 6 to exclude, 85% should suffice to exclude a + cti = new CachedTermInfo(reader, "content", 0, 85); + assertEquals(2, cti.totalTerms("content")); + assertNotNull(cti.getTermEntry("content", "b")); + assertNotNull(cti.getTermEntry("content", "c")); + assertNull(cti.getTermEntry("content", "a")); + + + } + + static RAMDirectory createTestIndex(Field.TermVector termVector, + RAMDirectory directory, + boolean createNew, + int startingId) throws IOException { + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_41, new WhitespaceAnalyzer(Version.LUCENE_41))); + + try { + for (int i = 0; i < DOCS.length; i++) { + Document doc = new Document(); + Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES); + doc.add(id); + //Store both position and offset information + //Says it is deprecated, but doesn't seem to offer an alternative that supports term vectors... + Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector); + doc.add(text); + Field text2 = new Field("content2", DOCS2[i], Field.Store.NO, Field.Index.ANALYZED, termVector); + doc.add(text2); + writer.addDocument(doc); + } + } finally { + Closeables.closeQuietly(writer); + } + return directory; + } +} Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1440334&r1=1440333&r2=1440334&view=diff ============================================================================== --- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java (original) +++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java Wed Jan 30 10:27:17 2013 @@ -18,12 +18,16 @@ package org.apache.mahout.utils.vectors.lucene; import com.google.common.io.Closeables; +import java.io.IOException; +import java.util.Iterator; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.apache.mahout.math.NamedVector; @@ -34,9 +38,6 @@ import org.apache.mahout.vectorizer.TFID import org.apache.mahout.vectorizer.Weight; import org.junit.Test; -import java.io.IOException; -import java.util.Iterator; - public final class LuceneIterableTest extends MahoutTestCase { private static final String [] DOCS = { @@ -57,11 +58,10 @@ public final class LuceneIterableTest ex @Test public void testIterable() throws Exception { - IndexReader reader = IndexReader.open(directory, true); + IndexReader reader = DirectoryReader.open(directory); Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); - LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper); + LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight); //TODO: do something more meaningful here for (Vector vector : iterable) { @@ -71,7 +71,7 @@ public final class LuceneIterableTest ex assertTrue(((NamedVector) vector).getName().startsWith("doc_")); } - iterable = new LuceneIterable(reader, "id", "content", mapper, 3); + iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, 3); //TODO: do something more meaningful here for (Vector vector : iterable) { @@ -86,12 +86,12 @@ public final class LuceneIterableTest ex @Test(expected = IllegalStateException.class) public void testIterable_noTermVectors() throws IOException { RAMDirectory directory = createTestIndex(Field.TermVector.NO); - - IndexReader reader = IndexReader.open(directory, true); + IndexReader reader = DirectoryReader.open(directory); + + Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); - LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper); + LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight); Iterator<Vector> iterator = iterable.iterator(); iterator.hasNext(); @@ -104,15 +104,14 @@ public final class LuceneIterableTest ex RAMDirectory directory = createTestIndex(Field.TermVector.YES, new RAMDirectory(), true, 0); //get real vectors createTestIndex(Field.TermVector.NO, directory, false, 5); - - IndexReader reader = IndexReader.open(directory, true); + IndexReader reader = DirectoryReader.open(directory); + Weight weight = new TFIDF(); TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); - VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); boolean exceptionThrown; //0 percent tolerance - LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper); + LuceneIterable iterable = new LuceneIterable(reader, "id", "content", termInfo,weight); try { for (Object a : iterable) { } @@ -124,7 +123,7 @@ public final class LuceneIterableTest ex assertTrue(exceptionThrown); //100 percent tolerance - iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 1.0); + iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 1.0); try { for (Object a : iterable) { } @@ -136,7 +135,7 @@ public final class LuceneIterableTest ex assertFalse(exceptionThrown); //50 percent tolerance - iterable = new LuceneIterable(reader, "id", "content", mapper, -1, 0.5); + iterable = new LuceneIterable(reader, "id", "content", termInfo,weight, -1, 0.5); Iterator<Vector> iterator = iterable.iterator(); iterator.next(); iterator.next(); @@ -156,28 +155,27 @@ public final class LuceneIterableTest ex assertTrue(exceptionThrown); } - private static RAMDirectory createTestIndex(Field.TermVector termVector) throws IOException { + static RAMDirectory createTestIndex(Field.TermVector termVector) throws IOException { return createTestIndex(termVector, new RAMDirectory(), true, 0); } - private static RAMDirectory createTestIndex(Field.TermVector termVector, + static RAMDirectory createTestIndex(Field.TermVector termVector, RAMDirectory directory, boolean createNew, int startingId) throws IOException { - IndexWriter writer = new IndexWriter( - directory, - new StandardAnalyzer(Version.LUCENE_31), - createNew, - IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter writer = new IndexWriter( directory, new IndexWriterConfig(Version.LUCENE_41,new StandardAnalyzer(Version.LUCENE_41))); + try { for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); - Fieldable id = new Field("id", "doc_" + (i + startingId), Field.Store.YES, - Field.Index.NOT_ANALYZED_NO_NORMS); + Field id = new StringField("id", "doc_" + (i + startingId), Field.Store.YES); doc.add(id); //Store both position and offset information - Fieldable text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector); + //Says it is deprecated, but doesn't seem to offer an alternative that supports term vectors... + Field text = new Field("content", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector); doc.add(text); + Field text2 = new Field("content2", DOCS[i], Field.Store.NO, Field.Index.ANALYZED, termVector); + doc.add(text2); writer.addDocument(doc); } } finally { Modified: mahout/trunk/pom.xml URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1440334&r1=1440333&r2=1440334&view=diff ============================================================================== --- mahout/trunk/pom.xml (original) +++ mahout/trunk/pom.xml Wed Jan 30 10:27:17 2013 @@ -98,7 +98,7 @@ <maven.clover.multiproject>true</maven.clover.multiproject> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <hadoop.version>1.1.1</hadoop.version> - <lucene.version>3.6.0</lucene.version> + <lucene.version>4.1.0</lucene.version> </properties> <issueManagement> <system>Jira</system> @@ -167,7 +167,7 @@ <!-- 3rd party --> <dependency> <groupId>org.apache.lucene</groupId> - <artifactId>lucene-analyzers</artifactId> + <artifactId>lucene-analyzers-common</artifactId> <version>${lucene.version}</version> </dependency> <dependency>
