[ https://issues.apache.org/jira/browse/LUCENE-5905?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Trejkaz updated LUCENE-5905: ---------------------------- Affects Version/s: 5.2.1 Description: A document with the word 秋葉原 in the body, when analysed by the JapaneseAnalyzer (AKA Kuromoji), cannot be found when searching for the same text as a phrase query. Two programs are provided to reproduce the issue. Both programs print out the term docs and positions and then the result of parsing the phrase query. As shown by the output, at analysis time, there is a lone Japanese term "秋葉原". At query parsing time, there are *three* such terms - "秋葉" and "秋葉原" at position 0 and "原" at position 1. Because all terms must be present for a phrase query to be a match, the query never matches, which is quite a serious issue for us. *Any workarounds, no matter how hacky, would be extremely helpful at this point.* My guess is that this is a quirk with the analyser. If it happened with StandardAnalyzer, surely someone would have discovered it before I did. Lucene 5.2.1 reproduction: {code:java} import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; public class LuceneMissingTerms { public static void main(String[] args) throws Exception { try (Directory directory = new RAMDirectory()) { Analyzer analyser = new JapaneseAnalyzer(); try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyser))) { Document document = new Document(); document.add(new TextField("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); writer.addDocument(document); } try (IndexReader multiReader = DirectoryReader.open(directory)) { for (LeafReaderContext leaf : multiReader.leaves()) { LeafReader reader = leaf.reader(); Terms terms = MultiFields.getFields(reader).terms("content"); TermsEnum termsEnum = terms.iterator(); BytesRef text; //noinspection NestedAssignment while ((text = termsEnum.next()) != null) { System.out.println("term: " + text.utf8ToString()); Bits liveDocs = reader.getLiveDocs(); PostingsEnum postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS); int doc; //noinspection NestedAssignment while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println(" doc: " + doc); int freq = postingsEnum.freq(); for (int i = 0; i < freq; i++) { int pos = postingsEnum.nextPosition(); System.out.println(" pos: " + pos); } } } } StandardQueryParser queryParser = new StandardQueryParser(analyser); queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query. Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); System.out.println(query); TopDocs topDocs = new IndexSearcher(multiReader).search(query, 10); System.out.println(topDocs.totalHits); } } } } {code} Lucene 4.9 reproduction: {code:java} import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; public class LuceneMissingTerms { public static void main(String[] args) throws Exception { try (Directory directory = new RAMDirectory()) { Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9); try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_4_9, analyser))) { Document document = new Document(); document.add(new TextField("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); writer.addDocument(document); } try (IndexReader multiReader = DirectoryReader.open(directory)) { for (AtomicReaderContext atomicReaderContext : multiReader.leaves()) { AtomicReader reader = atomicReaderContext.reader(); Terms terms = MultiFields.getFields(reader).terms("content"); TermsEnum termsEnum = terms.iterator(null); BytesRef text; //noinspection NestedAssignment while ((text = termsEnum.next()) != null) { System.out.println("term: " + text.utf8ToString()); Bits liveDocs = reader.getLiveDocs(); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, null); int doc; //noinspection NestedAssignment while ((doc = docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println(" doc: " + doc); int freq = docsAndPositionsEnum.freq(); for (int i = 0; i < freq; i++) { int pos = docsAndPositionsEnum.nextPosition(); System.out.println(" pos: " + pos); } } } } StandardQueryParser queryParser = new StandardQueryParser(analyser); queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query. Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); System.out.println(query); TopDocs topDocs = new IndexSearcher(multiReader).search(query, 10); System.out.println(topDocs.totalHits); } } } } {code} Lucene 3.6.2 reproduction: {code:java} import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermPositions; import org.apache.lucene.queryParser.standard.StandardQueryParser; import org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import static org.hamcrest.Matchers.*; import static org.junit.Assert.*; public class TestJapaneseAnalysis { @Test public void testJapaneseAnalysis() throws Exception { try (Directory directory = new RAMDirectory()) { Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_36); try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, analyser))) { Document document = new Document(); document.add(new Field("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(document); } try (IndexReader reader = IndexReader.open(directory); TermEnum terms = reader.terms(new Term("content", "")); TermPositions termPositions = reader.termPositions()) { do { Term term = terms.term(); if (term.field() != "content") { break; } System.out.println(term); termPositions.seek(terms); while (termPositions.next()) { System.out.println(" " + termPositions.doc()); int freq = termPositions.freq(); for (int i = 0; i < freq; i++) { System.out.println(" " + termPositions.nextPosition()); } } } while (terms.next()); StandardQueryParser queryParser = new StandardQueryParser(analyser); queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query. Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); System.out.println(query); TopDocs topDocs = new IndexSearcher(reader).search(query, 10); assertThat(topDocs.totalHits, is(1)); } } } } {code} was: A document with the word 秋葉原 in the body, when analysed by the JapaneseAnalyzer (AKA Kuromoji), cannot be found when searching for the same text as a phrase query. Two programs are provided to reproduce the issue. Both programs print out the term docs and positions and then the result of parsing the phrase query. As shown by the output, at analysis time, there is a lone Japanese term "秋葉原". At query parsing time, there are *three* such terms - "秋葉" and "秋葉原" at position 0 and "原" at position 1. Because all terms must be present for a phrase query to be a match, the query never matches, which is quite a serious issue for us. *Any workarounds, no matter how hacky, would be extremely helpful at this point.* My guess is that this is a quirk with the analyser. If it happened with StandardAnalyzer, surely someone would have discovered it before I did. Lucene 3.6.2 reproduction: {code:java} import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermPositions; import org.apache.lucene.queryParser.standard.StandardQueryParser; import org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import static org.hamcrest.Matchers.*; import static org.junit.Assert.*; public class TestJapaneseAnalysis { @Test public void testJapaneseAnalysis() throws Exception { try (Directory directory = new RAMDirectory()) { Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_36); try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, analyser))) { Document document = new Document(); document.add(new Field("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(document); } try (IndexReader reader = IndexReader.open(directory); TermEnum terms = reader.terms(new Term("content", "")); TermPositions termPositions = reader.termPositions()) { do { Term term = terms.term(); if (term.field() != "content") { break; } System.out.println(term); termPositions.seek(terms); while (termPositions.next()) { System.out.println(" " + termPositions.doc()); int freq = termPositions.freq(); for (int i = 0; i < freq; i++) { System.out.println(" " + termPositions.nextPosition()); } } } while (terms.next()); StandardQueryParser queryParser = new StandardQueryParser(analyser); queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query. Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); System.out.println(query); TopDocs topDocs = new IndexSearcher(reader).search(query, 10); assertThat(topDocs.totalHits, is(1)); } } } } {code} Lucene 4.9 reproduction: {code:java} import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; public class LuceneMissingTerms { public static void main(String[] args) throws Exception { try (Directory directory = new RAMDirectory()) { Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9); try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_4_9, analyser))) { Document document = new Document(); document.add(new TextField("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); writer.addDocument(document); } try (IndexReader multiReader = DirectoryReader.open(directory)) { for (AtomicReaderContext atomicReaderContext : multiReader.leaves()) { AtomicReader reader = atomicReaderContext.reader(); Terms terms = MultiFields.getFields(reader).terms("content"); TermsEnum termsEnum = terms.iterator(null); BytesRef text; //noinspection NestedAssignment while ((text = termsEnum.next()) != null) { System.out.println("term: " + text.utf8ToString()); Bits liveDocs = reader.getLiveDocs(); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(liveDocs, null); int doc; //noinspection NestedAssignment while ((doc = docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println(" doc: " + doc); int freq = docsAndPositionsEnum.freq(); for (int i = 0; i < freq; i++) { int pos = docsAndPositionsEnum.nextPosition(); System.out.println(" pos: " + pos); } } } } StandardQueryParser queryParser = new StandardQueryParser(analyser); queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query. Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); System.out.println(query); TopDocs topDocs = new IndexSearcher(multiReader).search(query, 10); System.out.println(topDocs.totalHits); } } } } {code} Still occurs in Lucene 5.2.1. Updated the reproduction. > Different behaviour of JapaneseAnalyzer at indexing time vs. at search time > --------------------------------------------------------------------------- > > Key: LUCENE-5905 > URL: https://issues.apache.org/jira/browse/LUCENE-5905 > Project: Lucene - Core > Issue Type: Bug > Components: modules/analysis > Affects Versions: 3.6.2, 4.9, 5.2.1 > Environment: Java 8u5 > Reporter: Trejkaz > > A document with the word 秋葉原 in the body, when analysed by the > JapaneseAnalyzer (AKA Kuromoji), cannot be found when searching for the same > text as a phrase query. > Two programs are provided to reproduce the issue. Both programs print out the > term docs and positions and then the result of parsing the phrase query. > As shown by the output, at analysis time, there is a lone Japanese term > "秋葉原". At query parsing time, there are *three* such terms - "秋葉" and "秋葉原" > at position 0 and "原" at position 1. Because all terms must be present for a > phrase query to be a match, the query never matches, which is quite a serious > issue for us. > *Any workarounds, no matter how hacky, would be extremely helpful at this > point.* > My guess is that this is a quirk with the analyser. If it happened with > StandardAnalyzer, surely someone would have discovered it before I did. > Lucene 5.2.1 reproduction: > {code:java} > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.ja.JapaneseAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.document.TextField; > import org.apache.lucene.index.DirectoryReader; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.LeafReader; > import org.apache.lucene.index.LeafReaderContext; > import org.apache.lucene.index.MultiFields; > import org.apache.lucene.index.PostingsEnum; > import org.apache.lucene.index.Terms; > import org.apache.lucene.index.TermsEnum; > import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; > import > org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; > import org.apache.lucene.search.DocIdSetIterator; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.RAMDirectory; > import org.apache.lucene.util.Bits; > import org.apache.lucene.util.BytesRef; > public class LuceneMissingTerms { > public static void main(String[] args) throws Exception { > try (Directory directory = new RAMDirectory()) { > Analyzer analyser = new JapaneseAnalyzer(); > try (IndexWriter writer = new IndexWriter(directory, new > IndexWriterConfig(analyser))) { > Document document = new Document(); > document.add(new TextField("content", "blah blah commercial > blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); > writer.addDocument(document); > } > try (IndexReader multiReader = DirectoryReader.open(directory)) { > for (LeafReaderContext leaf : multiReader.leaves()) { > LeafReader reader = leaf.reader(); > Terms terms = > MultiFields.getFields(reader).terms("content"); > TermsEnum termsEnum = terms.iterator(); > BytesRef text; > //noinspection NestedAssignment > while ((text = termsEnum.next()) != null) { > System.out.println("term: " + text.utf8ToString()); > Bits liveDocs = reader.getLiveDocs(); > PostingsEnum postingsEnum = > termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS); > int doc; > //noinspection NestedAssignment > while ((doc = postingsEnum.nextDoc()) != > DocIdSetIterator.NO_MORE_DOCS) { > System.out.println(" doc: " + doc); > int freq = postingsEnum.freq(); > for (int i = 0; i < freq; i++) { > int pos = postingsEnum.nextPosition(); > System.out.println(" pos: " + pos); > } > } > } > } > StandardQueryParser queryParser = new > StandardQueryParser(analyser); > > queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); > // quoted to work around strange behaviour of > StandardQueryParser treating this as a boolean query. > Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", > "content"); > System.out.println(query); > TopDocs topDocs = new > IndexSearcher(multiReader).search(query, 10); > System.out.println(topDocs.totalHits); > } > } > } > } > {code} > Lucene 4.9 reproduction: > {code:java} > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.ja.JapaneseAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.document.TextField; > import org.apache.lucene.index.AtomicReader; > import org.apache.lucene.index.AtomicReaderContext; > import org.apache.lucene.index.DirectoryReader; > import org.apache.lucene.index.DocsAndPositionsEnum; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.MultiFields; > import org.apache.lucene.index.Terms; > import org.apache.lucene.index.TermsEnum; > import > org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; > import > org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; > import org.apache.lucene.search.DocIdSetIterator; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.RAMDirectory; > import org.apache.lucene.util.Bits; > import org.apache.lucene.util.BytesRef; > import org.apache.lucene.util.Version; > public class LuceneMissingTerms { > public static void main(String[] args) throws Exception { > try (Directory directory = new RAMDirectory()) { > Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9); > try (IndexWriter writer = new IndexWriter(directory, > new IndexWriterConfig(Version.LUCENE_4_9, analyser))) { > Document document = new Document(); > document.add(new TextField("content", "blah blah > commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); > writer.addDocument(document); > } > try (IndexReader multiReader = > DirectoryReader.open(directory)) { > for (AtomicReaderContext atomicReaderContext : > multiReader.leaves()) { > AtomicReader reader = atomicReaderContext.reader(); > Terms terms = > MultiFields.getFields(reader).terms("content"); > TermsEnum termsEnum = terms.iterator(null); > BytesRef text; > //noinspection NestedAssignment > while ((text = termsEnum.next()) != null) { > System.out.println("term: " + > text.utf8ToString()); > Bits liveDocs = reader.getLiveDocs(); > DocsAndPositionsEnum docsAndPositionsEnum > = termsEnum.docsAndPositions(liveDocs, null); > int doc; > //noinspection NestedAssignment > while ((doc = > docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { > System.out.println(" doc: " + doc); > int freq = docsAndPositionsEnum.freq(); > for (int i = 0; i < freq; i++) { > int pos = > docsAndPositionsEnum.nextPosition(); > System.out.println(" pos: " + pos); > } > } > } > } > StandardQueryParser queryParser = new > StandardQueryParser(analyser); > queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); > // quoted to work around strange behaviour of > StandardQueryParser treating this as a boolean query. > Query query = > queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); > System.out.println(query); > TopDocs topDocs = new > IndexSearcher(multiReader).search(query, 10); > System.out.println(topDocs.totalHits); > } > } > } > } > {code} > Lucene 3.6.2 reproduction: > {code:java} > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.ja.JapaneseAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.Term; > import org.apache.lucene.index.TermEnum; > import org.apache.lucene.index.TermPositions; > import org.apache.lucene.queryParser.standard.StandardQueryParser; > import > org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.RAMDirectory; > import org.apache.lucene.util.Version; > import org.junit.Test; > import static org.hamcrest.Matchers.*; > import static org.junit.Assert.*; > public class TestJapaneseAnalysis { > @Test > public void testJapaneseAnalysis() throws Exception { > try (Directory directory = new RAMDirectory()) { > Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_36); > try (IndexWriter writer = new IndexWriter(directory, > new IndexWriterConfig(Version.LUCENE_36, analyser))) { > Document document = new Document(); > document.add(new Field("content", "blah blah > commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO, > Field.Index.ANALYZED)); > writer.addDocument(document); > } > try (IndexReader reader = IndexReader.open(directory); > TermEnum terms = reader.terms(new Term("content", "")); > TermPositions termPositions = reader.termPositions()) { > do { > Term term = terms.term(); > if (term.field() != "content") { > break; > } > System.out.println(term); > termPositions.seek(terms); > while (termPositions.next()) { > System.out.println(" " + termPositions.doc()); > int freq = termPositions.freq(); > for (int i = 0; i < freq; i++) { > System.out.println(" " + > termPositions.nextPosition()); > } > } > } > while (terms.next()); > StandardQueryParser queryParser = new > StandardQueryParser(analyser); > queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); > // quoted to work around strange behaviour of > StandardQueryParser treating this as a boolean query. > Query query = > queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); > System.out.println(query); > TopDocs topDocs = new > IndexSearcher(reader).search(query, 10); > assertThat(topDocs.totalHits, is(1)); > } > } > } > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org