[ https://issues.apache.org/jira/browse/LUCENE-5905?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Trejkaz updated LUCENE-5905: ---------------------------- Summary: Different behaviour of JapaneseAnalyzer at indexing time vs. at search time results in no matches for some words. (was: Different behaviour of JapaneseAnalyzer at indexing time vs. at search time) > Different behaviour of JapaneseAnalyzer at indexing time vs. at search time > results in no matches for some words. > ----------------------------------------------------------------------------------------------------------------- > > Key: LUCENE-5905 > URL: https://issues.apache.org/jira/browse/LUCENE-5905 > Project: Lucene - Core > Issue Type: Bug > Components: modules/analysis > Affects Versions: 3.6.2, 4.9, 5.2.1 > Environment: Java 8u5 > Reporter: Trejkaz > > A document with the word 秋葉原 in the body, when analysed by the > JapaneseAnalyzer (AKA Kuromoji), cannot be found when searching for the same > text as a phrase query. > Two programs are provided to reproduce the issue. Both programs print out the > term docs and positions and then the result of parsing the phrase query. > As shown by the output, at analysis time, there is a lone Japanese term > "秋葉原". At query parsing time, there are *three* such terms - "秋葉" and "秋葉原" > at position 0 and "原" at position 1. Because all terms must be present for a > phrase query to be a match, the query never matches, which is quite a serious > issue for us. > *Any workarounds, no matter how hacky, would be extremely helpful at this > point.* > My guess is that this is a quirk with the analyser. If it happened with > StandardAnalyzer, surely someone would have discovered it before I did. > Lucene 5.2.1 reproduction: > {code:java} > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.ja.JapaneseAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.document.TextField; > import org.apache.lucene.index.DirectoryReader; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.LeafReader; > import org.apache.lucene.index.LeafReaderContext; > import org.apache.lucene.index.MultiFields; > import org.apache.lucene.index.PostingsEnum; > import org.apache.lucene.index.Terms; > import org.apache.lucene.index.TermsEnum; > import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; > import > org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; > import org.apache.lucene.search.DocIdSetIterator; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.RAMDirectory; > import org.apache.lucene.util.Bits; > import org.apache.lucene.util.BytesRef; > public class LuceneMissingTerms { > public static void main(String[] args) throws Exception { > try (Directory directory = new RAMDirectory()) { > Analyzer analyser = new JapaneseAnalyzer(); > try (IndexWriter writer = new IndexWriter(directory, new > IndexWriterConfig(analyser))) { > Document document = new Document(); > document.add(new TextField("content", "blah blah commercial > blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); > writer.addDocument(document); > } > try (IndexReader multiReader = DirectoryReader.open(directory)) { > for (LeafReaderContext leaf : multiReader.leaves()) { > LeafReader reader = leaf.reader(); > Terms terms = > MultiFields.getFields(reader).terms("content"); > TermsEnum termsEnum = terms.iterator(); > BytesRef text; > //noinspection NestedAssignment > while ((text = termsEnum.next()) != null) { > System.out.println("term: " + text.utf8ToString()); > Bits liveDocs = reader.getLiveDocs(); > PostingsEnum postingsEnum = > termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS); > int doc; > //noinspection NestedAssignment > while ((doc = postingsEnum.nextDoc()) != > DocIdSetIterator.NO_MORE_DOCS) { > System.out.println(" doc: " + doc); > int freq = postingsEnum.freq(); > for (int i = 0; i < freq; i++) { > int pos = postingsEnum.nextPosition(); > System.out.println(" pos: " + pos); > } > } > } > } > StandardQueryParser queryParser = new > StandardQueryParser(analyser); > > queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); > // quoted to work around strange behaviour of > StandardQueryParser treating this as a boolean query. > Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", > "content"); > System.out.println(query); > TopDocs topDocs = new > IndexSearcher(multiReader).search(query, 10); > System.out.println(topDocs.totalHits); > } > } > } > } > {code} > Lucene 4.9 reproduction: > {code:java} > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.ja.JapaneseAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.document.TextField; > import org.apache.lucene.index.AtomicReader; > import org.apache.lucene.index.AtomicReaderContext; > import org.apache.lucene.index.DirectoryReader; > import org.apache.lucene.index.DocsAndPositionsEnum; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.MultiFields; > import org.apache.lucene.index.Terms; > import org.apache.lucene.index.TermsEnum; > import > org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; > import > org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; > import org.apache.lucene.search.DocIdSetIterator; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.RAMDirectory; > import org.apache.lucene.util.Bits; > import org.apache.lucene.util.BytesRef; > import org.apache.lucene.util.Version; > public class LuceneMissingTerms { > public static void main(String[] args) throws Exception { > try (Directory directory = new RAMDirectory()) { > Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9); > try (IndexWriter writer = new IndexWriter(directory, > new IndexWriterConfig(Version.LUCENE_4_9, analyser))) { > Document document = new Document(); > document.add(new TextField("content", "blah blah > commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO)); > writer.addDocument(document); > } > try (IndexReader multiReader = > DirectoryReader.open(directory)) { > for (AtomicReaderContext atomicReaderContext : > multiReader.leaves()) { > AtomicReader reader = atomicReaderContext.reader(); > Terms terms = > MultiFields.getFields(reader).terms("content"); > TermsEnum termsEnum = terms.iterator(null); > BytesRef text; > //noinspection NestedAssignment > while ((text = termsEnum.next()) != null) { > System.out.println("term: " + > text.utf8ToString()); > Bits liveDocs = reader.getLiveDocs(); > DocsAndPositionsEnum docsAndPositionsEnum > = termsEnum.docsAndPositions(liveDocs, null); > int doc; > //noinspection NestedAssignment > while ((doc = > docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { > System.out.println(" doc: " + doc); > int freq = docsAndPositionsEnum.freq(); > for (int i = 0; i < freq; i++) { > int pos = > docsAndPositionsEnum.nextPosition(); > System.out.println(" pos: " + pos); > } > } > } > } > StandardQueryParser queryParser = new > StandardQueryParser(analyser); > queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); > // quoted to work around strange behaviour of > StandardQueryParser treating this as a boolean query. > Query query = > queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); > System.out.println(query); > TopDocs topDocs = new > IndexSearcher(multiReader).search(query, 10); > System.out.println(topDocs.totalHits); > } > } > } > } > {code} > Lucene 3.6.2 reproduction: > {code:java} > import org.apache.lucene.analysis.Analyzer; > import org.apache.lucene.analysis.ja.JapaneseAnalyzer; > import org.apache.lucene.document.Document; > import org.apache.lucene.document.Field; > import org.apache.lucene.index.IndexReader; > import org.apache.lucene.index.IndexWriter; > import org.apache.lucene.index.IndexWriterConfig; > import org.apache.lucene.index.Term; > import org.apache.lucene.index.TermEnum; > import org.apache.lucene.index.TermPositions; > import org.apache.lucene.queryParser.standard.StandardQueryParser; > import > org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler; > import org.apache.lucene.search.IndexSearcher; > import org.apache.lucene.search.Query; > import org.apache.lucene.search.TopDocs; > import org.apache.lucene.store.Directory; > import org.apache.lucene.store.RAMDirectory; > import org.apache.lucene.util.Version; > import org.junit.Test; > import static org.hamcrest.Matchers.*; > import static org.junit.Assert.*; > public class TestJapaneseAnalysis { > @Test > public void testJapaneseAnalysis() throws Exception { > try (Directory directory = new RAMDirectory()) { > Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_36); > try (IndexWriter writer = new IndexWriter(directory, > new IndexWriterConfig(Version.LUCENE_36, analyser))) { > Document document = new Document(); > document.add(new Field("content", "blah blah > commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO, > Field.Index.ANALYZED)); > writer.addDocument(document); > } > try (IndexReader reader = IndexReader.open(directory); > TermEnum terms = reader.terms(new Term("content", "")); > TermPositions termPositions = reader.termPositions()) { > do { > Term term = terms.term(); > if (term.field() != "content") { > break; > } > System.out.println(term); > termPositions.seek(terms); > while (termPositions.next()) { > System.out.println(" " + termPositions.doc()); > int freq = termPositions.freq(); > for (int i = 0; i < freq; i++) { > System.out.println(" " + > termPositions.nextPosition()); > } > } > } > while (terms.next()); > StandardQueryParser queryParser = new > StandardQueryParser(analyser); > queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); > // quoted to work around strange behaviour of > StandardQueryParser treating this as a boolean query. > Query query = > queryParser.parse("\"\u79CB\u8449\u539F\"", "content"); > System.out.println(query); > TopDocs topDocs = new > IndexSearcher(reader).search(query, 10); > assertThat(topDocs.totalHits, is(1)); > } > } > } > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org