Lucene 4.9 gives much the same result.
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import
org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
public class LuceneMissingTerms {
public static void main(String[] args) throws Exception {
try (Directory directory = new RAMDirectory()) {
Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9);
try (IndexWriter writer = new IndexWriter(directory,
new IndexWriterConfig(Version.LUCENE_4_9, analyser))) {
Document document = new Document();
document.add(new TextField("content", "blah blah
commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO));
writer.addDocument(document);
}
try (IndexReader multiReader =
DirectoryReader.open(directory)) {
for (AtomicReaderContext atomicReaderContext :
multiReader.leaves()) {
AtomicReader reader = atomicReaderContext.reader();
Terms terms =
MultiFields.getFields(reader).terms("content");
TermsEnum termsEnum = terms.iterator(null);
BytesRef text;
//noinspection NestedAssignment
while ((text = termsEnum.next()) != null) {
System.out.println("term: " + text.utf8ToString());
Bits liveDocs = reader.getLiveDocs();
DocsAndPositionsEnum docsAndPositionsEnum
= termsEnum.docsAndPositions(liveDocs, null);
int doc;
//noinspection NestedAssignment
while ((doc =
docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
System.out.println(" doc: " + doc);
int freq = docsAndPositionsEnum.freq();
for (int i = 0; i < freq; i++) {
int pos =
docsAndPositionsEnum.nextPosition();
System.out.println(" pos: " + pos);
}
}
}
}
StandardQueryParser queryParser = new
StandardQueryParser(analyser);
queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
// quoted to work around strange behaviour of
StandardQueryParser treating this as a boolean query.
Query query =
queryParser.parse("\"\u79CB\u8449\u539F\"", "content");
System.out.println(query);
TopDocs topDocs = new
IndexSearcher(multiReader).search(query, 10);
System.out.println(topDocs.totalHits);
}
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]