Understanding lucene unique fields

Joan LLuís Planas Papió Tue, 26 Nov 2019 05:00:02 -0800

Hello,

I'm trying to understand why i'm getting duplicated results in the attached
java code.
Debugging the code it seems that they come from differents segments.
Increasing the RAMBufferSizeMB to a level that all the docs are in the same
segment seems to return only unique numbers.


There is any way to get unique documents in a bulk query without having to
cache then in a memory structure?

Thanks in advance!!

package com.king.lucifer.lucene;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;

public class NumbersExample {

    public static void main(String[] args) throws IOException {
        new NumbersExample().run();
    }

    private void run() throws IOException {
        FSDirectory dir = MMapDirectory.open(Paths.get("luceneTest"));
        IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        IndexWriter indexWriter = new IndexWriter(dir, config);
        SearcherManager searcherManager = new SearcherManager(indexWriter, null);

        long baseId = 9906843115L;
        int numberOfUniqueIds = 1_000_000;
        String ID_FIELD_NAME = "id";

        for (long id = 0; id < numberOfUniqueIds; id++) {
            String finalId = String.valueOf(baseId + id);
            Term term = new Term(ID_FIELD_NAME, finalId);
            Document document = new Document();
            document.add(new StringField(ID_FIELD_NAME, finalId, Field.Store.YES));
            indexWriter.updateDocument(term, document);
        }
        searcherManager.maybeRefreshBlocking();

        IndexSearcher indexSearcher = searcherManager.acquire();
        Query luceneQuery = new MatchAllDocsQuery();
        Set<String> ids = new HashSet<>();
        indexSearcher.search(luceneQuery, new SimpleCollector() {

            @Override
            public void collect(final int doc) throws IOException {
                ids.add(indexSearcher.doc(doc).getField(ID_FIELD_NAME).stringValue());
            }

            @Override
            public ScoreMode scoreMode() {
                return ScoreMode.TOP_SCORES;
            }
        });

        System.out.println(ids.size());
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Understanding lucene unique fields

Reply via email to