Hello,

I'm trying to understand why i'm getting duplicated results in the attached
java code.
Debugging the code it seems that they come from differents segments.
Increasing the RAMBufferSizeMB to a level that all the docs are in the same
segment seems to return only unique numbers.

There is any way to get unique documents in a bulk query without having to
cache then in a memory structure?

Thanks in advance!!
package com.king.lucifer.lucene;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.MMapDirectory;

public class NumbersExample {

    public static void main(String[] args) throws IOException {
        new NumbersExample().run();
    }

    private void run() throws IOException {
        FSDirectory dir = MMapDirectory.open(Paths.get("luceneTest"));
        IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        IndexWriter indexWriter = new IndexWriter(dir, config);
        SearcherManager searcherManager = new SearcherManager(indexWriter, null);

        long baseId = 9906843115L;
        int numberOfUniqueIds = 1_000_000;
        String ID_FIELD_NAME = "id";

        for (long id = 0; id < numberOfUniqueIds; id++) {
            String finalId = String.valueOf(baseId + id);
            Term term = new Term(ID_FIELD_NAME, finalId);
            Document document = new Document();
            document.add(new StringField(ID_FIELD_NAME, finalId, Field.Store.YES));
            indexWriter.updateDocument(term, document);
        }
        searcherManager.maybeRefreshBlocking();

        IndexSearcher indexSearcher = searcherManager.acquire();
        Query luceneQuery = new MatchAllDocsQuery();
        Set<String> ids = new HashSet<>();
        indexSearcher.search(luceneQuery, new SimpleCollector() {

            @Override
            public void collect(final int doc) throws IOException {
                ids.add(indexSearcher.doc(doc).getField(ID_FIELD_NAME).stringValue());
            }

            @Override
            public ScoreMode scoreMode() {
                return ScoreMode.TOP_SCORES;
            }
        });

        System.out.println(ids.size());
    }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to