Re: Filter based on the sum of values of two fields

Yann-Erwan Perio Tue, 26 Mar 2013 17:23:25 -0700

On Sun, Mar 24, 2013 at 10:46 AM, Wei Wang <[email protected]> wrote:


Hi,

> For example, assume we have fields F1 and F2, we would like to find
> all documents with condition F1+F2 > 5.0. This filter may be combined
> with other filters to form a BooleanFilter.
>
> The question is, is there any way to construct an efficient filter to do this?

I don't know - but the API looked interesting, so I gave it a try (see
below). I had never worked with search filters before writing that
code, so please proceed with caution, as I am not sure of many things
(iteration of all documents, treatment of deleted documents, what is
that "acceptDocs" variable, what threading constraints to respect...).

---
// add your package declaration


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.Ints;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;

public class FilterTest {

        private static final Version VERSION = Version.LUCENE_42;

        private static final String FIELD_ID = "id";
        private static final String FIELD_ALPHA = "alpha";
        private static final String FIELD_OMEGA = "omega";

        private static final int SUM_THRESHOLD = 5;
        private static final int[] VALUES_ALPHA = new int[] { 1, 2, 3, 4, 5 };
        private static final int[] VALUES_OMEGA = new int[] { 5, 0, 5, 0, 5 };
        private static final Set<Integer> EXPECTED_MATCHED_DOCUMENT_IDS = new
HashSet<Integer>(Arrays.asList(0, 2, 4));

        private Directory directory;

        @Before
        public void setUp() throws IOException {
                directory = new RAMDirectory();

                Analyzer analyzer = new StandardAnalyzer(VERSION);
                IndexWriterConfig config = new IndexWriterConfig(VERSION, 
analyzer);
                config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
                IndexWriter writer = new IndexWriter(directory, config);

                for (int ii = 0; ii < VALUES_ALPHA.length; ii++) {
                        Document doc = new Document();
                        Field id = new IntField(FIELD_ID, ii, 
IntField.Store.YES);
                        Field alpha = new IntField(FIELD_ALPHA, 
VALUES_ALPHA[ii],
IntField.Store.YES);
                        Field omega = new IntField(FIELD_OMEGA, 
VALUES_OMEGA[ii],
IntField.Store.YES);
                        doc.add(id);
                        doc.add(alpha);
                        doc.add(omega);
                        writer.addDocument(doc);
                }

                writer.close();
        }

        @Test
        public void testSumFilter() throws IOException {
                IndexReader reader = DirectoryReader.open(directory);
                IndexSearcher searcher = new IndexSearcher(reader);
                TopDocs results = searcher.search(new MatchAllDocsQuery(), new
SumFilter(SUM_THRESHOLD), VALUES_ALPHA.length);

                try {
                        assertEquals(EXPECTED_MATCHED_DOCUMENT_IDS.size(), 
results.totalHits);
                        for (int ii = 0; ii < results.scoreDocs.length; ii++) {
                                int docId = results.scoreDocs[ii].doc;
                                Document doc = reader.document(docId);
                                int idValue = 
doc.getField(FIELD_ID).numericValue().intValue();
                                int alphaValue = 
doc.getField(FIELD_ALPHA).numericValue().intValue();
                                int omegaValue = 
doc.getField(FIELD_OMEGA).numericValue().intValue();

                                
assertTrue(EXPECTED_MATCHED_DOCUMENT_IDS.contains(idValue));
                                assertTrue(alphaValue + omegaValue > 
SUM_THRESHOLD);
                        }
                } finally {
                        reader.close();
                }
        }

        private class SumFilter extends Filter {

                private int minValue;

                public SumFilter(int minValue) {
                        this.minValue = minValue;
                }

                @Override
                public DocIdSet getDocIdSet(AtomicReaderContext context, Bits
acceptDocs) throws IOException {
                        AtomicReader reader = context.reader();
                        Ints alphaCache = FieldCache.DEFAULT.getInts(reader, 
FIELD_ALPHA, false);
                        Ints omegaCache = FieldCache.DEFAULT.getInts(reader, 
FIELD_OMEGA, false);
                        SimpleDocIdSet docIdSet = new SimpleDocIdSet();

                        int maxDoc = reader.maxDoc();
                        for (int docId = 0; docId < maxDoc; docId++) {
                                int sum = alphaCache.get(docId) + 
omegaCache.get(docId);
                                if (sum > minValue) {
                                        docIdSet.add(docId);
                                }
                        }

                        return docIdSet;
                }
        }

        private class SimpleDocIdSet extends DocIdSet {

                private final TreeSet<Integer> sortedDocIdSet = new 
TreeSet<Integer>();

                public void add(int docId) {
                        sortedDocIdSet.add(docId);
                }

                @Override
                public DocIdSetIterator iterator() throws IOException {
                        return new DocIdSetIterator() {

                                private Iterator<Integer> 
sortedDocIdSetIterator =
sortedDocIdSet.iterator();
                                private int currentDocId = -1;

                                @Override
                                public int advance(int target) throws 
IOException {
                                        while ((currentDocId = nextDoc()) < 
target) {
                                        }
                                        return currentDocId;
                                }

                                @Override
                                public int docID() {
                                        if (currentDocId == -1) {
                                                return -1;
                                        }
                                        if (!sortedDocIdSetIterator.hasNext()) {
                                                return NO_MORE_DOCS;
                                        }
                                        return currentDocId;
                                }

                                @Override
                                public int nextDoc() throws IOException {
                                        if (!sortedDocIdSetIterator.hasNext()) {
                                                return NO_MORE_DOCS;
                                        }
                                        currentDocId = 
sortedDocIdSetIterator.next();
                                        return currentDocId;
                                }
                        };
                }
        }

}

---

Regards,
Yep.

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: Filter based on the sum of values of two fields

Reply via email to