This is an automated email from the ASF dual-hosted git repository. Jackie-Jiang pushed a commit to branch hotfix_18588 in repository https://gitbox.apache.org/repos/asf/pinot.git
commit 9ccdea3d6193dcd70d9799547e5b9666cf32cea1 Author: Chaitanya Deepthi <[email protected]> AuthorDate: Mon May 25 20:31:55 2026 -0700 Fix UnsupportedOperationException in SVScanDocIdIterator for RAW forward index + separate dictionary (#18579) * Fix UnsupportedOperationException in SVScanDocIdIterator when forward index is RAW + dictionary exists When a column has encodingType=RAW but a separate dictionary is built for secondary indexes (INVERTED, FST, IFST, RANGE), dict-based predicate evaluators created by FilterPlanNode (e.g. DictIdBasedRegexpLikePredicateEvaluator, IFSTBasedRegexpPredicateEvaluator) only implement applySV(int dictId). The previous fallback in SVScanDocIdIterator.getValueMatcher() selected typed raw matchers (e.g. StringMatcher) which call applySV(<rawType>) on the predicate evaluator — those overloads in BaseDictionaryBasedPredicateEvaluator throw UnsupportedOperationException, crashing queries such as `regexp_like(col, 'pattern', 'i')` or `LIKE 'pattern'` (which is internally case-insensitive) on external/iceberg-backed tables with this layout. Add a dict-lookup matcher per stored type that reads the raw value from the forward index, translates it to a dict id via the separate dictionary, and applies the predicate with applySV(int dictId). Selected only when (a) the forward index reports isDictionaryEncoded() == false, (b) the data source still exposes a non-null Dictionary, and (c) the predicate evaluator is a BaseDictionaryBasedPredicateEvaluator. Existing DictIdMatcher and typed raw matcher paths are unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]> * Add SVScanDocIdIteratorTest for DictLookupMatcher path Covers the RAW forward index + separate dictionary + dict-based predicate evaluator configuration that previously triggered UnsupportedOperationException. The test stand-in evaluator inherits the default applySV(String) that throws, so any regression in the matcher-selection logic surfaces as a test failure rather than silent data corruption. Two cases: - Multiple matching dict ids return the correct doc ids in order. - A raw value absent from the dictionary (indexOf returns -1) is treated as no-match without invoking applySV on the evaluator. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]> --------- Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]> (cherry picked from commit acf9d4ff5d32491d0a2d19055424e270999ae14a) --- .../dociditerators/SVScanDocIdIterator.java | 115 ++++++++++++++++-- .../dociditerators/SVScanDocIdIteratorTest.java | 130 +++++++++++++++++++++ 2 files changed, 237 insertions(+), 8 deletions(-) diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java index da718f279c9..6ea08e74e64 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java @@ -18,14 +18,19 @@ */ package org.apache.pinot.core.operator.dociditerators; +import java.math.BigDecimal; import java.util.Map; import java.util.OptionalInt; +import javax.annotation.Nullable; import org.apache.pinot.core.common.BlockDocIdIterator; +import org.apache.pinot.core.operator.filter.predicate.BaseDictionaryBasedPredicateEvaluator; import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator; import org.apache.pinot.segment.spi.Constants; import org.apache.pinot.segment.spi.datasource.DataSource; +import org.apache.pinot.segment.spi.index.reader.Dictionary; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; +import org.apache.pinot.spi.utils.ByteArray; import org.roaringbitmap.BatchIterator; import org.roaringbitmap.RoaringBitmapWriter; import org.roaringbitmap.buffer.MutableRoaringBitmap; @@ -40,6 +45,11 @@ public final class SVScanDocIdIterator implements ScanBasedDocIdIterator { private final PredicateEvaluator _predicateEvaluator; private final ForwardIndexReader _reader; private final ForwardIndexReaderContext _readerContext; + // Dictionary may exist even when the forward index is RAW (built only for secondary indexes such as INVERTED / + // FST / IFST / RANGE). When set together with a dictionary-based predicate evaluator, scans translate raw values + // to dict ids via this dictionary instead of calling applySV(<rawType>) on the evaluator (which would throw). + @Nullable + private final Dictionary _dictionary; private final int _numDocs; private final ValueMatcher _valueMatcher; private final int[] _batch; @@ -56,6 +66,7 @@ public final class SVScanDocIdIterator implements ScanBasedDocIdIterator { _predicateEvaluator = predicateEvaluator; _reader = dataSource.getForwardIndex(); _readerContext = _reader.createContext(queryOptions); + _dictionary = dataSource.getDictionary(); _numDocs = numDocs; _valueMatcher = getValueMatcher(); _cardinality = dataSource.getDataSourceMetadata().getCardinality(); @@ -63,10 +74,17 @@ public final class SVScanDocIdIterator implements ScanBasedDocIdIterator { // for testing public SVScanDocIdIterator(PredicateEvaluator predicateEvaluator, ForwardIndexReader reader, int numDocs) { + this(predicateEvaluator, reader, numDocs, null); + } + + // for testing + public SVScanDocIdIterator(PredicateEvaluator predicateEvaluator, ForwardIndexReader reader, int numDocs, + @Nullable Dictionary dictionary) { _batch = new int[BlockDocIdIterator.OPTIMAL_ITERATOR_BATCH_SIZE]; _predicateEvaluator = predicateEvaluator; _reader = reader; _readerContext = reader.createContext(); + _dictionary = dictionary; _numDocs = numDocs; _valueMatcher = getValueMatcher(); _cardinality = -1; @@ -166,26 +184,48 @@ public final class SVScanDocIdIterator implements ScanBasedDocIdIterator { private ValueMatcher getValueMatcher() { if (_reader.isDictionaryEncoded()) { return new DictIdMatcher(); - } else { + } + // RAW forward index, but a separate dictionary exists and the predicate evaluator is dict-based. Translate raw + // values to dict ids on read so applySV(int dictId) can be used. This is needed because dict-based evaluators + // (e.g. DictIdBasedRegexpLikePredicateEvaluator, IFSTBasedRegexpPredicateEvaluator) only implement applySV(int). + if (_dictionary != null && _predicateEvaluator instanceof BaseDictionaryBasedPredicateEvaluator) { switch (_reader.getStoredType()) { case INT: - return new IntMatcher(); + return new IntDictLookupMatcher(); case LONG: - return new LongMatcher(); + return new LongDictLookupMatcher(); case FLOAT: - return new FloatMatcher(); + return new FloatDictLookupMatcher(); case DOUBLE: - return new DoubleMatcher(); + return new DoubleDictLookupMatcher(); case BIG_DECIMAL: - return new BigDecimalMatcher(); + return new BigDecimalDictLookupMatcher(); case STRING: - return new StringMatcher(); + return new StringDictLookupMatcher(); case BYTES: - return new BytesMatcher(); + return new BytesDictLookupMatcher(); default: throw new UnsupportedOperationException(); } } + switch (_reader.getStoredType()) { + case INT: + return new IntMatcher(); + case LONG: + return new LongMatcher(); + case FLOAT: + return new FloatMatcher(); + case DOUBLE: + return new DoubleMatcher(); + case BIG_DECIMAL: + return new BigDecimalMatcher(); + case STRING: + return new StringMatcher(); + case BYTES: + return new BytesMatcher(); + default: + throw new UnsupportedOperationException(); + } } private interface ValueMatcher { @@ -317,6 +357,65 @@ public final class SVScanDocIdIterator implements ScanBasedDocIdIterator { } } + // ----- Dict-lookup matchers: forward index is RAW but a separate dictionary exists. ----- + + private class IntDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + int dictId = _dictionary.indexOf(_reader.getInt(docId, _readerContext)); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + + private class LongDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + int dictId = _dictionary.indexOf(_reader.getLong(docId, _readerContext)); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + + private class FloatDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + int dictId = _dictionary.indexOf(_reader.getFloat(docId, _readerContext)); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + + private class DoubleDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + int dictId = _dictionary.indexOf(_reader.getDouble(docId, _readerContext)); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + + private class BigDecimalDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + BigDecimal value = _reader.getBigDecimal(docId, _readerContext); + int dictId = _dictionary.indexOf(value); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + + private class StringDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + int dictId = _dictionary.indexOf(_reader.getString(docId, _readerContext)); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + + private class BytesDictLookupMatcher implements ValueMatcher { + @Override + public boolean doesValueMatch(int docId) { + int dictId = _dictionary.indexOf(new ByteArray(_reader.getBytes(docId, _readerContext))); + return dictId >= 0 && _predicateEvaluator.applySV(dictId); + } + } + @Override public void close() { if (_readerContext != null) { diff --git a/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java b/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java new file mode 100644 index 00000000000..a2e499ee3da --- /dev/null +++ b/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.operator.dociditerators; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import org.apache.pinot.common.request.context.predicate.Predicate; +import org.apache.pinot.core.operator.filter.predicate.BaseDictionaryBasedPredicateEvaluator; +import org.apache.pinot.segment.spi.Constants; +import org.apache.pinot.segment.spi.index.reader.Dictionary; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +public class SVScanDocIdIteratorTest { + + /// RAW forward index + separate dictionary + dictionary-based predicate evaluator + /// (the layout used by external/iceberg-backed tables with IFST/FST/inverted-needing-dict). + /// Without the DictLookupMatcher path, the scan would select `StringMatcher` and call + /// `applySV(String)` on the dict-based evaluator, throwing `UnsupportedOperationException`. + @Test + public void rawForwardWithDictAndDictBasedEvaluatorRoutesThroughDictLookup() { + String[] values = {"apple", "banana", "cherry", "apple", "date", "banana"}; + Map<String, Integer> dictEntries = new HashMap<>(); + dictEntries.put("apple", 0); + dictEntries.put("banana", 1); + dictEntries.put("cherry", 2); + dictEntries.put("date", 3); + + ForwardIndexReader reader = mockRawStringReader(values); + Dictionary dictionary = mockStringDictionary(dictEntries); + // Matches dict ids {0, 2} → values "apple" and "cherry". + BaseDictionaryBasedPredicateEvaluator evaluator = new TestDictPredicateEvaluator(dictionary, 0, 2); + + SVScanDocIdIterator iterator = new SVScanDocIdIterator(evaluator, reader, values.length, dictionary); + + assertEquals(iterator.next(), 0); // "apple" + assertEquals(iterator.next(), 2); // "cherry" + assertEquals(iterator.next(), 3); // "apple" + assertEquals(iterator.next(), Constants.EOF); + } + + /// A raw value absent from the dictionary (`dict.indexOf(value)` returns -1) must be treated as a no-match + /// without invoking `applySV` on the evaluator. + @Test + public void unknownValueDoesNotMatch() { + String[] values = {"apple", "ghost", "cherry"}; + Map<String, Integer> dictEntries = new HashMap<>(); + dictEntries.put("apple", 0); + dictEntries.put("cherry", 2); + // "ghost" is not present → dict.indexOf("ghost") == -1. + + ForwardIndexReader reader = mockRawStringReader(values); + Dictionary dictionary = mockStringDictionary(dictEntries); + // Matches every known dict id; the only way "ghost" can match is if -1 leaks into applySV. + BaseDictionaryBasedPredicateEvaluator evaluator = new TestDictPredicateEvaluator(dictionary, 0, 2); + + SVScanDocIdIterator iterator = new SVScanDocIdIterator(evaluator, reader, values.length, dictionary); + + assertEquals(iterator.next(), 0); + assertEquals(iterator.next(), 2); + assertEquals(iterator.next(), Constants.EOF); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private static ForwardIndexReader mockRawStringReader(String[] values) { + ForwardIndexReader reader = Mockito.mock(ForwardIndexReader.class); + Mockito.when(reader.isDictionaryEncoded()).thenReturn(false); + Mockito.when(reader.getStoredType()).thenReturn(DataType.STRING); + Mockito.when(reader.createContext()).thenReturn(null); + Mockito.when(reader.getString(Mockito.anyInt(), Mockito.any())).thenAnswer(invocation -> { + int docId = invocation.getArgument(0); + return values[docId]; + }); + return reader; + } + + private static Dictionary mockStringDictionary(Map<String, Integer> entries) { + Dictionary dictionary = Mockito.mock(Dictionary.class); + Mockito.when(dictionary.length()).thenReturn(entries.size()); + Mockito.when(dictionary.indexOf(Mockito.anyString())).thenAnswer(invocation -> { + String value = invocation.getArgument(0); + Integer id = entries.get(value); + return id != null ? id : -1; + }); + return dictionary; + } + + /// Minimal stand-in for IFST/FST/DictId-based regex evaluators: matches a fixed set of dict ids via + /// `applySV(int dictId)`. Inherits the default `applySV(String)` that throws `UnsupportedOperationException`, + /// so the test fails loudly if the scan picks the wrong matcher. + private static class TestDictPredicateEvaluator extends BaseDictionaryBasedPredicateEvaluator { + private final Set<Integer> _matchingDictIds; + + TestDictPredicateEvaluator(Dictionary dictionary, int... matchingDictIds) { + super(Mockito.mock(Predicate.class), dictionary); + _matchingDictIds = new HashSet<>(); + for (int id : matchingDictIds) { + _matchingDictIds.add(id); + } + } + + @Override + public boolean applySV(int dictId) { + return _matchingDictIds.contains(dictId); + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
