This is an automated email from the ASF dual-hosted git repository.
xiangfu0 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new acf9d4ff5d3 Fix UnsupportedOperationException in SVScanDocIdIterator
for RAW forward index + separate dictionary (#18579)
acf9d4ff5d3 is described below
commit acf9d4ff5d32491d0a2d19055424e270999ae14a
Author: Chaitanya Deepthi <[email protected]>
AuthorDate: Mon May 25 20:31:55 2026 -0700
Fix UnsupportedOperationException in SVScanDocIdIterator for RAW forward
index + separate dictionary (#18579)
* Fix UnsupportedOperationException in SVScanDocIdIterator when forward
index is RAW + dictionary exists
When a column has encodingType=RAW but a separate dictionary is built for
secondary indexes
(INVERTED, FST, IFST, RANGE), dict-based predicate evaluators created by
FilterPlanNode
(e.g. DictIdBasedRegexpLikePredicateEvaluator,
IFSTBasedRegexpPredicateEvaluator) only
implement applySV(int dictId). The previous fallback in
SVScanDocIdIterator.getValueMatcher()
selected typed raw matchers (e.g. StringMatcher) which call
applySV(<rawType>) on the
predicate evaluator — those overloads in
BaseDictionaryBasedPredicateEvaluator throw
UnsupportedOperationException, crashing queries such as
`regexp_like(col, 'pattern', 'i')` or `LIKE 'pattern'` (which is internally
case-insensitive)
on external/iceberg-backed tables with this layout.
Add a dict-lookup matcher per stored type that reads the raw value from the
forward index,
translates it to a dict id via the separate dictionary, and applies the
predicate with
applySV(int dictId). Selected only when (a) the forward index reports
isDictionaryEncoded() == false, (b) the data source still exposes a
non-null Dictionary,
and (c) the predicate evaluator is a BaseDictionaryBasedPredicateEvaluator.
Existing
DictIdMatcher and typed raw matcher paths are unchanged.
Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
* Add SVScanDocIdIteratorTest for DictLookupMatcher path
Covers the RAW forward index + separate dictionary + dict-based predicate
evaluator
configuration that previously triggered UnsupportedOperationException. The
test stand-in
evaluator inherits the default applySV(String) that throws, so any
regression in the
matcher-selection logic surfaces as a test failure rather than silent data
corruption.
Two cases:
- Multiple matching dict ids return the correct doc ids in order.
- A raw value absent from the dictionary (indexOf returns -1) is treated as
no-match
without invoking applySV on the evaluator.
Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
---------
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
.../dociditerators/SVScanDocIdIterator.java | 115 ++++++++++++++++--
.../dociditerators/SVScanDocIdIteratorTest.java | 130 +++++++++++++++++++++
2 files changed, 237 insertions(+), 8 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
index da718f279c9..6ea08e74e64 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
@@ -18,14 +18,19 @@
*/
package org.apache.pinot.core.operator.dociditerators;
+import java.math.BigDecimal;
import java.util.Map;
import java.util.OptionalInt;
+import javax.annotation.Nullable;
import org.apache.pinot.core.common.BlockDocIdIterator;
+import
org.apache.pinot.core.operator.filter.predicate.BaseDictionaryBasedPredicateEvaluator;
import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
import org.apache.pinot.segment.spi.Constants;
import org.apache.pinot.segment.spi.datasource.DataSource;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext;
+import org.apache.pinot.spi.utils.ByteArray;
import org.roaringbitmap.BatchIterator;
import org.roaringbitmap.RoaringBitmapWriter;
import org.roaringbitmap.buffer.MutableRoaringBitmap;
@@ -40,6 +45,11 @@ public final class SVScanDocIdIterator implements
ScanBasedDocIdIterator {
private final PredicateEvaluator _predicateEvaluator;
private final ForwardIndexReader _reader;
private final ForwardIndexReaderContext _readerContext;
+ // Dictionary may exist even when the forward index is RAW (built only for
secondary indexes such as INVERTED /
+ // FST / IFST / RANGE). When set together with a dictionary-based predicate
evaluator, scans translate raw values
+ // to dict ids via this dictionary instead of calling applySV(<rawType>) on
the evaluator (which would throw).
+ @Nullable
+ private final Dictionary _dictionary;
private final int _numDocs;
private final ValueMatcher _valueMatcher;
private final int[] _batch;
@@ -56,6 +66,7 @@ public final class SVScanDocIdIterator implements
ScanBasedDocIdIterator {
_predicateEvaluator = predicateEvaluator;
_reader = dataSource.getForwardIndex();
_readerContext = _reader.createContext(queryOptions);
+ _dictionary = dataSource.getDictionary();
_numDocs = numDocs;
_valueMatcher = getValueMatcher();
_cardinality = dataSource.getDataSourceMetadata().getCardinality();
@@ -63,10 +74,17 @@ public final class SVScanDocIdIterator implements
ScanBasedDocIdIterator {
// for testing
public SVScanDocIdIterator(PredicateEvaluator predicateEvaluator,
ForwardIndexReader reader, int numDocs) {
+ this(predicateEvaluator, reader, numDocs, null);
+ }
+
+ // for testing
+ public SVScanDocIdIterator(PredicateEvaluator predicateEvaluator,
ForwardIndexReader reader, int numDocs,
+ @Nullable Dictionary dictionary) {
_batch = new int[BlockDocIdIterator.OPTIMAL_ITERATOR_BATCH_SIZE];
_predicateEvaluator = predicateEvaluator;
_reader = reader;
_readerContext = reader.createContext();
+ _dictionary = dictionary;
_numDocs = numDocs;
_valueMatcher = getValueMatcher();
_cardinality = -1;
@@ -166,26 +184,48 @@ public final class SVScanDocIdIterator implements
ScanBasedDocIdIterator {
private ValueMatcher getValueMatcher() {
if (_reader.isDictionaryEncoded()) {
return new DictIdMatcher();
- } else {
+ }
+ // RAW forward index, but a separate dictionary exists and the predicate
evaluator is dict-based. Translate raw
+ // values to dict ids on read so applySV(int dictId) can be used. This is
needed because dict-based evaluators
+ // (e.g. DictIdBasedRegexpLikePredicateEvaluator,
IFSTBasedRegexpPredicateEvaluator) only implement applySV(int).
+ if (_dictionary != null && _predicateEvaluator instanceof
BaseDictionaryBasedPredicateEvaluator) {
switch (_reader.getStoredType()) {
case INT:
- return new IntMatcher();
+ return new IntDictLookupMatcher();
case LONG:
- return new LongMatcher();
+ return new LongDictLookupMatcher();
case FLOAT:
- return new FloatMatcher();
+ return new FloatDictLookupMatcher();
case DOUBLE:
- return new DoubleMatcher();
+ return new DoubleDictLookupMatcher();
case BIG_DECIMAL:
- return new BigDecimalMatcher();
+ return new BigDecimalDictLookupMatcher();
case STRING:
- return new StringMatcher();
+ return new StringDictLookupMatcher();
case BYTES:
- return new BytesMatcher();
+ return new BytesDictLookupMatcher();
default:
throw new UnsupportedOperationException();
}
}
+ switch (_reader.getStoredType()) {
+ case INT:
+ return new IntMatcher();
+ case LONG:
+ return new LongMatcher();
+ case FLOAT:
+ return new FloatMatcher();
+ case DOUBLE:
+ return new DoubleMatcher();
+ case BIG_DECIMAL:
+ return new BigDecimalMatcher();
+ case STRING:
+ return new StringMatcher();
+ case BYTES:
+ return new BytesMatcher();
+ default:
+ throw new UnsupportedOperationException();
+ }
}
private interface ValueMatcher {
@@ -317,6 +357,65 @@ public final class SVScanDocIdIterator implements
ScanBasedDocIdIterator {
}
}
+ // ----- Dict-lookup matchers: forward index is RAW but a separate
dictionary exists. -----
+
+ private class IntDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ int dictId = _dictionary.indexOf(_reader.getInt(docId, _readerContext));
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
+ private class LongDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ int dictId = _dictionary.indexOf(_reader.getLong(docId, _readerContext));
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
+ private class FloatDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ int dictId = _dictionary.indexOf(_reader.getFloat(docId,
_readerContext));
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
+ private class DoubleDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ int dictId = _dictionary.indexOf(_reader.getDouble(docId,
_readerContext));
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
+ private class BigDecimalDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ BigDecimal value = _reader.getBigDecimal(docId, _readerContext);
+ int dictId = _dictionary.indexOf(value);
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
+ private class StringDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ int dictId = _dictionary.indexOf(_reader.getString(docId,
_readerContext));
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
+ private class BytesDictLookupMatcher implements ValueMatcher {
+ @Override
+ public boolean doesValueMatch(int docId) {
+ int dictId = _dictionary.indexOf(new ByteArray(_reader.getBytes(docId,
_readerContext)));
+ return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+ }
+ }
+
@Override
public void close() {
if (_readerContext != null) {
diff --git
a/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java
b/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java
new file mode 100644
index 00000000000..a2e499ee3da
--- /dev/null
+++
b/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.operator.dociditerators;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import org.apache.pinot.common.request.context.predicate.Predicate;
+import
org.apache.pinot.core.operator.filter.predicate.BaseDictionaryBasedPredicateEvaluator;
+import org.apache.pinot.segment.spi.Constants;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.mockito.Mockito;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+
+
+public class SVScanDocIdIteratorTest {
+
+ /// RAW forward index + separate dictionary + dictionary-based predicate
evaluator
+ /// (the layout used by external/iceberg-backed tables with
IFST/FST/inverted-needing-dict).
+ /// Without the DictLookupMatcher path, the scan would select
`StringMatcher` and call
+ /// `applySV(String)` on the dict-based evaluator, throwing
`UnsupportedOperationException`.
+ @Test
+ public void rawForwardWithDictAndDictBasedEvaluatorRoutesThroughDictLookup()
{
+ String[] values = {"apple", "banana", "cherry", "apple", "date", "banana"};
+ Map<String, Integer> dictEntries = new HashMap<>();
+ dictEntries.put("apple", 0);
+ dictEntries.put("banana", 1);
+ dictEntries.put("cherry", 2);
+ dictEntries.put("date", 3);
+
+ ForwardIndexReader reader = mockRawStringReader(values);
+ Dictionary dictionary = mockStringDictionary(dictEntries);
+ // Matches dict ids {0, 2} → values "apple" and "cherry".
+ BaseDictionaryBasedPredicateEvaluator evaluator = new
TestDictPredicateEvaluator(dictionary, 0, 2);
+
+ SVScanDocIdIterator iterator = new SVScanDocIdIterator(evaluator, reader,
values.length, dictionary);
+
+ assertEquals(iterator.next(), 0); // "apple"
+ assertEquals(iterator.next(), 2); // "cherry"
+ assertEquals(iterator.next(), 3); // "apple"
+ assertEquals(iterator.next(), Constants.EOF);
+ }
+
+ /// A raw value absent from the dictionary (`dict.indexOf(value)` returns
-1) must be treated as a no-match
+ /// without invoking `applySV` on the evaluator.
+ @Test
+ public void unknownValueDoesNotMatch() {
+ String[] values = {"apple", "ghost", "cherry"};
+ Map<String, Integer> dictEntries = new HashMap<>();
+ dictEntries.put("apple", 0);
+ dictEntries.put("cherry", 2);
+ // "ghost" is not present → dict.indexOf("ghost") == -1.
+
+ ForwardIndexReader reader = mockRawStringReader(values);
+ Dictionary dictionary = mockStringDictionary(dictEntries);
+ // Matches every known dict id; the only way "ghost" can match is if -1
leaks into applySV.
+ BaseDictionaryBasedPredicateEvaluator evaluator = new
TestDictPredicateEvaluator(dictionary, 0, 2);
+
+ SVScanDocIdIterator iterator = new SVScanDocIdIterator(evaluator, reader,
values.length, dictionary);
+
+ assertEquals(iterator.next(), 0);
+ assertEquals(iterator.next(), 2);
+ assertEquals(iterator.next(), Constants.EOF);
+ }
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ private static ForwardIndexReader mockRawStringReader(String[] values) {
+ ForwardIndexReader reader = Mockito.mock(ForwardIndexReader.class);
+ Mockito.when(reader.isDictionaryEncoded()).thenReturn(false);
+ Mockito.when(reader.getStoredType()).thenReturn(DataType.STRING);
+ Mockito.when(reader.createContext()).thenReturn(null);
+ Mockito.when(reader.getString(Mockito.anyInt(),
Mockito.any())).thenAnswer(invocation -> {
+ int docId = invocation.getArgument(0);
+ return values[docId];
+ });
+ return reader;
+ }
+
+ private static Dictionary mockStringDictionary(Map<String, Integer> entries)
{
+ Dictionary dictionary = Mockito.mock(Dictionary.class);
+ Mockito.when(dictionary.length()).thenReturn(entries.size());
+
Mockito.when(dictionary.indexOf(Mockito.anyString())).thenAnswer(invocation -> {
+ String value = invocation.getArgument(0);
+ Integer id = entries.get(value);
+ return id != null ? id : -1;
+ });
+ return dictionary;
+ }
+
+ /// Minimal stand-in for IFST/FST/DictId-based regex evaluators: matches a
fixed set of dict ids via
+ /// `applySV(int dictId)`. Inherits the default `applySV(String)` that
throws `UnsupportedOperationException`,
+ /// so the test fails loudly if the scan picks the wrong matcher.
+ private static class TestDictPredicateEvaluator extends
BaseDictionaryBasedPredicateEvaluator {
+ private final Set<Integer> _matchingDictIds;
+
+ TestDictPredicateEvaluator(Dictionary dictionary, int... matchingDictIds) {
+ super(Mockito.mock(Predicate.class), dictionary);
+ _matchingDictIds = new HashSet<>();
+ for (int id : matchingDictIds) {
+ _matchingDictIds.add(id);
+ }
+ }
+
+ @Override
+ public boolean applySV(int dictId) {
+ return _matchingDictIds.contains(dictId);
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]