(pinot) branch master updated: Fix UnsupportedOperationException in SVScanDocIdIterator for RAW forward index + separate dictionary (#18579)

xiangfu Mon, 25 May 2026 20:32:10 -0700

This is an automated email from the ASF dual-hosted git repository.

xiangfu0 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new acf9d4ff5d3 Fix UnsupportedOperationException in SVScanDocIdIterator 
for RAW forward index + separate dictionary (#18579)
acf9d4ff5d3 is described below

commit acf9d4ff5d32491d0a2d19055424e270999ae14a
Author: Chaitanya Deepthi <[email protected]>
AuthorDate: Mon May 25 20:31:55 2026 -0700

    Fix UnsupportedOperationException in SVScanDocIdIterator for RAW forward 
index + separate dictionary (#18579)
    
    * Fix UnsupportedOperationException in SVScanDocIdIterator when forward 
index is RAW + dictionary exists
    
    When a column has encodingType=RAW but a separate dictionary is built for 
secondary indexes
    (INVERTED, FST, IFST, RANGE), dict-based predicate evaluators created by 
FilterPlanNode
    (e.g. DictIdBasedRegexpLikePredicateEvaluator, 
IFSTBasedRegexpPredicateEvaluator) only
    implement applySV(int dictId). The previous fallback in 
SVScanDocIdIterator.getValueMatcher()
    selected typed raw matchers (e.g. StringMatcher) which call 
applySV(<rawType>) on the
    predicate evaluator — those overloads in 
BaseDictionaryBasedPredicateEvaluator throw
    UnsupportedOperationException, crashing queries such as
    `regexp_like(col, 'pattern', 'i')` or `LIKE 'pattern'` (which is internally 
case-insensitive)
    on external/iceberg-backed tables with this layout.
    
    Add a dict-lookup matcher per stored type that reads the raw value from the 
forward index,
    translates it to a dict id via the separate dictionary, and applies the 
predicate with
    applySV(int dictId). Selected only when (a) the forward index reports
    isDictionaryEncoded() == false, (b) the data source still exposes a 
non-null Dictionary,
    and (c) the predicate evaluator is a BaseDictionaryBasedPredicateEvaluator. 
Existing
    DictIdMatcher and typed raw matcher paths are unchanged.
    
    Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
    
    * Add SVScanDocIdIteratorTest for DictLookupMatcher path
    
    Covers the RAW forward index + separate dictionary + dict-based predicate 
evaluator
    configuration that previously triggered UnsupportedOperationException. The 
test stand-in
    evaluator inherits the default applySV(String) that throws, so any 
regression in the
    matcher-selection logic surfaces as a test failure rather than silent data 
corruption.
    
    Two cases:
    - Multiple matching dict ids return the correct doc ids in order.
    - A raw value absent from the dictionary (indexOf returns -1) is treated as 
no-match
      without invoking applySV on the evaluator.
    
    Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
    
    ---------
    
    Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
 .../dociditerators/SVScanDocIdIterator.java        | 115 ++++++++++++++++--
 .../dociditerators/SVScanDocIdIteratorTest.java    | 130 +++++++++++++++++++++
 2 files changed, 237 insertions(+), 8 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
index da718f279c9..6ea08e74e64 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIterator.java
@@ -18,14 +18,19 @@
  */
 package org.apache.pinot.core.operator.dociditerators;
 
+import java.math.BigDecimal;
 import java.util.Map;
 import java.util.OptionalInt;
+import javax.annotation.Nullable;
 import org.apache.pinot.core.common.BlockDocIdIterator;
+import 
org.apache.pinot.core.operator.filter.predicate.BaseDictionaryBasedPredicateEvaluator;
 import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
 import org.apache.pinot.segment.spi.Constants;
 import org.apache.pinot.segment.spi.datasource.DataSource;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
 import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
 import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext;
+import org.apache.pinot.spi.utils.ByteArray;
 import org.roaringbitmap.BatchIterator;
 import org.roaringbitmap.RoaringBitmapWriter;
 import org.roaringbitmap.buffer.MutableRoaringBitmap;
@@ -40,6 +45,11 @@ public final class SVScanDocIdIterator implements 
ScanBasedDocIdIterator {
   private final PredicateEvaluator _predicateEvaluator;
   private final ForwardIndexReader _reader;
   private final ForwardIndexReaderContext _readerContext;
+  // Dictionary may exist even when the forward index is RAW (built only for 
secondary indexes such as INVERTED /
+  // FST / IFST / RANGE). When set together with a dictionary-based predicate 
evaluator, scans translate raw values
+  // to dict ids via this dictionary instead of calling applySV(<rawType>) on 
the evaluator (which would throw).
+  @Nullable
+  private final Dictionary _dictionary;
   private final int _numDocs;
   private final ValueMatcher _valueMatcher;
   private final int[] _batch;
@@ -56,6 +66,7 @@ public final class SVScanDocIdIterator implements 
ScanBasedDocIdIterator {
     _predicateEvaluator = predicateEvaluator;
     _reader = dataSource.getForwardIndex();
     _readerContext = _reader.createContext(queryOptions);
+    _dictionary = dataSource.getDictionary();
     _numDocs = numDocs;
     _valueMatcher = getValueMatcher();
     _cardinality = dataSource.getDataSourceMetadata().getCardinality();
@@ -63,10 +74,17 @@ public final class SVScanDocIdIterator implements 
ScanBasedDocIdIterator {
 
   // for testing
   public SVScanDocIdIterator(PredicateEvaluator predicateEvaluator, 
ForwardIndexReader reader, int numDocs) {
+    this(predicateEvaluator, reader, numDocs, null);
+  }
+
+  // for testing
+  public SVScanDocIdIterator(PredicateEvaluator predicateEvaluator, 
ForwardIndexReader reader, int numDocs,
+      @Nullable Dictionary dictionary) {
     _batch = new int[BlockDocIdIterator.OPTIMAL_ITERATOR_BATCH_SIZE];
     _predicateEvaluator = predicateEvaluator;
     _reader = reader;
     _readerContext = reader.createContext();
+    _dictionary = dictionary;
     _numDocs = numDocs;
     _valueMatcher = getValueMatcher();
     _cardinality = -1;
@@ -166,26 +184,48 @@ public final class SVScanDocIdIterator implements 
ScanBasedDocIdIterator {
   private ValueMatcher getValueMatcher() {
     if (_reader.isDictionaryEncoded()) {
       return new DictIdMatcher();
-    } else {
+    }
+    // RAW forward index, but a separate dictionary exists and the predicate 
evaluator is dict-based. Translate raw
+    // values to dict ids on read so applySV(int dictId) can be used. This is 
needed because dict-based evaluators
+    // (e.g. DictIdBasedRegexpLikePredicateEvaluator, 
IFSTBasedRegexpPredicateEvaluator) only implement applySV(int).
+    if (_dictionary != null && _predicateEvaluator instanceof 
BaseDictionaryBasedPredicateEvaluator) {
       switch (_reader.getStoredType()) {
         case INT:
-          return new IntMatcher();
+          return new IntDictLookupMatcher();
         case LONG:
-          return new LongMatcher();
+          return new LongDictLookupMatcher();
         case FLOAT:
-          return new FloatMatcher();
+          return new FloatDictLookupMatcher();
         case DOUBLE:
-          return new DoubleMatcher();
+          return new DoubleDictLookupMatcher();
         case BIG_DECIMAL:
-          return new BigDecimalMatcher();
+          return new BigDecimalDictLookupMatcher();
         case STRING:
-          return new StringMatcher();
+          return new StringDictLookupMatcher();
         case BYTES:
-          return new BytesMatcher();
+          return new BytesDictLookupMatcher();
         default:
           throw new UnsupportedOperationException();
       }
     }
+    switch (_reader.getStoredType()) {
+      case INT:
+        return new IntMatcher();
+      case LONG:
+        return new LongMatcher();
+      case FLOAT:
+        return new FloatMatcher();
+      case DOUBLE:
+        return new DoubleMatcher();
+      case BIG_DECIMAL:
+        return new BigDecimalMatcher();
+      case STRING:
+        return new StringMatcher();
+      case BYTES:
+        return new BytesMatcher();
+      default:
+        throw new UnsupportedOperationException();
+    }
   }
 
   private interface ValueMatcher {
@@ -317,6 +357,65 @@ public final class SVScanDocIdIterator implements 
ScanBasedDocIdIterator {
     }
   }
 
+  // ----- Dict-lookup matchers: forward index is RAW but a separate 
dictionary exists. -----
+
+  private class IntDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      int dictId = _dictionary.indexOf(_reader.getInt(docId, _readerContext));
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
+  private class LongDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      int dictId = _dictionary.indexOf(_reader.getLong(docId, _readerContext));
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
+  private class FloatDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      int dictId = _dictionary.indexOf(_reader.getFloat(docId, 
_readerContext));
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
+  private class DoubleDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      int dictId = _dictionary.indexOf(_reader.getDouble(docId, 
_readerContext));
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
+  private class BigDecimalDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      BigDecimal value = _reader.getBigDecimal(docId, _readerContext);
+      int dictId = _dictionary.indexOf(value);
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
+  private class StringDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      int dictId = _dictionary.indexOf(_reader.getString(docId, 
_readerContext));
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
+  private class BytesDictLookupMatcher implements ValueMatcher {
+    @Override
+    public boolean doesValueMatch(int docId) {
+      int dictId = _dictionary.indexOf(new ByteArray(_reader.getBytes(docId, 
_readerContext)));
+      return dictId >= 0 && _predicateEvaluator.applySV(dictId);
+    }
+  }
+
   @Override
   public void close() {
     if (_readerContext != null) {
diff --git 
a/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java
 
b/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java
new file mode 100644
index 00000000000..a2e499ee3da
--- /dev/null
+++ 
b/pinot-core/src/test/java/org/apache/pinot/core/operator/dociditerators/SVScanDocIdIteratorTest.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.operator.dociditerators;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import org.apache.pinot.common.request.context.predicate.Predicate;
+import 
org.apache.pinot.core.operator.filter.predicate.BaseDictionaryBasedPredicateEvaluator;
+import org.apache.pinot.segment.spi.Constants;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.mockito.Mockito;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+
+
+public class SVScanDocIdIteratorTest {
+
+  /// RAW forward index + separate dictionary + dictionary-based predicate 
evaluator
+  /// (the layout used by external/iceberg-backed tables with 
IFST/FST/inverted-needing-dict).
+  /// Without the DictLookupMatcher path, the scan would select 
`StringMatcher` and call
+  /// `applySV(String)` on the dict-based evaluator, throwing 
`UnsupportedOperationException`.
+  @Test
+  public void rawForwardWithDictAndDictBasedEvaluatorRoutesThroughDictLookup() 
{
+    String[] values = {"apple", "banana", "cherry", "apple", "date", "banana"};
+    Map<String, Integer> dictEntries = new HashMap<>();
+    dictEntries.put("apple", 0);
+    dictEntries.put("banana", 1);
+    dictEntries.put("cherry", 2);
+    dictEntries.put("date", 3);
+
+    ForwardIndexReader reader = mockRawStringReader(values);
+    Dictionary dictionary = mockStringDictionary(dictEntries);
+    // Matches dict ids {0, 2} → values "apple" and "cherry".
+    BaseDictionaryBasedPredicateEvaluator evaluator = new 
TestDictPredicateEvaluator(dictionary, 0, 2);
+
+    SVScanDocIdIterator iterator = new SVScanDocIdIterator(evaluator, reader, 
values.length, dictionary);
+
+    assertEquals(iterator.next(), 0);   // "apple"
+    assertEquals(iterator.next(), 2);   // "cherry"
+    assertEquals(iterator.next(), 3);   // "apple"
+    assertEquals(iterator.next(), Constants.EOF);
+  }
+
+  /// A raw value absent from the dictionary (`dict.indexOf(value)` returns 
-1) must be treated as a no-match
+  /// without invoking `applySV` on the evaluator.
+  @Test
+  public void unknownValueDoesNotMatch() {
+    String[] values = {"apple", "ghost", "cherry"};
+    Map<String, Integer> dictEntries = new HashMap<>();
+    dictEntries.put("apple", 0);
+    dictEntries.put("cherry", 2);
+    // "ghost" is not present → dict.indexOf("ghost") == -1.
+
+    ForwardIndexReader reader = mockRawStringReader(values);
+    Dictionary dictionary = mockStringDictionary(dictEntries);
+    // Matches every known dict id; the only way "ghost" can match is if -1 
leaks into applySV.
+    BaseDictionaryBasedPredicateEvaluator evaluator = new 
TestDictPredicateEvaluator(dictionary, 0, 2);
+
+    SVScanDocIdIterator iterator = new SVScanDocIdIterator(evaluator, reader, 
values.length, dictionary);
+
+    assertEquals(iterator.next(), 0);
+    assertEquals(iterator.next(), 2);
+    assertEquals(iterator.next(), Constants.EOF);
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  private static ForwardIndexReader mockRawStringReader(String[] values) {
+    ForwardIndexReader reader = Mockito.mock(ForwardIndexReader.class);
+    Mockito.when(reader.isDictionaryEncoded()).thenReturn(false);
+    Mockito.when(reader.getStoredType()).thenReturn(DataType.STRING);
+    Mockito.when(reader.createContext()).thenReturn(null);
+    Mockito.when(reader.getString(Mockito.anyInt(), 
Mockito.any())).thenAnswer(invocation -> {
+      int docId = invocation.getArgument(0);
+      return values[docId];
+    });
+    return reader;
+  }
+
+  private static Dictionary mockStringDictionary(Map<String, Integer> entries) 
{
+    Dictionary dictionary = Mockito.mock(Dictionary.class);
+    Mockito.when(dictionary.length()).thenReturn(entries.size());
+    
Mockito.when(dictionary.indexOf(Mockito.anyString())).thenAnswer(invocation -> {
+      String value = invocation.getArgument(0);
+      Integer id = entries.get(value);
+      return id != null ? id : -1;
+    });
+    return dictionary;
+  }
+
+  /// Minimal stand-in for IFST/FST/DictId-based regex evaluators: matches a 
fixed set of dict ids via
+  /// `applySV(int dictId)`. Inherits the default `applySV(String)` that 
throws `UnsupportedOperationException`,
+  /// so the test fails loudly if the scan picks the wrong matcher.
+  private static class TestDictPredicateEvaluator extends 
BaseDictionaryBasedPredicateEvaluator {
+    private final Set<Integer> _matchingDictIds;
+
+    TestDictPredicateEvaluator(Dictionary dictionary, int... matchingDictIds) {
+      super(Mockito.mock(Predicate.class), dictionary);
+      _matchingDictIds = new HashSet<>();
+      for (int id : matchingDictIds) {
+        _matchingDictIds.add(id);
+      }
+    }
+
+    @Override
+    public boolean applySV(int dictId) {
+      return _matchingDictIds.contains(dictId);
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(pinot) branch master updated: Fix UnsupportedOperationException in SVScanDocIdIterator for RAW forward index + separate dictionary (#18579)

Reply via email to