This is an automated email from the ASF dual-hosted git repository.
Jackie-Jiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 15a4da0267e Fall back to raw-value REGEXP_LIKE evaluator when no
dict-consuming index is available (#18599)
15a4da0267e is described below
commit 15a4da0267e683d082d4bfb66ba89cf7d2c065e2
Author: Chaitanya Deepthi <[email protected]>
AuthorDate: Thu May 28 11:39:54 2026 -0700
Fall back to raw-value REGEXP_LIKE evaluator when no dict-consuming index
is available (#18599)
---
.../predicate/PredicateEvaluatorProvider.java | 32 +++--
.../org/apache/pinot/core/plan/FilterPlanNode.java | 28 +----
.../apache/pinot/core/plan/FilterPlanNodeTest.java | 138 +++++++++++++++++++++
3 files changed, 166 insertions(+), 32 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
index 9a4662d99bb..822a61b05c9 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
@@ -44,26 +44,28 @@ public class PredicateEvaluatorProvider {
/// Builds a [PredicateEvaluator] for a leaf filter on the column backed by
`dataSource`. The dictionary is derived
/// via [#getDictionaryUsableForFiltering], which keeps it only when a
dict-consuming filter operator (inverted /
/// exact range) will actually run for this predicate type on the column's
forward-index encoding. The data type is
- /// taken from the data-source metadata.
+ /// taken from the data-source metadata. For REGEXP_LIKE, the FST/IFST text
index (when present) is consulted here
+ /// — the upgrade happens only when the dictionary is usable, so no
evaluator is built and discarded.
public static PredicateEvaluator getPredicateEvaluator(Predicate predicate,
DataSource dataSource,
QueryContext queryContext) {
Dictionary dictionary = getDictionaryUsableForFiltering(dataSource,
queryContext, predicate);
DataType dataType = dataSource.getDataSourceMetadata().getDataType();
- return buildEvaluator(predicate, dictionary, dataType, queryContext);
+ return buildEvaluator(predicate, dictionary, dataType, queryContext,
dataSource);
}
/// Builds a [PredicateEvaluator] when the value source and `dictionary` are
already in sync by construction: when
/// `dictionary` is non-null the source produces dict ids decodable by that
dictionary; when `dictionary` is null
/// the source produces raw values. No gating logic runs — the dictionary
(if any) is taken as-is, so the caller is
- /// responsible for the match.
+ /// responsible for the match. FST/IFST evaluators are not considered here
since this overload has no `DataSource`
+ /// to read text indexes from.
// TODO: Always pass in query context
public static PredicateEvaluator getPredicateEvaluator(Predicate predicate,
@Nullable Dictionary dictionary,
DataType dataType, @Nullable QueryContext queryContext) {
- return buildEvaluator(predicate, dictionary, dataType, queryContext);
+ return buildEvaluator(predicate, dictionary, dataType, queryContext, null);
}
private static PredicateEvaluator buildEvaluator(Predicate predicate,
@Nullable Dictionary dictionary,
- DataType dataType, @Nullable QueryContext queryContext) {
+ DataType dataType, @Nullable QueryContext queryContext, @Nullable
DataSource dataSource) {
try {
if (dictionary != null) {
// dictionary based predicate evaluators
@@ -83,9 +85,23 @@ public class PredicateEvaluatorProvider {
case RANGE:
return
RangePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RangePredicate)
predicate, dictionary,
dataType);
- case REGEXP_LIKE:
- return
RegexpLikePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RegexpLikePredicate)
predicate,
- dictionary, dataType, queryContext);
+ case REGEXP_LIKE: {
+ // Prefer FST/IFST text index when present on the data source;
otherwise fall back to the generic
+ // dict-based evaluator (dict-id scan or eager dict iteration).
+ RegexpLikePredicate regexpLike = (RegexpLikePredicate) predicate;
+ if (dataSource != null) {
+ if (regexpLike.isCaseInsensitive() && dataSource.getIFSTIndex()
!= null) {
+ return
IFSTBasedRegexpPredicateEvaluatorFactory.newIFSTBasedEvaluator(regexpLike,
+ dataSource.getIFSTIndex(), dictionary);
+ }
+ if (!regexpLike.isCaseInsensitive() && dataSource.getFSTIndex()
!= null) {
+ return
FSTBasedRegexpPredicateEvaluatorFactory.newFSTBasedEvaluator(regexpLike,
+ dataSource.getFSTIndex(), dictionary);
+ }
+ }
+ return
RegexpLikePredicateEvaluatorFactory.newDictionaryBasedEvaluator(regexpLike,
dictionary, dataType,
+ queryContext);
+ }
default:
throw new UnsupportedOperationException("Unsupported predicate
type: " + predicate.getType());
}
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/plan/FilterPlanNode.java
b/pinot-core/src/main/java/org/apache/pinot/core/plan/FilterPlanNode.java
index 9074ff6323f..b69d77bcd2f 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/plan/FilterPlanNode.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/plan/FilterPlanNode.java
@@ -30,7 +30,6 @@ import org.apache.pinot.common.request.context.FilterContext;
import org.apache.pinot.common.request.context.FunctionContext;
import org.apache.pinot.common.request.context.predicate.JsonMatchPredicate;
import org.apache.pinot.common.request.context.predicate.Predicate;
-import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
import org.apache.pinot.common.request.context.predicate.TextMatchPredicate;
import
org.apache.pinot.common.request.context.predicate.VectorSimilarityPredicate;
import
org.apache.pinot.common.request.context.predicate.VectorSimilarityRadiusPredicate;
@@ -54,8 +53,6 @@ import org.apache.pinot.core.operator.filter.VectorSearchMode;
import org.apache.pinot.core.operator.filter.VectorSearchParams;
import org.apache.pinot.core.operator.filter.VectorSearchStrategy;
import org.apache.pinot.core.operator.filter.VectorSimilarityFilterOperator;
-import
org.apache.pinot.core.operator.filter.predicate.FSTBasedRegexpPredicateEvaluatorFactory;
-import
org.apache.pinot.core.operator.filter.predicate.IFSTBasedRegexpPredicateEvaluatorFactory;
import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
import
org.apache.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider;
import org.apache.pinot.core.operator.transform.function.ItemTransformFunction;
@@ -306,27 +303,10 @@ public class FilterPlanNode implements PlanNode {
return new TextMatchFilterOperator(textIndexReader,
(TextMatchPredicate) predicate, numDocs);
}
case REGEXP_LIKE:
- // Check if case-insensitive flag is present
- RegexpLikePredicate regexpLikePredicate = (RegexpLikePredicate)
predicate;
- boolean caseInsensitive =
regexpLikePredicate.isCaseInsensitive();
- if (caseInsensitive) {
- if (dataSource.getIFSTIndex() != null) {
- predicateEvaluator =
-
IFSTBasedRegexpPredicateEvaluatorFactory.newIFSTBasedEvaluator(regexpLikePredicate,
- dataSource.getIFSTIndex(),
dataSource.getDictionary());
- } else {
- predicateEvaluator =
-
PredicateEvaluatorProvider.getPredicateEvaluator(predicate, dataSource,
_queryContext);
- }
- } else {
- if (dataSource.getFSTIndex() != null) {
- predicateEvaluator =
FSTBasedRegexpPredicateEvaluatorFactory.newFSTBasedEvaluator(regexpLikePredicate,
- dataSource.getFSTIndex(), dataSource.getDictionary());
- } else {
- predicateEvaluator =
-
PredicateEvaluatorProvider.getPredicateEvaluator(predicate, dataSource,
_queryContext);
- }
- }
+ // PredicateEvaluatorProvider handles FST/IFST upgrade
internally when the dictionary is usable for
+ // filtering and a matching text index exists on the data source.
+ predicateEvaluator =
+ PredicateEvaluatorProvider.getPredicateEvaluator(predicate,
dataSource, _queryContext);
_predicateEvaluators.add(Pair.of(predicate, predicateEvaluator));
return FilterOperatorUtils.getLeafFilterOperator(_queryContext,
predicateEvaluator, dataSource, numDocs);
case JSON_MATCH:
diff --git
a/pinot-core/src/test/java/org/apache/pinot/core/plan/FilterPlanNodeTest.java
b/pinot-core/src/test/java/org/apache/pinot/core/plan/FilterPlanNodeTest.java
index 689db9e2fef..3faced128bb 100644
---
a/pinot-core/src/test/java/org/apache/pinot/core/plan/FilterPlanNodeTest.java
+++
b/pinot-core/src/test/java/org/apache/pinot/core/plan/FilterPlanNodeTest.java
@@ -20,24 +20,44 @@ package org.apache.pinot.core.plan;
import java.lang.reflect.Method;
import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.pinot.common.request.context.ExpressionContext;
+import org.apache.pinot.common.request.context.FilterContext;
+import org.apache.pinot.common.request.context.predicate.Predicate;
+import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
import org.apache.pinot.core.common.BlockDocIdIterator;
import org.apache.pinot.core.common.BlockDocIdSet;
import org.apache.pinot.core.operator.blocks.FilterBlock;
import org.apache.pinot.core.operator.filter.BaseFilterOperator;
+import
org.apache.pinot.core.operator.filter.predicate.BaseDictIdBasedRegexpLikePredicateEvaluator;
+import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
import org.apache.pinot.core.query.request.context.QueryContext;
import org.apache.pinot.segment.local.upsert.UpsertUtils;
import org.apache.pinot.segment.spi.Constants;
import org.apache.pinot.segment.spi.IndexSegment;
import org.apache.pinot.segment.spi.SegmentContext;
import org.apache.pinot.segment.spi.SegmentMetadata;
+import org.apache.pinot.segment.spi.datasource.DataSource;
+import org.apache.pinot.segment.spi.datasource.DataSourceMetadata;
import org.apache.pinot.segment.spi.index.creator.VectorIndexConfig;
import
org.apache.pinot.segment.spi.index.mutable.ThreadSafeMutableRoaringBitmap;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
+import org.apache.pinot.segment.spi.index.reader.InvertedIndexReader;
+import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
+import org.apache.pinot.spi.config.table.FieldConfig;
+import org.apache.pinot.spi.data.DimensionFieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.mockito.Mockito;
import org.mockito.stubbing.Answer;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
import org.testng.annotations.Test;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;
public class FilterPlanNodeTest {
@@ -133,4 +153,122 @@ public class FilterPlanNodeTest {
}
return numDocsFiltered;
}
+
+ @Test
+ public void regexpLikeUsesIFSTEvaluatorWhenIFSTAndInvertedAvailable()
+ throws Exception {
+ PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
+ true, true, false, true, false, true);
+ assertTrue(evaluator.isDictionaryBased());
+ assertTrue(evaluator instanceof
BaseDictIdBasedRegexpLikePredicateEvaluator);
+ }
+
+ @Test
+ public void regexpLikeFallsBackToRawWhenIFSTPresentButNoDictConsumer()
+ throws Exception {
+ PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
+ true, true, false, true, false, false);
+ assertFalse(evaluator.isDictionaryBased());
+ }
+
+ @Test
+ public void regexpLikeUsesFSTEvaluatorWhenFSTAndInvertedAvailable()
+ throws Exception {
+ PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
+ false, false, true, true, false, true);
+ assertTrue(evaluator.isDictionaryBased());
+ assertTrue(evaluator instanceof
BaseDictIdBasedRegexpLikePredicateEvaluator);
+ }
+
+ @Test
+ public void regexpLikeFallsBackToRawWhenFSTPresentButNoDictConsumer()
+ throws Exception {
+ PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
+ false, false, true, true, false, false);
+ assertFalse(evaluator.isDictionaryBased());
+ }
+
+ @Test
+ public void regexpLikeUsesIFSTEvaluatorWhenIFSTAndDictEncodedForward()
+ throws Exception {
+ PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
+ true, true, false, true, true, false);
+ assertTrue(evaluator.isDictionaryBased());
+ }
+
+ private PredicateEvaluator runRegexpLikeAndGetEvaluator(boolean
caseInsensitive, boolean hasIFST, boolean hasFST,
+ boolean hasDictionary, boolean forwardDictEncoded, boolean hasInverted)
+ throws Exception {
+ String column = "col";
+ DataSource dataSource =
+ mockStringDataSource(column, hasIFST, hasFST, hasDictionary,
forwardDictEncoded, hasInverted);
+ RegexpLikePredicate predicate = caseInsensitive
+ ? new RegexpLikePredicate(ExpressionContext.forIdentifier(column),
"pat", "i")
+ : new RegexpLikePredicate(ExpressionContext.forIdentifier(column),
"pat");
+ FilterContext filterContext = FilterContext.forPredicate(predicate);
+
+ IndexSegment segment = mock(IndexSegment.class);
+ SegmentMetadata segmentMetadata = mock(SegmentMetadata.class);
+ when(segmentMetadata.getTotalDocs()).thenReturn(1);
+ when(segment.getSegmentMetadata()).thenReturn(segmentMetadata);
+ when(segment.getDataSource(Mockito.eq(column),
Mockito.any())).thenReturn(dataSource);
+
+ QueryContext queryContext = mock(QueryContext.class);
+ when(queryContext.getFilter()).thenReturn(filterContext);
+ when(queryContext.isIndexUseAllowed(Mockito.any(DataSource.class),
Mockito.any(FieldConfig.IndexType.class)))
+ .thenReturn(true);
+
+ SegmentContext segmentContext = new SegmentContext(segment);
+
+ FilterPlanNode planNode = new FilterPlanNode(segmentContext, queryContext);
+ try {
+ planNode.run();
+ } catch (Exception ignored) {
+ }
+
+ Pair<Predicate, PredicateEvaluator> pair =
planNode.getPredicateEvaluators().get(0);
+ return pair.getRight();
+ }
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ private static DataSource mockStringDataSource(String column, boolean
hasIFST, boolean hasFST,
+ boolean hasDictionary, boolean forwardDictEncoded, boolean hasInverted) {
+ DataSource dataSource = Mockito.mock(DataSource.class);
+ DataSourceMetadata metadata = Mockito.mock(DataSourceMetadata.class);
+ when(metadata.getDataType()).thenReturn(DataType.STRING);
+ when(metadata.isSorted()).thenReturn(false);
+ when(metadata.getFieldSpec()).thenReturn(new DimensionFieldSpec(column,
DataType.STRING, true));
+ when(dataSource.getDataSourceMetadata()).thenReturn(metadata);
+ when(dataSource.getColumnName()).thenReturn(column);
+
+ ForwardIndexReader forwardIndex = Mockito.mock(ForwardIndexReader.class);
+ when(forwardIndex.isDictionaryEncoded()).thenReturn(forwardDictEncoded);
+ when(forwardIndex.getStoredType()).thenReturn(DataType.STRING);
+ when(dataSource.getForwardIndex()).thenReturn(forwardIndex);
+
+ if (hasDictionary) {
+ Dictionary dictionary = Mockito.mock(Dictionary.class);
+ when(dictionary.length()).thenReturn(0);
+ when(dataSource.getDictionary()).thenReturn(dictionary);
+ } else {
+ when(dataSource.getDictionary()).thenReturn(null);
+ }
+
+ InvertedIndexReader invertedReader = hasInverted ?
Mockito.mock(InvertedIndexReader.class) : null;
+ TextIndexReader ifstReader = hasIFST ? mockTextIndexReader() : null;
+ TextIndexReader fstReader = hasFST ? mockTextIndexReader() : null;
+ when(dataSource.getInvertedIndex()).thenReturn(invertedReader);
+ when(dataSource.getRangeIndex()).thenReturn(null);
+ when(dataSource.getIFSTIndex()).thenReturn(ifstReader);
+ when(dataSource.getFSTIndex()).thenReturn(fstReader);
+
+ return dataSource;
+ }
+
+ private static TextIndexReader mockTextIndexReader() {
+ TextIndexReader reader = Mockito.mock(TextIndexReader.class);
+ ImmutableRoaringBitmap emptyBitmap = ImmutableRoaringBitmap.bitmapOf();
+ when(reader.getDictIds(Mockito.anyString())).thenReturn(emptyBitmap);
+ return reader;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]