Re: [PR] Refine index-based DISTINCT operators (JSON / inverted) [pinot]

via GitHub Tue, 26 May 2026 14:14:22 -0700


Copilot commented on code in PR #18588:
URL: https://github.com/apache/pinot/pull/18588#discussion_r3306883749



##########
pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java:
##########
@@ -50,516 +47,396 @@
 import org.apache.pinot.core.query.distinct.table.StringDistinctTable;
 import org.apache.pinot.core.query.request.context.QueryContext;
 import org.apache.pinot.segment.spi.IndexSegment;
-import org.apache.pinot.segment.spi.SegmentContext;
 import org.apache.pinot.segment.spi.datasource.DataSource;
 import org.apache.pinot.segment.spi.index.IndexService;
 import org.apache.pinot.segment.spi.index.IndexType;
 import org.apache.pinot.segment.spi.index.reader.JsonIndexReader;
-import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.query.QueryThreadContext;
-import org.apache.pinot.sql.parsers.CalciteSqlParser;
+import org.apache.pinot.spi.utils.JsonUtils;
 import org.roaringbitmap.RoaringBitmap;
 import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
 
 
-/**
- * Distinct operator for the scalar {@code jsonExtractIndex(column, path, 
type[, defaultValue])} form.
- *
- * <p>Execution flow:
- * 1. Push a same-path {@code JSON_MATCH} predicate into the JSON-index lookup 
when it cannot match missing paths.
- * 2. Convert matching flattened doc ids back to segment doc ids.
- * 3. Apply any remaining row-level filter and materialize DISTINCT results, 
including missing-path handling.
- */
+/// Distinct operator for `jsonExtractIndex(column, path, type[, 
defaultValue[, filterJsonExpression]])`.
+///
+/// Supports both SV (e.g. `STRING`) and MV (e.g. `STRING_ARRAY`) result types 
— DISTINCT collapses MV array elements
+/// to scalar rows, matching the scan-based `SELECT DISTINCT mvCol` 
convention. The 4-arg default is a single value
+/// for SV; for MV it's a JSON array whose elements are each added to the 
distinct set when no doc matches the path.
+///
+/// Execution flow:
+/// 1. Pass the optional 5-arg `filterJsonExpression` directly to the 
JSON-index lookup (matches
+///    `JsonExtractIndexTransformFunction`'s convention).
+/// 2. Convert matching flattened doc ids back to segment doc ids.
+/// 3. Apply any remaining row-level WHERE filter and materialize DISTINCT 
results, including missing-path handling.
 public class JsonIndexDistinctOperator extends 
BaseOperator<DistinctResultsBlock> {
   private static final String EXPLAIN_NAME = "DISTINCT_JSON_INDEX";
   private static final String FUNCTION_NAME = "jsonExtractIndex";
 
+  /// Returns true if the expression is a `jsonExtractIndex` function call. 
All other validation (argument count/types,
+  /// column existence, JSON index presence, path support) happens inside the 
operator's constructor and matches what
+  /// the scan-based fallback (`JsonExtractIndexTransformFunction`) would 
surface during its own `init`.
+  public static boolean canUseJsonIndexDistinct(ExpressionContext expr) {
+    return expr.getType() == ExpressionContext.Type.FUNCTION && 
FUNCTION_NAME.equalsIgnoreCase(
+        expr.getFunction().getFunctionName());
+  }
+
   private final IndexSegment _indexSegment;
-  private final SegmentContext _segmentContext;
+  private final int _totalDocs;
   private final QueryContext _queryContext;
   private final BaseFilterOperator _filterOperator;
+  private final ExpressionContext _expression;
+  private final boolean _skipMissingPath;
+  private final JsonIndexReader _jsonIndexReader;
+  private final String _jsonPathString;
+  private final DataType _dataType;
+  @Nullable
+  private final String[] _defaultValueLiterals;
+  @Nullable
+  private final String _filterJsonExpression;
+  private final DataSchema _dataSchema;
+  @Nullable
+  private final OrderByExpressionContext _orderByExpression;
 
-  private int _numEntriesExamined = 0;
+  private int _numDocsScanned = 0;
   private long _numEntriesScannedInFilter = 0;
+  private int _numEntriesExaminedPostFilter = 0;
 
-  public JsonIndexDistinctOperator(IndexSegment indexSegment, SegmentContext 
segmentContext,
-      QueryContext queryContext, BaseFilterOperator filterOperator) {
+  public JsonIndexDistinctOperator(IndexSegment indexSegment, QueryContext 
queryContext,
+      BaseFilterOperator filterOperator) {
     _indexSegment = indexSegment;
-    _segmentContext = segmentContext;
+    _totalDocs = indexSegment.getSegmentMetadata().getTotalDocs();
     _queryContext = queryContext;
     _filterOperator = filterOperator;
-  }
-
-  @Override
-  protected DistinctResultsBlock getNextBlock() {
-    List<ExpressionContext> expressions = _queryContext.getSelectExpressions();
+    List<ExpressionContext> expressions = queryContext.getSelectExpressions();
     if (expressions.size() != 1) {
       throw new IllegalStateException("JsonIndexDistinctOperator supports 
single expression only");
     }
+    _expression = expressions.get(0);
+    _skipMissingPath = 
QueryOptionsUtils.isJsonIndexDistinctSkipMissingPath(queryContext.getQueryOptions());
 
-    ExpressionContext expr = expressions.get(0);
-    ParsedJsonExtractIndex parsed = parseJsonExtractIndex(expr);
-    if (parsed == null) {
-      throw new IllegalStateException("Expected 3/4-arg scalar 
jsonExtractIndex expression");
-    }
+    // Mirrors the arguments handling logic in 
`JsonExtractIndexTransformFunction`
 
-    DataSource dataSource = _indexSegment.getDataSource(parsed._columnName, 
_queryContext.getSchema());
-    JsonIndexReader jsonIndexReader = getJsonIndexReader(dataSource);
-    if (jsonIndexReader == null) {
-      throw new IllegalStateException("Column " + parsed._columnName + " has 
no JSON index");
+    List<ExpressionContext> arguments = 
_expression.getFunction().getArguments();
+    int numArguments = arguments.size();
+    // Check that there are exactly 3 or 4 or 5 arguments
+    if (numArguments < 3 || numArguments > 5) {
+      throw new IllegalArgumentException(
+          "Expected 3/4/5 arguments for jsonExtractIndex(jsonFieldName, 
'jsonPath', 'resultsType',"
+              + " ['defaultValue'], ['jsonFilterExpression'])");
     }
 
-    String pushedDownFilterJson = extractSamePathJsonMatchFilter(parsed, 
_queryContext.getFilter());
-    boolean filterFullyPushedDown = pushedDownFilterJson != null
-        && isOnlySamePathJsonMatchFilter(parsed, _queryContext.getFilter())
-        && !jsonMatchFilterCanMatchMissingPath(pushedDownFilterJson);
-
-    // Fast path: when the filter is fully pushed down into the JSON index, we 
only need the distinct value strings.
-    // This avoids reading posting lists, building per-value bitmaps, and 
converting flattened doc IDs.
-    if (filterFullyPushedDown) {
-      Set<String> distinctValues = jsonIndexReader.getMatchingDistinctValues(
-          parsed._jsonPathString, pushedDownFilterJson);
-      return buildDistinctResultsFromValues(expr, parsed, distinctValues);
+    ExpressionContext firstArgument = arguments.get(0);
+    if (firstArgument.getType() == ExpressionContext.Type.IDENTIFIER) {
+      DataSource dataSource = 
indexSegment.getDataSource(firstArgument.getIdentifier());
+      _jsonIndexReader = getJsonIndexReader(dataSource);
+      if (_jsonIndexReader == null) {
+        throw new IllegalStateException("jsonExtractIndex can only be applied 
on a column with JSON index");
+      }
+    } else {
+      throw new IllegalArgumentException("jsonExtractIndex can only be applied 
to a raw column");
     }
 
-    // Evaluate the filter first so we can skip the (potentially expensive) 
index map when no docs match.
-    RoaringBitmap filteredDocIds = buildFilteredDocIds();
-    if (filteredDocIds != null && filteredDocIds.isEmpty()) {
-      ColumnDataType earlyColumnDataType = 
ColumnDataType.fromDataTypeSV(parsed._dataType);
-      DataSchema earlyDataSchema = new DataSchema(
-          new String[]{expr.toString()},
-          new ColumnDataType[]{earlyColumnDataType});
-      OrderByExpressionContext earlyOrderBy = 
_queryContext.getOrderByExpressions() != null
-          ? _queryContext.getOrderByExpressions().get(0) : null;
-      return new DistinctResultsBlock(
-          createDistinctTable(earlyDataSchema, parsed._dataType, 
earlyOrderBy), _queryContext);
+    ExpressionContext secondArgument = arguments.get(1);
+    if (secondArgument.getType() != ExpressionContext.Type.LITERAL) {
+      throw new IllegalArgumentException("JSON path argument must be a 
literal");
+    }
+    _jsonPathString = secondArgument.getLiteral().getStringValue();
+    try {
+      JsonPathCache.INSTANCE.getOrCompute(_jsonPathString);
+    } catch (Exception e) {
+      throw new IllegalArgumentException("JSON path argument is not a valid 
JSON path");
     }

Review Comment:
   This path no longer checks `_jsonIndexReader.isPathIndexed(_jsonPathString)` 
after moving `canUseJsonIndexDistinct` to a function-name-only gate. For JSON 
index readers with selective path support, `DistinctPlanNode` now routes 
non-indexed paths into this operator instead of falling back, so the operator 
can call `getMatchingFlattenedDocsMap` for a path the reader explicitly reports 
as unsupported. Please restore the indexed-path check here (or make the planner 
fall back when it is false).
   



##########
pinot-core/src/main/java/org/apache/pinot/core/operator/query/JsonIndexDistinctOperator.java:
##########
@@ -50,516 +47,396 @@
 import org.apache.pinot.core.query.distinct.table.StringDistinctTable;
 import org.apache.pinot.core.query.request.context.QueryContext;
 import org.apache.pinot.segment.spi.IndexSegment;
-import org.apache.pinot.segment.spi.SegmentContext;
 import org.apache.pinot.segment.spi.datasource.DataSource;
 import org.apache.pinot.segment.spi.index.IndexService;
 import org.apache.pinot.segment.spi.index.IndexType;
 import org.apache.pinot.segment.spi.index.reader.JsonIndexReader;
-import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.FieldSpec.DataType;
 import org.apache.pinot.spi.query.QueryThreadContext;
-import org.apache.pinot.sql.parsers.CalciteSqlParser;
+import org.apache.pinot.spi.utils.JsonUtils;
 import org.roaringbitmap.RoaringBitmap;
 import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
 
 
-/**
- * Distinct operator for the scalar {@code jsonExtractIndex(column, path, 
type[, defaultValue])} form.
- *
- * <p>Execution flow:
- * 1. Push a same-path {@code JSON_MATCH} predicate into the JSON-index lookup 
when it cannot match missing paths.
- * 2. Convert matching flattened doc ids back to segment doc ids.
- * 3. Apply any remaining row-level filter and materialize DISTINCT results, 
including missing-path handling.
- */
+/// Distinct operator for `jsonExtractIndex(column, path, type[, 
defaultValue[, filterJsonExpression]])`.
+///
+/// Supports both SV (e.g. `STRING`) and MV (e.g. `STRING_ARRAY`) result types 
— DISTINCT collapses MV array elements
+/// to scalar rows, matching the scan-based `SELECT DISTINCT mvCol` 
convention. The 4-arg default is a single value
+/// for SV; for MV it's a JSON array whose elements are each added to the 
distinct set when no doc matches the path.
+///
+/// Execution flow:
+/// 1. Pass the optional 5-arg `filterJsonExpression` directly to the 
JSON-index lookup (matches
+///    `JsonExtractIndexTransformFunction`'s convention).
+/// 2. Convert matching flattened doc ids back to segment doc ids.
+/// 3. Apply any remaining row-level WHERE filter and materialize DISTINCT 
results, including missing-path handling.
 public class JsonIndexDistinctOperator extends 
BaseOperator<DistinctResultsBlock> {
   private static final String EXPLAIN_NAME = "DISTINCT_JSON_INDEX";
   private static final String FUNCTION_NAME = "jsonExtractIndex";
 
+  /// Returns true if the expression is a `jsonExtractIndex` function call. 
All other validation (argument count/types,
+  /// column existence, JSON index presence, path support) happens inside the 
operator's constructor and matches what
+  /// the scan-based fallback (`JsonExtractIndexTransformFunction`) would 
surface during its own `init`.
+  public static boolean canUseJsonIndexDistinct(ExpressionContext expr) {
+    return expr.getType() == ExpressionContext.Type.FUNCTION && 
FUNCTION_NAME.equalsIgnoreCase(
+        expr.getFunction().getFunctionName());
+  }
+
   private final IndexSegment _indexSegment;
-  private final SegmentContext _segmentContext;
+  private final int _totalDocs;
   private final QueryContext _queryContext;
   private final BaseFilterOperator _filterOperator;
+  private final ExpressionContext _expression;
+  private final boolean _skipMissingPath;
+  private final JsonIndexReader _jsonIndexReader;
+  private final String _jsonPathString;
+  private final DataType _dataType;
+  @Nullable
+  private final String[] _defaultValueLiterals;
+  @Nullable
+  private final String _filterJsonExpression;
+  private final DataSchema _dataSchema;
+  @Nullable
+  private final OrderByExpressionContext _orderByExpression;
 
-  private int _numEntriesExamined = 0;
+  private int _numDocsScanned = 0;
   private long _numEntriesScannedInFilter = 0;
+  private int _numEntriesExaminedPostFilter = 0;
 
-  public JsonIndexDistinctOperator(IndexSegment indexSegment, SegmentContext 
segmentContext,
-      QueryContext queryContext, BaseFilterOperator filterOperator) {
+  public JsonIndexDistinctOperator(IndexSegment indexSegment, QueryContext 
queryContext,
+      BaseFilterOperator filterOperator) {
     _indexSegment = indexSegment;
-    _segmentContext = segmentContext;
+    _totalDocs = indexSegment.getSegmentMetadata().getTotalDocs();
     _queryContext = queryContext;
     _filterOperator = filterOperator;
-  }
-
-  @Override
-  protected DistinctResultsBlock getNextBlock() {
-    List<ExpressionContext> expressions = _queryContext.getSelectExpressions();
+    List<ExpressionContext> expressions = queryContext.getSelectExpressions();
     if (expressions.size() != 1) {
       throw new IllegalStateException("JsonIndexDistinctOperator supports 
single expression only");
     }
+    _expression = expressions.get(0);
+    _skipMissingPath = 
QueryOptionsUtils.isJsonIndexDistinctSkipMissingPath(queryContext.getQueryOptions());
 
-    ExpressionContext expr = expressions.get(0);
-    ParsedJsonExtractIndex parsed = parseJsonExtractIndex(expr);
-    if (parsed == null) {
-      throw new IllegalStateException("Expected 3/4-arg scalar 
jsonExtractIndex expression");
-    }
+    // Mirrors the arguments handling logic in 
`JsonExtractIndexTransformFunction`
 
-    DataSource dataSource = _indexSegment.getDataSource(parsed._columnName, 
_queryContext.getSchema());
-    JsonIndexReader jsonIndexReader = getJsonIndexReader(dataSource);
-    if (jsonIndexReader == null) {
-      throw new IllegalStateException("Column " + parsed._columnName + " has 
no JSON index");
+    List<ExpressionContext> arguments = 
_expression.getFunction().getArguments();
+    int numArguments = arguments.size();
+    // Check that there are exactly 3 or 4 or 5 arguments
+    if (numArguments < 3 || numArguments > 5) {
+      throw new IllegalArgumentException(
+          "Expected 3/4/5 arguments for jsonExtractIndex(jsonFieldName, 
'jsonPath', 'resultsType',"
+              + " ['defaultValue'], ['jsonFilterExpression'])");
     }
 
-    String pushedDownFilterJson = extractSamePathJsonMatchFilter(parsed, 
_queryContext.getFilter());
-    boolean filterFullyPushedDown = pushedDownFilterJson != null
-        && isOnlySamePathJsonMatchFilter(parsed, _queryContext.getFilter())
-        && !jsonMatchFilterCanMatchMissingPath(pushedDownFilterJson);
-
-    // Fast path: when the filter is fully pushed down into the JSON index, we 
only need the distinct value strings.
-    // This avoids reading posting lists, building per-value bitmaps, and 
converting flattened doc IDs.
-    if (filterFullyPushedDown) {
-      Set<String> distinctValues = jsonIndexReader.getMatchingDistinctValues(
-          parsed._jsonPathString, pushedDownFilterJson);
-      return buildDistinctResultsFromValues(expr, parsed, distinctValues);
+    ExpressionContext firstArgument = arguments.get(0);
+    if (firstArgument.getType() == ExpressionContext.Type.IDENTIFIER) {
+      DataSource dataSource = 
indexSegment.getDataSource(firstArgument.getIdentifier());
+      _jsonIndexReader = getJsonIndexReader(dataSource);
+      if (_jsonIndexReader == null) {
+        throw new IllegalStateException("jsonExtractIndex can only be applied 
on a column with JSON index");
+      }
+    } else {
+      throw new IllegalArgumentException("jsonExtractIndex can only be applied 
to a raw column");
     }
 
-    // Evaluate the filter first so we can skip the (potentially expensive) 
index map when no docs match.
-    RoaringBitmap filteredDocIds = buildFilteredDocIds();
-    if (filteredDocIds != null && filteredDocIds.isEmpty()) {
-      ColumnDataType earlyColumnDataType = 
ColumnDataType.fromDataTypeSV(parsed._dataType);
-      DataSchema earlyDataSchema = new DataSchema(
-          new String[]{expr.toString()},
-          new ColumnDataType[]{earlyColumnDataType});
-      OrderByExpressionContext earlyOrderBy = 
_queryContext.getOrderByExpressions() != null
-          ? _queryContext.getOrderByExpressions().get(0) : null;
-      return new DistinctResultsBlock(
-          createDistinctTable(earlyDataSchema, parsed._dataType, 
earlyOrderBy), _queryContext);
+    ExpressionContext secondArgument = arguments.get(1);
+    if (secondArgument.getType() != ExpressionContext.Type.LITERAL) {
+      throw new IllegalArgumentException("JSON path argument must be a 
literal");
+    }
+    _jsonPathString = secondArgument.getLiteral().getStringValue();
+    try {
+      JsonPathCache.INSTANCE.getOrCompute(_jsonPathString);
+    } catch (Exception e) {
+      throw new IllegalArgumentException("JSON path argument is not a valid 
JSON path");
     }
 
-    // All other WHERE filters remain row-level and are applied after 
converting flattened doc IDs to real doc IDs.
-    Map<String, RoaringBitmap> valueToMatchingDocs =
-        jsonIndexReader.getMatchingFlattenedDocsMap(parsed._jsonPathString, 
pushedDownFilterJson);
+    ExpressionContext thirdArgument = arguments.get(2);
+    if (thirdArgument.getType() != ExpressionContext.Type.LITERAL) {
+      throw new IllegalArgumentException("Result type argument must be a 
literal");
+    }
+    String resultsType = 
thirdArgument.getLiteral().getStringValue().toUpperCase();
+    boolean isSingleValue = !resultsType.endsWith("_ARRAY");
+    if (isSingleValue && _jsonPathString.contains("[*]")) {
+      throw new IllegalArgumentException(
+          "[*] syntax in json path is unsupported for singleValue field 
json_extract_index");
+    }
+    String dataTypeName = isSingleValue ? resultsType : 
resultsType.substring(0, resultsType.length() - 6);
+    try {
+      _dataType = DataType.valueOf(dataTypeName);
+    } catch (IllegalArgumentException e) {
+      throw new IllegalArgumentException("Unknown jsonExtractIndex result 
type: " + resultsType);
+    }
+    switch (_dataType) {
+      case INT:
+      case LONG:
+      case FLOAT:
+      case DOUBLE:
+      case BIG_DECIMAL:
+      case STRING:
+        break;
+      default:
+        throw new IllegalArgumentException("Unsupported jsonExtractIndex 
result type for distinct: " + _dataType);
+    }
 
-    // Always single-value (MV _ARRAY is rejected in parseJsonExtractIndex)
-    jsonIndexReader.convertFlattenedDocIdsToDocIds(valueToMatchingDocs);
-    return buildDistinctResultsBlock(expr, parsed, valueToMatchingDocs, 
filteredDocIds,
-        filteredDocIds == null);
-  }
+    // With _skipMissingPath, the 4-arg default is never used at runtime 
(handleMissingDocs is bypassed), so don't
+    // parse or validate it — accept any literal shape and ignore it.
+    if (numArguments >= 4 && !_skipMissingPath) {
+      ExpressionContext fourthArgument = arguments.get(3);
+      if (fourthArgument.getType() != ExpressionContext.Type.LITERAL) {
+        throw new IllegalArgumentException("Default value must be a literal");
+      }
+      String defaultLiteral = fourthArgument.getLiteral().getStringValue();
+      if (isSingleValue) {
+        try {
+          _dataType.convert(defaultLiteral);
+        } catch (Exception e) {
+          throw new IllegalArgumentException("Default value '" + 
defaultLiteral + "' is not a valid " + _dataType);
+        }
+        _defaultValueLiterals = new String[]{defaultLiteral};
+      } else {
+        try {
+          JsonNode mvArray = JsonUtils.stringToJsonNode(defaultLiteral);
+          if (!mvArray.isArray()) {
+            throw new IllegalArgumentException("Default value must be a valid 
JSON array");
+          }
+          String[] literals = new String[mvArray.size()];
+          for (int i = 0; i < mvArray.size(); i++) {
+            literals[i] = mvArray.get(i).asText();
+            try {
+              _dataType.convert(literals[i]);
+            } catch (Exception e) {
+              throw new IllegalArgumentException("Default value '" + 
literals[i] + "' is not a valid " + _dataType);
+            }
+          }
+          _defaultValueLiterals = literals;
+        } catch (IOException e) {
+          throw new IllegalArgumentException("Default value must be a valid 
JSON array");
+        }
+      }
+    } else {
+      _defaultValueLiterals = null;
+    }
 
-  private DistinctResultsBlock 
buildDistinctResultsFromValues(ExpressionContext expr, ParsedJsonExtractIndex 
parsed,
-      Set<String> distinctValues) {
-    ColumnDataType columnDataType = 
ColumnDataType.fromDataTypeSV(parsed._dataType);
-    DataSchema dataSchema = new DataSchema(
-        new String[]{expr.toString()},
-        new ColumnDataType[]{columnDataType});
-    OrderByExpressionContext orderByExpression = 
_queryContext.getOrderByExpressions() != null
-        ? _queryContext.getOrderByExpressions().get(0) : null;
-    DistinctTable distinctTable = createDistinctTable(dataSchema, 
parsed._dataType, orderByExpression);
-    int limit = _queryContext.getLimit();
+    if (numArguments == 5) {
+      ExpressionContext fifthArgument = arguments.get(4);
+      if (fifthArgument.getType() != ExpressionContext.Type.LITERAL) {
+        throw new IllegalArgumentException("JSON path filter argument must be 
a literal");
+      }
+      _filterJsonExpression = fifthArgument.getLiteral().getStringValue();
+    } else {
+      _filterJsonExpression = null;
+    }
 
-    for (String value : distinctValues) {
-      _numEntriesExamined++;
-      
QueryThreadContext.checkTerminationAndSampleUsagePeriodically(_numEntriesExamined,
 EXPLAIN_NAME);
+    _dataSchema = new DataSchema(new String[]{_expression.toString()},
+        new ColumnDataType[]{ColumnDataType.fromDataTypeSV(_dataType)});
+    List<OrderByExpressionContext> orderByExpressions = 
queryContext.getOrderByExpressions();
+    _orderByExpression = orderByExpressions != null ? 
orderByExpressions.get(0) : null;
+  }
 
-      boolean done = addValueToDistinctTable(distinctTable, value, 
parsed._dataType, orderByExpression);
-      if (done) {
-        break;
-      }
-      if (orderByExpression == null && distinctTable.hasLimit() && 
distinctTable.size() >= limit) {
-        break;
+  @Nullable
+  private static JsonIndexReader getJsonIndexReader(DataSource dataSource) {
+    JsonIndexReader reader = dataSource.getJsonIndex();
+    // TODO: rework
+    if (reader == null) {
+      Optional<IndexType<?, ?, ?>> compositeIndex = 
IndexService.getInstance().getOptional("composite_json_index");
+      if (compositeIndex.isPresent()) {
+        reader = (JsonIndexReader) dataSource.getIndex(compositeIndex.get());
       }
     }
+    return reader;
+  }
 
-    return new DistinctResultsBlock(distinctTable, _queryContext);
+  @Override
+  protected DistinctResultsBlock getNextBlock() {
+    // Evaluate the filter first so we can skip the (potentially expensive) 
index map when no docs match.
+    BaseFilterOperator.FilteredDocIds filteredDocIds = 
_filterOperator.getFilteredDocIds();
+    ImmutableRoaringBitmap docIds = filteredDocIds.getDocIds();
+    _numDocsScanned = docIds != null ? docIds.getCardinality() : _totalDocs;
+    _numEntriesScannedInFilter = filteredDocIds.getNumEntriesScannedInFilter();
+    if (_numDocsScanned == 0) {
+      return new DistinctResultsBlock(createDistinctTable(), _queryContext);
+    }
+
+    // The 5-arg form's filter literal is pushed into the JSON index; 
WHERE-clause filters remain row-level and are
+    // applied after converting flattened doc IDs to real doc IDs.
+    Map<String, RoaringBitmap> valueToMatchingDocs =
+        _jsonIndexReader.getMatchingFlattenedDocsMap(_jsonPathString, 
_filterJsonExpression);
+    _jsonIndexReader.convertFlattenedDocIdsToDocIds(valueToMatchingDocs);

Review Comment:
   The constructor now accepts `_ARRAY` result types, but this always converts 
the value posting lists from flattened doc IDs to real doc IDs. 
`JsonExtractIndexTransformFunction` deliberately skips that conversion for MV 
results because MV extraction needs the flattened posting lists; after 
conversion, a value that occurs multiple times in one document collapses to a 
single real doc id and intersections/default handling no longer match the scan 
path. Gate this conversion on the single-value case or keep rejecting `_ARRAY` 
types until the MV path can preserve flattened IDs.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Refine index-based DISTINCT operators (JSON / inverted) [pinot]

Reply via email to