This is an automated email from the ASF dual-hosted git repository.
xiangfu0 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 11545db16de Fix GROUP BY on column with RAW forward index + dictionary
(#18500)
11545db16de is described below
commit 11545db16de09215d43cd76268200cab12b4f8f0
Author: Xiang Fu <[email protected]>
AuthorDate: Thu May 14 13:59:07 2026 -0700
Fix GROUP BY on column with RAW forward index + dictionary (#18500)
A column configured with EncodingType.RAW + dictionaryIndex has a
dictionary file on disk but a non-dict-encoded forward index. Inside
NoDictionaryMultiColumnGroupKeyGenerator, the per-column branch gated on
ColumnContext.getDictionary() != null and then called
BlockValSet.getDictionaryIdsSV() — which routes to
ForwardIndexReader.readDictIds and throws UnsupportedOperationException
on a RAW forward index.
DefaultGroupByExecutor already routes such columns to the no-dict
GroupByExecutor by checking forwardIndex.isDictionaryEncoded() in
addition to the dict presence. NoDictionaryMultiColumnGroupKeyGenerator
must apply the same check per column when choosing between the dict-id
path and the on-the-fly-dictionary path, so a single RAW+dict column
in a multi-column GROUP BY does not crash the query.
The dictionary itself remains exposed on ColumnContext and
ProjectionBlockValSet for callers that legitimately need value↔id
lookups on a RAW+dict column (e.g. inverted-index based filters that
already hold dict IDs); the high-level operator alone decides whether
the dict-id read path is usable.
Add a multi-column GROUP BY regression test in
RawForwardIndexWithDictionaryTest covering the exact
NoDictionaryMultiColumnGroupKeyGenerator path from the original stack
trace.
---
.../NoDictionaryMultiColumnGroupKeyGenerator.java | 23 ++++++++++++++++++++
.../custom/RawForwardIndexWithDictionaryTest.java | 25 ++++++++++++++++++++++
2 files changed, 48 insertions(+)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
index 26857e2dd25..8f588b49d9e 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/groupby/NoDictionaryMultiColumnGroupKeyGenerator.java
@@ -32,7 +32,9 @@ import org.apache.pinot.core.operator.ColumnContext;
import org.apache.pinot.core.operator.blocks.ValueBlock;
import org.apache.pinot.core.query.aggregation.groupby.utils.ValueToIdMap;
import
org.apache.pinot.core.query.aggregation.groupby.utils.ValueToIdMapFactory;
+import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
+import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.utils.ByteArray;
import org.apache.pinot.spi.utils.FixedIntArray;
@@ -78,7 +80,14 @@ public class NoDictionaryMultiColumnGroupKeyGenerator
implements GroupKeyGenerat
ExpressionContext groupByExpression = groupByExpressions[i];
ColumnContext columnContext =
projectOperator.getResultColumnContext(groupByExpression);
_storedTypes[i] = columnContext.getDataType().getStoredType();
+ // Only take the dict-id path when the column has a dictionary AND its
forward index is dict-encoded.
+ // A column can have a dictionary alongside a RAW forward index (e.g.
dict + inverted/range), in which case
+ // BlockValSet#getDictionaryIdsSV would route to
ForwardIndexReader#readDictIds and throw on the raw forward
+ // index. Fall back to an on-the-fly dictionary on raw values instead.
Dictionary dictionary = _nullHandlingEnabled ? null :
columnContext.getDictionary();
+ if (dictionary != null && !hasDictEncodedForwardIndex(columnContext)) {
+ dictionary = null;
+ }
if (dictionary != null) {
_dictionaries[i] = dictionary;
} else {
@@ -428,6 +437,20 @@ public class NoDictionaryMultiColumnGroupKeyGenerator
implements GroupKeyGenerat
return new GroupKeyIterator();
}
+ /**
+ * Returns {@code true} if the column has a dict-encoded forward index, i.e.
{@link BlockValSet#getDictionaryIdsSV}
+ * is callable. A column referenced by a transform (rather than directly)
has no underlying {@link DataSource}; in
+ * that case the transform builds its own dictionary on the fly, so the
dict-id path is always usable.
+ */
+ private static boolean hasDictEncodedForwardIndex(ColumnContext
columnContext) {
+ DataSource dataSource = columnContext.getDataSource();
+ if (dataSource == null) {
+ return true;
+ }
+ ForwardIndexReader<?> forwardIndex = dataSource.getForwardIndex();
+ return forwardIndex == null || forwardIndex.isDictionaryEncoded();
+ }
+
/**
* Helper method to get or create group-id for a group key.
*
diff --git
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
index eebde67c7f8..2fa08e27adf 100644
---
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
+++
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/RawForwardIndexWithDictionaryTest.java
@@ -517,6 +517,31 @@ public class RawForwardIndexWithDictionaryTest extends
CustomDataQueryClusterInt
assertEquals(rawRows, dictRows, "DISTINCT rows must match between
dictionary-only and raw+dictionary columns");
}
+ /**
+ * Multi-column GROUP BY that mixes a dict-encoded column with a
RAW+dictionary column. This forces the executor
+ * onto the {@code NoDictionaryMultiColumnGroupKeyGenerator} path. The
per-column branch there must check the
+ * forward-index encoding in addition to {@code ColumnContext#getDictionary}
— otherwise it keeps the dictionary
+ * for any column that has a dict file and calls {@code
BlockValSet#getDictionaryIdsSV()} on it, which routes to
+ * {@code ForwardIndexReader#readDictIds} and throws {@code
UnsupportedOperationException} on a RAW forward index.
+ */
+ @Test(dataProvider = "useBothQueryEngines")
+ public void
testMultiColumnGroupByWithRawDictColumnReturnsSameResults(boolean
useMultiStageQueryEngine)
+ throws Exception {
+ setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+ JsonNode dictRows = postQuery(
+ String.format("SELECT %s, %s, COUNT(*) FROM %s GROUP BY %s, %s ORDER
BY %s, %s",
+ DICT_DIMENSION, DICT_INT_DIMENSION, getTableName(),
+ DICT_DIMENSION, DICT_INT_DIMENSION, DICT_DIMENSION,
DICT_INT_DIMENSION))
+ .get("resultTable").get("rows");
+ JsonNode rawRows = postQuery(
+ String.format("SELECT %s, %s, COUNT(*) FROM %s GROUP BY %s, %s ORDER
BY %s, %s",
+ RAW_DICT_DIMENSION, RAW_DICT_INT_DIMENSION, getTableName(),
+ RAW_DICT_DIMENSION, RAW_DICT_INT_DIMENSION, RAW_DICT_DIMENSION,
RAW_DICT_INT_DIMENSION))
+ .get("resultTable").get("rows");
+ assertEquals(rawRows, dictRows,
+ "Multi-column GROUP BY rows must match between dictionary-only and
raw+dictionary columns");
+ }
+
@Test(dataProvider = "useBothQueryEngines")
public void
testAggregationWithGroupByOnRawDictColumnReturnsSameResults(boolean
useMultiStageQueryEngine)
throws Exception {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]