Jackie-Jiang commented on code in PR #17820: URL: https://github.com/apache/pinot/pull/17820#discussion_r2897880221
########## pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/InvertedIndexDistinctOperatorIntegrationTest.java: ########## @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.integration.tests; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.File; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.CommonConstants.Broker.Request.QueryOptionKey; +import org.apache.pinot.util.TestUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +/** + * Integration tests for InvertedIndexDistinctOperator. + * <p> + * InvertedIndexDistinctOperator is disabled by default and must be enabled via query option + * {@code useInvertedIndexDistinct=true}. + * </p> + * Tests both Single-Stage Engine (SSE) and Multi-Stage Engine (MSE). + */ +public class InvertedIndexDistinctOperatorIntegrationTest extends BaseClusterIntegrationTestSet { Review Comment: Can we integrate this into an existing integration test? Setting up the cluster is quite slow ########## pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/InvertedIndexDistinctOperatorIntegrationTest.java: ########## @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.integration.tests; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.File; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.pinot.spi.config.table.TableConfig; +import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.spi.utils.CommonConstants.Broker.Request.QueryOptionKey; +import org.apache.pinot.util.TestUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +/** + * Integration tests for InvertedIndexDistinctOperator. + * <p> + * InvertedIndexDistinctOperator is disabled by default and must be enabled via query option + * {@code useInvertedIndexDistinct=true}. + * </p> + * Tests both Single-Stage Engine (SSE) and Multi-Stage Engine (MSE). + */ +public class InvertedIndexDistinctOperatorIntegrationTest extends BaseClusterIntegrationTestSet { + + @BeforeClass + public void setUp() + throws Exception { + TestUtils.ensureDirectoriesExistAndEmpty(_tempDir, _segmentDir, _tarDir); + + startZk(); + startController(); + startBroker(); + startServer(); + + // Create and upload schema and table config (uses default On_Time data with inverted index on Origin) + Schema schema = createSchema(); + addSchema(schema); + TableConfig tableConfig = createOfflineTableConfig(); + addTableConfig(tableConfig); + + List<File> avroFiles = unpackAvroData(_tempDir); + ClusterIntegrationTestUtils.buildSegmentsFromAvro(avroFiles, tableConfig, schema, 0, _segmentDir, _tarDir); + uploadSegments(getTableName(), _tarDir); + + setUpH2Connection(avroFiles); + waitForAllDocsLoaded(600_000L); + } + + /** + * Verifies that without useInvertedIndexDistinct (operator disabled by default), the query uses + * default DistinctOperator and returns correct results. + */ + @Test(dataProvider = "useBothQueryEngines") + public void testInvertedIndexDistinctOperatorDisabledByDefault(boolean useMultiStageQueryEngine) + throws Exception { + setUseMultiStageQueryEngine(useMultiStageQueryEngine); + + // No query option = operator disabled, uses default DistinctOperator + String query = "SELECT DISTINCT Origin FROM mytable ORDER BY Origin"; + JsonNode response = postQuery(query); + Assert.assertEquals(response.get("exceptions").size(), 0); + Set<String> values = extractDistinctValues(response); + Assert.assertFalse(values.isEmpty(), "Baseline (operator disabled) should return distinct values"); + } + + @Test(dataProvider = "useBothQueryEngines") + public void testInvertedIndexDistinctOperator(boolean useMultiStageQueryEngine) + throws Exception { + setUseMultiStageQueryEngine(useMultiStageQueryEngine); + + // Baseline: SELECT DISTINCT Origin without the optimization + String query = "SELECT DISTINCT Origin FROM mytable ORDER BY Origin"; + JsonNode baselineResponse = postQuery(query); + Assert.assertEquals(baselineResponse.get("exceptions").size(), 0); + Set<String> baselineValues = extractDistinctValues(baselineResponse); + + // With useInvertedIndexDistinct=true - should use InvertedIndexDistinctOperator + String queryWithOption = "SELECT DISTINCT Origin FROM mytable ORDER BY Origin"; + JsonNode optimizedResponse = + postQueryWithOptions(queryWithOption, QueryOptionKey.USE_INVERTED_INDEX_DISTINCT + "=true"); + Assert.assertEquals(optimizedResponse.get("exceptions").size(), 0); + Set<String> optimizedValues = extractDistinctValues(optimizedResponse); + + Assert.assertEquals(optimizedValues, baselineValues, Review Comment: We should also verify execution stats, e.g. docs scanned to ensure we are using the new operator ########## pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java: ########## @@ -654,6 +654,8 @@ public static class QueryOptionKey { public static final String SKIP_UPSERT_VIEW = "skipUpsertView"; public static final String UPSERT_VIEW_FRESHNESS_MS = "upsertViewFreshnessMs"; public static final String USE_STAR_TREE = "useStarTree"; + public static final String USE_JSON_INDEX_DISTINCT = "useJsonIndexDistinct"; + public static final String USE_INVERTED_INDEX_DISTINCT = "useInvertedIndexDistinct"; Review Comment: Consider merging them, e.g. `useIndexBasedDistinctOperator` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
