This is an automated email from the ASF dual-hosted git repository. liuneng pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push: new 65dd411e1 [CH] Support shuffle function (#5432) 65dd411e1 is described below commit 65dd411e1cea7ef164295bdd1f694e34257ba070 Author: exmy <xumov...@gmail.com> AuthorDate: Thu Apr 18 20:42:12 2024 +0800 [CH] Support shuffle function (#5432) What changes were proposed in this pull request? How was this patch tested? Pass CI --- .../GlutenClickHouseTPCHSaltNullParquetSuite.scala | 16 ++++++++++++++++ cpp-ch/local-engine/Parser/SerializedPlanParser.h | 1 + .../apache/gluten/expression/ExpressionMappings.scala | 1 + .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 4 ++++ .../apache/gluten/utils/velox/VeloxTestSettings.scala | 4 ++++ .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 4 ++++ .../apache/gluten/utils/velox/VeloxTestSettings.scala | 4 ++++ .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 4 ++++ .../apache/gluten/utils/velox/VeloxTestSettings.scala | 4 ++++ .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 4 ++++ .../apache/gluten/utils/velox/VeloxTestSettings.scala | 4 ++++ .../org/apache/gluten/expression/ExpressionNames.scala | 1 + 12 files changed, 51 insertions(+) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala index 27cb39584..615cceae9 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTPCHSaltNullParquetSuite.scala @@ -688,6 +688,22 @@ class GlutenClickHouseTPCHSaltNullParquetSuite extends GlutenClickHouseTPCHAbstr } } + test("test shuffle function") { + withSQLConf( + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> (ConstantFolding.ruleName + "," + NullPropagation.ruleName)) { + runQueryAndCompare( + "select shuffle(split(n_comment, ' ')) from nation", + compareResult = false + )(checkGlutenOperatorMatch[ProjectExecTransformer]) + + runQueryAndCompare( + "select shuffle(array(1,2,3,4,5)), shuffle(array(1,3,null,3,4)), shuffle(null)", + compareResult = false, + noFallBack = false + )(checkGlutenOperatorMatch[ProjectExecTransformer]) + } + } + test("test 'function regexp_extract_all'") { runQueryAndCompare( "select l_orderkey, regexp_extract_all(l_comment, '([a-z])', 1) " + diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.h b/cpp-ch/local-engine/Parser/SerializedPlanParser.h index b9a72cf05..019d49b02 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.h +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.h @@ -181,6 +181,7 @@ static const std::map<std::string, std::string> SCALAR_FUNCTIONS // array functions {"array", "array"}, + {"shuffle", "arrayShuffle"}, {"range", "range"}, /// dummy mapping // map functions diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala index 618798b15..e34ea8840 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala @@ -229,6 +229,7 @@ object ExpressionMappings { Sig[ArrayRepeat](ARRAY_REPEAT), Sig[ArrayRemove](ARRAY_REMOVE), Sig[ArrayFilter](FILTER), + Sig[Shuffle](SHUFFLE), // Map functions Sig[CreateMap](CREATE_MAP), Sig[GetMapValue](GET_MAP_VALUE), diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9b987b0e4..a51157e62 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -167,6 +167,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -686,6 +689,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-36755: ArraysOverlap hould handle duplicated Double.NaN and Float.Nan") .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 189a09d35..7e0ccb17c 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -195,6 +195,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -273,6 +274,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameTungstenSuite] enableSuite[GlutenDataFrameSetOperationsSuite] // Result depends on the implementation for nondeterministic expression rand. diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 05e00ca5d..144d103c0 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -185,6 +185,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -727,6 +730,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") .exclude("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 518908c9c..00eb455c0 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -114,6 +114,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -938,6 +939,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite] diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index d90722301..679893bb6 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -187,6 +187,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -567,6 +570,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") .exclude("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 9d297e4ea..4f8afe579 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -95,6 +95,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -943,6 +944,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index d90722301..679893bb6 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -187,6 +187,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("transform values function - test empty") .exclude("SPARK-14393: values generated by non-deterministic functions shouldn't change after coalesce or union") .exclude("SPARK-24734: Fix containsNull of Concat for array type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite].exclude( @@ -567,6 +570,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude( "SPARK-36740: ArrayMin/ArrayMax/SortArray should handle NaN greater then non-NaN value") .exclude("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") + .excludeGlutenTest("Shuffle") enableSuite[GlutenComplexTypeSuite] .exclude("SPARK-33386: GetArrayItem ArrayIndexOutOfBoundsException") .exclude("SPARK-33460: GetMapValue NoSuchElementException") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 1c74bd247..6f6c6d05a 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -96,6 +96,7 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenCollectionExpressionsSuite] // Rewrite in Gluten to replace Seq with Array .exclude("Shuffle") + .excludeGlutenTest("Shuffle") // TODO: ArrayDistinct should handle duplicated Double.NaN .excludeByPrefix("SPARK-36741") // TODO: ArrayIntersect should handle duplicated Double.NaN @@ -959,6 +960,9 @@ class VeloxTestSettings extends BackendTestSettings { // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") .exclude("aggregate function - array for non-primitive type") + .exclude("shuffle function - array for primitive type not containing null") + .exclude("shuffle function - array for primitive type containing null") + .exclude("shuffle function - array for non-primitive type") enableSuite[GlutenDataFrameHintSuite] enableSuite[GlutenDataFrameImplicitsSuite] enableSuite[GlutenDataFrameJoinSuite] diff --git a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala index ca8b098aa..f097d5362 100644 --- a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala +++ b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala @@ -245,6 +245,7 @@ object ExpressionNames { final val ARRAY_REPEAT = "array_repeat" final val ARRAY_REMOVE = "array_remove" final val FILTER = "filter" + final val SHUFFLE = "shuffle" // Map functions final val CREATE_MAP = "map" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@gluten.apache.org For additional commands, e-mail: commits-h...@gluten.apache.org