This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 89ebbd735 perf: Implement more microbenchmarks for cast expressions
(#3031)
89ebbd735 is described below
commit 89ebbd735fa722afb1b7ec666666da02287ed14a
Author: Andy Grove <[email protected]>
AuthorDate: Tue Jan 6 12:08:17 2026 -0700
perf: Implement more microbenchmarks for cast expressions (#3031)
---
.../spark/sql/benchmark/CometCastBenchmark.scala | 96 --------------
.../sql/benchmark/CometCastBooleanBenchmark.scala | 124 +++++++++++++++++
.../CometCastNumericToNumericBenchmark.scala | 147 +++++++++++++++++++++
.../CometCastNumericToStringBenchmark.scala | 104 +++++++++++++++
.../CometCastNumericToTemporalBenchmark.scala | 101 ++++++++++++++
.../CometCastStringToNumericBenchmark.scala | 7 +-
.../CometCastStringToTemporalBenchmark.scala | 10 +-
.../CometCastTemporalToNumericBenchmark.scala | 105 +++++++++++++++
.../CometCastTemporalToStringBenchmark.scala | 98 ++++++++++++++
.../CometCastTemporalToTemporalBenchmark.scala | 100 ++++++++++++++
10 files changed, 790 insertions(+), 102 deletions(-)
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
deleted file mode 100644
index 975abd632..000000000
---
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBenchmark.scala
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.spark.sql.benchmark
-
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{DataType, LongType}
-
-import org.apache.comet.expressions.{CometCast, CometEvalMode}
-import org.apache.comet.serde.{Compatible, Incompatible, Unsupported}
-
-/**
- * Benchmark to measure Comet execution performance. To run this benchmark:
- * {{{
- * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastBenchmark
- * }}}
- *
- * Results will be written to
"spark/benchmarks/CometCastBenchmark-**results.txt".
- */
-
-object CometCastBenchmark extends CometBenchmarkBase {
-
- override def getSparkSession: SparkSession = {
- val session = super.getSparkSession
- session.conf.set("parquet.enable.dictionary", "false")
- session.conf.set("spark.sql.shuffle.partitions", "2")
- session
- }
-
- def castExprSQL(toDataType: DataType, input: String): String = {
- s"CAST ($input AS ${toDataType.sql})"
- }
-
- override def runCometBenchmark(args: Array[String]): Unit = {
-
- // TODO : Create all possible input datatypes. We only have Long inputs
for now
- CometCast.supportedTypes.foreach { toDataType =>
- Seq(false, true).foreach { ansiMode =>
- CometCast.isSupported(
- LongType,
- toDataType,
- None,
- if (ansiMode) CometEvalMode.ANSI else CometEvalMode.LEGACY) match {
- case Compatible(notes) =>
- runBenchmarkWithTable(
- s"Running benchmark cast operation from : $LongType to :
$toDataType",
- 1024 * 1024 * 10) { v =>
- castBenchmark(v, LongType, toDataType, isAnsiMode = ansiMode)
- }
- case Incompatible(notes) => None
- case Unsupported(notes) => None
- }
- }
- }
- }
-
- def castBenchmark(
- values: Int,
- fromDataType: DataType,
- toDataType: DataType,
- isAnsiMode: Boolean): Unit = {
-
- withTempPath { dir =>
- withTempTable("parquetV1Table") {
- prepareTable(dir, spark.sql(s"SELECT value FROM $tbl"))
-
- val functionSQL = castExprSQL(toDataType, "value")
- val query = s"SELECT $functionSQL FROM parquetV1Table"
- val name =
- s"Cast function to : ${toDataType} , ansi mode enabled :
${isAnsiMode}"
-
- val extraConfigs = Map(SQLConf.ANSI_ENABLED.key -> isAnsiMode.toString)
-
- runExpressionBenchmark(name, values, query, extraConfigs)
- }
- }
- }
-
-}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
new file mode 100644
index 000000000..57b8e88a7
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastBooleanBenchmark.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastBooleanConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast operations involving Boolean
type. To run this
+ * benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastBooleanBenchmark
+ * }}}
+ * Results will be written to
"spark/benchmarks/CometCastBooleanBenchmark-**results.txt".
+ */
+object CometCastBooleanBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+
+ // Boolean to String
+ private val boolToStringConfigs = for {
+ castFunc <- castFunctions
+ } yield CastBooleanConfig(
+ s"$castFunc Boolean to String",
+ s"SELECT $castFunc(c_bool AS STRING) FROM parquetV1Table")
+
+ // Boolean to numeric types
+ private val boolToNumericTypes =
+ Seq("BYTE", "SHORT", "INT", "LONG", "FLOAT", "DOUBLE", "DECIMAL(10,2)")
+ private val boolToNumericConfigs = for {
+ castFunc <- castFunctions
+ targetType <- boolToNumericTypes
+ } yield CastBooleanConfig(
+ s"$castFunc Boolean to $targetType",
+ s"SELECT $castFunc(c_bool AS $targetType) FROM parquetV1Table")
+
+ // Numeric to Boolean
+ private val numericTypes = Seq(
+ ("BYTE", "c_byte"),
+ ("SHORT", "c_short"),
+ ("INT", "c_int"),
+ ("LONG", "c_long"),
+ ("FLOAT", "c_float"),
+ ("DOUBLE", "c_double"),
+ ("DECIMAL(10,2)", "c_decimal"))
+
+ private val numericToBoolConfigs = for {
+ castFunc <- castFunctions
+ (sourceType, colName) <- numericTypes
+ } yield CastBooleanConfig(
+ s"$castFunc $sourceType to Boolean",
+ s"SELECT $castFunc($colName AS BOOLEAN) FROM parquetV1Table")
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate boolean data for boolean-to-other casts
+ runBenchmarkWithTable("Boolean to other types casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, 50/50 true/false
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE (value % 2 = 0)
+ END AS c_bool
+ FROM $tbl
+ """))
+
+ (boolToStringConfigs ++ boolToNumericConfigs).foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+
+ // Generate numeric data for numeric-to-boolean casts
+ runBenchmarkWithTable("Numeric to Boolean casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL per column, values in {-1, 0, 1} (~33%
each)
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT
+ CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 3) - 1
AS BYTE) END AS c_byte,
+ CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 3) - 1
AS SHORT) END AS c_short,
+ CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 3) - 1
AS INT) END AS c_int,
+ CASE WHEN value % 100 = 3 THEN NULL ELSE CAST((value % 3) - 1
AS LONG) END AS c_long,
+ CASE WHEN value % 100 = 4 THEN NULL ELSE CAST((value % 3) - 1
AS FLOAT) END AS c_float,
+ CASE WHEN value % 100 = 5 THEN NULL ELSE CAST((value % 3) - 1
AS DOUBLE) END AS c_double,
+ CASE WHEN value % 100 = 6 THEN NULL ELSE CAST((value % 3) - 1
AS DECIMAL(10,2)) END AS c_decimal
+ FROM $tbl
+ """))
+
+ numericToBoolConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
new file mode 100644
index 000000000..a9ea19a0e
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToNumericBenchmark.scala
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastNumericToNumericConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast between numeric types. To
run this benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastNumericToNumericBenchmark
+ * }}}
+ * Results will be written to
"spark/benchmarks/CometCastNumericToNumericBenchmark-**results.txt".
+ */
+object CometCastNumericToNumericBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+
+ // Integer widening conversions
+ private val integerWideningPairs = Seq(
+ ("BYTE", "c_byte", "SHORT"),
+ ("BYTE", "c_byte", "INT"),
+ ("BYTE", "c_byte", "LONG"),
+ ("SHORT", "c_short", "INT"),
+ ("SHORT", "c_short", "LONG"),
+ ("INT", "c_int", "LONG"))
+
+ // Integer narrowing conversions
+ private val integerNarrowingPairs = Seq(
+ ("LONG", "c_long", "INT"),
+ ("LONG", "c_long", "SHORT"),
+ ("LONG", "c_long", "BYTE"),
+ ("INT", "c_int", "SHORT"),
+ ("INT", "c_int", "BYTE"),
+ ("SHORT", "c_short", "BYTE"))
+
+ // Floating point conversions
+ private val floatPairs = Seq(("FLOAT", "c_float", "DOUBLE"), ("DOUBLE",
"c_double", "FLOAT"))
+
+ // Integer to floating point conversions
+ private val intToFloatPairs = Seq(
+ ("BYTE", "c_byte", "FLOAT"),
+ ("SHORT", "c_short", "FLOAT"),
+ ("INT", "c_int", "FLOAT"),
+ ("LONG", "c_long", "FLOAT"),
+ ("INT", "c_int", "DOUBLE"),
+ ("LONG", "c_long", "DOUBLE"))
+
+ // Floating point to integer conversions
+ private val floatToIntPairs = Seq(
+ ("FLOAT", "c_float", "INT"),
+ ("FLOAT", "c_float", "LONG"),
+ ("DOUBLE", "c_double", "INT"),
+ ("DOUBLE", "c_double", "LONG"))
+
+ // Decimal conversions
+ private val decimalPairs = Seq(
+ ("INT", "c_int", "DECIMAL(10,2)"),
+ ("LONG", "c_long", "DECIMAL(20,4)"),
+ ("DOUBLE", "c_double", "DECIMAL(15,5)"),
+ ("DECIMAL(10,2)", "c_decimal", "INT"),
+ ("DECIMAL(10,2)", "c_decimal", "LONG"),
+ ("DECIMAL(10,2)", "c_decimal", "DOUBLE"))
+
+ private def generateConfigs(
+ pairs: Seq[(String, String, String)]): Seq[CastNumericToNumericConfig] =
{
+ for {
+ castFunc <- castFunctions
+ (sourceType, colName, targetType) <- pairs
+ } yield CastNumericToNumericConfig(
+ s"$castFunc $sourceType to $targetType",
+ s"SELECT $castFunc($colName AS $targetType) FROM parquetV1Table")
+ }
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate input data once with all numeric types
+ runBenchmarkWithTable("Numeric to Numeric casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL per column
+ // - c_byte: full range -64 to 63
+ // - c_short: full range -16384 to 16383
+ // - c_int: centered around 0 (-2.5M to +2.5M)
+ // - c_long: large positive values (0 to ~5 billion)
+ // - c_float/c_double: 4% special values (NaN/Infinity), rest
centered around 0
+ // - c_decimal: values from -25000.00 to +25000.00
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT
+ CASE WHEN value % 100 = 0 THEN NULL ELSE CAST((value % 128) -
64 AS BYTE) END AS c_byte,
+ CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 32768)
- 16384 AS SHORT) END AS c_short,
+ CASE WHEN value % 100 = 2 THEN NULL ELSE CAST(value - 2500000
AS INT) END AS c_int,
+ CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value * 1000 AS
LONG) END AS c_long,
+ CASE
+ WHEN value % 100 = 4 THEN NULL
+ WHEN value % 100 = 5 THEN CAST('NaN' AS FLOAT)
+ WHEN value % 100 = 6 THEN CAST('Infinity' AS FLOAT)
+ WHEN value % 100 = 7 THEN CAST('-Infinity' AS FLOAT)
+ ELSE CAST((value - 2500000) / 100.0 AS FLOAT)
+ END AS c_float,
+ CASE
+ WHEN value % 100 = 8 THEN NULL
+ WHEN value % 100 = 9 THEN CAST('NaN' AS DOUBLE)
+ WHEN value % 100 = 10 THEN CAST('Infinity' AS DOUBLE)
+ WHEN value % 100 = 11 THEN CAST('-Infinity' AS DOUBLE)
+ ELSE CAST((value - 2500000) / 100.0 AS DOUBLE)
+ END AS c_double,
+ CASE WHEN value % 100 = 12 THEN NULL ELSE CAST((value -
2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal
+ FROM $tbl
+ """))
+
+ // Run all benchmark categories
+ (generateConfigs(integerWideningPairs) ++
+ generateConfigs(integerNarrowingPairs) ++
+ generateConfigs(floatPairs) ++
+ generateConfigs(intToFloatPairs) ++
+ generateConfigs(floatToIntPairs) ++
+ generateConfigs(decimalPairs)).foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
new file mode 100644
index 000000000..1fd2138c5
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToStringBenchmark.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastNumericToStringConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast from numeric types to
String. To run this
+ * benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastNumericToStringBenchmark
+ * }}}
+ * Results will be written to
"spark/benchmarks/CometCastNumericToStringBenchmark-**results.txt".
+ */
+object CometCastNumericToStringBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+ private val sourceTypes =
+ Seq(
+ ("BOOLEAN", "c_bool"),
+ ("BYTE", "c_byte"),
+ ("SHORT", "c_short"),
+ ("INT", "c_int"),
+ ("LONG", "c_long"),
+ ("FLOAT", "c_float"),
+ ("DOUBLE", "c_double"),
+ ("DECIMAL(10,2)", "c_decimal"))
+
+ private val castConfigs = for {
+ castFunc <- castFunctions
+ (sourceType, colName) <- sourceTypes
+ } yield CastNumericToStringConfig(
+ s"$castFunc $sourceType to String",
+ s"SELECT $castFunc($colName AS STRING) FROM parquetV1Table")
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate input data once with all numeric types
+ runBenchmarkWithTable("Numeric to String casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL per column
+ // - c_bool: 50/50 true/false
+ // - c_byte: full range -64 to 63
+ // - c_short: full range -16384 to 16383
+ // - c_int/c_long: large values centered around 0
+ // - c_float/c_double: 3% special values (NaN/Infinity), rest
centered around 0
+ // - c_decimal: values from -25000.00 to +25000.00
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT
+ CASE WHEN value % 100 = 0 THEN NULL ELSE (value % 2 = 0) END
AS c_bool,
+ CASE WHEN value % 100 = 1 THEN NULL ELSE CAST((value % 128) -
64 AS BYTE) END AS c_byte,
+ CASE WHEN value % 100 = 2 THEN NULL ELSE CAST((value % 32768)
- 16384 AS SHORT) END AS c_short,
+ CASE WHEN value % 100 = 3 THEN NULL ELSE CAST(value - 2500000
AS INT) END AS c_int,
+ CASE WHEN value % 100 = 4 THEN NULL ELSE CAST(value * 1000000
AS LONG) END AS c_long,
+ CASE
+ WHEN value % 100 = 5 THEN NULL
+ WHEN value % 100 = 6 THEN CAST('NaN' AS FLOAT)
+ WHEN value % 100 = 7 THEN CAST('Infinity' AS FLOAT)
+ WHEN value % 100 = 8 THEN CAST('-Infinity' AS FLOAT)
+ ELSE CAST((value - 2500000) / 1000.0 AS FLOAT)
+ END AS c_float,
+ CASE
+ WHEN value % 100 = 9 THEN NULL
+ WHEN value % 100 = 10 THEN CAST('NaN' AS DOUBLE)
+ WHEN value % 100 = 11 THEN CAST('Infinity' AS DOUBLE)
+ WHEN value % 100 = 12 THEN CAST('-Infinity' AS DOUBLE)
+ ELSE CAST((value - 2500000) / 100.0 AS DOUBLE)
+ END AS c_double,
+ CASE WHEN value % 100 = 13 THEN NULL ELSE CAST((value -
2500000) / 100.0 AS DECIMAL(10,2)) END AS c_decimal
+ FROM $tbl
+ """))
+
+ castConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
new file mode 100644
index 000000000..ec2d9ab12
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastNumericToTemporalBenchmark.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastNumericToTemporalConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast from numeric types to
temporal types. To run
+ * this benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastNumericToTemporalBenchmark
+ * }}}
+ * Results will be written to
+ * "spark/benchmarks/CometCastNumericToTemporalBenchmark-**results.txt".
+ */
+object CometCastNumericToTemporalBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+
+ // INT to DATE (days since epoch)
+ private val intToDateConfigs = for {
+ castFunc <- castFunctions
+ } yield CastNumericToTemporalConfig(
+ s"$castFunc Int to Date",
+ s"SELECT $castFunc(c_int AS DATE) FROM parquetV1Table")
+
+ // LONG to TIMESTAMP (microseconds since epoch)
+ private val longToTimestampConfigs = for {
+ castFunc <- castFunctions
+ } yield CastNumericToTemporalConfig(
+ s"$castFunc Long to Timestamp",
+ s"SELECT $castFunc(c_long AS TIMESTAMP) FROM parquetV1Table")
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate data once for INT to DATE conversions
+ runBenchmarkWithTable("Int to Date casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, days since epoch spanning ~100 years
(1920-2020)
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE CAST((value % 36500) - 18000 AS INT)
+ END AS c_int
+ FROM $tbl
+ """))
+
+ intToDateConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+
+ // Generate data once for LONG to TIMESTAMP conversions
+ runBenchmarkWithTable("Long to Timestamp casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, microseconds since epoch spanning ~1
year from 2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE 1577836800000000 + (value % 31536000000000)
+ END AS c_long
+ FROM $tbl
+ """))
+
+ longToTimestampConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
index 7f210fc73..c71eadad8 100644
---
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToNumericBenchmark.scala
@@ -68,8 +68,11 @@ object CometCastStringToNumericBenchmark extends
CometBenchmarkBase {
runBenchmarkWithTable("String to numeric casts", values) { v =>
withTempPath { dir =>
withTempTable("parquetV1Table") {
- // Generate numeric strings with both integer and decimal values
- // Also include some special values: nulls (~2%), NaN (~2%),
Infinity (~2%)
+ // Data distribution:
+ // - 2% NULL, 2% 'NaN', 2% 'Infinity', 2% '-Infinity'
+ // - 12% small integers (0-98)
+ // - 40% medium integers (0-999,998)
+ // - 40% decimals centered around 0 (approx -5000.00 to +5000.00)
prepareTable(
dir,
spark.sql(s"""
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
index 39337be5c..77cc009ae 100644
---
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastStringToTemporalBenchmark.scala
@@ -24,14 +24,14 @@ case class CastStringToTemporalConfig(
query: String,
extraCometConfigs: Map[String, String] = Map.empty)
-// spotless:off
/**
* Benchmark to measure performance of Comet cast from String to temporal
types. To run this
* benchmark:
- * `SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark`
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastStringToTemporalBenchmark
+ * }}}
* Results will be written to
"spark/benchmarks/CometCastStringToTemporalBenchmark-**results.txt".
*/
-// spotless:on
object CometCastStringToTemporalBenchmark extends CometBenchmarkBase {
// Configuration for String to temporal cast benchmarks
@@ -52,12 +52,13 @@ object CometCastStringToTemporalBenchmark extends
CometBenchmarkBase {
"SELECT TRY_CAST(c1 AS TIMESTAMP) FROM parquetV1Table"))
override def runCometBenchmark(mainArgs: Array[String]): Unit = {
- val values = 1024 * 1024 * 10 // 10M rows
+ val values = 1024 * 1024 // 1M rows
// Generate date data once with ~10% invalid values
runBenchmarkWithTable("date data generation", values) { v =>
withTempPath { dateDir =>
withTempTable("parquetV1Table") {
+ // Data distribution: 10% invalid strings, 90% valid date strings
spanning ~10 years
prepareTable(
dateDir,
spark.sql(s"""
@@ -80,6 +81,7 @@ object CometCastStringToTemporalBenchmark extends
CometBenchmarkBase {
runBenchmarkWithTable("timestamp data generation", values) { v =>
withTempPath { timestampDir =>
withTempTable("parquetV1Table") {
+ // Data distribution: 10% invalid strings, 90% valid timestamp
strings (1970 epoch range)
prepareTable(
timestampDir,
spark.sql(s"""
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
new file mode 100644
index 000000000..1468cbe08
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToNumericBenchmark.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastTemporalToNumericConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast from temporal types to
numeric types. To run
+ * this benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToNumericBenchmark
+ * }}}
+ * Results will be written to
+ * "spark/benchmarks/CometCastTemporalToNumericBenchmark-**results.txt".
+ */
+object CometCastTemporalToNumericBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+
+ // DATE to numeric types
+ private val dateToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG")
+ private val dateToNumericConfigs = for {
+ castFunc <- castFunctions
+ targetType <- dateToNumericTypes
+ } yield CastTemporalToNumericConfig(
+ s"$castFunc Date to $targetType",
+ s"SELECT $castFunc(c_date AS $targetType) FROM parquetV1Table")
+
+ // TIMESTAMP to numeric types
+ private val timestampToNumericTypes = Seq("BYTE", "SHORT", "INT", "LONG")
+ private val timestampToNumericConfigs = for {
+ castFunc <- castFunctions
+ targetType <- timestampToNumericTypes
+ } yield CastTemporalToNumericConfig(
+ s"$castFunc Timestamp to $targetType",
+ s"SELECT $castFunc(c_timestamp AS $targetType) FROM parquetV1Table")
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate DATE data once for all date-to-numeric benchmarks
+ runBenchmarkWithTable("Date to Numeric casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, dates spanning ~10 years from
2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT))
+ END AS c_date
+ FROM $tbl
+ """))
+
+ dateToNumericConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+
+ // Generate TIMESTAMP data once for all timestamp-to-numeric benchmarks
+ runBenchmarkWithTable("Timestamp to Numeric casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, timestamps spanning ~1 year from
2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE TIMESTAMP_MICROS(1577836800000000 + value %
31536000000000)
+ END AS c_timestamp
+ FROM $tbl
+ """))
+
+ timestampToNumericConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
new file mode 100644
index 000000000..1ef3e7711
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToStringBenchmark.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastTemporalToStringConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast from temporal types to
String. To run this
+ * benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToStringBenchmark
+ * }}}
+ * Results will be written to
"spark/benchmarks/CometCastTemporalToStringBenchmark-**results.txt".
+ */
+object CometCastTemporalToStringBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+
+ private val dateCastConfigs = for {
+ castFunc <- castFunctions
+ } yield CastTemporalToStringConfig(
+ s"$castFunc Date to String",
+ s"SELECT $castFunc(c_date AS STRING) FROM parquetV1Table")
+
+ private val timestampCastConfigs = for {
+ castFunc <- castFunctions
+ } yield CastTemporalToStringConfig(
+ s"$castFunc Timestamp to String",
+ s"SELECT $castFunc(c_timestamp AS STRING) FROM parquetV1Table")
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate temporal data once for date benchmarks
+ runBenchmarkWithTable("Date to String casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, dates spanning ~10 years from
2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT))
+ END AS c_date
+ FROM $tbl
+ """))
+
+ dateCastConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+
+ // Generate temporal data once for timestamp benchmarks
+ runBenchmarkWithTable("Timestamp to String casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, timestamps spanning ~1 year from
2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE TIMESTAMP_MICROS(1577836800000000 + value %
31536000000000)
+ END AS c_timestamp
+ FROM $tbl
+ """))
+
+ timestampCastConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
diff --git
a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
new file mode 100644
index 000000000..f2e257248
--- /dev/null
+++
b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometCastTemporalToTemporalBenchmark.scala
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.benchmark
+
+case class CastTemporalToTemporalConfig(
+ name: String,
+ query: String,
+ extraCometConfigs: Map[String, String] = Map.empty)
+
+/**
+ * Benchmark to measure performance of Comet cast between temporal types. To
run this benchmark:
+ * {{{
+ * SPARK_GENERATE_BENCHMARK_FILES=1 make
benchmark-org.apache.spark.sql.benchmark.CometCastTemporalToTemporalBenchmark
+ * }}}
+ * Results will be written to
+ * "spark/benchmarks/CometCastTemporalToTemporalBenchmark-**results.txt".
+ */
+object CometCastTemporalToTemporalBenchmark extends CometBenchmarkBase {
+
+ private val castFunctions = Seq("CAST", "TRY_CAST")
+
+ // Date to Timestamp
+ private val dateToTimestampConfigs = for {
+ castFunc <- castFunctions
+ } yield CastTemporalToTemporalConfig(
+ s"$castFunc Date to Timestamp",
+ s"SELECT $castFunc(c_date AS TIMESTAMP) FROM parquetV1Table")
+
+ // Timestamp to Date
+ private val timestampToDateConfigs = for {
+ castFunc <- castFunctions
+ } yield CastTemporalToTemporalConfig(
+ s"$castFunc Timestamp to Date",
+ s"SELECT $castFunc(c_timestamp AS DATE) FROM parquetV1Table")
+
+ override def runCometBenchmark(mainArgs: Array[String]): Unit = {
+ val values = 1024 * 1024 // 1M rows
+
+ // Generate DATE data for Date -> Timestamp benchmarks
+ runBenchmarkWithTable("Date to Timestamp casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, dates spanning ~10 years from
2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE DATE_ADD('2020-01-01', CAST(value % 3650 AS INT))
+ END AS c_date
+ FROM $tbl
+ """))
+
+ dateToTimestampConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+
+ // Generate TIMESTAMP data for Timestamp -> Date benchmarks
+ runBenchmarkWithTable("Timestamp to Date casts", values) { v =>
+ withTempPath { dir =>
+ withTempTable("parquetV1Table") {
+ // Data distribution: 1% NULL, timestamps spanning ~1 year from
2020-01-01
+ prepareTable(
+ dir,
+ spark.sql(s"""
+ SELECT CASE
+ WHEN value % 100 = 0 THEN NULL
+ ELSE TIMESTAMP_MICROS(1577836800000000 + value %
31536000000000)
+ END AS c_timestamp
+ FROM $tbl
+ """))
+
+ timestampToDateConfigs.foreach { config =>
+ runExpressionBenchmark(config.name, v, config.query,
config.extraCometConfigs)
+ }
+ }
+ }
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]