(datafusion) branch main updated: fix: dictionary encoded column to partition column casting bug (#15652)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new 94ed08a364 fix: dictionary encoded column to partition column casting 
bug (#15652)
94ed08a364 is described below

commit 94ed08a3645758a77b9d62a99f801a103532311e
Author: haruband 
AuthorDate: Thu Apr 10 00:37:23 2025 +0900

fix: dictionary encoded column to partition column casting bug (#15652)

* Fix partition values bugs from dictionary encoded column

* Add some sqllogictests
---
 datafusion/datasource/src/write/demux.rs  | 10 --
 datafusion/sqllogictest/test_files/dictionary.slt |  7 +++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/datafusion/datasource/src/write/demux.rs 
b/datafusion/datasource/src/write/demux.rs
index fc2e5daf92..49c3a64d24 100644
--- a/datafusion/datasource/src/write/demux.rs
+++ b/datafusion/datasource/src/write/demux.rs
@@ -28,8 +28,8 @@ use datafusion_common::error::Result;
 use datafusion_physical_plan::SendableRecordBatchStream;
 
 use arrow::array::{
-builder::UInt64Builder, cast::AsArray, downcast_dictionary_array, 
RecordBatch,
-StringArray, StructArray,
+builder::UInt64Builder, cast::AsArray, downcast_dictionary_array, 
ArrayAccessor,
+RecordBatch, StringArray, StructArray,
 };
 use arrow::datatypes::{DataType, Schema};
 use datafusion_common::cast::{
@@ -482,10 +482,8 @@ fn compute_partition_keys_by_row<'a>(
 .ok_or(exec_datafusion_err!("it is not yet 
supported to write to hive partitions with datatype {}",
 dtype))?;
 
-for val in array.values() {
-partition_values.push(
-
Cow::from(val.ok_or(exec_datafusion_err!("Cannot partition by null value for 
column {}", col))?),
-);
+for i in 0..rb.num_rows() {
+partition_values.push(Cow::from(array.value(i)));
 }
 },
 _ => unreachable!(),
diff --git a/datafusion/sqllogictest/test_files/dictionary.slt 
b/datafusion/sqllogictest/test_files/dictionary.slt
index 778b3537d1..1769f42c2d 100644
--- a/datafusion/sqllogictest/test_files/dictionary.slt
+++ b/datafusion/sqllogictest/test_files/dictionary.slt
@@ -450,3 +450,10 @@ query I
 select dense_rank() over (order by arrow_cast('abc', 'Dictionary(UInt16, 
Utf8)'));
 
 1
+
+# Test dictionary encoded column to partition column casting
+statement ok
+CREATE TABLE test0 AS VALUES ('foo',1), ('bar',2), ('foo',3);
+
+statement ok
+COPY (SELECT arrow_cast(column1, 'Dictionary(Int32, Utf8)') AS column1, 
column2 FROM test0) TO 'test_files/scratch/copy/part_dict_test' STORED AS 
PARQUET PARTITIONED BY (column1);
\ No newline at end of file


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-site) branch site/tpch_data_generator updated (4f3ae9b -> c81497d)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch site/tpch_data_generator
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


from 4f3ae9b  Update content/blog/2025-04-10-fastest-tpch-generator.md
 add c81497d  Update content/blog/2025-04-10-fastest-tpch-generator.md

No new revisions were added by this update.

Summary of changes:
 content/blog/2025-04-10-fastest-tpch-generator.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-site) branch asf-staging updated: Commit build products

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-staging
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


The following commit(s) were added to refs/heads/asf-staging by this push:
 new 603b6b8  Commit build products
603b6b8 is described below

commit 603b6b819836b798d13260b8e26a1dd8a2cd4283
Author: Build Pelican (action) 
AuthorDate: Wed Apr 9 22:59:43 2025 +

Commit build products
---
 blog/2025/04/10/fastest-tpch-generator/index.html  | 500 +
 .../andrew-lamb-achraf-b-and-sean-smith.html   | 117 +
 blog/category/blog.html|  48 ++
 blog/feed.xml  |  31 +-
 blog/feeds/all-en.atom.xml | 470 ++-
 .../andrew-lamb-achraf-b-and-sean-smith.atom.xml   | 470 +++
 .../andrew-lamb-achraf-b-and-sean-smith.rss.xml|  31 ++
 blog/feeds/blog.atom.xml   | 470 ++-
 blog/images/fastest-tpch-generator/lamb-theory.png | Bin 0 -> 300479 bytes
 .../fastest-tpch-generator/parquet-performance.png | Bin 0 -> 61946 bytes
 .../fastest-tpch-generator/tbl-performance.png | Bin 0 -> 49477 bytes
 blog/index.html|  48 ++
 12 files changed, 2182 insertions(+), 3 deletions(-)

diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html 
b/blog/2025/04/10/fastest-tpch-generator/index.html
new file mode 100644
index 000..e4aac31
--- /dev/null
+++ b/blog/2025/04/10/fastest-tpch-generator/index.html
@@ -0,0 +1,500 @@
+
+
+  
+
+
+
+tpchgen-rs World’s fastest open source TPC-H data generator, 
written in Rust - Apache DataFusion Blog
+
+
+
+
+
+hljs.highlightAll();  
+  
+  
+
+
+
+ Apache 
DataFusion Blog
+
+
+
+
+
+
+
+About
+
+
+RSS
+
+
+
+
+
+
+
+
+
+
+
+  
+  tpchgen-rs World’s fastest open source TPC-H data generator, 
written in Rust
+  
+  Posted on: Thu 10 April 2025 by Andrew Lamb, Achraf B, and 
Sean Smith
+  
+
+/* Table borders */
+table, th, td {
+  border: 1px solid black;
+  border-collapse: collapse;
+}
+th, td {
+  padding: 3px;
+}
+
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over 20x
+faster than any other implementation we know of.
+It is now possible to create the TPC-H SF=100 dataset in 72.23 seconds (1.4 
GB/s
+😎) on a Macbook Air M3 with 16GB of memory, compared to the classic 
dbgen
+which takes 30 minutes1 (0.05GB/sec). On the same machine, it takes 
less than
+2 minutes to create all 3.6 GB of SF=100 in https://parquet.apache.org/";>Apache Parquet format, which takes 44 
minutes using https://duckdb.org";>DuckDB.
+It is finally convenient and efficient to run TPC-H queries locally when 
testing
+analytical engines such as DataFusion.
+
+Figure 1: Time to create TPC-H dataset for Scale Factor 
(see below) 1, 10,
+100 and 1000 as 8 individual SNAPPY compressed parquet files using a 22 core 
GCP
+VM with 88GB of memory. For Scale Factor(SF) 100 tpchgen takes 1 
minute and 14 seconds and
+https://duckdb.org";>DuckDB takes 17 minutes and 48 seconds. For 
SF=1000, tpchgen takes 10
+minutes and 26 and uses about 5 GB of RAM at peak, and we could not measure
+DuckDB’s time as it https://duckdb.org/docs/stable/extensions/tpch.html#resource-usage-of-the-data-generator";>requires
 647 GB of RAM, more than the 88 GB that was
+available on our test machine. The testing methodology is in the
+https://github.com/clflushopt/tpchgen-rs/blob/main/benchmarks/BENCHMARKS.md";>documentation.
+This blog explains what TPC-H is, how we ported the vintage C data 
generator to
+Rust (yes, https://www.reddit.com/r/rust/comments/4ri2gn/riir_rewrite_it_in_rust/";>RWIR)
 and optimized its performance over the course of a few weeks
+of part-time work. We began this project so we can easily generate TPC-H data 
in
+https://datafusion.apache.org/";>Apache DataFusion and https://glaredb.com/";>GlareDB.
+Try it for yourself
+The tool is entirely open source under the https://www.apache.org/licenses/LICENSE-2.0";>Apache 2.0 license. 
Visit the https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs 
repository or try it for yourself by run the following commands after https://www.rust-lang.org/tools/install";>installing Rust:
+$ cargo install tpchgen-cli
+
+# create SF=1 in classic TBL format
+$ tpchgen-cli -s 1 
+
+# create SF=10 in Parquet
+$ tpchgen-cli -s 10 --format=parquet
+
+What is TPC-H / dbgen?
+The popular https://www.tpc.org/tpch/";>TPC-H benchmark (often 
referred to as TPCH)

(datafusion-site) branch site/tpch_data_generator updated (c81497d -> d782afd)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch site/tpch_data_generator
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


from c81497d  Update content/blog/2025-04-10-fastest-tpch-generator.md
 add d782afd  Update content/blog/2025-04-10-fastest-tpch-generator.md

No new revisions were added by this update.

Summary of changes:
 content/blog/2025-04-10-fastest-tpch-generator.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-site) branch asf-staging updated: Commit build products

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-staging
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


The following commit(s) were added to refs/heads/asf-staging by this push:
 new 0db2e2c  Commit build products
0db2e2c is described below

commit 0db2e2cb91d8fa5bfe59ae08873e59c94f4d11de
Author: Build Pelican (action) 
AuthorDate: Wed Apr 9 22:59:50 2025 +

Commit build products
---
 blog/2025/04/10/fastest-tpch-generator/index.html   |  3 ++-
 blog/author/andrew-lamb-achraf-b-and-sean-smith.html|  7 +++
 blog/category/blog.html |  7 +++
 blog/feed.xml   |  7 +++
 blog/feeds/all-en.atom.xml  | 10 +-
 blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml | 10 +-
 blog/feeds/andrew-lamb-achraf-b-and-sean-smith.rss.xml  |  7 +++
 blog/feeds/blog.atom.xml| 10 +-
 blog/index.html |  7 +++
 9 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html 
b/blog/2025/04/10/fastest-tpch-generator/index.html
index e4aac31..62a9864 100644
--- a/blog/2025/04/10/fastest-tpch-generator/index.html
+++ b/blog/2025/04/10/fastest-tpch-generator/index.html
@@ -69,7 +69,8 @@ th, td {
   padding: 3px;
 }
 
-3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
 development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over 20x
 faster than any other implementation we know of.
 It is now possible to create the TPC-H SF=100 dataset in 72.23 seconds (1.4 
GB/s
diff --git a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html 
b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html
index 6f07c46..a308e6a 100644
--- a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html
+++ b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html
@@ -81,10 +81,9 @@ th, td {
   padding: 3px;
 }
 
-3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
-development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over 20x
-faster than any other implementation we know of.
-It is now …
+TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over …
 
 
 Continue 
Reading
diff --git a/blog/category/blog.html b/blog/category/blog.html
index 7e5dc70..17e20f8 100644
--- a/blog/category/blog.html
+++ b/blog/category/blog.html
@@ -81,10 +81,9 @@ th, td {
   padding: 3px;
 }
 
-3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
-development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over 20x
-faster than any other implementation we know of.
-It is now …
+TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over …
 
 
 Continue 
Reading
diff --git a/blog/feed.xml b/blog/feed.xml
index 0947603..01fe8ab 100644
--- a/blog/feed.xml
+++ b/blog/feed.xml
@@ -25,10 +25,9 @@ th, td {
   padding: 3px;
 }
 
-

3 members of the Apache DataFusion community used Rust and open source -development to build tpchgen-rs;, a fully open TPC-H data generator over 20x -faster than any other implementation we know of.

-

It is now …

http://purl.org/dc/elements/1.1/";>Andrew Lamb, Achraf B, and Sean SmithThu, 10 Apr 2025 00:00:00 +tag:datafusion.apache.org,2025-04-10:/blog/2025/04/10/fastest-tpch-generatorblogApache DataFusion Python 46.0.0 Releasedhttps://datafusion.apache.org/blog/2025/03/30/datafusion-python-46.0.0https://datafusion.apache.org/">Apache DataFusion community used Rust and open source +development to build tpchgen-rs;, a fully open TPC-H data generator over …

http://purl.org/dc/elements/1.1/";>Andrew Lamb, Achraf B, an

(datafusion-comet) branch main updated: feat: add MAP type support for first level (#1603)

2025-04-09 Thread comphead
This is an automated email from the ASF dual-hosted git repository.

comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
 new ba53a7f2f feat: add MAP type support for first level (#1603)
ba53a7f2f is described below

commit ba53a7f2f0a73b4120197f709e5d71b1d46dbf7a
Author: Oleks V 
AuthorDate: Wed Apr 9 13:35:20 2025 -0700

feat: add MAP type support for first level (#1603)

* feat: add MAP type support for first level
---
 .../src/main/scala/org/apache/spark/sql/comet/util/Utils.scala |  3 ++-
 .../src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala |  2 ++
 .../scala/org/apache/spark/sql/comet/CometNativeScanExec.scala |  2 +-
 .../main/scala/org/apache/spark/sql/comet/CometScanExec.scala  |  2 +-
 .../scala/org/apache/comet/exec/CometNativeReaderSuite.scala   |  9 +
 .../test/scala/org/apache/comet/parquet/ParquetReadSuite.scala | 10 +-
 6 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala 
b/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
index 46e474170..3041ea2c8 100644
--- a/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
+++ b/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala
@@ -277,7 +277,8 @@ object Utils {
   case v @ (_: BitVector | _: TinyIntVector | _: SmallIntVector | _: 
IntVector |
   _: BigIntVector | _: Float4Vector | _: Float8Vector | _: 
VarCharVector |
   _: DecimalVector | _: DateDayVector | _: TimeStampMicroTZVector | _: 
VarBinaryVector |
-  _: FixedSizeBinaryVector | _: TimeStampMicroVector | _: StructVector 
| _: ListVector) =>
+  _: FixedSizeBinaryVector | _: TimeStampMicroVector | _: StructVector 
| _: ListVector |
+  _: MapVector) =>
 v.asInstanceOf[FieldVector]
   case _ =>
 throw new SparkException(s"Unsupported Arrow Vector for $reason: 
${valueVector.getClass}")
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala 
b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 53fbb6d44..bf779a6b5 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -70,6 +70,8 @@ object QueryPlanSerde extends Logging with CometExprShim {
   s.fields.map(_.dataType).forall(supportedDataType(_, allowComplex))
 case a: ArrayType if allowComplex =>
   supportedDataType(a.elementType, allowComplex)
+case m: MapType if allowComplex =>
+  supportedDataType(m.keyType, allowComplex) && 
supportedDataType(m.valueType, allowComplex)
 case dt =>
   emitWarning(s"unsupported Spark data type: $dt")
   false
diff --git 
a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala 
b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
index fd71e92fa..3ddc134c7 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
@@ -230,10 +230,10 @@ object CometNativeScanExec extends DataTypeSupport {
   }
 
   override def isAdditionallySupported(dt: DataType): Boolean = {
-// TODO add map
 dt match {
   case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported)
   case a: ArrayType => isTypeSupported(a.elementType)
+  case m: MapType => isTypeSupported(m.keyType) && 
isTypeSupported(m.valueType)
   case _ => false
 }
   }
diff --git 
a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala 
b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala
index 87f48f841..43a925fd8 100644
--- a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala
+++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala
@@ -480,10 +480,10 @@ object CometScanExec extends DataTypeSupport {
 
   override def isAdditionallySupported(dt: DataType): Boolean = {
 if (CometConf.COMET_NATIVE_SCAN_IMPL.get() == 
CometConf.SCAN_NATIVE_ICEBERG_COMPAT) {
-  // TODO add map
   dt match {
 case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported)
 case a: ArrayType => isTypeSupported(a.elementType)
+case m: MapType => isTypeSupported(m.keyType) && 
isTypeSupported(m.valueType)
 case _ => false
   }
 } else {
diff --git 
a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala 
b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala
index 3593ae220..9696ef1bb 100644
--- a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala
@@ -134,4 +134,13 @@ class CometNativeReaderSuite extends CometTestBase with 
Adap

(datafusion-comet) branch asf-site updated: Publish built docs triggered by e2383921f714c857c31bbbc1a2f427bb0608b46c

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/asf-site by this push:
 new 66a5bca66 Publish built docs triggered by 
e2383921f714c857c31bbbc1a2f427bb0608b46c
66a5bca66 is described below

commit 66a5bca66b9f9eebe97186889a012e025a490a46
Author: github-actions[bot] 
AuthorDate: Wed Apr 9 22:58:01 2025 +

Publish built docs triggered by e2383921f714c857c31bbbc1a2f427bb0608b46c
---
 _sources/contributor-guide/benchmarking.md.txt |   4 +
 .../contributor-guide/benchmarking_aws_ec2.md.txt  | 223 
 contributor-guide/benchmarking.html|   4 +
 contributor-guide/benchmarking_aws_ec2.html| 567 +
 objects.inv| Bin 786 -> 807 bytes
 searchindex.js |   2 +-
 6 files changed, 799 insertions(+), 1 deletion(-)

diff --git a/_sources/contributor-guide/benchmarking.md.txt 
b/_sources/contributor-guide/benchmarking.md.txt
index 1193ada62..15934d7f5 100644
--- a/_sources/contributor-guide/benchmarking.md.txt
+++ b/_sources/contributor-guide/benchmarking.md.txt
@@ -22,6 +22,10 @@ under the License.
 To track progress on performance, we regularly run benchmarks derived from 
TPC-H and TPC-DS. Data generation and 
 benchmarking documentation and scripts are available in the [DataFusion 
Benchmarks](https://github.com/apache/datafusion-benchmarks) GitHub repository.
 
+Available benchmarking guides:
+
+- [Benchmarking on AWS EC2](benchmarking_aws_ec2) 
+
 We also have many micro benchmarks that can be run from an IDE located 
[here](https://github.com/apache/datafusion-comet/tree/main/spark/src/test/scala/org/apache/spark/sql/benchmark).
 
 
 ## Current Benchmark Results
diff --git a/_sources/contributor-guide/benchmarking_aws_ec2.md.txt 
b/_sources/contributor-guide/benchmarking_aws_ec2.md.txt
new file mode 100644
index 0..0ec33bf7e
--- /dev/null
+++ b/_sources/contributor-guide/benchmarking_aws_ec2.md.txt
@@ -0,0 +1,223 @@
+
+
+# Comet Benchmarking in AWS
+
+This guide is for setting up benchmarks on AWS EC2 with a single node with 
Parquet files located in S3.
+
+## Data Generation
+
+- Create an EC2 instance with an EBS volume sized for approximately 2x the 
size of
+  the dataset to be generated (200 GB for scale factor 100, 2 TB for scale 
factor 1000, and so on)
+- Create an S3 bucket to store the Parquet files
+
+Install prerequisites:
+
+```shell
+sudo yum install -y docker git python3-pip
+
+sudo systemctl start docker
+sudo systemctl enable docker
+sudo usermod -aG docker ec2-user
+newgrp docker
+
+docker pull ghcr.io/scalytics/tpch-docker:main
+
+pip3 install datafusion
+```
+
+Run the data generation script:
+
+```shell
+git clone https://github.com/apache/datafusion-benchmarks.git
+cd datafusion-benchmarks/tpch
+nohup python3 tpchgen.py generate --scale-factor 100 --partitions 16 &
+```
+
+Check on progress with the following commands:
+
+```shell
+docker ps
+du -h -d 1 data
+```
+
+Fix ownership in the generated files:
+
+```shell
+sudo chown -R ec2-user:docker data
+```
+
+Convert to Parquet:
+
+```shell
+nohup python3 tpchgen.py convert --scale-factor 100 --partitions 16 &
+```
+
+Delete the CSV files:
+
+```shell
+cd data
+rm *.tbl.*
+```
+
+Copy the Parquet files to S3:
+
+```shell
+aws s3 cp . s3://your-bucket-name/top-level-folder/ --recursive
+```
+
+## Install Spark
+
+Install Java
+
+```shell
+sudo yum install -y java-17-amazon-corretto-headless 
java-17-amazon-corretto-devel
+```
+
+Set JAVA_HOME
+
+```shell
+export JAVA_HOME=/usr/lib/jvm/java-17-amazon-corretto
+```
+
+Install Spark
+
+```shell
+wget 
https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz
+tar xzf spark-3.5.4-bin-hadoop3.tgz
+sudo mv spark-3.5.4-bin-hadoop3 /opt
+export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3/
+mkdir /tmp/spark-events
+```
+
+Set `SPARK_MASTER` env var (IP address will need to be edited):
+
+```shell
+export SPARK_MASTER=spark://172.31.34.87:7077
+```
+
+Set `SPARK_LOCAL_DIRS` to point to EBS volume
+
+```shell
+sudo mkdir /mnt/tmp
+sudo chmod 777 /mnt/tmp
+mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh
+```
+
+Add the following entry to `spark-env.sh`:
+
+```shell
+SPARK_LOCAL_DIRS=/mnt/tmp
+```
+
+Start Spark in standalone mode:
+
+```shell
+$SPARK_HOME/sbin/start-master.sh
+$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
+```
+
+Install Hadoop jar files:
+
+```shell
+wget 
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar
 -P $SPARK_HOME/jars
+wget 
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar
 -P $SPARK_HOME/jars
+```
+
+Add credentials to `~/.aws/credentials`:
+
+```shell
+[default]
+aws_access_key_id=your-access-key
+aws_secret_access

(datafusion-site) branch asf-staging updated: Commit build products

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-staging
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


The following commit(s) were added to refs/heads/asf-staging by this push:
 new 0a9cfe7  Commit build products
0a9cfe7 is described below

commit 0a9cfe7c5614349d462927f94e3eef6ca81ae233
Author: Build Pelican (action) 
AuthorDate: Wed Apr 9 23:04:58 2025 +

Commit build products
---
 blog/2025/04/10/fastest-tpch-generator/index.html   |  6 +++---
 blog/author/andrew-lamb-achraf-b-and-sean-smith.html|  6 +++---
 blog/category/blog.html |  6 +++---
 blog/feed.xml   |  6 +++---
 blog/feeds/all-en.atom.xml  | 12 ++--
 blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml | 12 ++--
 blog/feeds/andrew-lamb-achraf-b-and-sean-smith.rss.xml  |  6 +++---
 blog/feeds/blog.atom.xml| 12 ++--
 blog/index.html |  6 +++---
 9 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html 
b/blog/2025/04/10/fastest-tpch-generator/index.html
index 62a9864..3e09c2d 100644
--- a/blog/2025/04/10/fastest-tpch-generator/index.html
+++ b/blog/2025/04/10/fastest-tpch-generator/index.html
@@ -57,7 +57,7 @@ distributed under the License is distributed on an "AS IS" 
BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-{% endcomment %}
+{% endcomment %}x
 -->
 
 /* Table borders */
@@ -69,8 +69,8 @@ th, td {
   padding: 3px;
 }
 
-TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen
-3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen.
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
 development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over 20x
 faster than any other implementation we know of.
 It is now possible to create the TPC-H SF=100 dataset in 72.23 seconds (1.4 
GB/s
diff --git a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html 
b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html
index a308e6a..7b48354 100644
--- a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html
+++ b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html
@@ -69,7 +69,7 @@ distributed under the License is distributed on an "AS IS" 
BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-{% endcomment %}
+{% endcomment %}x
 -->
 
 /* Table borders */
@@ -81,8 +81,8 @@ th, td {
   padding: 3px;
 }
 
-TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen
-3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen.
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
 development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over …
 
 
diff --git a/blog/category/blog.html b/blog/category/blog.html
index 17e20f8..bcf7e8c 100644
--- a/blog/category/blog.html
+++ b/blog/category/blog.html
@@ -69,7 +69,7 @@ distributed under the License is distributed on an "AS IS" 
BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-{% endcomment %}
+{% endcomment %}x
 -->
 
 /* Table borders */
@@ -81,8 +81,8 @@ th, td {
   padding: 3px;
 }
 
-TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen
-3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
+TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with 
dbgen.
+3 members of the https://datafusion.apache.org/";>Apache 
DataFusion community used Rust and open source
 development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open 
TPC-H data generator over …
 
 
diff --git a/blog/feed.xml b/blog/feed.xml
index 01fe8ab..430f736 100644
--- a/blog/feed.xml
+++ b/blog/feed.xml
@@ -13,7 +13,7 @@ distributed under the License is distributed on an "AS IS" 
BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.

(datafusion-site) branch site/tpch_data_generator updated (495bad4 -> 623404b)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch site/tpch_data_generator
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


from 495bad4  tweak
 add 623404b  remove errant asterisks

No new revisions were added by this update.

Summary of changes:
 content/blog/2025-04-10-fastest-tpch-generator.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-site) branch site/tpch_data_generator updated (d782afd -> 495bad4)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch site/tpch_data_generator
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


from d782afd  Update content/blog/2025-04-10-fastest-tpch-generator.md
 add 495bad4  tweak

No new revisions were added by this update.

Summary of changes:
 content/blog/2025-04-10-fastest-tpch-generator.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-site) branch asf-staging updated: Commit build products

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch asf-staging
in repository https://gitbox.apache.org/repos/asf/datafusion-site.git


The following commit(s) were added to refs/heads/asf-staging by this push:
 new 960619c  Commit build products
960619c is described below

commit 960619c802c5e9ac12a6f778e0c4995c5fdfd529
Author: Build Pelican (action) 
AuthorDate: Wed Apr 9 23:05:30 2025 +

Commit build products
---
 blog/2025/04/10/fastest-tpch-generator/index.html   | 2 +-
 blog/feeds/all-en.atom.xml  | 2 +-
 blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml | 2 +-
 blog/feeds/blog.atom.xml| 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html 
b/blog/2025/04/10/fastest-tpch-generator/index.html
index 3e09c2d..e03c029 100644
--- a/blog/2025/04/10/fastest-tpch-generator/index.html
+++ b/blog/2025/04/10/fastest-tpch-generator/index.html
@@ -104,7 +104,7 @@ $ tpchgen-cli -s 10 --format=parquet
 
 What is TPC-H / dbgen?
 The popular https://www.tpc.org/tpch/";>TPC-H benchmark (often 
referred to as TPCH) helps evaluate the
-performance of database systems on https://en.wikipedia.org/wiki/Online_analytical_processing";>OLAP 
queries, the kind used to build BI
+performance of database systems on https://en.wikipedia.org/wiki/Online_analytical_processing";>OLAP 
queries, the kind used to build BI
 dashboards.
 TPC-H has become a de facto standard for analytic systems. While there are 
https://www.vldb.org/pvldb/vol9/p204-leis.pdf";>well
 known limitations as the data and queries do not well represent many real 
world
diff --git a/blog/feeds/all-en.atom.xml b/blog/feeds/all-en.atom.xml
index 674243c..bde3316 100644
--- a/blog/feeds/all-en.atom.xml
+++ b/blog/feeds/all-en.atom.xml
@@ -88,7 +88,7 @@ $ tpchgen-cli -s 10 --format=parquet
 

What is TPC-H / dbgen?

The popular TPC-H; benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on OLAP; queries, the kind used to build BI +performance of database systems on OLAP; queries, the kind used to build BI dashboards.

TPC-H has become a de facto standard for analytic systems. While there are well known limitations as the data and queries do not well represent many real world diff --git a/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml b/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml index 08be951..21b8660 100644 --- a/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml +++ b/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml @@ -88,7 +88,7 @@ $ tpchgen-cli -s 10 --format=parquet

What is TPC-H / dbgen?

The popular TPC-H; benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on OLAP; queries, the kind used to build BI +performance of database systems on OLAP; queries, the kind used to build BI dashboards.

TPC-H has become a de facto standard for analytic systems. While there are well known limitations as the data and queries do not well represent many real world diff --git a/blog/feeds/blog.atom.xml b/blog/feeds/blog.atom.xml index 7ae5546..c3fdf9f 100644 --- a/blog/feeds/blog.atom.xml +++ b/blog/feeds/blog.atom.xml @@ -88,7 +88,7 @@ $ tpchgen-cli -s 10 --format=parquet

What is TPC-H / dbgen?

The popular TPC-H; benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on OLAP; queries, the kind used to build BI +performance of database systems on OLAP; queries, the kind used to build BI dashboards.

TPC-H has become a de facto standard for analytic systems. While there are well known limitations as the data and queries do not well represent many real world - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org


(datafusion-ballista) branch main updated: chore: Remove some arrow references (#1232)

2025-04-09 Thread agrove
This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-ballista.git


The following commit(s) were added to refs/heads/main by this push:
 new 13eb8112 chore: Remove some arrow references (#1232)
13eb8112 is described below

commit 13eb811256bb251492d66255170f58d5583265be
Author: Andy Grove 
AuthorDate: Wed Apr 9 10:03:54 2025 -0600

chore: Remove some arrow references (#1232)
---
 .devcontainer/devcontainer.json  |2 +-
 .gitattributes   |6 -
 .github/workflows/comment_bot.yml|   72 -
 .github/workflows/docker.yml |   24 +-
 LICENSE.txt  | 2008 --
 ballista-cli/README.md   |2 +-
 ballista/client/README.md|2 +-
 ballista/client/tests/context_unsupported.rs |2 +-
 ballista/core/proto/ballista.proto   |2 +-
 ballista/core/proto/datafusion.proto |2 +-
 benchmarks/README.md |2 +-
 benchmarks/spark/README.md   |2 +-
 ci/scripts/rust_build.sh |   45 -
 dev/build-ballista-docker.sh |   10 +-
 dev/release/download-python-wheels.py|2 +-
 dev/release/generate-changelog.py|2 +-
 rustfmt.toml |4 -
 17 files changed, 27 insertions(+), 2162 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 753774ee..38472603 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,7 +1,7 @@
 // For format details, see https://aka.ms/devcontainer.json. For config 
options, see the
 // README at: https://github.com/devcontainers/templates/tree/main/src/rust
 {
-   "name": "arrow-ballista",
+   "name": "datafusion-ballista",
// Or use a Dockerfile or Docker Compose file. More info: 
https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/rust:latest",
"features": {
diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index fac7bf85..
--- a/.gitattributes
+++ /dev/null
@@ -1,6 +0,0 @@
-r/R/RcppExports.R linguist-generated=true
-r/R/arrowExports.R linguist-generated=true
-r/src/RcppExports.cpp linguist-generated=true
-r/src/arrowExports.cpp linguist-generated=true
-r/man/*.Rd linguist-generated=true
-
diff --git a/.github/workflows/comment_bot.yml 
b/.github/workflows/comment_bot.yml
deleted file mode 100644
index 81b15939..
--- a/.github/workflows/comment_bot.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-name: Comment Bot
-
-on:
-  # TODO(kszucs): support pull_request_review_comment
-  issue_comment:
-types:
-  - created
-  - edited
-
-jobs:
-  crossbow:
-name: Listen!
-if: startsWith(github.event.comment.body, '@github-actions crossbow')
-runs-on: ubuntu-latest
-steps:
-  - name: Checkout Arrow
-uses: actions/checkout@v4
-with:
-  repository: apache/arrow
-  - name: Set up Python
-uses: actions/setup-python@v4
-with:
-  python-version: "3.10"
-  - name: Install Archery and Crossbow dependencies
-run: pip install -e dev/archery[bot]
-  - name: Handle Github comment event
-env:
-  ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }}
-run: |
-  archery trigger-bot \
---event-name ${{ github.event_name }} \
---event-payload ${{ github.event_path }}
-
-  rebase:
-name: "Rebase"
-if: startsWith(github.event.comment.body, '@github-actions rebase')
-runs-on: ubuntu-latest
-steps:
-  - uses: actions/checkout@v4
-  - uses: r-lib/actions/pr-fetch@master
-with:
-  repo-token: ${{ secrets.GITHUB_TOKEN }}
-  - name: Rebase on ${{ github.repository }} main
-run: |
-  set -ex
-  git config user.name "$(git log -1 --pretty=format:%an)"
-  git config u

(datafusion) branch main updated: chore(deps): bump blake3 from 1.8.0 to 1.8.1 (#15650)

2025-04-09 Thread comphead
This is an automated email from the ASF dual-hosted git repository.

comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new d2a1077e9f chore(deps): bump blake3 from 1.8.0 to 1.8.1 (#15650)
d2a1077e9f is described below

commit d2a1077e9f9627fb089989713c9bbbda8a956e46
Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
AuthorDate: Wed Apr 9 07:39:41 2025 -0700

chore(deps): bump blake3 from 1.8.0 to 1.8.1 (#15650)

Bumps [blake3](https://github.com/BLAKE3-team/BLAKE3) from 1.8.0 to 1.8.1.
- [Release notes](https://github.com/BLAKE3-team/BLAKE3/releases)
- [Commits](https://github.com/BLAKE3-team/BLAKE3/compare/1.8.0...1.8.1)

---
updated-dependencies:
- dependency-name: blake3
  dependency-version: 1.8.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] 
Co-authored-by: dependabot[bot] 
<49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0bf2432de0..ac430e1457 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1119,9 +1119,9 @@ dependencies = [
 
 [[package]]
 name = "blake3"
-version = "1.8.0"
+version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index";
-checksum = "34a796731680be7931955498a16a10b2270c7762963d5d570fdbfe02dcbf314f"
+checksum = "389a099b34312839e16420d499a9cad9650541715937ffbdd40d36f49e77eeb3"
 dependencies = [
  "arrayref",
  "arrayvec",


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch dependabot/cargo/main/blake3-1.8.1 deleted (was e06fcc576c)

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/main/blake3-1.8.1
in repository https://gitbox.apache.org/repos/asf/datafusion.git


 was e06fcc576c chore(deps): bump blake3 from 1.8.0 to 1.8.1

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch dependabot/cargo/main/blake3-1.8.1 created (now e06fcc576c)

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/main/blake3-1.8.1
in repository https://gitbox.apache.org/repos/asf/datafusion.git


  at e06fcc576c chore(deps): bump blake3 from 1.8.0 to 1.8.1

No new revisions were added by this update.


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch main updated: Ignore false positive warning (#15635)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new 68475eb981 Ignore false positive warning (#15635)
68475eb981 is described below

commit 68475eb9817ec791f8a934642ddea965b04e8b0e
Author: Jannik Steinmann 
AuthorDate: Wed Apr 9 17:17:28 2025 +0200

Ignore false positive warning (#15635)

Signed-off-by: Jannik Steinmann 
---
 datafusion/physical-optimizer/src/aggregate_statistics.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs 
b/datafusion/physical-optimizer/src/aggregate_statistics.rs
index 0d3d83c583..28ee10eb65 100644
--- a/datafusion/physical-optimizer/src/aggregate_statistics.rs
+++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs
@@ -42,6 +42,7 @@ impl AggregateStatistics {
 
 impl PhysicalOptimizerRule for AggregateStatistics {
 #[cfg_attr(feature = "recursive_protection", recursive::recursive)]
+#[allow(clippy::only_used_in_recursion)] // See 
https://github.com/rust-lang/rust-clippy/issues/14566
 fn optimize(
 &self,
 plan: Arc,


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch main updated: chore: avoid erroneuous warning for FFI table operation (only not default value) (#15579)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new a337ca75d7 chore: avoid erroneuous warning for FFI table operation 
(only not default value) (#15579)
a337ca75d7 is described below

commit a337ca75d758f2b92905045fc89e971a955851d4
Author: Chen Chongchen 
AuthorDate: Wed Apr 9 22:58:13 2025 +0800

chore: avoid erroneuous warning for FFI table operation (only not default 
value) (#15579)

* warning only not default value

* fmt
---
 datafusion/common/src/config.rs | 47 +
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index b0f17630c9..449ce22693 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -149,9 +149,17 @@ macro_rules! config_namespace {
 // $(#[allow(deprecated)])?
 {
 $(let value = $transform(value);)? // Apply 
transformation if specified
-$(log::warn!($warn);)? // Log warning if 
specified
 #[allow(deprecated)]
-self.$field_name.set(rem, value.as_ref())
+let ret = self.$field_name.set(rem, 
value.as_ref());
+
+$(if !$warn.is_empty() {
+let default: $field_type = $default;
+#[allow(deprecated)]
+if default != self.$field_name {
+log::warn!($warn);
+}
+})? // Log warning if specified, and the value 
is not the default
+ret
 }
 },
 )*
@@ -1999,8 +2007,8 @@ mod tests {
 use std::collections::HashMap;
 
 use crate::config::{
-ConfigEntry, ConfigExtension, ConfigFileType, ExtensionOptions, 
Extensions,
-TableOptions,
+ConfigEntry, ConfigExtension, ConfigField, ConfigFileType, 
ExtensionOptions,
+Extensions, TableOptions,
 };
 
 #[derive(Default, Debug, Clone)]
@@ -2085,6 +2093,37 @@ mod tests {
 assert_eq!(table_config.csv.escape.unwrap() as char, '\'');
 }
 
+#[test]
+fn warning_only_not_default() {
+use std::sync::atomic::AtomicUsize;
+static COUNT: AtomicUsize = AtomicUsize::new(0);
+use log::{Level, LevelFilter, Metadata, Record};
+struct SimpleLogger;
+impl log::Log for SimpleLogger {
+fn enabled(&self, metadata: &Metadata) -> bool {
+metadata.level() <= Level::Info
+}
+
+fn log(&self, record: &Record) {
+if self.enabled(record.metadata()) {
+COUNT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+}
+}
+fn flush(&self) {}
+}
+log::set_logger(&SimpleLogger).unwrap();
+log::set_max_level(LevelFilter::Info);
+let mut sql_parser_options = 
crate::config::SqlParserOptions::default();
+sql_parser_options
+.set("enable_options_value_normalization", "false")
+.unwrap();
+assert_eq!(COUNT.load(std::sync::atomic::Ordering::Relaxed), 0);
+sql_parser_options
+.set("enable_options_value_normalization", "true")
+.unwrap();
+assert_eq!(COUNT.load(std::sync::atomic::Ordering::Relaxed), 1);
+}
+
 #[cfg(feature = "parquet")]
 #[test]
 fn parquet_table_options() {


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-comet) branch main updated: chore: Parquet fuzz testing (#1623)

2025-04-09 Thread agrove
This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
 new 4740e94eb chore: Parquet fuzz testing (#1623)
4740e94eb is described below

commit 4740e94eb921822f1cd22fa008dd49d8c7a18b52
Author: Andy Grove 
AuthorDate: Wed Apr 9 11:40:12 2025 -0600

chore: Parquet fuzz testing (#1623)
---
 .../apache/comet/testing/ParquetGenerator.scala| 18 +++--
 .../org/apache/comet/CometFuzzTestSuite.scala  | 80 +-
 2 files changed, 90 insertions(+), 8 deletions(-)

diff --git 
a/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala 
b/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala
index 9b91d5d93..520af27ea 100644
--- a/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala
+++ b/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala
@@ -34,11 +34,14 @@ import org.apache.spark.sql.types.{ArrayType, DataType, 
DataTypes, DecimalType,
 object ParquetGenerator {
 
   /**
-   * Arbitrary date to use as base for generating temporal columns. Random 
integers will be added
-   * to or subtracted from this value.
+   * Date to use as base for generating temporal columns. Random integers will 
be added to or
+   * subtracted from this value.
+   *
+   * Date was chosen to trigger generating a timestamp that's larger than a 
64-bit nanosecond
+   * timestamp can represent so that we test support for INT96 timestamps.
*/
-  private val baseDate =
-new SimpleDateFormat("-MM-DD hh:mm:ss").parse("2024-05-25 
12:34:56").getTime
+  val defaultBaseDate: Long =
+new SimpleDateFormat("-MM-DD hh:mm:ss").parse("-05-25 
12:34:56").getTime
 
   private val primitiveTypes = Seq(
 DataTypes.BooleanType,
@@ -217,13 +220,13 @@ object ParquetGenerator {
   null
   }
   case DataTypes.DateType =>
-Range(0, numRows).map(_ => new java.sql.Date(baseDate + r.nextInt()))
+Range(0, numRows).map(_ => new java.sql.Date(options.baseDate + 
r.nextInt()))
   case DataTypes.TimestampType =>
-Range(0, numRows).map(_ => new Timestamp(baseDate + r.nextInt()))
+Range(0, numRows).map(_ => new Timestamp(options.baseDate + 
r.nextInt()))
   case DataTypes.TimestampNTZType =>
 Range(0, numRows).map(_ =>
   LocalDateTime.ofInstant(
-Instant.ofEpochMilli(baseDate + r.nextInt()),
+Instant.ofEpochMilli(options.baseDate + r.nextInt()),
 ZoneId.systemDefault()))
   case _ => throw new IllegalStateException(s"Cannot generate data for 
$dataType yet")
 }
@@ -234,6 +237,7 @@ object ParquetGenerator {
 case class DataGenOptions(
 allowNull: Boolean = true,
 generateNegativeZero: Boolean = true,
+baseDate: Long = ParquetGenerator.defaultBaseDate,
 generateArray: Boolean = false,
 generateStruct: Boolean = false,
 generateMap: Boolean = false)
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala 
b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala
index 1d12b4bfb..195308526 100644
--- a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala
@@ -20,6 +20,7 @@
 package org.apache.comet
 
 import java.io.File
+import java.text.SimpleDateFormat
 
 import scala.util.Random
 
@@ -32,6 +33,8 @@ import org.apache.spark.sql.comet.{CometNativeScanExec, 
CometScanExec}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
+import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, StructType}
 
 import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
 
@@ -57,7 +60,13 @@ class CometFuzzTestSuite extends CometTestBase with 
AdaptiveSparkPlanHelper {
   CometConf.COMET_ENABLED.key -> "false",
   SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) {
   val options =
-DataGenOptions(generateArray = true, generateStruct = true, 
generateNegativeZero = false)
+DataGenOptions(
+  generateArray = true,
+  generateStruct = true,
+  generateNegativeZero = false,
+  // override base date due to known issues with experimental scans
+  baseDate =
+new SimpleDateFormat("-MM-DD hh:mm:ss").parse("2024-05-25 
12:34:56").getTime)
   ParquetGenerator.makeParquetFile(random, spark, filename, 1000, options)
 }
   }
@@ -166,6 +175,75 @@ class CometFuzzTestSuite extends CometTestBase with 
AdaptiveSparkPlanHelper {
 }
   }
 
+  test("Parquet temporal types written as INT96") {
+
+// there are known issues with INT96 support in the new experi

(datafusion) branch main updated: Rename protobuf Java package (#15658)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new 5ab5a03724 Rename protobuf Java package (#15658)
5ab5a03724 is described below

commit 5ab5a03724b3afa009ba480a022145875972d08c
Author: Andy Grove 
AuthorDate: Wed Apr 9 15:02:06 2025 -0600

Rename protobuf Java package (#15658)
---
 datafusion/proto/proto/datafusion.proto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/proto/proto/datafusion.proto 
b/datafusion/proto/proto/datafusion.proto
index 2e028eb291..908b95ab56 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -21,7 +21,7 @@ syntax = "proto3";
 package datafusion;
 
 option java_multiple_files = true;
-option java_package = "org.apache.arrow.datafusion.protobuf";
+option java_package = "org.apache.datafusion.protobuf";
 option java_outer_classname = "DatafusionProto";
 
 import "datafusion/proto-common/proto/datafusion_common.proto";


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch dependabot/cargo/main/mimalloc-0.1.46 created (now d2d924ca3a)

2025-04-09 Thread github-bot
This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a change to branch dependabot/cargo/main/mimalloc-0.1.46
in repository https://gitbox.apache.org/repos/asf/datafusion.git


  at d2d924ca3a chore(deps): bump mimalloc from 0.1.44 to 0.1.46

No new revisions were added by this update.


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-testing) branch main updated: Updates for changes in DataFusion PR #15462 (#9)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-testing.git


The following commit(s) were added to refs/heads/main by this push:
 new e9f9e22  Updates for changes in DataFusion PR #15462 (#9)
e9f9e22 is described below

commit e9f9e22ccf09145a7368f80fd6a871f11e2b4481
Author: Bruce Ritchie 
AuthorDate: Wed Apr 9 06:31:33 2025 -0400

Updates for changes in DataFusion PR #15462 (#9)
---
 data/sqlite/random/aggregates/slt_good_11.slt | 4 ++--
 data/sqlite/random/groupby/slt_good_12.slt| 8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/data/sqlite/random/aggregates/slt_good_11.slt 
b/data/sqlite/random/aggregates/slt_good_11.slt
index b9dd3ff..4f26801 100644
--- a/data/sqlite/random/aggregates/slt_good_11.slt
+++ b/data/sqlite/random/aggregates/slt_good_11.slt
@@ -6356,9 +6356,9 @@ SELECT DISTINCT + ( - COUNT ( * ) ) * + COUNT ( * ) AS 
col0 FROM tab2
 
 -9
 
-# Datafusion - Datafusion expected results:
-query error DataFusion error: Arrow error: Divide by zero error
+query I rowsort label-1028
 SELECT - 69 FROM tab0 WHERE + - col1 NOT IN ( - + col1, CAST ( + col0 AS 
INTEGER ), + + ( + + col1 ) / + - 0 * + col0 )
+
 
 query I rowsort
 SELECT ALL - col1 * - col0 + - ( 88 ) AS col1 FROM tab1
diff --git a/data/sqlite/random/groupby/slt_good_12.slt 
b/data/sqlite/random/groupby/slt_good_12.slt
index 98bde5d..58bbf38 100644
--- a/data/sqlite/random/groupby/slt_good_12.slt
+++ b/data/sqlite/random/groupby/slt_good_12.slt
@@ -12137,9 +12137,9 @@ SELECT - col1 + - col1 AS col1 FROM tab2 WHERE NULL IS 
NULL GROUP BY col1
 -122
 -82
 
-# Datafusion - Datafusion expected results:
-query error DataFusion error: Arrow error: Divide by zero error
+query I rowsort
 SELECT DISTINCT col1 AS col1 FROM tab0 WHERE NOT + CASE + 87 WHEN col1 THEN 
col1 ELSE NULL END NOT BETWEEN - col1 / + col1 AND NULL GROUP BY col1 HAVING 
NOT col1 * 75 IS NOT NULL
+
 
 query I rowsort
 SELECT ( - ( - col0 ) ) FROM tab0 GROUP BY col0 HAVING NULL IN ( 93 )
@@ -26969,9 +26969,9 @@ SELECT DISTINCT - + col0 AS col2, col0 / + - col0 FROM 
tab1 AS cor0 WHERE col2 *
 -82
 -1
 
-# Datafusion - Datafusion expected results:
-query error DataFusion error: Arrow error: Divide by zero error
+query I rowsort
 SELECT - col0 * 87 * - col1 FROM tab0 WHERE NOT col1 BETWEEN 67 + 65 AND - 41 
/ - col1 + + + col0 * + col1 * col2 / - col0 GROUP BY col1, col0 HAVING NOT 
col0 IS NOT NULL
+
 
 query I rowsort
 SELECT DISTINCT ( - - col1 ) + + 43 FROM tab2 GROUP BY col1, col1, col1


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch main updated: Improve binary_op benchmark (#15632)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new 1385140fb2 Improve binary_op benchmark (#15632)
1385140fb2 is described below

commit 1385140fb2cdd8b474e3ec768e21aed4d34eccd7
Author: Andrew Lamb 
AuthorDate: Wed Apr 9 06:35:12 2025 -0400

Improve binary_op benchmark (#15632)
---
 datafusion/physical-expr/benches/binary_op.rs | 80 ++-
 1 file changed, 4 insertions(+), 76 deletions(-)

diff --git a/datafusion/physical-expr/benches/binary_op.rs 
b/datafusion/physical-expr/benches/binary_op.rs
index 7ac5c04852..59a602df05 100644
--- a/datafusion/physical-expr/benches/binary_op.rs
+++ b/datafusion/physical-expr/benches/binary_op.rs
@@ -17,7 +17,6 @@
 
 use arrow::{
 array::BooleanArray,
-compute::{bool_and, bool_or},
 datatypes::{DataType, Field, Schema},
 };
 use arrow::{array::StringArray, record_batch::RecordBatch};
@@ -28,7 +27,7 @@ use datafusion_physical_expr::{
 planner::logical2physical,
 PhysicalExpr,
 };
-use std::sync::{Arc, LazyLock};
+use std::sync::Arc;
 
 /// Generates BooleanArrays with different true/false distributions for 
benchmarking.
 ///
@@ -130,75 +129,6 @@ fn generate_boolean_cases(
 cases
 }
 
-/// Benchmarks boolean operations `false_count/bool_or` and 
`true_count/bool_and` on [`BooleanArray`]
-/// You can run this benchmark with:
-/// ```sh
-/// # test true_count/false_count
-/// TEST_BOOL_COUNT=1 cargo bench --bench binary_op -- boolean_ops
-/// # test bool_or/bool_and
-/// cargo bench --bench binary_op -- boolean_ops
-/// ```
-fn benchmark_boolean_ops(c: &mut Criterion) {
-let len = 1_000_000; // Use one million elements for clear performance 
differentiation
-static TEST_BOOL_COUNT: LazyLock =
-LazyLock::new(|| match std::env::var("TEST_BOOL_COUNT") {
-Ok(_) => {
-println!("TEST_BOOL_COUNT=ON");
-true
-}
-Err(_) => {
-println!("TEST_BOOL_COUNT=OFF");
-false
-}
-});
-
-// Determine the test function to be executed based on the ENV 
`TEST_BOOL_COUNT`
-fn test_func(array: &BooleanArray) -> bool {
-// Use false_count for all false and true_count for all true
-if *TEST_BOOL_COUNT {
-if TEST_ALL_FALSE {
-array.false_count() == array.len()
-} else {
-array.true_count() == array.len()
-}
-}
-// Use bool_or for all false and bool_and for all true
-else if TEST_ALL_FALSE {
-match bool_or(array) {
-Some(v) => !v,
-None => false,
-}
-} else {
-bool_and(array).unwrap_or(false)
-}
-}
-
-// Test cases for false_count and bool_or
-{
-let test_cases = generate_boolean_cases::(len);
-for (scenario, array) in test_cases {
-let arr_ref = Arc::new(array);
-
-// Benchmark test_func across different scenarios
-c.bench_function(&format!("boolean_ops/or/{}", scenario), |b| {
-b.iter(|| test_func::(black_box(&arr_ref)))
-});
-}
-}
-// Test cases for true_count and bool_and
-{
-let test_cases = generate_boolean_cases::(len);
-for (scenario, array) in test_cases {
-let arr_ref = Arc::new(array);
-
-// Benchmark test_func across different scenarios
-c.bench_function(&format!("boolean_ops/and/{}", scenario), |b| {
-b.iter(|| test_func::(black_box(&arr_ref)))
-});
-}
-}
-}
-
 /// Benchmarks AND/OR operator short-circuiting by evaluating complex regex 
conditions.
 ///
 /// Creates 6 test scenarios per operator:
@@ -257,12 +187,14 @@ fn benchmark_binary_op_in_short_circuit(c: &mut 
Criterion) {
 );
 
 // Create physical binary expressions
+// a AND ((b ~ regex) AND (c ~ regex))
 let expr_and = BinaryExpr::new(
 Arc::new(Column::new("a", 0)),
 Operator::And,
 logical2physical(&right_condition_and, &schema),
 );
 
+// a OR ((b ~ regex) OR (c ~ regex))
 let expr_or = BinaryExpr::new(
 Arc::new(Column::new("a", 0)),
 Operator::Or,
@@ -364,10 +296,6 @@ fn create_record_batch(
 Ok(rbs)
 }
 
-criterion_group!(
-benches,
-benchmark_boolean_ops,
-benchmark_binary_op_in_short_circuit
-);
+criterion_group!(benches, benchmark_binary_op_in_short_circuit);
 
 criterion_main!(benches);


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion) branch main updated: Minor: refine comments for statistics compution (#15647)

2025-04-09 Thread alamb
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new b86619e84b Minor: refine comments for statistics compution (#15647)
b86619e84b is described below

commit b86619e84b24271f28c9cb4bf3751b6f3686eb7b
Author: xudong.w 
AuthorDate: Wed Apr 9 18:33:59 2025 +0800

Minor: refine comments for statistics compution (#15647)
---
 datafusion/core/src/datasource/listing/table.rs |  8 ++--
 datafusion/datasource/src/statistics.rs | 15 ---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index c05b7835ed..5848506da2 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -716,9 +716,13 @@ impl ListingOptions {
 #[derive(Debug)]
 pub struct ListingTable {
 table_paths: Vec,
-/// File fields only
+/// `file_schema` contains only the columns physically stored in the data 
files themselves.
+/// - Represents the actual fields found in files like Parquet, CSV, 
etc.
+/// - Used when reading the raw data from files
 file_schema: SchemaRef,
-/// File fields + partition columns
+/// `table_schema` combines `file_schema` + partition columns
+/// - Partition columns are derived from directory paths (not stored 
in files)
+/// - These are columns like "year=2022/month=01" in paths like 
`/data/year=2022/month=01/file.parquet`
 table_schema: SchemaRef,
 options: ListingOptions,
 definition: Option,
diff --git a/datafusion/datasource/src/statistics.rs 
b/datafusion/datasource/src/statistics.rs
index 040bf754dd..e1a91c0533 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -506,7 +506,7 @@ pub fn compute_file_group_statistics(
 ///
 /// # Parameters
 /// * `file_groups` - Vector of file groups to process
-/// * `file_schema` - Schema of the files
+/// * `table_schema` - Schema of the table
 /// * `collect_stats` - Whether to collect statistics
 /// * `inexact_stats` - Whether to mark the resulting statistics as inexact
 ///
@@ -516,7 +516,7 @@ pub fn compute_file_group_statistics(
 /// * The summary statistics across all file groups, aka all files summary 
statistics
 pub fn compute_all_files_statistics(
 file_groups: Vec,
-file_schema: SchemaRef,
+table_schema: SchemaRef,
 collect_stats: bool,
 inexact_stats: bool,
 ) -> Result<(Vec, Statistics)> {
@@ -526,16 +526,17 @@ pub fn compute_all_files_statistics(
 for file_group in file_groups {
 file_groups_with_stats.push(compute_file_group_statistics(
 file_group,
-Arc::clone(&file_schema),
+Arc::clone(&table_schema),
 collect_stats,
 )?);
 }
 
 // Then summary statistics across all file groups
-let mut statistics =
-compute_summary_statistics(&file_groups_with_stats, &file_schema, 
|file_group| {
-file_group.statistics()
-});
+let mut statistics = compute_summary_statistics(
+&file_groups_with_stats,
+&table_schema,
+|file_group| file_group.statistics(),
+);
 
 if inexact_stats {
 statistics = statistics.to_inexact()


-
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org



(datafusion-comet) branch main updated: chore: Refactor QueryPlanSerde to use idiomatic Scala and reduce verbosity (#1609)

2025-04-09 Thread agrove
This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
 new 168715106 chore: Refactor QueryPlanSerde to use idiomatic Scala and 
reduce verbosity (#1609)
168715106 is described below

commit 168715106b4587cc509c4e022f72a70c50b9bf27
Author: Andy Grove 
AuthorDate: Mon Apr 7 11:09:20 2025 -0600

chore: Refactor QueryPlanSerde to use idiomatic Scala and reduce verbosity 
(#1609)

* start refactor

* more refactoring

* address feedback
---
 .../apache/comet/CometSparkSessionExtensions.scala | 315 -
 1 file changed, 118 insertions(+), 197 deletions(-)

diff --git 
a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala 
b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala
index daa1f19ea..183d04ee7 100644
--- a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala
+++ b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala
@@ -352,7 +352,7 @@ class CometSparkSessionExtensions
  */
 // spotless:on
 private def transform(plan: SparkPlan): SparkPlan = {
-  def transform1(op: SparkPlan): Option[Operator] = {
+  def operator2Proto(op: SparkPlan): Option[Operator] = {
 if (op.children.forall(_.isInstanceOf[CometNativeExec])) {
   QueryPlanSerde.operator2Proto(
 op,
@@ -366,6 +366,14 @@ class CometSparkSessionExtensions
 }
   }
 
+  /**
+   * Convert operator to proto and then apply a transformation to wrap the 
proto in a new
+   * plan.
+   */
+  def newPlanWithProto(op: SparkPlan, fun: Operator => SparkPlan): 
SparkPlan = {
+operator2Proto(op).map(fun).getOrElse(op)
+  }
+
   plan.transformUp {
 // Fully native scan for V1
 case scan: CometScanExec
@@ -384,97 +392,54 @@ class CometSparkSessionExtensions
   CometScanWrapper(nativeOp.get, cometOp)
 
 case op: ProjectExec =>
-  val newOp = transform1(op)
-  newOp match {
-case Some(nativeOp) =>
-  CometProjectExec(
-nativeOp,
-op,
-op.output,
-op.projectList,
-op.child,
-SerializedPlan(None))
-case None =>
-  op
-  }
+  newPlanWithProto(
+op,
+CometProjectExec(_, op, op.output, op.projectList, op.child, 
SerializedPlan(None)))
 
 case op: FilterExec =>
-  val newOp = transform1(op)
-  newOp match {
-case Some(nativeOp) =>
-  CometFilterExec(
-nativeOp,
-op,
-op.output,
-op.condition,
-op.child,
-SerializedPlan(None))
-case None =>
-  op
-  }
+  newPlanWithProto(
+op,
+CometFilterExec(_, op, op.output, op.condition, op.child, 
SerializedPlan(None)))
 
 case op: SortExec =>
-  val newOp = transform1(op)
-  newOp match {
-case Some(nativeOp) =>
-  CometSortExec(
-nativeOp,
-op,
-op.output,
-op.outputOrdering,
-op.sortOrder,
-op.child,
-SerializedPlan(None))
-case None =>
-  op
-  }
+  newPlanWithProto(
+op,
+CometSortExec(
+  _,
+  op,
+  op.output,
+  op.outputOrdering,
+  op.sortOrder,
+  op.child,
+  SerializedPlan(None)))
 
 case op: LocalLimitExec =>
-  val newOp = transform1(op)
-  newOp match {
-case Some(nativeOp) =>
-  CometLocalLimitExec(nativeOp, op, op.limit, op.child, 
SerializedPlan(None))
-case None =>
-  op
-  }
+  newPlanWithProto(
+op,
+CometLocalLimitExec(_, op, op.limit, op.child, 
SerializedPlan(None)))
 
 case op: GlobalLimitExec if op.offset == 0 =>
-  val newOp = transform1(op)
-  newOp match {
-case Some(nativeOp) =>
-  CometGlobalLimitExec(nativeOp, op, op.limit, op.child, 
SerializedPlan(None))
-case None =>
-  op
-  }
+  newPlanWithProto(
+op,
+CometGlobalLimitExec(_, op, op.limit, op.child, 
SerializedPlan(None)))
 
 case op: CollectLimitExec
 if isCometNative(op.child) && 
CometConf.COMET_EXEC_COLLECT_LIMIT_ENABLED.get(conf)
   && isCometShuffleEnabled(conf)
   && op.offset == 0 =>
-  QueryPlanSerde.operator2Proto(op) match {
- 

(datafusion) branch main updated: perf: Introduce sort prefix computation for early TopK exit optimization on partially sorted input (10x speedup on top10 bench) (#15563)

2025-04-09 Thread berkay
This is an automated email from the ASF dual-hosted git repository.

berkay pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
 new 7a4577e963 perf: Introduce sort prefix computation for early TopK exit 
optimization on partially sorted input (10x speedup on top10 bench) (#15563)
7a4577e963 is described below

commit 7a4577e963fec6a9af6028fd932b003352141392
Author: Geoffrey Claude 
AuthorDate: Wed Apr 9 12:35:40 2025 +0200

perf: Introduce sort prefix computation for early TopK exit optimization on 
partially sorted input (10x speedup on top10 bench) (#15563)

* perf: Introduce sort prefix computation for early TopK exit optimization 
on partially sorted input

* perf: Use same `common_sort_prefix` nomenclature everywhere

* perf: Remove redundant argument to `sort_partially_satisfied`

* perf: Clarify that the `common_sort_prefix` is normalized

* perf: Update the topk tests for normalized projections

* perf: Rename `worst` to `max` to keep naming consistent with heap 
nomenclature

* perf: Add `NULLS FIRST` and `NULLS LAST` TopK sql logic tests

* perf: Rename sqllogic topk test columns and reduce batch size

* perf: Update TopK header doc with "Partial Sort Optimization" section

* fix: Reset `SortExec`'s `EmissionType` to `Final` on partially sorted 
input

- Without a fetch, the entire input data must be sorted before emitting 
results
- With a fetch, we can optimize for an early exit, but the results will 
still be emitted once all the necessary input data has been processed
---
 .../tests/physical_optimizer/enforce_sorting.rs|   2 +-
 .../src/equivalence/properties/mod.rs  |  59 +++--
 datafusion/physical-plan/src/sorts/sort.rs |  45 +++-
 datafusion/physical-plan/src/topk/mod.rs   | 267 +++--
 datafusion/sqllogictest/test_files/topk.slt| 162 +
 datafusion/sqllogictest/test_files/window.slt  |   2 +-
 6 files changed, 489 insertions(+), 48 deletions(-)

diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs 
b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
index 4d2c875d3f..d4b84a52f4 100644
--- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs
@@ -1652,7 +1652,7 @@ async fn test_remove_unnecessary_sort7() -> Result<()> {
 ) as Arc;
 
 let expected_input = [
-"SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], 
preserve_partitioning=[false]",
+"SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], 
preserve_partitioning=[false], sort_prefix=[non_nullable_col@1 ASC]",
 "  SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], 
preserve_partitioning=[false]",
 "DataSourceExec: partitions=1, partition_sizes=[0]",
 ];
diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs 
b/datafusion/physical-expr/src/equivalence/properties/mod.rs
index 9cf9897001..5b34a02a91 100644
--- a/datafusion/physical-expr/src/equivalence/properties/mod.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs
@@ -546,22 +546,26 @@ impl EquivalenceProperties {
 self.ordering_satisfy_requirement(&sort_requirements)
 }
 
-/// Checks whether the given sort requirements are satisfied by any of the
-/// existing orderings.
-pub fn ordering_satisfy_requirement(&self, reqs: &LexRequirement) -> bool {
-let mut eq_properties = self.clone();
-// First, standardize the given requirement:
-let normalized_reqs = eq_properties.normalize_sort_requirements(reqs);
-
+/// Returns the number of consecutive requirements (starting from the left)
+/// that are satisfied by the plan ordering.
+fn compute_common_sort_prefix_length(
+&self,
+normalized_reqs: &LexRequirement,
+) -> usize {
 // Check whether given ordering is satisfied by constraints first
-if self.satisfied_by_constraints(&normalized_reqs) {
-return true;
+if self.satisfied_by_constraints(normalized_reqs) {
+// If the constraints satisfy all requirements, return the full 
normalized requirements length
+return normalized_reqs.len();
 }
 
-for normalized_req in normalized_reqs {
+let mut eq_properties = self.clone();
+
+for (i, normalized_req) in normalized_reqs.iter().enumerate() {
 // Check whether given ordering is satisfied
-if !eq_properties.ordering_satisfy_single(&normalized_req) {
-return false;
+if !eq_properties.ordering_satisfy_single(normalized_req) {
+// As soon as one requirement is not satisfied, return
+// how many we've satisfied so far
+