(datafusion) branch main updated: fix: dictionary encoded column to partition column casting bug (#15652)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new 94ed08a364 fix: dictionary encoded column to partition column casting bug (#15652) 94ed08a364 is described below commit 94ed08a3645758a77b9d62a99f801a103532311e Author: haruband AuthorDate: Thu Apr 10 00:37:23 2025 +0900 fix: dictionary encoded column to partition column casting bug (#15652) * Fix partition values bugs from dictionary encoded column * Add some sqllogictests --- datafusion/datasource/src/write/demux.rs | 10 -- datafusion/sqllogictest/test_files/dictionary.slt | 7 +++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/datafusion/datasource/src/write/demux.rs b/datafusion/datasource/src/write/demux.rs index fc2e5daf92..49c3a64d24 100644 --- a/datafusion/datasource/src/write/demux.rs +++ b/datafusion/datasource/src/write/demux.rs @@ -28,8 +28,8 @@ use datafusion_common::error::Result; use datafusion_physical_plan::SendableRecordBatchStream; use arrow::array::{ -builder::UInt64Builder, cast::AsArray, downcast_dictionary_array, RecordBatch, -StringArray, StructArray, +builder::UInt64Builder, cast::AsArray, downcast_dictionary_array, ArrayAccessor, +RecordBatch, StringArray, StructArray, }; use arrow::datatypes::{DataType, Schema}; use datafusion_common::cast::{ @@ -482,10 +482,8 @@ fn compute_partition_keys_by_row<'a>( .ok_or(exec_datafusion_err!("it is not yet supported to write to hive partitions with datatype {}", dtype))?; -for val in array.values() { -partition_values.push( - Cow::from(val.ok_or(exec_datafusion_err!("Cannot partition by null value for column {}", col))?), -); +for i in 0..rb.num_rows() { +partition_values.push(Cow::from(array.value(i))); } }, _ => unreachable!(), diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index 778b3537d1..1769f42c2d 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -450,3 +450,10 @@ query I select dense_rank() over (order by arrow_cast('abc', 'Dictionary(UInt16, Utf8)')); 1 + +# Test dictionary encoded column to partition column casting +statement ok +CREATE TABLE test0 AS VALUES ('foo',1), ('bar',2), ('foo',3); + +statement ok +COPY (SELECT arrow_cast(column1, 'Dictionary(Int32, Utf8)') AS column1, column2 FROM test0) TO 'test_files/scratch/copy/part_dict_test' STORED AS PARQUET PARTITIONED BY (column1); \ No newline at end of file - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-site) branch site/tpch_data_generator updated (4f3ae9b -> c81497d)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a change to branch site/tpch_data_generator in repository https://gitbox.apache.org/repos/asf/datafusion-site.git from 4f3ae9b Update content/blog/2025-04-10-fastest-tpch-generator.md add c81497d Update content/blog/2025-04-10-fastest-tpch-generator.md No new revisions were added by this update. Summary of changes: content/blog/2025-04-10-fastest-tpch-generator.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-site) branch asf-staging updated: Commit build products
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch asf-staging in repository https://gitbox.apache.org/repos/asf/datafusion-site.git The following commit(s) were added to refs/heads/asf-staging by this push: new 603b6b8 Commit build products 603b6b8 is described below commit 603b6b819836b798d13260b8e26a1dd8a2cd4283 Author: Build Pelican (action) AuthorDate: Wed Apr 9 22:59:43 2025 + Commit build products --- blog/2025/04/10/fastest-tpch-generator/index.html | 500 + .../andrew-lamb-achraf-b-and-sean-smith.html | 117 + blog/category/blog.html| 48 ++ blog/feed.xml | 31 +- blog/feeds/all-en.atom.xml | 470 ++- .../andrew-lamb-achraf-b-and-sean-smith.atom.xml | 470 +++ .../andrew-lamb-achraf-b-and-sean-smith.rss.xml| 31 ++ blog/feeds/blog.atom.xml | 470 ++- blog/images/fastest-tpch-generator/lamb-theory.png | Bin 0 -> 300479 bytes .../fastest-tpch-generator/parquet-performance.png | Bin 0 -> 61946 bytes .../fastest-tpch-generator/tbl-performance.png | Bin 0 -> 49477 bytes blog/index.html| 48 ++ 12 files changed, 2182 insertions(+), 3 deletions(-) diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html b/blog/2025/04/10/fastest-tpch-generator/index.html new file mode 100644 index 000..e4aac31 --- /dev/null +++ b/blog/2025/04/10/fastest-tpch-generator/index.html @@ -0,0 +1,500 @@ + + + + + + +tpchgen-rs World’s fastest open source TPC-H data generator, written in Rust - Apache DataFusion Blog + + + + + +hljs.highlightAll(); + + + + + + Apache DataFusion Blog + + + + + + + +About + + +RSS + + + + + + + + + + + + + tpchgen-rs World’s fastest open source TPC-H data generator, written in Rust + + Posted on: Thu 10 April 2025 by Andrew Lamb, Achraf B, and Sean Smith + + +/* Table borders */ +table, th, td { + border: 1px solid black; + border-collapse: collapse; +} +th, td { + padding: 3px; +} + +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over 20x +faster than any other implementation we know of. +It is now possible to create the TPC-H SF=100 dataset in 72.23 seconds (1.4 GB/s +😎) on a Macbook Air M3 with 16GB of memory, compared to the classic dbgen +which takes 30 minutes1 (0.05GB/sec). On the same machine, it takes less than +2 minutes to create all 3.6 GB of SF=100 in https://parquet.apache.org/";>Apache Parquet format, which takes 44 minutes using https://duckdb.org";>DuckDB. +It is finally convenient and efficient to run TPC-H queries locally when testing +analytical engines such as DataFusion. + +Figure 1: Time to create TPC-H dataset for Scale Factor (see below) 1, 10, +100 and 1000 as 8 individual SNAPPY compressed parquet files using a 22 core GCP +VM with 88GB of memory. For Scale Factor(SF) 100 tpchgen takes 1 minute and 14 seconds and +https://duckdb.org";>DuckDB takes 17 minutes and 48 seconds. For SF=1000, tpchgen takes 10 +minutes and 26 and uses about 5 GB of RAM at peak, and we could not measure +DuckDB’s time as it https://duckdb.org/docs/stable/extensions/tpch.html#resource-usage-of-the-data-generator";>requires 647 GB of RAM, more than the 88 GB that was +available on our test machine. The testing methodology is in the +https://github.com/clflushopt/tpchgen-rs/blob/main/benchmarks/BENCHMARKS.md";>documentation. +This blog explains what TPC-H is, how we ported the vintage C data generator to +Rust (yes, https://www.reddit.com/r/rust/comments/4ri2gn/riir_rewrite_it_in_rust/";>RWIR) and optimized its performance over the course of a few weeks +of part-time work. We began this project so we can easily generate TPC-H data in +https://datafusion.apache.org/";>Apache DataFusion and https://glaredb.com/";>GlareDB. +Try it for yourself +The tool is entirely open source under the https://www.apache.org/licenses/LICENSE-2.0";>Apache 2.0 license. Visit the https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs repository or try it for yourself by run the following commands after https://www.rust-lang.org/tools/install";>installing Rust: +$ cargo install tpchgen-cli + +# create SF=1 in classic TBL format +$ tpchgen-cli -s 1 + +# create SF=10 in Parquet +$ tpchgen-cli -s 10 --format=parquet + +What is TPC-H / dbgen? +The popular https://www.tpc.org/tpch/";>TPC-H benchmark (often referred to as TPCH)
(datafusion-site) branch site/tpch_data_generator updated (c81497d -> d782afd)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a change to branch site/tpch_data_generator in repository https://gitbox.apache.org/repos/asf/datafusion-site.git from c81497d Update content/blog/2025-04-10-fastest-tpch-generator.md add d782afd Update content/blog/2025-04-10-fastest-tpch-generator.md No new revisions were added by this update. Summary of changes: content/blog/2025-04-10-fastest-tpch-generator.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-site) branch asf-staging updated: Commit build products
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch asf-staging in repository https://gitbox.apache.org/repos/asf/datafusion-site.git The following commit(s) were added to refs/heads/asf-staging by this push: new 0db2e2c Commit build products 0db2e2c is described below commit 0db2e2cb91d8fa5bfe59ae08873e59c94f4d11de Author: Build Pelican (action) AuthorDate: Wed Apr 9 22:59:50 2025 + Commit build products --- blog/2025/04/10/fastest-tpch-generator/index.html | 3 ++- blog/author/andrew-lamb-achraf-b-and-sean-smith.html| 7 +++ blog/category/blog.html | 7 +++ blog/feed.xml | 7 +++ blog/feeds/all-en.atom.xml | 10 +- blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml | 10 +- blog/feeds/andrew-lamb-achraf-b-and-sean-smith.rss.xml | 7 +++ blog/feeds/blog.atom.xml| 10 +- blog/index.html | 7 +++ 9 files changed, 32 insertions(+), 36 deletions(-) diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html b/blog/2025/04/10/fastest-tpch-generator/index.html index e4aac31..62a9864 100644 --- a/blog/2025/04/10/fastest-tpch-generator/index.html +++ b/blog/2025/04/10/fastest-tpch-generator/index.html @@ -69,7 +69,8 @@ th, td { padding: 3px; } -3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over 20x faster than any other implementation we know of. It is now possible to create the TPC-H SF=100 dataset in 72.23 seconds (1.4 GB/s diff --git a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html index 6f07c46..a308e6a 100644 --- a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html +++ b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html @@ -81,10 +81,9 @@ th, td { padding: 3px; } -3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source -development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over 20x -faster than any other implementation we know of. -It is now … +TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over … Continue Reading diff --git a/blog/category/blog.html b/blog/category/blog.html index 7e5dc70..17e20f8 100644 --- a/blog/category/blog.html +++ b/blog/category/blog.html @@ -81,10 +81,9 @@ th, td { padding: 3px; } -3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source -development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over 20x -faster than any other implementation we know of. -It is now … +TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over … Continue Reading diff --git a/blog/feed.xml b/blog/feed.xml index 0947603..01fe8ab 100644 --- a/blog/feed.xml +++ b/blog/feed.xml @@ -25,10 +25,9 @@ th, td { padding: 3px; } -3 members of the Apache DataFusion community used Rust and open source -development to build tpchgen-rs;, a fully open TPC-H data generator over 20x -faster than any other implementation we know of.
-It is now …
http://purl.org/dc/elements/1.1/";>Andrew Lamb, Achraf B, and Sean SmithThu, 10 Apr 2025 00:00:00 +tag:datafusion.apache.org,2025-04-10:/blog/2025/04/10/fastest-tpch-generatorblogApache DataFusion Python 46.0.0 Releasedhttps://datafusion.apache.org/blog/2025/03/30/datafusion-python-46.0.0https://datafusion.apache.org/">Apache DataFusion community used Rust and open source +development to build tpchgen-rs;, a fully open TPC-H data generator over …http://purl.org/dc/elements/1.1/";>Andrew Lamb, Achraf B, an
(datafusion-comet) branch main updated: feat: add MAP type support for first level (#1603)
This is an automated email from the ASF dual-hosted git repository. comphead pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git The following commit(s) were added to refs/heads/main by this push: new ba53a7f2f feat: add MAP type support for first level (#1603) ba53a7f2f is described below commit ba53a7f2f0a73b4120197f709e5d71b1d46dbf7a Author: Oleks V AuthorDate: Wed Apr 9 13:35:20 2025 -0700 feat: add MAP type support for first level (#1603) * feat: add MAP type support for first level --- .../src/main/scala/org/apache/spark/sql/comet/util/Utils.scala | 3 ++- .../src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala | 2 ++ .../scala/org/apache/spark/sql/comet/CometNativeScanExec.scala | 2 +- .../main/scala/org/apache/spark/sql/comet/CometScanExec.scala | 2 +- .../scala/org/apache/comet/exec/CometNativeReaderSuite.scala | 9 + .../test/scala/org/apache/comet/parquet/ParquetReadSuite.scala | 10 +- 6 files changed, 20 insertions(+), 8 deletions(-) diff --git a/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala b/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala index 46e474170..3041ea2c8 100644 --- a/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala +++ b/common/src/main/scala/org/apache/spark/sql/comet/util/Utils.scala @@ -277,7 +277,8 @@ object Utils { case v @ (_: BitVector | _: TinyIntVector | _: SmallIntVector | _: IntVector | _: BigIntVector | _: Float4Vector | _: Float8Vector | _: VarCharVector | _: DecimalVector | _: DateDayVector | _: TimeStampMicroTZVector | _: VarBinaryVector | - _: FixedSizeBinaryVector | _: TimeStampMicroVector | _: StructVector | _: ListVector) => + _: FixedSizeBinaryVector | _: TimeStampMicroVector | _: StructVector | _: ListVector | + _: MapVector) => v.asInstanceOf[FieldVector] case _ => throw new SparkException(s"Unsupported Arrow Vector for $reason: ${valueVector.getClass}") diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 53fbb6d44..bf779a6b5 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -70,6 +70,8 @@ object QueryPlanSerde extends Logging with CometExprShim { s.fields.map(_.dataType).forall(supportedDataType(_, allowComplex)) case a: ArrayType if allowComplex => supportedDataType(a.elementType, allowComplex) +case m: MapType if allowComplex => + supportedDataType(m.keyType, allowComplex) && supportedDataType(m.valueType, allowComplex) case dt => emitWarning(s"unsupported Spark data type: $dt") false diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala index fd71e92fa..3ddc134c7 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala @@ -230,10 +230,10 @@ object CometNativeScanExec extends DataTypeSupport { } override def isAdditionallySupported(dt: DataType): Boolean = { -// TODO add map dt match { case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported) case a: ArrayType => isTypeSupported(a.elementType) + case m: MapType => isTypeSupported(m.keyType) && isTypeSupported(m.valueType) case _ => false } } diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala index 87f48f841..43a925fd8 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala @@ -480,10 +480,10 @@ object CometScanExec extends DataTypeSupport { override def isAdditionallySupported(dt: DataType): Boolean = { if (CometConf.COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_ICEBERG_COMPAT) { - // TODO add map dt match { case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported) case a: ArrayType => isTypeSupported(a.elementType) +case m: MapType => isTypeSupported(m.keyType) && isTypeSupported(m.valueType) case _ => false } } else { diff --git a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala index 3593ae220..9696ef1bb 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala @@ -134,4 +134,13 @@ class CometNativeReaderSuite extends CometTestBase with Adap
(datafusion-comet) branch asf-site updated: Publish built docs triggered by e2383921f714c857c31bbbc1a2f427bb0608b46c
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git The following commit(s) were added to refs/heads/asf-site by this push: new 66a5bca66 Publish built docs triggered by e2383921f714c857c31bbbc1a2f427bb0608b46c 66a5bca66 is described below commit 66a5bca66b9f9eebe97186889a012e025a490a46 Author: github-actions[bot] AuthorDate: Wed Apr 9 22:58:01 2025 + Publish built docs triggered by e2383921f714c857c31bbbc1a2f427bb0608b46c --- _sources/contributor-guide/benchmarking.md.txt | 4 + .../contributor-guide/benchmarking_aws_ec2.md.txt | 223 contributor-guide/benchmarking.html| 4 + contributor-guide/benchmarking_aws_ec2.html| 567 + objects.inv| Bin 786 -> 807 bytes searchindex.js | 2 +- 6 files changed, 799 insertions(+), 1 deletion(-) diff --git a/_sources/contributor-guide/benchmarking.md.txt b/_sources/contributor-guide/benchmarking.md.txt index 1193ada62..15934d7f5 100644 --- a/_sources/contributor-guide/benchmarking.md.txt +++ b/_sources/contributor-guide/benchmarking.md.txt @@ -22,6 +22,10 @@ under the License. To track progress on performance, we regularly run benchmarks derived from TPC-H and TPC-DS. Data generation and benchmarking documentation and scripts are available in the [DataFusion Benchmarks](https://github.com/apache/datafusion-benchmarks) GitHub repository. +Available benchmarking guides: + +- [Benchmarking on AWS EC2](benchmarking_aws_ec2) + We also have many micro benchmarks that can be run from an IDE located [here](https://github.com/apache/datafusion-comet/tree/main/spark/src/test/scala/org/apache/spark/sql/benchmark). ## Current Benchmark Results diff --git a/_sources/contributor-guide/benchmarking_aws_ec2.md.txt b/_sources/contributor-guide/benchmarking_aws_ec2.md.txt new file mode 100644 index 0..0ec33bf7e --- /dev/null +++ b/_sources/contributor-guide/benchmarking_aws_ec2.md.txt @@ -0,0 +1,223 @@ + + +# Comet Benchmarking in AWS + +This guide is for setting up benchmarks on AWS EC2 with a single node with Parquet files located in S3. + +## Data Generation + +- Create an EC2 instance with an EBS volume sized for approximately 2x the size of + the dataset to be generated (200 GB for scale factor 100, 2 TB for scale factor 1000, and so on) +- Create an S3 bucket to store the Parquet files + +Install prerequisites: + +```shell +sudo yum install -y docker git python3-pip + +sudo systemctl start docker +sudo systemctl enable docker +sudo usermod -aG docker ec2-user +newgrp docker + +docker pull ghcr.io/scalytics/tpch-docker:main + +pip3 install datafusion +``` + +Run the data generation script: + +```shell +git clone https://github.com/apache/datafusion-benchmarks.git +cd datafusion-benchmarks/tpch +nohup python3 tpchgen.py generate --scale-factor 100 --partitions 16 & +``` + +Check on progress with the following commands: + +```shell +docker ps +du -h -d 1 data +``` + +Fix ownership in the generated files: + +```shell +sudo chown -R ec2-user:docker data +``` + +Convert to Parquet: + +```shell +nohup python3 tpchgen.py convert --scale-factor 100 --partitions 16 & +``` + +Delete the CSV files: + +```shell +cd data +rm *.tbl.* +``` + +Copy the Parquet files to S3: + +```shell +aws s3 cp . s3://your-bucket-name/top-level-folder/ --recursive +``` + +## Install Spark + +Install Java + +```shell +sudo yum install -y java-17-amazon-corretto-headless java-17-amazon-corretto-devel +``` + +Set JAVA_HOME + +```shell +export JAVA_HOME=/usr/lib/jvm/java-17-amazon-corretto +``` + +Install Spark + +```shell +wget https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz +tar xzf spark-3.5.4-bin-hadoop3.tgz +sudo mv spark-3.5.4-bin-hadoop3 /opt +export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3/ +mkdir /tmp/spark-events +``` + +Set `SPARK_MASTER` env var (IP address will need to be edited): + +```shell +export SPARK_MASTER=spark://172.31.34.87:7077 +``` + +Set `SPARK_LOCAL_DIRS` to point to EBS volume + +```shell +sudo mkdir /mnt/tmp +sudo chmod 777 /mnt/tmp +mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh +``` + +Add the following entry to `spark-env.sh`: + +```shell +SPARK_LOCAL_DIRS=/mnt/tmp +``` + +Start Spark in standalone mode: + +```shell +$SPARK_HOME/sbin/start-master.sh +$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER +``` + +Install Hadoop jar files: + +```shell +wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -P $SPARK_HOME/jars +wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar -P $SPARK_HOME/jars +``` + +Add credentials to `~/.aws/credentials`: + +```shell +[default] +aws_access_key_id=your-access-key +aws_secret_access
(datafusion-site) branch asf-staging updated: Commit build products
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch asf-staging in repository https://gitbox.apache.org/repos/asf/datafusion-site.git The following commit(s) were added to refs/heads/asf-staging by this push: new 0a9cfe7 Commit build products 0a9cfe7 is described below commit 0a9cfe7c5614349d462927f94e3eef6ca81ae233 Author: Build Pelican (action) AuthorDate: Wed Apr 9 23:04:58 2025 + Commit build products --- blog/2025/04/10/fastest-tpch-generator/index.html | 6 +++--- blog/author/andrew-lamb-achraf-b-and-sean-smith.html| 6 +++--- blog/category/blog.html | 6 +++--- blog/feed.xml | 6 +++--- blog/feeds/all-en.atom.xml | 12 ++-- blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml | 12 ++-- blog/feeds/andrew-lamb-achraf-b-and-sean-smith.rss.xml | 6 +++--- blog/feeds/blog.atom.xml| 12 ++-- blog/index.html | 6 +++--- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html b/blog/2025/04/10/fastest-tpch-generator/index.html index 62a9864..3e09c2d 100644 --- a/blog/2025/04/10/fastest-tpch-generator/index.html +++ b/blog/2025/04/10/fastest-tpch-generator/index.html @@ -57,7 +57,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -{% endcomment %} +{% endcomment %}x --> /* Table borders */ @@ -69,8 +69,8 @@ th, td { padding: 3px; } -TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen -3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen. +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over 20x faster than any other implementation we know of. It is now possible to create the TPC-H SF=100 dataset in 72.23 seconds (1.4 GB/s diff --git a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html index a308e6a..7b48354 100644 --- a/blog/author/andrew-lamb-achraf-b-and-sean-smith.html +++ b/blog/author/andrew-lamb-achraf-b-and-sean-smith.html @@ -69,7 +69,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -{% endcomment %} +{% endcomment %}x --> /* Table borders */ @@ -81,8 +81,8 @@ th, td { padding: 3px; } -TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen -3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen. +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over … diff --git a/blog/category/blog.html b/blog/category/blog.html index 17e20f8..bcf7e8c 100644 --- a/blog/category/blog.html +++ b/blog/category/blog.html @@ -69,7 +69,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -{% endcomment %} +{% endcomment %}x --> /* Table borders */ @@ -81,8 +81,8 @@ th, td { padding: 3px; } -TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen -3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source +TLDR: TPC-H SF=100 in 1min using tpchgen-rs vs 30min+ with dbgen. +3 members of the https://datafusion.apache.org/";>Apache DataFusion community used Rust and open source development to build https://github.com/clflushopt/tpchgen-rs";>tpchgen-rs, a fully open TPC-H data generator over … diff --git a/blog/feed.xml b/blog/feed.xml index 01fe8ab..430f736 100644 --- a/blog/feed.xml +++ b/blog/feed.xml @@ -13,7 +13,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
(datafusion-site) branch site/tpch_data_generator updated (495bad4 -> 623404b)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a change to branch site/tpch_data_generator in repository https://gitbox.apache.org/repos/asf/datafusion-site.git from 495bad4 tweak add 623404b remove errant asterisks No new revisions were added by this update. Summary of changes: content/blog/2025-04-10-fastest-tpch-generator.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-site) branch site/tpch_data_generator updated (d782afd -> 495bad4)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a change to branch site/tpch_data_generator in repository https://gitbox.apache.org/repos/asf/datafusion-site.git from d782afd Update content/blog/2025-04-10-fastest-tpch-generator.md add 495bad4 tweak No new revisions were added by this update. Summary of changes: content/blog/2025-04-10-fastest-tpch-generator.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-site) branch asf-staging updated: Commit build products
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch asf-staging in repository https://gitbox.apache.org/repos/asf/datafusion-site.git The following commit(s) were added to refs/heads/asf-staging by this push: new 960619c Commit build products 960619c is described below commit 960619c802c5e9ac12a6f778e0c4995c5fdfd529 Author: Build Pelican (action) AuthorDate: Wed Apr 9 23:05:30 2025 + Commit build products --- blog/2025/04/10/fastest-tpch-generator/index.html | 2 +- blog/feeds/all-en.atom.xml | 2 +- blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml | 2 +- blog/feeds/blog.atom.xml| 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/blog/2025/04/10/fastest-tpch-generator/index.html b/blog/2025/04/10/fastest-tpch-generator/index.html index 3e09c2d..e03c029 100644 --- a/blog/2025/04/10/fastest-tpch-generator/index.html +++ b/blog/2025/04/10/fastest-tpch-generator/index.html @@ -104,7 +104,7 @@ $ tpchgen-cli -s 10 --format=parquet What is TPC-H / dbgen? The popular https://www.tpc.org/tpch/";>TPC-H benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on https://en.wikipedia.org/wiki/Online_analytical_processing";>OLAP queries, the kind used to build BI +performance of database systems on https://en.wikipedia.org/wiki/Online_analytical_processing";>OLAP queries, the kind used to build BI dashboards. TPC-H has become a de facto standard for analytic systems. While there are https://www.vldb.org/pvldb/vol9/p204-leis.pdf";>well known limitations as the data and queries do not well represent many real world diff --git a/blog/feeds/all-en.atom.xml b/blog/feeds/all-en.atom.xml index 674243c..bde3316 100644 --- a/blog/feeds/all-en.atom.xml +++ b/blog/feeds/all-en.atom.xml @@ -88,7 +88,7 @@ $ tpchgen-cli -s 10 --format=parquetWhat is TPC-H / dbgen?
The popular TPC-H; benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on OLAP; queries, the kind used to build BI +performance of database systems on OLAP; queries, the kind used to build BI dashboards.
TPC-H has become a de facto standard for analytic systems. While there are well known limitations as the data and queries do not well represent many real world diff --git a/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml b/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml index 08be951..21b8660 100644 --- a/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml +++ b/blog/feeds/andrew-lamb-achraf-b-and-sean-smith.atom.xml @@ -88,7 +88,7 @@ $ tpchgen-cli -s 10 --format=parquet
What is TPC-H / dbgen?
The popular TPC-H; benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on OLAP; queries, the kind used to build BI +performance of database systems on OLAP; queries, the kind used to build BI dashboards.
TPC-H has become a de facto standard for analytic systems. While there are well known limitations as the data and queries do not well represent many real world diff --git a/blog/feeds/blog.atom.xml b/blog/feeds/blog.atom.xml index 7ae5546..c3fdf9f 100644 --- a/blog/feeds/blog.atom.xml +++ b/blog/feeds/blog.atom.xml @@ -88,7 +88,7 @@ $ tpchgen-cli -s 10 --format=parquet
What is TPC-H / dbgen?
The popular TPC-H; benchmark (often referred to as TPCH) helps evaluate the -performance of database systems on OLAP; queries, the kind used to build BI +performance of database systems on OLAP; queries, the kind used to build BI dashboards.
TPC-H has become a de facto standard for analytic systems. While there are well known limitations as the data and queries do not well represent many real world - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-ballista) branch main updated: chore: Remove some arrow references (#1232)
This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-ballista.git The following commit(s) were added to refs/heads/main by this push: new 13eb8112 chore: Remove some arrow references (#1232) 13eb8112 is described below commit 13eb811256bb251492d66255170f58d5583265be Author: Andy Grove AuthorDate: Wed Apr 9 10:03:54 2025 -0600 chore: Remove some arrow references (#1232) --- .devcontainer/devcontainer.json |2 +- .gitattributes |6 - .github/workflows/comment_bot.yml| 72 - .github/workflows/docker.yml | 24 +- LICENSE.txt | 2008 -- ballista-cli/README.md |2 +- ballista/client/README.md|2 +- ballista/client/tests/context_unsupported.rs |2 +- ballista/core/proto/ballista.proto |2 +- ballista/core/proto/datafusion.proto |2 +- benchmarks/README.md |2 +- benchmarks/spark/README.md |2 +- ci/scripts/rust_build.sh | 45 - dev/build-ballista-docker.sh | 10 +- dev/release/download-python-wheels.py|2 +- dev/release/generate-changelog.py|2 +- rustfmt.toml |4 - 17 files changed, 27 insertions(+), 2162 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 753774ee..38472603 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,7 +1,7 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/rust { - "name": "arrow-ballista", + "name": "datafusion-ballista", // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/rust:latest", "features": { diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index fac7bf85.. --- a/.gitattributes +++ /dev/null @@ -1,6 +0,0 @@ -r/R/RcppExports.R linguist-generated=true -r/R/arrowExports.R linguist-generated=true -r/src/RcppExports.cpp linguist-generated=true -r/src/arrowExports.cpp linguist-generated=true -r/man/*.Rd linguist-generated=true - diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml deleted file mode 100644 index 81b15939.. --- a/.github/workflows/comment_bot.yml +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Comment Bot - -on: - # TODO(kszucs): support pull_request_review_comment - issue_comment: -types: - - created - - edited - -jobs: - crossbow: -name: Listen! -if: startsWith(github.event.comment.body, '@github-actions crossbow') -runs-on: ubuntu-latest -steps: - - name: Checkout Arrow -uses: actions/checkout@v4 -with: - repository: apache/arrow - - name: Set up Python -uses: actions/setup-python@v4 -with: - python-version: "3.10" - - name: Install Archery and Crossbow dependencies -run: pip install -e dev/archery[bot] - - name: Handle Github comment event -env: - ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }} -run: | - archery trigger-bot \ ---event-name ${{ github.event_name }} \ ---event-payload ${{ github.event_path }} - - rebase: -name: "Rebase" -if: startsWith(github.event.comment.body, '@github-actions rebase') -runs-on: ubuntu-latest -steps: - - uses: actions/checkout@v4 - - uses: r-lib/actions/pr-fetch@master -with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Rebase on ${{ github.repository }} main -run: | - set -ex - git config user.name "$(git log -1 --pretty=format:%an)" - git config u
(datafusion) branch main updated: chore(deps): bump blake3 from 1.8.0 to 1.8.1 (#15650)
This is an automated email from the ASF dual-hosted git repository. comphead pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new d2a1077e9f chore(deps): bump blake3 from 1.8.0 to 1.8.1 (#15650) d2a1077e9f is described below commit d2a1077e9f9627fb089989713c9bbbda8a956e46 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> AuthorDate: Wed Apr 9 07:39:41 2025 -0700 chore(deps): bump blake3 from 1.8.0 to 1.8.1 (#15650) Bumps [blake3](https://github.com/BLAKE3-team/BLAKE3) from 1.8.0 to 1.8.1. - [Release notes](https://github.com/BLAKE3-team/BLAKE3/releases) - [Commits](https://github.com/BLAKE3-team/BLAKE3/compare/1.8.0...1.8.1) --- updated-dependencies: - dependency-name: blake3 dependency-version: 1.8.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0bf2432de0..ac430e1457 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1119,9 +1119,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index"; -checksum = "34a796731680be7931955498a16a10b2270c7762963d5d570fdbfe02dcbf314f" +checksum = "389a099b34312839e16420d499a9cad9650541715937ffbdd40d36f49e77eeb3" dependencies = [ "arrayref", "arrayvec", - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch dependabot/cargo/main/blake3-1.8.1 deleted (was e06fcc576c)
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a change to branch dependabot/cargo/main/blake3-1.8.1 in repository https://gitbox.apache.org/repos/asf/datafusion.git was e06fcc576c chore(deps): bump blake3 from 1.8.0 to 1.8.1 The revisions that were on this branch are still contained in other references; therefore, this change does not discard any commits from the repository. - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch dependabot/cargo/main/blake3-1.8.1 created (now e06fcc576c)
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a change to branch dependabot/cargo/main/blake3-1.8.1 in repository https://gitbox.apache.org/repos/asf/datafusion.git at e06fcc576c chore(deps): bump blake3 from 1.8.0 to 1.8.1 No new revisions were added by this update. - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch main updated: Ignore false positive warning (#15635)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new 68475eb981 Ignore false positive warning (#15635) 68475eb981 is described below commit 68475eb9817ec791f8a934642ddea965b04e8b0e Author: Jannik Steinmann AuthorDate: Wed Apr 9 17:17:28 2025 +0200 Ignore false positive warning (#15635) Signed-off-by: Jannik Steinmann --- datafusion/physical-optimizer/src/aggregate_statistics.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/physical-optimizer/src/aggregate_statistics.rs b/datafusion/physical-optimizer/src/aggregate_statistics.rs index 0d3d83c583..28ee10eb65 100644 --- a/datafusion/physical-optimizer/src/aggregate_statistics.rs +++ b/datafusion/physical-optimizer/src/aggregate_statistics.rs @@ -42,6 +42,7 @@ impl AggregateStatistics { impl PhysicalOptimizerRule for AggregateStatistics { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] +#[allow(clippy::only_used_in_recursion)] // See https://github.com/rust-lang/rust-clippy/issues/14566 fn optimize( &self, plan: Arc, - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch main updated: chore: avoid erroneuous warning for FFI table operation (only not default value) (#15579)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new a337ca75d7 chore: avoid erroneuous warning for FFI table operation (only not default value) (#15579) a337ca75d7 is described below commit a337ca75d758f2b92905045fc89e971a955851d4 Author: Chen Chongchen AuthorDate: Wed Apr 9 22:58:13 2025 +0800 chore: avoid erroneuous warning for FFI table operation (only not default value) (#15579) * warning only not default value * fmt --- datafusion/common/src/config.rs | 47 + 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b0f17630c9..449ce22693 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -149,9 +149,17 @@ macro_rules! config_namespace { // $(#[allow(deprecated)])? { $(let value = $transform(value);)? // Apply transformation if specified -$(log::warn!($warn);)? // Log warning if specified #[allow(deprecated)] -self.$field_name.set(rem, value.as_ref()) +let ret = self.$field_name.set(rem, value.as_ref()); + +$(if !$warn.is_empty() { +let default: $field_type = $default; +#[allow(deprecated)] +if default != self.$field_name { +log::warn!($warn); +} +})? // Log warning if specified, and the value is not the default +ret } }, )* @@ -1999,8 +2007,8 @@ mod tests { use std::collections::HashMap; use crate::config::{ -ConfigEntry, ConfigExtension, ConfigFileType, ExtensionOptions, Extensions, -TableOptions, +ConfigEntry, ConfigExtension, ConfigField, ConfigFileType, ExtensionOptions, +Extensions, TableOptions, }; #[derive(Default, Debug, Clone)] @@ -2085,6 +2093,37 @@ mod tests { assert_eq!(table_config.csv.escape.unwrap() as char, '\''); } +#[test] +fn warning_only_not_default() { +use std::sync::atomic::AtomicUsize; +static COUNT: AtomicUsize = AtomicUsize::new(0); +use log::{Level, LevelFilter, Metadata, Record}; +struct SimpleLogger; +impl log::Log for SimpleLogger { +fn enabled(&self, metadata: &Metadata) -> bool { +metadata.level() <= Level::Info +} + +fn log(&self, record: &Record) { +if self.enabled(record.metadata()) { +COUNT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); +} +} +fn flush(&self) {} +} +log::set_logger(&SimpleLogger).unwrap(); +log::set_max_level(LevelFilter::Info); +let mut sql_parser_options = crate::config::SqlParserOptions::default(); +sql_parser_options +.set("enable_options_value_normalization", "false") +.unwrap(); +assert_eq!(COUNT.load(std::sync::atomic::Ordering::Relaxed), 0); +sql_parser_options +.set("enable_options_value_normalization", "true") +.unwrap(); +assert_eq!(COUNT.load(std::sync::atomic::Ordering::Relaxed), 1); +} + #[cfg(feature = "parquet")] #[test] fn parquet_table_options() { - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-comet) branch main updated: chore: Parquet fuzz testing (#1623)
This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git The following commit(s) were added to refs/heads/main by this push: new 4740e94eb chore: Parquet fuzz testing (#1623) 4740e94eb is described below commit 4740e94eb921822f1cd22fa008dd49d8c7a18b52 Author: Andy Grove AuthorDate: Wed Apr 9 11:40:12 2025 -0600 chore: Parquet fuzz testing (#1623) --- .../apache/comet/testing/ParquetGenerator.scala| 18 +++-- .../org/apache/comet/CometFuzzTestSuite.scala | 80 +- 2 files changed, 90 insertions(+), 8 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala b/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala index 9b91d5d93..520af27ea 100644 --- a/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala +++ b/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala @@ -34,11 +34,14 @@ import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, object ParquetGenerator { /** - * Arbitrary date to use as base for generating temporal columns. Random integers will be added - * to or subtracted from this value. + * Date to use as base for generating temporal columns. Random integers will be added to or + * subtracted from this value. + * + * Date was chosen to trigger generating a timestamp that's larger than a 64-bit nanosecond + * timestamp can represent so that we test support for INT96 timestamps. */ - private val baseDate = -new SimpleDateFormat("-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime + val defaultBaseDate: Long = +new SimpleDateFormat("-MM-DD hh:mm:ss").parse("-05-25 12:34:56").getTime private val primitiveTypes = Seq( DataTypes.BooleanType, @@ -217,13 +220,13 @@ object ParquetGenerator { null } case DataTypes.DateType => -Range(0, numRows).map(_ => new java.sql.Date(baseDate + r.nextInt())) +Range(0, numRows).map(_ => new java.sql.Date(options.baseDate + r.nextInt())) case DataTypes.TimestampType => -Range(0, numRows).map(_ => new Timestamp(baseDate + r.nextInt())) +Range(0, numRows).map(_ => new Timestamp(options.baseDate + r.nextInt())) case DataTypes.TimestampNTZType => Range(0, numRows).map(_ => LocalDateTime.ofInstant( -Instant.ofEpochMilli(baseDate + r.nextInt()), +Instant.ofEpochMilli(options.baseDate + r.nextInt()), ZoneId.systemDefault())) case _ => throw new IllegalStateException(s"Cannot generate data for $dataType yet") } @@ -234,6 +237,7 @@ object ParquetGenerator { case class DataGenOptions( allowNull: Boolean = true, generateNegativeZero: Boolean = true, +baseDate: Long = ParquetGenerator.defaultBaseDate, generateArray: Boolean = false, generateStruct: Boolean = false, generateMap: Boolean = false) diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala index 1d12b4bfb..195308526 100644 --- a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala @@ -20,6 +20,7 @@ package org.apache.comet import java.io.File +import java.text.SimpleDateFormat import scala.util.Random @@ -32,6 +33,8 @@ import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType +import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, StructType} import org.apache.comet.testing.{DataGenOptions, ParquetGenerator} @@ -57,7 +60,13 @@ class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { CometConf.COMET_ENABLED.key -> "false", SQLConf.SESSION_LOCAL_TIMEZONE.key -> defaultTimezone) { val options = -DataGenOptions(generateArray = true, generateStruct = true, generateNegativeZero = false) +DataGenOptions( + generateArray = true, + generateStruct = true, + generateNegativeZero = false, + // override base date due to known issues with experimental scans + baseDate = +new SimpleDateFormat("-MM-DD hh:mm:ss").parse("2024-05-25 12:34:56").getTime) ParquetGenerator.makeParquetFile(random, spark, filename, 1000, options) } } @@ -166,6 +175,75 @@ class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } + test("Parquet temporal types written as INT96") { + +// there are known issues with INT96 support in the new experi
(datafusion) branch main updated: Rename protobuf Java package (#15658)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new 5ab5a03724 Rename protobuf Java package (#15658) 5ab5a03724 is described below commit 5ab5a03724b3afa009ba480a022145875972d08c Author: Andy Grove AuthorDate: Wed Apr 9 15:02:06 2025 -0600 Rename protobuf Java package (#15658) --- datafusion/proto/proto/datafusion.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 2e028eb291..908b95ab56 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -21,7 +21,7 @@ syntax = "proto3"; package datafusion; option java_multiple_files = true; -option java_package = "org.apache.arrow.datafusion.protobuf"; +option java_package = "org.apache.datafusion.protobuf"; option java_outer_classname = "DatafusionProto"; import "datafusion/proto-common/proto/datafusion_common.proto"; - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch dependabot/cargo/main/mimalloc-0.1.46 created (now d2d924ca3a)
This is an automated email from the ASF dual-hosted git repository. github-bot pushed a change to branch dependabot/cargo/main/mimalloc-0.1.46 in repository https://gitbox.apache.org/repos/asf/datafusion.git at d2d924ca3a chore(deps): bump mimalloc from 0.1.44 to 0.1.46 No new revisions were added by this update. - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-testing) branch main updated: Updates for changes in DataFusion PR #15462 (#9)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-testing.git The following commit(s) were added to refs/heads/main by this push: new e9f9e22 Updates for changes in DataFusion PR #15462 (#9) e9f9e22 is described below commit e9f9e22ccf09145a7368f80fd6a871f11e2b4481 Author: Bruce Ritchie AuthorDate: Wed Apr 9 06:31:33 2025 -0400 Updates for changes in DataFusion PR #15462 (#9) --- data/sqlite/random/aggregates/slt_good_11.slt | 4 ++-- data/sqlite/random/groupby/slt_good_12.slt| 8 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data/sqlite/random/aggregates/slt_good_11.slt b/data/sqlite/random/aggregates/slt_good_11.slt index b9dd3ff..4f26801 100644 --- a/data/sqlite/random/aggregates/slt_good_11.slt +++ b/data/sqlite/random/aggregates/slt_good_11.slt @@ -6356,9 +6356,9 @@ SELECT DISTINCT + ( - COUNT ( * ) ) * + COUNT ( * ) AS col0 FROM tab2 -9 -# Datafusion - Datafusion expected results: -query error DataFusion error: Arrow error: Divide by zero error +query I rowsort label-1028 SELECT - 69 FROM tab0 WHERE + - col1 NOT IN ( - + col1, CAST ( + col0 AS INTEGER ), + + ( + + col1 ) / + - 0 * + col0 ) + query I rowsort SELECT ALL - col1 * - col0 + - ( 88 ) AS col1 FROM tab1 diff --git a/data/sqlite/random/groupby/slt_good_12.slt b/data/sqlite/random/groupby/slt_good_12.slt index 98bde5d..58bbf38 100644 --- a/data/sqlite/random/groupby/slt_good_12.slt +++ b/data/sqlite/random/groupby/slt_good_12.slt @@ -12137,9 +12137,9 @@ SELECT - col1 + - col1 AS col1 FROM tab2 WHERE NULL IS NULL GROUP BY col1 -122 -82 -# Datafusion - Datafusion expected results: -query error DataFusion error: Arrow error: Divide by zero error +query I rowsort SELECT DISTINCT col1 AS col1 FROM tab0 WHERE NOT + CASE + 87 WHEN col1 THEN col1 ELSE NULL END NOT BETWEEN - col1 / + col1 AND NULL GROUP BY col1 HAVING NOT col1 * 75 IS NOT NULL + query I rowsort SELECT ( - ( - col0 ) ) FROM tab0 GROUP BY col0 HAVING NULL IN ( 93 ) @@ -26969,9 +26969,9 @@ SELECT DISTINCT - + col0 AS col2, col0 / + - col0 FROM tab1 AS cor0 WHERE col2 * -82 -1 -# Datafusion - Datafusion expected results: -query error DataFusion error: Arrow error: Divide by zero error +query I rowsort SELECT - col0 * 87 * - col1 FROM tab0 WHERE NOT col1 BETWEEN 67 + 65 AND - 41 / - col1 + + + col0 * + col1 * col2 / - col0 GROUP BY col1, col0 HAVING NOT col0 IS NOT NULL + query I rowsort SELECT DISTINCT ( - - col1 ) + + 43 FROM tab2 GROUP BY col1, col1, col1 - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch main updated: Improve binary_op benchmark (#15632)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new 1385140fb2 Improve binary_op benchmark (#15632) 1385140fb2 is described below commit 1385140fb2cdd8b474e3ec768e21aed4d34eccd7 Author: Andrew Lamb AuthorDate: Wed Apr 9 06:35:12 2025 -0400 Improve binary_op benchmark (#15632) --- datafusion/physical-expr/benches/binary_op.rs | 80 ++- 1 file changed, 4 insertions(+), 76 deletions(-) diff --git a/datafusion/physical-expr/benches/binary_op.rs b/datafusion/physical-expr/benches/binary_op.rs index 7ac5c04852..59a602df05 100644 --- a/datafusion/physical-expr/benches/binary_op.rs +++ b/datafusion/physical-expr/benches/binary_op.rs @@ -17,7 +17,6 @@ use arrow::{ array::BooleanArray, -compute::{bool_and, bool_or}, datatypes::{DataType, Field, Schema}, }; use arrow::{array::StringArray, record_batch::RecordBatch}; @@ -28,7 +27,7 @@ use datafusion_physical_expr::{ planner::logical2physical, PhysicalExpr, }; -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; /// Generates BooleanArrays with different true/false distributions for benchmarking. /// @@ -130,75 +129,6 @@ fn generate_boolean_cases( cases } -/// Benchmarks boolean operations `false_count/bool_or` and `true_count/bool_and` on [`BooleanArray`] -/// You can run this benchmark with: -/// ```sh -/// # test true_count/false_count -/// TEST_BOOL_COUNT=1 cargo bench --bench binary_op -- boolean_ops -/// # test bool_or/bool_and -/// cargo bench --bench binary_op -- boolean_ops -/// ``` -fn benchmark_boolean_ops(c: &mut Criterion) { -let len = 1_000_000; // Use one million elements for clear performance differentiation -static TEST_BOOL_COUNT: LazyLock = -LazyLock::new(|| match std::env::var("TEST_BOOL_COUNT") { -Ok(_) => { -println!("TEST_BOOL_COUNT=ON"); -true -} -Err(_) => { -println!("TEST_BOOL_COUNT=OFF"); -false -} -}); - -// Determine the test function to be executed based on the ENV `TEST_BOOL_COUNT` -fn test_func(array: &BooleanArray) -> bool { -// Use false_count for all false and true_count for all true -if *TEST_BOOL_COUNT { -if TEST_ALL_FALSE { -array.false_count() == array.len() -} else { -array.true_count() == array.len() -} -} -// Use bool_or for all false and bool_and for all true -else if TEST_ALL_FALSE { -match bool_or(array) { -Some(v) => !v, -None => false, -} -} else { -bool_and(array).unwrap_or(false) -} -} - -// Test cases for false_count and bool_or -{ -let test_cases = generate_boolean_cases::(len); -for (scenario, array) in test_cases { -let arr_ref = Arc::new(array); - -// Benchmark test_func across different scenarios -c.bench_function(&format!("boolean_ops/or/{}", scenario), |b| { -b.iter(|| test_func::(black_box(&arr_ref))) -}); -} -} -// Test cases for true_count and bool_and -{ -let test_cases = generate_boolean_cases::(len); -for (scenario, array) in test_cases { -let arr_ref = Arc::new(array); - -// Benchmark test_func across different scenarios -c.bench_function(&format!("boolean_ops/and/{}", scenario), |b| { -b.iter(|| test_func::(black_box(&arr_ref))) -}); -} -} -} - /// Benchmarks AND/OR operator short-circuiting by evaluating complex regex conditions. /// /// Creates 6 test scenarios per operator: @@ -257,12 +187,14 @@ fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) { ); // Create physical binary expressions +// a AND ((b ~ regex) AND (c ~ regex)) let expr_and = BinaryExpr::new( Arc::new(Column::new("a", 0)), Operator::And, logical2physical(&right_condition_and, &schema), ); +// a OR ((b ~ regex) OR (c ~ regex)) let expr_or = BinaryExpr::new( Arc::new(Column::new("a", 0)), Operator::Or, @@ -364,10 +296,6 @@ fn create_record_batch( Ok(rbs) } -criterion_group!( -benches, -benchmark_boolean_ops, -benchmark_binary_op_in_short_circuit -); +criterion_group!(benches, benchmark_binary_op_in_short_circuit); criterion_main!(benches); - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion) branch main updated: Minor: refine comments for statistics compution (#15647)
This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new b86619e84b Minor: refine comments for statistics compution (#15647) b86619e84b is described below commit b86619e84b24271f28c9cb4bf3751b6f3686eb7b Author: xudong.w AuthorDate: Wed Apr 9 18:33:59 2025 +0800 Minor: refine comments for statistics compution (#15647) --- datafusion/core/src/datasource/listing/table.rs | 8 ++-- datafusion/datasource/src/statistics.rs | 15 --- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index c05b7835ed..5848506da2 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -716,9 +716,13 @@ impl ListingOptions { #[derive(Debug)] pub struct ListingTable { table_paths: Vec, -/// File fields only +/// `file_schema` contains only the columns physically stored in the data files themselves. +/// - Represents the actual fields found in files like Parquet, CSV, etc. +/// - Used when reading the raw data from files file_schema: SchemaRef, -/// File fields + partition columns +/// `table_schema` combines `file_schema` + partition columns +/// - Partition columns are derived from directory paths (not stored in files) +/// - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet` table_schema: SchemaRef, options: ListingOptions, definition: Option, diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs index 040bf754dd..e1a91c0533 100644 --- a/datafusion/datasource/src/statistics.rs +++ b/datafusion/datasource/src/statistics.rs @@ -506,7 +506,7 @@ pub fn compute_file_group_statistics( /// /// # Parameters /// * `file_groups` - Vector of file groups to process -/// * `file_schema` - Schema of the files +/// * `table_schema` - Schema of the table /// * `collect_stats` - Whether to collect statistics /// * `inexact_stats` - Whether to mark the resulting statistics as inexact /// @@ -516,7 +516,7 @@ pub fn compute_file_group_statistics( /// * The summary statistics across all file groups, aka all files summary statistics pub fn compute_all_files_statistics( file_groups: Vec, -file_schema: SchemaRef, +table_schema: SchemaRef, collect_stats: bool, inexact_stats: bool, ) -> Result<(Vec, Statistics)> { @@ -526,16 +526,17 @@ pub fn compute_all_files_statistics( for file_group in file_groups { file_groups_with_stats.push(compute_file_group_statistics( file_group, -Arc::clone(&file_schema), +Arc::clone(&table_schema), collect_stats, )?); } // Then summary statistics across all file groups -let mut statistics = -compute_summary_statistics(&file_groups_with_stats, &file_schema, |file_group| { -file_group.statistics() -}); +let mut statistics = compute_summary_statistics( +&file_groups_with_stats, +&table_schema, +|file_group| file_group.statistics(), +); if inexact_stats { statistics = statistics.to_inexact() - To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org
(datafusion-comet) branch main updated: chore: Refactor QueryPlanSerde to use idiomatic Scala and reduce verbosity (#1609)
This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git The following commit(s) were added to refs/heads/main by this push: new 168715106 chore: Refactor QueryPlanSerde to use idiomatic Scala and reduce verbosity (#1609) 168715106 is described below commit 168715106b4587cc509c4e022f72a70c50b9bf27 Author: Andy Grove AuthorDate: Mon Apr 7 11:09:20 2025 -0600 chore: Refactor QueryPlanSerde to use idiomatic Scala and reduce verbosity (#1609) * start refactor * more refactoring * address feedback --- .../apache/comet/CometSparkSessionExtensions.scala | 315 - 1 file changed, 118 insertions(+), 197 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala index daa1f19ea..183d04ee7 100644 --- a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala +++ b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala @@ -352,7 +352,7 @@ class CometSparkSessionExtensions */ // spotless:on private def transform(plan: SparkPlan): SparkPlan = { - def transform1(op: SparkPlan): Option[Operator] = { + def operator2Proto(op: SparkPlan): Option[Operator] = { if (op.children.forall(_.isInstanceOf[CometNativeExec])) { QueryPlanSerde.operator2Proto( op, @@ -366,6 +366,14 @@ class CometSparkSessionExtensions } } + /** + * Convert operator to proto and then apply a transformation to wrap the proto in a new + * plan. + */ + def newPlanWithProto(op: SparkPlan, fun: Operator => SparkPlan): SparkPlan = { +operator2Proto(op).map(fun).getOrElse(op) + } + plan.transformUp { // Fully native scan for V1 case scan: CometScanExec @@ -384,97 +392,54 @@ class CometSparkSessionExtensions CometScanWrapper(nativeOp.get, cometOp) case op: ProjectExec => - val newOp = transform1(op) - newOp match { -case Some(nativeOp) => - CometProjectExec( -nativeOp, -op, -op.output, -op.projectList, -op.child, -SerializedPlan(None)) -case None => - op - } + newPlanWithProto( +op, +CometProjectExec(_, op, op.output, op.projectList, op.child, SerializedPlan(None))) case op: FilterExec => - val newOp = transform1(op) - newOp match { -case Some(nativeOp) => - CometFilterExec( -nativeOp, -op, -op.output, -op.condition, -op.child, -SerializedPlan(None)) -case None => - op - } + newPlanWithProto( +op, +CometFilterExec(_, op, op.output, op.condition, op.child, SerializedPlan(None))) case op: SortExec => - val newOp = transform1(op) - newOp match { -case Some(nativeOp) => - CometSortExec( -nativeOp, -op, -op.output, -op.outputOrdering, -op.sortOrder, -op.child, -SerializedPlan(None)) -case None => - op - } + newPlanWithProto( +op, +CometSortExec( + _, + op, + op.output, + op.outputOrdering, + op.sortOrder, + op.child, + SerializedPlan(None))) case op: LocalLimitExec => - val newOp = transform1(op) - newOp match { -case Some(nativeOp) => - CometLocalLimitExec(nativeOp, op, op.limit, op.child, SerializedPlan(None)) -case None => - op - } + newPlanWithProto( +op, +CometLocalLimitExec(_, op, op.limit, op.child, SerializedPlan(None))) case op: GlobalLimitExec if op.offset == 0 => - val newOp = transform1(op) - newOp match { -case Some(nativeOp) => - CometGlobalLimitExec(nativeOp, op, op.limit, op.child, SerializedPlan(None)) -case None => - op - } + newPlanWithProto( +op, +CometGlobalLimitExec(_, op, op.limit, op.child, SerializedPlan(None))) case op: CollectLimitExec if isCometNative(op.child) && CometConf.COMET_EXEC_COLLECT_LIMIT_ENABLED.get(conf) && isCometShuffleEnabled(conf) && op.offset == 0 => - QueryPlanSerde.operator2Proto(op) match { -
(datafusion) branch main updated: perf: Introduce sort prefix computation for early TopK exit optimization on partially sorted input (10x speedup on top10 bench) (#15563)
This is an automated email from the ASF dual-hosted git repository. berkay pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git The following commit(s) were added to refs/heads/main by this push: new 7a4577e963 perf: Introduce sort prefix computation for early TopK exit optimization on partially sorted input (10x speedup on top10 bench) (#15563) 7a4577e963 is described below commit 7a4577e963fec6a9af6028fd932b003352141392 Author: Geoffrey Claude AuthorDate: Wed Apr 9 12:35:40 2025 +0200 perf: Introduce sort prefix computation for early TopK exit optimization on partially sorted input (10x speedup on top10 bench) (#15563) * perf: Introduce sort prefix computation for early TopK exit optimization on partially sorted input * perf: Use same `common_sort_prefix` nomenclature everywhere * perf: Remove redundant argument to `sort_partially_satisfied` * perf: Clarify that the `common_sort_prefix` is normalized * perf: Update the topk tests for normalized projections * perf: Rename `worst` to `max` to keep naming consistent with heap nomenclature * perf: Add `NULLS FIRST` and `NULLS LAST` TopK sql logic tests * perf: Rename sqllogic topk test columns and reduce batch size * perf: Update TopK header doc with "Partial Sort Optimization" section * fix: Reset `SortExec`'s `EmissionType` to `Final` on partially sorted input - Without a fetch, the entire input data must be sorted before emitting results - With a fetch, we can optimize for an early exit, but the results will still be emitted once all the necessary input data has been processed --- .../tests/physical_optimizer/enforce_sorting.rs| 2 +- .../src/equivalence/properties/mod.rs | 59 +++-- datafusion/physical-plan/src/sorts/sort.rs | 45 +++- datafusion/physical-plan/src/topk/mod.rs | 267 +++-- datafusion/sqllogictest/test_files/topk.slt| 162 + datafusion/sqllogictest/test_files/window.slt | 2 +- 6 files changed, 489 insertions(+), 48 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 4d2c875d3f..d4b84a52f4 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -1652,7 +1652,7 @@ async fn test_remove_unnecessary_sort7() -> Result<()> { ) as Arc; let expected_input = [ -"SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], preserve_partitioning=[false]", +"SortExec: TopK(fetch=2), expr=[non_nullable_col@1 ASC], preserve_partitioning=[false], sort_prefix=[non_nullable_col@1 ASC]", " SortExec: expr=[non_nullable_col@1 ASC, nullable_col@0 ASC], preserve_partitioning=[false]", "DataSourceExec: partitions=1, partition_sizes=[0]", ]; diff --git a/datafusion/physical-expr/src/equivalence/properties/mod.rs b/datafusion/physical-expr/src/equivalence/properties/mod.rs index 9cf9897001..5b34a02a91 100644 --- a/datafusion/physical-expr/src/equivalence/properties/mod.rs +++ b/datafusion/physical-expr/src/equivalence/properties/mod.rs @@ -546,22 +546,26 @@ impl EquivalenceProperties { self.ordering_satisfy_requirement(&sort_requirements) } -/// Checks whether the given sort requirements are satisfied by any of the -/// existing orderings. -pub fn ordering_satisfy_requirement(&self, reqs: &LexRequirement) -> bool { -let mut eq_properties = self.clone(); -// First, standardize the given requirement: -let normalized_reqs = eq_properties.normalize_sort_requirements(reqs); - +/// Returns the number of consecutive requirements (starting from the left) +/// that are satisfied by the plan ordering. +fn compute_common_sort_prefix_length( +&self, +normalized_reqs: &LexRequirement, +) -> usize { // Check whether given ordering is satisfied by constraints first -if self.satisfied_by_constraints(&normalized_reqs) { -return true; +if self.satisfied_by_constraints(normalized_reqs) { +// If the constraints satisfy all requirements, return the full normalized requirements length +return normalized_reqs.len(); } -for normalized_req in normalized_reqs { +let mut eq_properties = self.clone(); + +for (i, normalized_req) in normalized_reqs.iter().enumerate() { // Check whether given ordering is satisfied -if !eq_properties.ordering_satisfy_single(&normalized_req) { -return false; +if !eq_properties.ordering_satisfy_single(normalized_req) { +// As soon as one requirement is not satisfied, return +// how many we've satisfied so far +