Re: [PR] feat: Add Hash Join benchmarks [datafusion]

via GitHub Thu, 18 Sep 2025 22:21:13 -0700


2010YOUY01 commented on code in PR #17636:
URL: https://github.com/apache/datafusion/pull/17636#discussion_r2361777811



##########
benchmarks/src/hj.rs:
##########
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
+use structopt::StructOpt;
+
+use futures::StreamExt;
+
+// TODO: Add existence joins
+
+/// Run the Hash Join benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of Hash 
Joins.
+/// It uses simple equality predicates to ensure a hash join is selected.
+/// Where we vary selectivity, we do so with additional cheap predicates that
+/// do not change the join key (so the physical operator remains HashJoin).
+#[derive(Debug, StructOpt, Clone)]
+#[structopt(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 12). If not specified, runs all queries
+    #[structopt(short, long)]
+    query: Option<usize>,
+
+    /// Common options (iterations, batch size, target_partitions, etc.)
+    #[structopt(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for Hash Join benchmarks
+///
+/// Each query's comment includes:
+///   - Left row count × Right row count
+///   - Join predicate selectivity (approximate output fraction).
+const HASH_QUERIES: &[&str] = &[
+    // Q1: INNER 10K x 10K | LOW ~0.1%
+    // equality on key + cheap filter to downselect
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0

Review Comment:
   I think it's equivalent to do `generate_series(0, 10000, 1000)` for t1? This 
way we can eliminate the cost of `FilterExec` and make those queries more 
focused on the join executor.



##########
benchmarks/src/hj.rs:
##########
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
+use structopt::StructOpt;
+
+use futures::StreamExt;
+
+// TODO: Add existence joins
+
+/// Run the Hash Join benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of Hash 
Joins.
+/// It uses simple equality predicates to ensure a hash join is selected.
+/// Where we vary selectivity, we do so with additional cheap predicates that
+/// do not change the join key (so the physical operator remains HashJoin).
+#[derive(Debug, StructOpt, Clone)]
+#[structopt(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 12). If not specified, runs all queries
+    #[structopt(short, long)]
+    query: Option<usize>,
+
+    /// Common options (iterations, batch size, target_partitions, etc.)
+    #[structopt(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for Hash Join benchmarks
+///
+/// Each query's comment includes:
+///   - Left row count × Right row count
+///   - Join predicate selectivity (approximate output fraction).
+const HASH_QUERIES: &[&str] = &[
+    // Q1: INNER 10K x 10K | LOW ~0.1%
+    // equality on key + cheap filter to downselect
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1

Review Comment:
   The scope of this micro-benchmark is indeed limited, but I think it’s better 
to keep the focus on the join executor and eliminate the cost of Parquet 
reading. Perhaps we could add some join-focused queries by extending the TPCH 
or JOB benchmark for realistic scenarios.



##########
benchmarks/src/hj.rs:
##########
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
+use structopt::StructOpt;
+
+use futures::StreamExt;
+
+// TODO: Add existence joins
+
+/// Run the Hash Join benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of Hash 
Joins.
+/// It uses simple equality predicates to ensure a hash join is selected.
+/// Where we vary selectivity, we do so with additional cheap predicates that
+/// do not change the join key (so the physical operator remains HashJoin).
+#[derive(Debug, StructOpt, Clone)]
+#[structopt(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 12). If not specified, runs all queries
+    #[structopt(short, long)]
+    query: Option<usize>,
+
+    /// Common options (iterations, batch size, target_partitions, etc.)
+    #[structopt(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for Hash Join benchmarks
+///
+/// Each query's comment includes:
+///   - Left row count × Right row count
+///   - Join predicate selectivity (approximate output fraction).
+const HASH_QUERIES: &[&str] = &[
+    // Q1: INNER 10K x 10K | LOW ~0.1%
+    // equality on key + cheap filter to downselect
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q2: INNER 10K x 10K | MEDIUM ~20%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 5 = 0
+    "#,
+    // Q3: INNER 10K x 10K | HIGH ~90%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 10 <> 0
+    "#,
+    // Q4: INNER 30K x 30K | MEDIUM ~20%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(30000) AS t1
+        JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 5 = 0
+    "#,
+    // Q5: INNER 10K x 200K | LOW ~0.1% (small to large)
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(200000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q6: INNER 200K x 10K | LOW ~0.1% (large to small)
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(200000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q7: RIGHT OUTER 10K x 200K | LOW ~0.1%
+    // Outer join still uses HashJoin for equi-keys; the extra filter reduces 
matches
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(10000) AS t1
+        RIGHT JOIN range(200000) AS t2
+          ON t1.value = t2.value
+        WHERE t2.value % 1000 = 0
+    "#,
+    // Q8: LEFT OUTER 200K x 10K | LOW ~0.1%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(200000) AS t1
+        LEFT JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q9: FULL OUTER 30K x 30K | LOW ~0.1%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE COALESCE(t1.value, t2.value) % 1000 = 0
+    "#,
+    // Q10: FULL OUTER 30K x 30K | HIGH ~90%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE COALESCE(t1.value, t2.value) % 10 <> 0
+    "#,
+    // Q11: INNER 30K x 30K | MEDIUM ~50% | cheap predicate on parity

Review Comment:
   This query is interesting. The current hash table implementation may not 
perform well when there are many duplicate join keys, due to its linear probing 
design. How did the benchmark results look for this query?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: Add Hash Join benchmarks [datafusion]

Reply via email to