jonathanc-n commented on code in PR #17636:
URL: https://github.com/apache/datafusion/pull/17636#discussion_r2365939109


##########
benchmarks/src/hj.rs:
##########
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use datafusion::physical_plan::execute_stream;
+use datafusion::{error::Result, prelude::SessionContext};
+use datafusion_common::instant::Instant;
+use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError};
+use structopt::StructOpt;
+
+use futures::StreamExt;
+
+// TODO: Add existence joins
+
+/// Run the Hash Join benchmark
+///
+/// This micro-benchmark focuses on the performance characteristics of Hash 
Joins.
+/// It uses simple equality predicates to ensure a hash join is selected.
+/// Where we vary selectivity, we do so with additional cheap predicates that
+/// do not change the join key (so the physical operator remains HashJoin).
+#[derive(Debug, StructOpt, Clone)]
+#[structopt(verbatim_doc_comment)]
+pub struct RunOpt {
+    /// Query number (between 1 and 12). If not specified, runs all queries
+    #[structopt(short, long)]
+    query: Option<usize>,
+
+    /// Common options (iterations, batch size, target_partitions, etc.)
+    #[structopt(flatten)]
+    common: CommonOpt,
+
+    /// If present, write results json here
+    #[structopt(parse(from_os_str), short = "o", long = "output")]
+    output_path: Option<std::path::PathBuf>,
+}
+
+/// Inline SQL queries for Hash Join benchmarks
+///
+/// Each query's comment includes:
+///   - Left row count × Right row count
+///   - Join predicate selectivity (approximate output fraction).
+const HASH_QUERIES: &[&str] = &[
+    // Q1: INNER 10K x 10K | LOW ~0.1%
+    // equality on key + cheap filter to downselect
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q2: INNER 10K x 10K | MEDIUM ~20%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 5 = 0
+    "#,
+    // Q3: INNER 10K x 10K | HIGH ~90%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 10 <> 0
+    "#,
+    // Q4: INNER 30K x 30K | MEDIUM ~20%
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(30000) AS t1
+        JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 5 = 0
+    "#,
+    // Q5: INNER 10K x 200K | LOW ~0.1% (small to large)
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(10000) AS t1
+        JOIN range(200000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q6: INNER 200K x 10K | LOW ~0.1% (large to small)
+    r#"
+        SELECT t1.value, t2.value
+        FROM range(200000) AS t1
+        JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q7: RIGHT OUTER 10K x 200K | LOW ~0.1%
+    // Outer join still uses HashJoin for equi-keys; the extra filter reduces 
matches
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(10000) AS t1
+        RIGHT JOIN range(200000) AS t2
+          ON t1.value = t2.value
+        WHERE t2.value % 1000 = 0
+    "#,
+    // Q8: LEFT OUTER 200K x 10K | LOW ~0.1%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(200000) AS t1
+        LEFT JOIN range(10000) AS t2
+          ON t1.value = t2.value
+        WHERE t1.value % 1000 = 0
+    "#,
+    // Q9: FULL OUTER 30K x 30K | LOW ~0.1%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE COALESCE(t1.value, t2.value) % 1000 = 0
+    "#,
+    // Q10: FULL OUTER 30K x 30K | HIGH ~90%
+    r#"
+        SELECT t1.value AS l, t2.value AS r
+        FROM range(30000) AS t1
+        FULL JOIN range(30000) AS t2
+          ON t1.value = t2.value
+        WHERE COALESCE(t1.value, t2.value) % 10 <> 0
+    "#,
+    // Q11: INNER 30K x 30K | MEDIUM ~50% | cheap predicate on parity

Review Comment:
   ```
   
   Query 1 iteration 0 returned 10 rows in 1.743042ms
   Query 1 iteration 1 returned 10 rows in 543.083µs
   Query 1 iteration 2 returned 10 rows in 358.542µs
   Query 1 iteration 3 returned 10 rows in 307.291µs
   Query 1 iteration 4 returned 10 rows in 596.667µs
   Query 2 iteration 0 returned 2000 rows in 659.75µs
   Query 2 iteration 1 returned 2000 rows in 820.791µs
   Query 2 iteration 2 returned 2000 rows in 681.041µs
   Query 2 iteration 3 returned 2000 rows in 656.709µs
   Query 2 iteration 4 returned 2000 rows in 522.916µs
   Query 3 iteration 0 returned 9000 rows in 538.542µs
   Query 3 iteration 1 returned 9000 rows in 594.292µs
   Query 3 iteration 2 returned 9000 rows in 408.583µs
   Query 3 iteration 3 returned 9000 rows in 494.708µs
   Query 3 iteration 4 returned 9000 rows in 508.792µs
   Query 4 iteration 0 returned 6000 rows in 439.125µs
   Query 4 iteration 1 returned 6000 rows in 483.375µs
   Query 4 iteration 2 returned 6000 rows in 429.625µs
   Query 4 iteration 3 returned 6000 rows in 526.417µs
   Query 4 iteration 4 returned 6000 rows in 393.459µs
   Query 5 iteration 0 returned 10 rows in 730.625µs
   Query 5 iteration 1 returned 10 rows in 706.291µs
   Query 5 iteration 2 returned 10 rows in 626.958µs
   Query 5 iteration 3 returned 10 rows in 670.959µs
   Query 5 iteration 4 returned 10 rows in 689.25µs
   Query 6 iteration 0 returned 10 rows in 606.208µs
   Query 6 iteration 1 returned 10 rows in 650.625µs
   Query 6 iteration 2 returned 10 rows in 616.709µs
   Query 6 iteration 3 returned 10 rows in 597.917µs
   Query 6 iteration 4 returned 10 rows in 604.75µs
   Query 7 iteration 0 returned 200 rows in 653.042µs
   Query 7 iteration 1 returned 200 rows in 607.458µs
   Query 7 iteration 2 returned 200 rows in 617.5µs
   Query 7 iteration 3 returned 200 rows in 607µs
   Query 7 iteration 4 returned 200 rows in 623.208µs
   Query 8 iteration 0 returned 200 rows in 629.792µs
   Query 8 iteration 1 returned 200 rows in 610.208µs
   Query 8 iteration 2 returned 200 rows in 542.083µs
   Query 8 iteration 3 returned 200 rows in 541.959µs
   Query 8 iteration 4 returned 200 rows in 538.334µs
   Query 9 iteration 0 returned 30 rows in 702.541µs
   Query 9 iteration 1 returned 30 rows in 701.459µs
   Query 9 iteration 2 returned 30 rows in 730.25µs
   Query 9 iteration 3 returned 30 rows in 696.333µs
   Query 9 iteration 4 returned 30 rows in 585.458µs
   Query 10 iteration 0 returned 27000 rows in 660.209µs
   Query 10 iteration 1 returned 27000 rows in 593.167µs
   Query 10 iteration 2 returned 27000 rows in 607.208µs
   Query 10 iteration 3 returned 27000 rows in 584.541µs
   Query 10 iteration 4 returned 27000 rows in 607.25µs
   Query 11 iteration 0 returned 450000000 rows in 716.183375ms
   Query 11 iteration 1 returned 450000000 rows in 721.864041ms
   Query 11 iteration 2 returned 450000000 rows in 719.099666ms
   Query 11 iteration 3 returned 450000000 rows in 722.7035ms
   Query 11 iteration 4 returned 450000000 rows in 724.213ms
   Query 12 iteration 0 returned 450000000 rows in 1.009563667s
   Query 12 iteration 1 returned 450000000 rows in 1.007581917s
   Query 12 iteration 2 returned 450000000 rows in 1.011669666s
   Query 12 iteration 3 returned 450000000 rows in 992.134792ms
   Query 12 iteration 4 returned 450000000 rows in 1.000713083s
   Query 13 iteration 0 returned 9999 rows in 633.083µs
   Query 13 iteration 1 returned 9999 rows in 540.959µs
   Query 13 iteration 2 returned 9999 rows in 478.792µs
   Query 13 iteration 3 returned 9999 rows in 538.375µs
   Query 13 iteration 4 returned 9999 rows in 575.917µs
   Query 14 iteration 0 returned 54000 rows in 733.667µs
   Query 14 iteration 1 returned 54000 rows in 664.5µs
   Query 14 iteration 2 returned 54000 rows in 695.042µs
   Query 14 iteration 3 returned 54000 rows in 572.167µs
   Query 14 iteration 4 returned 54000 rows in 612.542µs
   
   ```
   
   These were the results, 11 and 12 were much slower than the others



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to