This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 83dd4e5eb chore: Support running specific benchmark query (#2491)
83dd4e5eb is described below
commit 83dd4e5ebddd83626af2cef4986725a313e0ec87
Author: Oleks V <[email protected]>
AuthorDate: Mon Sep 29 11:28:19 2025 -0700
chore: Support running specific benchmark query (#2491)
---
dev/benchmarks/tpcbench.py | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/dev/benchmarks/tpcbench.py b/dev/benchmarks/tpcbench.py
index 2a6b5708b..1b0d2e362 100644
--- a/dev/benchmarks/tpcbench.py
+++ b/dev/benchmarks/tpcbench.py
@@ -21,7 +21,7 @@ import json
from pyspark.sql import SparkSession
import time
-def main(benchmark: str, data_path: str, query_path: str, iterations: int,
output: str, name: str):
+def main(benchmark: str, data_path: str, query_path: str, iterations: int,
output: str, name: str, query_num: int = None):
# Initialize a SparkSession
spark = SparkSession.builder \
@@ -59,9 +59,17 @@ def main(benchmark: str, data_path: str, query_path: str,
iterations: int, outpu
for iteration in range(0, iterations):
print(f"Starting iteration {iteration} of {iterations}")
- iter_start_time = time.time()
- for query in range(1, num_queries+1):
+ # Determine which queries to run
+ if query_num is not None:
+ # Validate query number
+ if query_num < 1 or query_num > num_queries:
+ raise ValueError(f"Query number {query_num} is out of range.
Valid range is 1-{num_queries} for {benchmark}")
+ queries_to_run = [query_num]
+ else:
+ queries_to_run = range(1, num_queries+1)
+
+ for query in queries_to_run:
spark.sparkContext.setJobDescription(f"{benchmark} q{query}")
# read text file
@@ -105,8 +113,6 @@ def main(benchmark: str, data_path: str, query_path: str,
iterations: int, outpu
# Stop the SparkSession
spark.stop()
- #print(str)
-
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="DataFusion benchmark derived
from TPC-H / TPC-DS")
parser.add_argument("--benchmark", required=True, help="Benchmark to run
(tpch or tpcds)")
@@ -115,6 +121,7 @@ if __name__ == "__main__":
parser.add_argument("--iterations", required=False, default="1", help="How
many iterations to run")
parser.add_argument("--output", required=True, help="Path to write output")
parser.add_argument("--name", required=True, help="Prefix for result file
e.g. spark/comet/gluten")
+ parser.add_argument("--query", required=False, type=int, help="Specific
query number to run (1-based). If not specified, all queries will be run.")
args = parser.parse_args()
- main(args.benchmark, args.data, args.queries, int(args.iterations),
args.output, args.name)
+ main(args.benchmark, args.data, args.queries, int(args.iterations),
args.output, args.name, args.query)
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]