[arrow-datafusion] branch main updated: Add parquet filter and sort to bench.sh (#6172)

alamb Wed, 03 May 2023 10:27:28 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 424e4d8ab4 Add parquet filter and sort to bench.sh (#6172)
424e4d8ab4 is described below

commit 424e4d8ab4537664e477170b7c625b950ce3a893
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed May 3 13:27:14 2023 -0400

    Add parquet filter and sort to bench.sh (#6172)
---
 benchmarks/bench.sh | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 24286014a0..dee6896aec 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -32,7 +32,7 @@ COMMAND=
 BENCHMARK=all
 DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
 DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
-#CARGO_COMMAND=$CARGO_COMMAND:"cargo run --release"}
+#CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
 CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --profile release-nonlto"}  # TEMP: 
for faster iterations
 
 usage() {
@@ -66,6 +66,8 @@ compare:      Comares results from benchmark runs
 all(default): Data/Run/Compare for all benchmarks
 tpch:         TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single 
parquet file per table
 tpch_mem:     TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query 
from memory
+parquet:      Benchmark of parquet reader's filtering speed
+sort:         Benchmark of sorting speed
 
 **********
 * Supported Configuration (Environment Variables)
@@ -162,6 +164,8 @@ main() {
                 all)
                     run_tpch
                     run_tpch_mem
+                    run_parquet
+                    run_sort
                     ;;
                 tpch)
                     run_tpch
@@ -169,6 +173,12 @@ main() {
                 tpch_mem)
                     run_tpch_mem
                     ;;
+                parquet)
+                    run_parquet
+                    ;;
+                sort)
+                    run_sort
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for run"
                     usage
@@ -247,6 +257,22 @@ run_tpch_mem() {
     $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path 
"${DATA_DIR}" -m --format parquet -o ${RESULTS_FILE}
 }
 
+# Runs the parquet filter benchmark
+run_parquet() {
+    RESULTS_FILE="${RESULTS_DIR}/parquet.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running parquet filter benchmark..."
+    $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor 
1.0 --iterations 5 -o ${RESULTS_FILE}
+}
+
+# Runs the sort benchmark
+run_sort() {
+    RESULTS_FILE="${RESULTS_DIR}/sort.json"
+    echo "RESULTS_FILE: ${RESULTS_FILE}"
+    echo "Running sort benchmark..."
+    $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor 
1.0 --iterations 5 -o ${RESULTS_FILE}
+}
+
 compare_benchmarks() {
     BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
     BRANCH1="${ARG2}"

[arrow-datafusion] branch main updated: Add parquet filter and sort to bench.sh (#6172)

Reply via email to