This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 424e4d8ab4 Add parquet filter and sort to bench.sh (#6172)
424e4d8ab4 is described below
commit 424e4d8ab4537664e477170b7c625b950ce3a893
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed May 3 13:27:14 2023 -0400
Add parquet filter and sort to bench.sh (#6172)
---
benchmarks/bench.sh | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index 24286014a0..dee6896aec 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -32,7 +32,7 @@ COMMAND=
BENCHMARK=all
DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
-#CARGO_COMMAND=$CARGO_COMMAND:"cargo run --release"}
+#CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --profile release-nonlto"} # TEMP:
for faster iterations
usage() {
@@ -66,6 +66,8 @@ compare: Comares results from benchmark runs
all(default): Data/Run/Compare for all benchmarks
tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single
parquet file per table
tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query
from memory
+parquet: Benchmark of parquet reader's filtering speed
+sort: Benchmark of sorting speed
**********
* Supported Configuration (Environment Variables)
@@ -162,6 +164,8 @@ main() {
all)
run_tpch
run_tpch_mem
+ run_parquet
+ run_sort
;;
tpch)
run_tpch
@@ -169,6 +173,12 @@ main() {
tpch_mem)
run_tpch_mem
;;
+ parquet)
+ run_parquet
+ ;;
+ sort)
+ run_sort
+ ;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for run"
usage
@@ -247,6 +257,22 @@ run_tpch_mem() {
$CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path
"${DATA_DIR}" -m --format parquet -o ${RESULTS_FILE}
}
+# Runs the parquet filter benchmark
+run_parquet() {
+ RESULTS_FILE="${RESULTS_DIR}/parquet.json"
+ echo "RESULTS_FILE: ${RESULTS_FILE}"
+ echo "Running parquet filter benchmark..."
+ $CARGO_COMMAND --bin parquet -- filter --path "${DATA_DIR}" --scale-factor
1.0 --iterations 5 -o ${RESULTS_FILE}
+}
+
+# Runs the sort benchmark
+run_sort() {
+ RESULTS_FILE="${RESULTS_DIR}/sort.json"
+ echo "RESULTS_FILE: ${RESULTS_FILE}"
+ echo "Running sort benchmark..."
+ $CARGO_COMMAND --bin parquet -- sort --path "${DATA_DIR}" --scale-factor
1.0 --iterations 5 -o ${RESULTS_FILE}
+}
+
compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
BRANCH1="${ARG2}"