This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-spatialbench.git
The following commit(s) were added to refs/heads/main by this push:
new f788c0f Update DuckDB to 1.5.0+, Run each benchmark query in its own
job for OOM resilience (#93)
f788c0f is described below
commit f788c0fc86ff0ff9f828748208ec335f8de70b22
Author: Jia Yu <[email protected]>
AuthorDate: Tue Mar 24 00:58:52 2026 -0700
Update DuckDB to 1.5.0+, Run each benchmark query in its own job for OOM
resilience (#93)
* Run each benchmark query in its own job for OOM resilience
- Remove DuckDB <1.5.0 version cap so nightlies resolve to 1.5.x+
- Default DuckDB and SedonaDB to stable releases (nightly opt-in)
- Default scale factors to SF1,SF10 with 600s per-query timeout
- Run each query as a separate GitHub Actions job so OOM on one
query does not block remaining queries
- Cache pip dependencies across per-query jobs for faster setup
- Summarize script merges per-query result files, fills missing
engines/queries as not_started (OOM), and shows partial results
* Show completed query count in Performance Summary table
---
.github/workflows/benchmark.yml | 270 ++++++++++++++++++++++------------------
benchmark/run_benchmark.py | 138 ++++++++++++--------
benchmark/summarize_results.py | 107 ++++++++++++++--
3 files changed, 335 insertions(+), 180 deletions(-)
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 9294d55..83b481e 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -21,24 +21,15 @@ on:
workflow_dispatch:
inputs:
scale_factor:
- description: 'Scale factor for benchmark'
+ description: 'Scale factors to benchmark (comma-separated, e.g.
"1,10")'
required: false
- default: '1'
- type: choice
- options:
- - '0.1'
- - '1'
- - '10'
+ default: '1,10'
+ type: string
engines:
description: 'Engines to benchmark (comma-separated)'
required: false
default: 'duckdb,geopandas,sedonadb,spatial_polars'
type: string
- timeout:
- description: 'Query timeout in seconds (default: 60, increase for full
benchmark)'
- required: false
- default: '60'
- type: string
sedonadb_version:
description: 'SedonaDB version (e.g., 1.0.0, leave empty for latest)'
required: false
@@ -71,12 +62,12 @@ on:
sedonadb_nightly:
description: 'Use SedonaDB nightly build from Gemfury (ignores version
if true)'
required: false
- default: true
+ default: false
type: boolean
duckdb_nightly:
description: 'Use DuckDB pre-release/nightly build (ignores version if
true)'
required: false
- default: true
+ default: false
type: boolean
concurrency:
@@ -85,27 +76,47 @@ concurrency:
env:
CARGO_TERM_COLOR: always
- SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }}
BENCHMARK_ENGINES: ${{ github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars' }}
- QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '60' }}
BENCHMARK_RUNS: ${{ github.event.inputs.runs || '3' }}
# Package versions (empty = latest, can be overridden via workflow_dispatch)
SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }}
DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }}
GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }}
SPATIAL_POLARS_VERSION: ${{ github.event.inputs.spatial_polars_version }}
- # Nightly build options (default: true)
- SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly || 'true' }}
- DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly || 'true' }}
+ # Nightly build options (default: false, use stable releases)
+ SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly || 'false' }}
+ DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly || 'false' }}
+ QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '600' }}
# Hugging Face dataset for benchmark data
HF_DATASET: apache-sedona/spatialbench
HF_DATA_VERSION: v0.1.0
jobs:
+ # Parse scale factors into a JSON array for matrix strategy
+ parse-scale-factors:
+ name: Parse Scale Factors
+ runs-on: ubuntu-latest
+ outputs:
+ matrix: ${{ steps.parse.outputs.matrix }}
+ steps:
+ - name: Parse scale factor input
+ id: parse
+ run: |
+ # Default: "1,10" for automatic runs, or user-provided for
workflow_dispatch
+ INPUT="${{ github.event.inputs.scale_factor || '1,10' }}"
+ # Convert comma-separated string to JSON array: "1,10" -> ["1","10"]
+ MATRIX=$(echo "$INPUT" | tr ',' '\n' | sed
's/^[[:space:]]*//;s/[[:space:]]*$//' | jq -R . | jq -s -c .)
+ echo "matrix=$MATRIX"
+ echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
+
# Download benchmark data from Hugging Face
download-data:
- name: Download Data (SF${{ github.event.inputs.scale_factor || '1' }})
+ name: Download Data (SF${{ matrix.scale_factor }})
+ needs: parse-scale-factors
runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
steps:
- uses: actions/checkout@v6
@@ -113,8 +124,8 @@ jobs:
id: cache-data
uses: actions/cache@v5
with:
- path: benchmark-data-sf${{ env.SCALE_FACTOR }}
- key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ path: benchmark-data-sf${{ matrix.scale_factor }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
matrix.scale_factor }}
- name: Setup Python
if: steps.cache-data.outputs.cache-hit != 'true'
@@ -128,9 +139,11 @@ jobs:
- name: Download benchmark data from Hugging Face
if: steps.cache-data.outputs.cache-hit != 'true'
+ env:
+ SCALE_FACTOR: ${{ matrix.scale_factor }}
run: |
# Map scale factor to HF folder name
- SF="${{ env.SCALE_FACTOR }}"
+ SF="${{ matrix.scale_factor }}"
if [ "$SF" = "0.1" ]; then
HF_SF="sf0.1"
else
@@ -155,66 +168,74 @@ jobs:
"
# Move data to expected location
- mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }}
+ mkdir -p benchmark-data-sf${{ matrix.scale_factor }}
- SF="${{ env.SCALE_FACTOR }}"
+ SF="${{ matrix.scale_factor }}"
if [ "$SF" = "0.1" ]; then
HF_SF="sf0.1"
else
HF_SF="sf${SF}"
fi
- cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/*
benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/*
benchmark-data-sf${{ matrix.scale_factor }}/
echo "Downloaded data structure:"
- find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name
"*.parquet" | head -20
+ find benchmark-data-sf${{ matrix.scale_factor }} -type f -name
"*.parquet" | head -20
echo ""
echo "Directory contents:"
- ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ ls -la benchmark-data-sf${{ matrix.scale_factor }}/
echo ""
echo "Total size:"
- du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ du -sh benchmark-data-sf${{ matrix.scale_factor }}/
- name: Show cached data info
if: steps.cache-data.outputs.cache-hit == 'true'
run: |
echo "Using cached benchmark data"
echo "Directory contents:"
- ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ ls -la benchmark-data-sf${{ matrix.scale_factor }}/
echo ""
echo "Total size:"
- du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
+ du -sh benchmark-data-sf${{ matrix.scale_factor }}/
+
+ # ── Per-query benchmark jobs ──
+ # Each query runs in its own job (separate runner) so that if one query
+ # OOMs and kills the runner, the remaining queries still execute.
+ # max-parallel: 1 ensures queries run sequentially per engine to avoid
+ # overloading the CI and to keep results orderly.
benchmark-duckdb:
- name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor || '1' }})
- needs: download-data
+ name: DuckDB ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+ needs: [parse-scale-factors, download-data]
runs-on: ubuntu-latest
if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb')
+ strategy:
+ matrix:
+ scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+ query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+ fail-fast: false
+ max-parallel: 2 # 1 per scale factor
steps:
- uses: actions/checkout@v6
- name: Restore benchmark data from cache
uses: actions/cache/restore@v5
with:
- path: benchmark-data-sf${{ env.SCALE_FACTOR }}
- key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ path: benchmark-data-sf${{ matrix.scale_factor }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
matrix.scale_factor }}
fail-on-cache-miss: true
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
+ cache: 'pip'
+ cache-dependency-path: .github/workflows/benchmark.yml
- name: Install dependencies
run: |
- echo "=== DuckDB Installation Parameters ==="
- echo "DUCKDB_NIGHTLY: ${{ env.DUCKDB_NIGHTLY }}"
- echo "DUCKDB_VERSION: ${{ env.DUCKDB_VERSION }}"
- echo "======================================"
if [ "${{ env.DUCKDB_NIGHTLY }}" = "true" ]; then
- # Use --pre to install pre-release dev builds (e.g., 1.4.4.dev48)
- # Constraint <1.5.0 ensures we get 1.4.x branch dev builds
- pip install "duckdb<1.5.0" --pre pyarrow pandas
+ pip install duckdb --pre pyarrow pandas
elif [ -n "${{ env.DUCKDB_VERSION }}" ]; then
pip install "duckdb==${{ env.DUCKDB_VERSION }}" pyarrow pandas
else
@@ -222,47 +243,57 @@ jobs:
fi
echo "Installed DuckDB version: $(python -c 'import duckdb;
print(duckdb.__version__)')"
- - name: Pre-install DuckDB spatial extension
+ - name: Install DuckDB spatial extension
run: |
- # Dev builds don't have spatial extension in core_nightly, so always
use default repo
- python -c "import duckdb; con = duckdb.connect();
con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')"
+ # INSTALL is a no-op on DuckDB 1.5 stable (spatial bundled natively)
but required for nightly builds.
+ python -c "import duckdb; con = duckdb.connect();
con.execute('INSTALL spatial'); con.execute('LOAD spatial'); print('DuckDB
spatial extension installed and loaded')"
- - name: Run DuckDB benchmark
+ - name: Run DuckDB ${{ matrix.query }}
run: |
python benchmark/run_benchmark.py \
- --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
--engines duckdb \
- --timeout ${{ env.QUERY_TIMEOUT }} \
+ --queries ${{ matrix.query }} \
--runs ${{ env.BENCHMARK_RUNS }} \
- --scale-factor ${{ env.SCALE_FACTOR }} \
- --output duckdb_results.json
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --scale-factor ${{ matrix.scale_factor }} \
+ --output duckdb_${{ matrix.query }}_results.json
- name: Upload results
+ if: always() && hashFiles(format('duckdb_{0}_results.json',
matrix.query)) != ''
uses: actions/upload-artifact@v6
with:
- name: duckdb-results-sf${{ env.SCALE_FACTOR }}
- path: duckdb_results.json
+ name: duckdb-${{ matrix.query }}-results-sf${{ matrix.scale_factor }}
+ path: duckdb_${{ matrix.query }}_results.json
retention-days: 30
benchmark-geopandas:
- name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor || '1'
}})
- needs: download-data
+ name: GeoPandas ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+ needs: [parse-scale-factors, download-data]
runs-on: ubuntu-latest
if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas')
+ strategy:
+ matrix:
+ scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+ query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+ fail-fast: false
+ max-parallel: 2
steps:
- uses: actions/checkout@v6
- name: Restore benchmark data from cache
uses: actions/cache/restore@v5
with:
- path: benchmark-data-sf${{ env.SCALE_FACTOR }}
- key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ path: benchmark-data-sf${{ matrix.scale_factor }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
matrix.scale_factor }}
fail-on-cache-miss: true
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
+ cache: 'pip'
+ cache-dependency-path: .github/workflows/benchmark.yml
- name: Install dependencies
run: |
@@ -273,51 +304,56 @@ jobs:
fi
echo "Installed GeoPandas version: $(python -c 'from
importlib.metadata import version; print(version("geopandas"))')"
- - name: Run GeoPandas benchmark
+ - name: Run GeoPandas ${{ matrix.query }}
run: |
python benchmark/run_benchmark.py \
- --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
--engines geopandas \
- --timeout ${{ env.QUERY_TIMEOUT }} \
+ --queries ${{ matrix.query }} \
--runs ${{ env.BENCHMARK_RUNS }} \
- --scale-factor ${{ env.SCALE_FACTOR }} \
- --output geopandas_results.json
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --scale-factor ${{ matrix.scale_factor }} \
+ --output geopandas_${{ matrix.query }}_results.json
- name: Upload results
+ if: always() && hashFiles(format('geopandas_{0}_results.json',
matrix.query)) != ''
uses: actions/upload-artifact@v6
with:
- name: geopandas-results-sf${{ env.SCALE_FACTOR }}
- path: geopandas_results.json
+ name: geopandas-${{ matrix.query }}-results-sf${{
matrix.scale_factor }}
+ path: geopandas_${{ matrix.query }}_results.json
retention-days: 30
benchmark-sedonadb:
- name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor || '1' }})
- needs: download-data
+ name: SedonaDB ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+ needs: [parse-scale-factors, download-data]
runs-on: ubuntu-latest
if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb')
+ strategy:
+ matrix:
+ scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+ query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+ fail-fast: false
+ max-parallel: 2
steps:
- uses: actions/checkout@v6
- name: Restore benchmark data from cache
uses: actions/cache/restore@v5
with:
- path: benchmark-data-sf${{ env.SCALE_FACTOR }}
- key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ path: benchmark-data-sf${{ matrix.scale_factor }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
matrix.scale_factor }}
fail-on-cache-miss: true
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
+ cache: 'pip'
+ cache-dependency-path: .github/workflows/benchmark.yml
- name: Install dependencies
run: |
- echo "=== SedonaDB Installation Parameters ==="
- echo "SEDONADB_NIGHTLY: ${{ env.SEDONADB_NIGHTLY }}"
- echo "SEDONADB_VERSION: ${{ env.SEDONADB_VERSION }}"
- echo "========================================"
if [ "${{ env.SEDONADB_NIGHTLY }}" = "true" ]; then
- # Use Gemfury as primary index and --pre to install nightly alpha
builds (e.g., 0.3.0a69)
pip install "sedonadb[geopandas]" pandas pyarrow pyproj \
--pre \
--index-url https://repo.fury.io/sedona-nightlies/ \
@@ -329,42 +365,52 @@ jobs:
fi
echo "Installed SedonaDB version: $(python -c 'from
importlib.metadata import version; print(version("sedonadb"))')"
- - name: Run SedonaDB benchmark
+ - name: Run SedonaDB ${{ matrix.query }}
run: |
python benchmark/run_benchmark.py \
- --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
--engines sedonadb \
- --timeout ${{ env.QUERY_TIMEOUT }} \
+ --queries ${{ matrix.query }} \
--runs ${{ env.BENCHMARK_RUNS }} \
- --scale-factor ${{ env.SCALE_FACTOR }} \
- --output sedonadb_results.json
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --scale-factor ${{ matrix.scale_factor }} \
+ --output sedonadb_${{ matrix.query }}_results.json
- name: Upload results
+ if: always() && hashFiles(format('sedonadb_{0}_results.json',
matrix.query)) != ''
uses: actions/upload-artifact@v6
with:
- name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
- path: sedonadb_results.json
+ name: sedonadb-${{ matrix.query }}-results-sf${{ matrix.scale_factor
}}
+ path: sedonadb_${{ matrix.query }}_results.json
retention-days: 30
benchmark-spatial-polars:
- name: Benchmark Spatial Polars (SF${{ github.event.inputs.scale_factor ||
'1' }})
- needs: download-data
+ name: Spatial Polars ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+ needs: [parse-scale-factors, download-data]
runs-on: ubuntu-latest
if: contains(github.event.inputs.engines ||
'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars')
+ strategy:
+ matrix:
+ scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+ query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+ fail-fast: false
+ max-parallel: 2
steps:
- uses: actions/checkout@v6
- name: Restore benchmark data from cache
uses: actions/cache/restore@v5
with:
- path: benchmark-data-sf${{ env.SCALE_FACTOR }}
- key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
env.SCALE_FACTOR }}
+ path: benchmark-data-sf${{ matrix.scale_factor }}
+ key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{
matrix.scale_factor }}
fail-on-cache-miss: true
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.11'
+ cache: 'pip'
+ cache-dependency-path: .github/workflows/benchmark.yml
- name: Install dependencies
run: |
@@ -375,61 +421,42 @@ jobs:
fi
echo "Installed Spatial Polars version: $(python -c 'from
importlib.metadata import version; print(version("spatial-polars"))')"
- - name: Run Spatial Polars benchmark
+ - name: Run Spatial Polars ${{ matrix.query }}
run: |
python benchmark/run_benchmark.py \
- --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+ --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
--engines spatial_polars \
- --timeout ${{ env.QUERY_TIMEOUT }} \
+ --queries ${{ matrix.query }} \
--runs ${{ env.BENCHMARK_RUNS }} \
- --scale-factor ${{ env.SCALE_FACTOR }} \
- --output spatial_polars_results.json
+ --timeout ${{ env.QUERY_TIMEOUT }} \
+ --scale-factor ${{ matrix.scale_factor }} \
+ --output spatial_polars_${{ matrix.query }}_results.json
- name: Upload results
+ if: always() && hashFiles(format('spatial_polars_{0}_results.json',
matrix.query)) != ''
uses: actions/upload-artifact@v6
with:
- name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
- path: spatial_polars_results.json
+ name: spatial_polars-${{ matrix.query }}-results-sf${{
matrix.scale_factor }}
+ path: spatial_polars_${{ matrix.query }}_results.json
retention-days: 30
summarize-results:
- name: Summarize Results (SF${{ github.event.inputs.scale_factor || '1' }})
- needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb,
benchmark-spatial-polars]
- if: always() && (needs.benchmark-duckdb.result == 'success' ||
needs.benchmark-geopandas.result == 'success' ||
needs.benchmark-sedonadb.result == 'success' ||
needs.benchmark-spatial-polars.result == 'success')
+ name: Summarize Results (SF${{ matrix.scale_factor }})
+ needs: [parse-scale-factors, benchmark-duckdb, benchmark-geopandas,
benchmark-sedonadb, benchmark-spatial-polars]
+ if: always() && (needs.benchmark-duckdb.result != 'cancelled' ||
needs.benchmark-geopandas.result != 'cancelled' ||
needs.benchmark-sedonadb.result != 'cancelled' ||
needs.benchmark-spatial-polars.result != 'cancelled')
runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
steps:
- uses: actions/checkout@v6
- - name: Download DuckDB results
- if: needs.benchmark-duckdb.result == 'success'
- uses: actions/download-artifact@v7
- with:
- name: duckdb-results-sf${{ env.SCALE_FACTOR }}
- path: results
- continue-on-error: true
-
- - name: Download GeoPandas results
- if: needs.benchmark-geopandas.result == 'success'
- uses: actions/download-artifact@v7
- with:
- name: geopandas-results-sf${{ env.SCALE_FACTOR }}
- path: results
- continue-on-error: true
-
- - name: Download SedonaDB results
- if: needs.benchmark-sedonadb.result == 'success'
- uses: actions/download-artifact@v7
- with:
- name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
- path: results
- continue-on-error: true
-
- - name: Download Spatial Polars results
- if: needs.benchmark-spatial-polars.result == 'success'
+ - name: Download all results for this scale factor
uses: actions/download-artifact@v7
with:
- name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
+ pattern: '*-results-sf${{ matrix.scale_factor }}'
path: results
+ merge-multiple: true
continue-on-error: true
- name: Setup Python
@@ -443,6 +470,7 @@ jobs:
--results-dir results \
--timeout ${{ env.QUERY_TIMEOUT }} \
--runs ${{ env.BENCHMARK_RUNS }} \
+ --engines ${{ env.BENCHMARK_ENGINES }} \
--output benchmark_summary.md
- name: Display summary
@@ -454,7 +482,7 @@ jobs:
- name: Upload combined results
uses: actions/upload-artifact@v6
with:
- name: benchmark-summary-sf${{ env.SCALE_FACTOR }}
+ name: benchmark-summary-sf${{ matrix.scale_factor }}
path: |
results/
benchmark_summary.md
diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
index e05e237..ce438ae 100755
--- a/benchmark/run_benchmark.py
+++ b/benchmark/run_benchmark.py
@@ -428,6 +428,7 @@ def run_benchmark(
timeout: int,
scale_factor: float,
runs: int = 3,
+ output_file: str | None = None,
) -> BenchmarkSuite:
"""Generic benchmark runner for any engine.
@@ -438,6 +439,9 @@ def run_benchmark(
If runs > 1 and the first run succeeds, additional runs are performed
and the average time is reported for fair comparison.
+
+ If output_file is provided, results are saved incrementally after each
+ query so that partial results survive if the runner crashes mid-way.
"""
from importlib.metadata import version as pkg_version
@@ -483,60 +487,94 @@ def run_benchmark(
all_queries = config["queries_getter"]()
engine_class = config["class"]
- for query_name, query_sql in all_queries.items():
- if queries and query_name not in queries:
- continue
+ # Determine which queries will be run
+ query_items = [
+ (qname, qsql) for qname, qsql in all_queries.items()
+ if not queries or qname in queries
+ ]
- print(f" Running {query_name}...", end=" ", flush=True)
+ # Pre-populate all queries as "not_started" so even a total crash
+ # (e.g. OOM killing the runner) leaves a file showing what was attempted
+ for query_name, _ in query_items:
+ suite.results.append(BenchmarkResult(
+ query=query_name,
+ engine=engine,
+ time_seconds=None,
+ row_count=None,
+ status="not_started",
+ error_message=None,
+ ))
+ if output_file:
+ save_results([suite], output_file)
- # First run
- result = run_query_isolated(
- engine_class=engine_class,
- engine_name=engine,
- data_paths=data_paths,
- query_name=query_name,
- query_sql=query_sql,
- timeout=timeout,
- )
+ # Install a SIGTERM handler so we flush results if the runner is shutting
down
+ def _sigterm_handler(signum, frame):
+ print(f"\nReceived signal {signum}, saving partial results...",
flush=True)
+ if output_file:
+ save_results([suite], output_file)
+ sys.exit(128 + signum)
- # If first run succeeded and we want multiple runs, do additional runs
- if result.status == "success" and runs > 1:
- run_times = [result.time_seconds]
-
- for run_num in range(2, runs + 1):
- additional_result = run_query_isolated(
- engine_class=engine_class,
- engine_name=engine,
- data_paths=data_paths,
- query_name=query_name,
- query_sql=query_sql,
- timeout=timeout,
- )
- if additional_result.status == "success":
- run_times.append(additional_result.time_seconds)
- else:
- # If any subsequent run fails, just use successful runs
- break
-
- # Calculate average of all successful runs
- avg_time = round(sum(run_times) / len(run_times), 2)
- result = BenchmarkResult(
- query=query_name,
- engine=engine,
- time_seconds=avg_time,
- row_count=result.row_count,
- status="success",
- error_message=None,
+ prev_handler = signal.signal(signal.SIGTERM, _sigterm_handler)
+
+ try:
+ for idx, (query_name, query_sql) in enumerate(query_items):
+ print(f" Running {query_name}...", end=" ", flush=True)
+
+ # First run
+ result = run_query_isolated(
+ engine_class=engine_class,
+ engine_name=engine,
+ data_paths=data_paths,
+ query_name=query_name,
+ query_sql=query_sql,
+ timeout=timeout,
)
- print(f"{avg_time}s avg ({len(run_times)} runs, {result.row_count}
rows)")
- elif result.status == "success":
- print(f"{result.time_seconds}s ({result.row_count} rows)")
- else:
- print(f"{result.status.upper()}: {result.error_message}")
- suite.results.append(result)
- if result.status == "success":
- suite.total_time += result.time_seconds
+ # If first run succeeded and we want multiple runs, do additional
runs
+ if result.status == "success" and runs > 1:
+ run_times = [result.time_seconds]
+
+ for run_num in range(2, runs + 1):
+ additional_result = run_query_isolated(
+ engine_class=engine_class,
+ engine_name=engine,
+ data_paths=data_paths,
+ query_name=query_name,
+ query_sql=query_sql,
+ timeout=timeout,
+ )
+ if additional_result.status == "success":
+ run_times.append(additional_result.time_seconds)
+ else:
+ # If any subsequent run fails, just use successful runs
+ break
+
+ # Calculate average of all successful runs
+ avg_time = round(sum(run_times) / len(run_times), 2)
+ result = BenchmarkResult(
+ query=query_name,
+ engine=engine,
+ time_seconds=avg_time,
+ row_count=result.row_count,
+ status="success",
+ error_message=None,
+ )
+ print(f"{avg_time}s avg ({len(run_times)} runs,
{result.row_count} rows)")
+ elif result.status == "success":
+ print(f"{result.time_seconds}s ({result.row_count} rows)")
+ else:
+ print(f"{result.status.upper()}: {result.error_message}")
+
+ # Replace the pre-populated "not_started" entry with the actual
result
+ suite.results[idx] = result
+ if result.status == "success":
+ suite.total_time += result.time_seconds
+
+ # Save partial results after each query so they survive crashes
+ if output_file:
+ save_results([suite], output_file)
+ finally:
+ signal.signal(signal.SIGTERM, prev_handler)
return suite
@@ -629,7 +667,7 @@ def main():
print(f" {table}: {path}")
results = [
- run_benchmark(engine, data_paths, queries, args.timeout,
args.scale_factor, args.runs)
+ run_benchmark(engine, data_paths, queries, args.timeout,
args.scale_factor, args.runs, args.output)
for engine in engines
]
diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py
index a52fcc8..74b7a3f 100755
--- a/benchmark/summarize_results.py
+++ b/benchmark/summarize_results.py
@@ -26,8 +26,20 @@ from datetime import datetime, timezone
from pathlib import Path
-def load_results(results_dir: str) -> dict:
- """Load all JSON result files from a directory."""
+def load_results(results_dir: str, expected_engines: list[str] | None = None)
-> dict:
+ """Load all JSON result files from a directory.
+
+ Supports two layouts:
+ 1. One file per engine (e.g., duckdb_results.json with all queries)
+ 2. One file per query (e.g., duckdb_q1_results.json with a single query)
+
+ Per-query files are merged into a single suite per engine. If multiple
files
+ contain results for the same engine, their query results are combined.
+
+ If expected_engines is provided, engines that were expected to run but have
+ no results file will be included with all queries marked as 'not_started'.
+ This handles the case where a runner was OOM-killed before uploading
results.
+ """
results = {}
results_path = Path(results_dir)
@@ -36,7 +48,55 @@ def load_results(results_dir: str) -> dict:
data = json.load(f)
for suite in data.get("results", []):
engine = suite["engine"]
- results[engine] = suite
+ if engine not in results:
+ results[engine] = suite
+ else:
+ # Merge query results from multiple files for the same
engine
+ existing_queries = {r["query"] for r in
results[engine].get("results", [])}
+ for r in suite.get("results", []):
+ if r["query"] not in existing_queries:
+ results[engine]["results"].append(r)
+ existing_queries.add(r["query"])
+ elif r.get("status") != "not_started":
+ # Replace not_started placeholder with actual
result
+ results[engine]["results"] = [
+ r if existing["query"] == r["query"] else
existing
+ for existing in results[engine]["results"]
+ ]
+
+ # For expected engines with no results, create placeholder entries
+ if expected_engines:
+ # Determine the full query list from engines that did report results
+ all_queries = set()
+ scale_factor = None
+ for engine_data in results.values():
+ if scale_factor is None:
+ scale_factor = engine_data.get("scale_factor", 1)
+ for r in engine_data.get("results", []):
+ all_queries.add(r["query"])
+
+ # Default to q1-q12 if no engine reported any results
+ if not all_queries:
+ all_queries = {f"q{i}" for i in range(1, 13)}
+
+ for engine in expected_engines:
+ if engine not in results:
+ results[engine] = {
+ "engine": engine,
+ "version": "unknown",
+ "scale_factor": scale_factor or 1,
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "results": [
+ {
+ "query": q,
+ "status": "not_started",
+ "time_seconds": None,
+ "row_count": None,
+ "error_message": "Runner was killed before
completing this query (likely OOM)",
+ }
+ for q in sorted(all_queries, key=lambda x: int(x[1:]))
+ ],
+ }
return results
@@ -151,29 +211,38 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
row += " ⏱️ TIMEOUT |"
elif status == "error":
row += " ❌ ERROR |"
+ elif status == "not_started":
+ row += " 💀 OOM |"
else:
row += " — |"
lines.append(row)
- # Win count summary
+ # Win count and completion summary
win_counts = {engine: 0 for engine in engines}
+ completed_counts = {engine: 0 for engine in engines}
+ total_queries = len(all_queries)
for query in all_queries:
winner = get_winner(query, data, engines)
if winner:
win_counts[winner] += 1
+ for engine in engines:
+ result = data.get(engine, {}).get(query, {})
+ if result.get("status") == "success":
+ completed_counts[engine] += 1
lines.extend([
"",
"## 🥇 Performance Summary",
"",
- "| Engine | Wins |",
- "|--------|:----:|",
+ "| Engine | Completed | Wins |",
+ "|--------|:---------:|:----:|",
])
for engine in sorted(engines, key=lambda e: win_counts[e], reverse=True):
icon_name = engine_icons.get(engine, engine.title())
wins = win_counts[engine]
- lines.append(f"| {icon_name} | {wins} |")
+ completed = completed_counts[engine]
+ lines.append(f"| {icon_name} | {completed}/{total_queries} | {wins} |")
# Detailed results section (collapsible)
lines.extend([
@@ -203,6 +272,7 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"success": "✅",
"error": "❌",
"timeout": "⏱️",
+ "not_started": "💀",
}.get(status, "❓")
lines.append(f"| {query.upper()} | {time_str} | {status_emoji} |
{row_str} |")
@@ -219,14 +289,24 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
for engine in engines:
engine_errors = []
+ not_started_queries = []
for query in all_queries:
result = data.get(engine, {}).get(query, {})
- if result.get("status") in ("error", "timeout"):
+ status = result.get("status")
+ if status in ("error", "timeout"):
error_msg = result.get("error_message", "No details available")
# Truncate long error messages
if len(error_msg) > 200:
error_msg = error_msg[:200] + "..."
engine_errors.append(f"- **{query.upper()}**: `{error_msg}`")
+ elif status == "not_started":
+ not_started_queries.append(query.upper())
+
+ if not_started_queries:
+ engine_errors.append(
+ f"- **{', '.join(not_started_queries)}**: "
+ f"`Could not complete these queries, likely due to OOM (runner
was killed)`"
+ )
if engine_errors:
has_errors = True
@@ -248,6 +328,7 @@ def generate_markdown_summary(results: dict, output_file:
str, query_timeout: in
"| **bold** | Fastest for this query |",
"| ⏱️ TIMEOUT | Query exceeded timeout |",
"| ❌ ERROR | Query failed |",
+ "| 💀 OOM | Could not run, likely due to out-of-memory (runner killed)
|",
"",
f"*Generated by
[SpatialBench](https://github.com/apache/sedona-spatialbench) on
{datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}*",
])
@@ -289,10 +370,18 @@ def main():
default=3,
help="Number of runs per query (for reporting)",
)
+ parser.add_argument(
+ "--engines",
+ type=str,
+ default=None,
+ help="Comma-separated list of expected engines (e.g.,
'duckdb,geopandas,sedonadb,spatial_polars'). "
+ "Engines that were expected but have no results will be shown as
OOM/runner-killed.",
+ )
args = parser.parse_args()
- results = load_results(args.results_dir)
+ expected_engines = [e.strip() for e in args.engines.split(",")] if
args.engines else None
+ results = load_results(args.results_dir, expected_engines=expected_engines)
if not results:
print(f"No results found in {args.results_dir}")