This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-benchmarks.git
The following commit(s) were added to refs/heads/main by this push:
new af27848 feat: add TPC-DS data (#24)
af27848 is described below
commit af27848bd345df60c06f5f9fe9bbb2a547bdf113
Author: Oleks V <[email protected]>
AuthorDate: Wed Nov 26 17:22:53 2025 -0800
feat: add TPC-DS data (#24)
* chore: support specific query
* Adding TPCDS SF1 example data along with updated `gen.sh`
---------
Co-authored-by: ovoievodin <[email protected]>
---
tpcds/Dockerfile | 23 ++++++++++++++++-------
tpcds/data/sf1/call_center.parquet | Bin 0 -> 20680 bytes
tpcds/data/sf1/catalog_page.parquet | Bin 0 -> 451245 bytes
tpcds/data/sf1/catalog_returns.parquet | Bin 0 -> 9877124 bytes
tpcds/data/sf1/catalog_sales.parquet | Bin 0 -> 85430135 bytes
tpcds/data/sf1/customer.parquet | Bin 0 -> 4574173 bytes
tpcds/data/sf1/customer_address.parquet | Bin 0 -> 895931 bytes
tpcds/data/sf1/customer_demographics.parquet | Bin 0 -> 3747969 bytes
tpcds/data/sf1/date_dim.parquet | Bin 0 -> 1411299 bytes
tpcds/data/sf1/household_demographics.parquet | Bin 0 -> 25497 bytes
tpcds/data/sf1/income_band.parquet | Bin 0 -> 2698 bytes
tpcds/data/sf1/inventory.parquet | Bin 0 -> 15534351 bytes
tpcds/data/sf1/item.parquet | Bin 0 -> 1190446 bytes
tpcds/data/sf1/promotion.parquet | Bin 0 -> 24047 bytes
tpcds/data/sf1/reason.parquet | Bin 0 -> 3370 bytes
tpcds/data/sf1/ship_mode.parquet | Bin 0 -> 4921 bytes
tpcds/data/sf1/store.parquet | Bin 0 -> 19512 bytes
tpcds/data/sf1/store_returns.parquet | Bin 0 -> 13787027 bytes
tpcds/data/sf1/store_sales.parquet | Bin 0 -> 104062329 bytes
tpcds/data/sf1/time_dim.parquet | Bin 0 -> 837756 bytes
tpcds/data/sf1/warehouse.parquet | Bin 0 -> 9351 bytes
tpcds/data/sf1/web_page.parquet | Bin 0 -> 10334 bytes
tpcds/data/sf1/web_returns.parquet | Bin 0 -> 5222963 bytes
tpcds/data/sf1/web_sales.parquet | Bin 0 -> 40606746 bytes
tpcds/data/sf1/web_site.parquet | Bin 0 -> 20192 bytes
tpcds/gen.sh | 16 +++++++++++-----
26 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/tpcds/Dockerfile b/tpcds/Dockerfile
index ad1b00e..7b8e862 100644
--- a/tpcds/Dockerfile
+++ b/tpcds/Dockerfile
@@ -1,20 +1,29 @@
-FROM rust:1-slim-buster
+FROM rust:1-slim-bullseye
RUN apt update && apt install -y zip gcc make flex bison byacc git
# TPC-DS generator
COPY tpc-ds-tool.zip .
RUN unzip tpc-ds-tool.zip
-WORKDIR /DSGen-software-code-3.2.0rc1/tools
+
+# Find the actual extracted directory and create a consistent symlink
+RUN ls -la && \
+ EXTRACTED_DIR=$(find . -maxdepth 1 -name "DSGen-software-code*" -type d |
head -1) && \
+ echo "Found directory: $EXTRACTED_DIR" && \
+ ln -sf "$EXTRACTED_DIR" /dsgen-tools
+
+WORKDIR /dsgen-tools/tools
# Fix bad UTF-8 char
RUN iconv -f ISO-8859-14 -t UTF-8 tpcds.dst > tpcds.dst2
RUN mv tpcds.dst2 tpcds.dst
-# compile
-RUN make
+# compile with flags to handle multiple definitions
+RUN make CC="gcc -fcommon"
-# tpctools crate
-RUN cargo install tpctools
+# Copy and make gen.sh executable
+COPY gen.sh /usr/local/bin/gen.sh
+RUN chmod +x /usr/local/bin/gen.sh
-ADD gen.sh .
\ No newline at end of file
+# Set working directory to root for script execution
+WORKDIR /
\ No newline at end of file
diff --git a/tpcds/data/sf1/call_center.parquet
b/tpcds/data/sf1/call_center.parquet
new file mode 100644
index 0000000..efeded2
Binary files /dev/null and b/tpcds/data/sf1/call_center.parquet differ
diff --git a/tpcds/data/sf1/catalog_page.parquet
b/tpcds/data/sf1/catalog_page.parquet
new file mode 100644
index 0000000..c40f4a3
Binary files /dev/null and b/tpcds/data/sf1/catalog_page.parquet differ
diff --git a/tpcds/data/sf1/catalog_returns.parquet
b/tpcds/data/sf1/catalog_returns.parquet
new file mode 100644
index 0000000..6d7f2fb
Binary files /dev/null and b/tpcds/data/sf1/catalog_returns.parquet differ
diff --git a/tpcds/data/sf1/catalog_sales.parquet
b/tpcds/data/sf1/catalog_sales.parquet
new file mode 100644
index 0000000..2abdfac
Binary files /dev/null and b/tpcds/data/sf1/catalog_sales.parquet differ
diff --git a/tpcds/data/sf1/customer.parquet b/tpcds/data/sf1/customer.parquet
new file mode 100644
index 0000000..0301da3
Binary files /dev/null and b/tpcds/data/sf1/customer.parquet differ
diff --git a/tpcds/data/sf1/customer_address.parquet
b/tpcds/data/sf1/customer_address.parquet
new file mode 100644
index 0000000..f99c948
Binary files /dev/null and b/tpcds/data/sf1/customer_address.parquet differ
diff --git a/tpcds/data/sf1/customer_demographics.parquet
b/tpcds/data/sf1/customer_demographics.parquet
new file mode 100644
index 0000000..f21868e
Binary files /dev/null and b/tpcds/data/sf1/customer_demographics.parquet differ
diff --git a/tpcds/data/sf1/date_dim.parquet b/tpcds/data/sf1/date_dim.parquet
new file mode 100644
index 0000000..7ae176f
Binary files /dev/null and b/tpcds/data/sf1/date_dim.parquet differ
diff --git a/tpcds/data/sf1/household_demographics.parquet
b/tpcds/data/sf1/household_demographics.parquet
new file mode 100644
index 0000000..d8046c8
Binary files /dev/null and b/tpcds/data/sf1/household_demographics.parquet
differ
diff --git a/tpcds/data/sf1/income_band.parquet
b/tpcds/data/sf1/income_band.parquet
new file mode 100644
index 0000000..b3b6e9e
Binary files /dev/null and b/tpcds/data/sf1/income_band.parquet differ
diff --git a/tpcds/data/sf1/inventory.parquet b/tpcds/data/sf1/inventory.parquet
new file mode 100644
index 0000000..d898890
Binary files /dev/null and b/tpcds/data/sf1/inventory.parquet differ
diff --git a/tpcds/data/sf1/item.parquet b/tpcds/data/sf1/item.parquet
new file mode 100644
index 0000000..76dbb94
Binary files /dev/null and b/tpcds/data/sf1/item.parquet differ
diff --git a/tpcds/data/sf1/promotion.parquet b/tpcds/data/sf1/promotion.parquet
new file mode 100644
index 0000000..83f8260
Binary files /dev/null and b/tpcds/data/sf1/promotion.parquet differ
diff --git a/tpcds/data/sf1/reason.parquet b/tpcds/data/sf1/reason.parquet
new file mode 100644
index 0000000..4155a60
Binary files /dev/null and b/tpcds/data/sf1/reason.parquet differ
diff --git a/tpcds/data/sf1/ship_mode.parquet b/tpcds/data/sf1/ship_mode.parquet
new file mode 100644
index 0000000..e4e6c94
Binary files /dev/null and b/tpcds/data/sf1/ship_mode.parquet differ
diff --git a/tpcds/data/sf1/store.parquet b/tpcds/data/sf1/store.parquet
new file mode 100644
index 0000000..4e9a7ae
Binary files /dev/null and b/tpcds/data/sf1/store.parquet differ
diff --git a/tpcds/data/sf1/store_returns.parquet
b/tpcds/data/sf1/store_returns.parquet
new file mode 100644
index 0000000..e09a8c9
Binary files /dev/null and b/tpcds/data/sf1/store_returns.parquet differ
diff --git a/tpcds/data/sf1/store_sales.parquet
b/tpcds/data/sf1/store_sales.parquet
new file mode 100644
index 0000000..036a85b
Binary files /dev/null and b/tpcds/data/sf1/store_sales.parquet differ
diff --git a/tpcds/data/sf1/time_dim.parquet b/tpcds/data/sf1/time_dim.parquet
new file mode 100644
index 0000000..12d2332
Binary files /dev/null and b/tpcds/data/sf1/time_dim.parquet differ
diff --git a/tpcds/data/sf1/warehouse.parquet b/tpcds/data/sf1/warehouse.parquet
new file mode 100644
index 0000000..ad85144
Binary files /dev/null and b/tpcds/data/sf1/warehouse.parquet differ
diff --git a/tpcds/data/sf1/web_page.parquet b/tpcds/data/sf1/web_page.parquet
new file mode 100644
index 0000000..dff04fc
Binary files /dev/null and b/tpcds/data/sf1/web_page.parquet differ
diff --git a/tpcds/data/sf1/web_returns.parquet
b/tpcds/data/sf1/web_returns.parquet
new file mode 100644
index 0000000..99eadf1
Binary files /dev/null and b/tpcds/data/sf1/web_returns.parquet differ
diff --git a/tpcds/data/sf1/web_sales.parquet b/tpcds/data/sf1/web_sales.parquet
new file mode 100644
index 0000000..41b642f
Binary files /dev/null and b/tpcds/data/sf1/web_sales.parquet differ
diff --git a/tpcds/data/sf1/web_site.parquet b/tpcds/data/sf1/web_site.parquet
new file mode 100644
index 0000000..8cf6c8a
Binary files /dev/null and b/tpcds/data/sf1/web_site.parquet differ
diff --git a/tpcds/gen.sh b/tpcds/gen.sh
index c18b05b..c928e3b 100755
--- a/tpcds/gen.sh
+++ b/tpcds/gen.sh
@@ -1,6 +1,12 @@
#!/bin/bash
-tpctools generate --benchmark tpcds \
- --scale 100 \
- --partitions 12 \
- --generator-path /DSGen-software-code-3.2.0rc1/tools \
- --output /data
\ No newline at end of file
+cd /dsgen-tools/tools
+
+for i in $(seq 1 12); do
+ mkdir -p /data/part_$i
+ ./dsdgen -scale 100 \
+ -dir /data/part_$i \
+ -parallel 12 \
+ -child $i \
+ -terminate n &
+done
+wait
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]