This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new 0d42e77 Use int64 for TPC-H keys and set input schema to not nullable
(#714)
0d42e77 is described below
commit 0d42e77b469708f10ec69ff51366a529d3454efb
Author: Tim Saucer <[email protected]>
AuthorDate: Thu May 23 18:09:39 2024 -0400
Use int64 for TPC-H keys and set input schema to not nullable (#714)
---
examples/tpch/convert_data_to_parquet.py | 30 ++++++++++++++++--------------
1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/examples/tpch/convert_data_to_parquet.py
b/examples/tpch/convert_data_to_parquet.py
index 5da60bc..d81ec29 100644
--- a/examples/tpch/convert_data_to_parquet.py
+++ b/examples/tpch/convert_data_to_parquet.py
@@ -31,10 +31,10 @@ ctx = datafusion.SessionContext()
all_schemas = {}
all_schemas["customer"] = [
- ("C_CUSTKEY", pyarrow.int32()),
+ ("C_CUSTKEY", pyarrow.int64()),
("C_NAME", pyarrow.string()),
("C_ADDRESS", pyarrow.string()),
- ("C_NATIONKEY", pyarrow.int32()),
+ ("C_NATIONKEY", pyarrow.int64()),
("C_PHONE", pyarrow.string()),
("C_ACCTBAL", pyarrow.decimal128(15, 2)),
("C_MKTSEGMENT", pyarrow.string()),
@@ -42,9 +42,9 @@ all_schemas["customer"] = [
]
all_schemas["lineitem"] = [
- ("L_ORDERKEY", pyarrow.int32()),
- ("L_PARTKEY", pyarrow.int32()),
- ("L_SUPPKEY", pyarrow.int32()),
+ ("L_ORDERKEY", pyarrow.int64()),
+ ("L_PARTKEY", pyarrow.int64()),
+ ("L_SUPPKEY", pyarrow.int64()),
("L_LINENUMBER", pyarrow.int32()),
("L_QUANTITY", pyarrow.decimal128(15, 2)),
("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)),
@@ -61,15 +61,15 @@ all_schemas["lineitem"] = [
]
all_schemas["nation"] = [
- ("N_NATIONKEY", pyarrow.int32()),
+ ("N_NATIONKEY", pyarrow.int64()),
("N_NAME", pyarrow.string()),
- ("N_REGIONKEY", pyarrow.int32()),
+ ("N_REGIONKEY", pyarrow.int64()),
("N_COMMENT", pyarrow.string()),
]
all_schemas["orders"] = [
- ("O_ORDERKEY", pyarrow.int32()),
- ("O_CUSTKEY", pyarrow.int32()),
+ ("O_ORDERKEY", pyarrow.int64()),
+ ("O_CUSTKEY", pyarrow.int64()),
("O_ORDERSTATUS", pyarrow.string()),
("O_TOTALPRICE", pyarrow.decimal128(15, 2)),
("O_ORDERDATE", pyarrow.date32()),
@@ -80,7 +80,7 @@ all_schemas["orders"] = [
]
all_schemas["part"] = [
- ("P_PARTKEY", pyarrow.int32()),
+ ("P_PARTKEY", pyarrow.int64()),
("P_NAME", pyarrow.string()),
("P_MFGR", pyarrow.string()),
("P_BRAND", pyarrow.string()),
@@ -92,21 +92,21 @@ all_schemas["part"] = [
]
all_schemas["partsupp"] = [
- ("PS_PARTKEY", pyarrow.int32()),
- ("PS_SUPPKEY", pyarrow.int32()),
+ ("PS_PARTKEY", pyarrow.int64()),
+ ("PS_SUPPKEY", pyarrow.int64()),
("PS_AVAILQTY", pyarrow.int32()),
("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)),
("PS_COMMENT", pyarrow.string()),
]
all_schemas["region"] = [
- ("r_REGIONKEY", pyarrow.int32()),
+ ("r_REGIONKEY", pyarrow.int64()),
("r_NAME", pyarrow.string()),
("r_COMMENT", pyarrow.string()),
]
all_schemas["supplier"] = [
- ("S_SUPPKEY", pyarrow.int32()),
+ ("S_SUPPKEY", pyarrow.int64()),
("S_NAME", pyarrow.string()),
("S_ADDRESS", pyarrow.string()),
("S_NATIONKEY", pyarrow.int32()),
@@ -125,6 +125,8 @@ for filename, curr_schema in all_schemas.items():
# in to handle the trailing | in the file
output_cols = [r[0] for r in curr_schema]
+ curr_schema = [ pyarrow.field(r[0], r[1], nullable=False) for r in
curr_schema]
+
# Trailing | requires extra field for in processing
curr_schema.append(("some_null", pyarrow.null()))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]