This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f8256bd625 GH-36537: [Python] Ensure dataset writer follows default
Parquet version of 2.6 (#36538)
f8256bd625 is described below
commit f8256bd625ac0b06238011fc13ea0249956e3859
Author: Joris Van den Bossche <[email protected]>
AuthorDate: Fri Jul 7 15:45:06 2023 +0200
GH-36537: [Python] Ensure dataset writer follows default Parquet version of
2.6 (#36538)
### Rationale for this change
When bumping the default Parquet write version from 1.0 to 2.4 and then to
2.6, we forgot to also bump that default in the parquet.dataset writer
(`ParquetFileWriteOptions`).
This PR bumps that directly from 1.0 to 2.6 to follow the default of the
pyarrow.parquet module.
### Are these changes tested?
Yes
### Are there any user-facing changes?
Different default version can give different types in the parquet file.
* Closes: #36537
Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/_dataset_parquet.pyx | 8 +++++++-
python/pyarrow/parquet/core.py | 2 ++
python/pyarrow/tests/test_dataset.py | 18 ++++++++++++++++--
3 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/_dataset_parquet.pyx
b/python/pyarrow/_dataset_parquet.pyx
index ad914c77bf..bc4786b9cd 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -531,6 +531,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
"use_deprecated_int96_timestamps",
"coerce_timestamps",
"allow_truncated_timestamps",
+ "use_compliant_nested_type",
}
setters = set()
@@ -586,7 +587,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._properties = dict(
use_dictionary=True,
compression="snappy",
- version="1.0",
+ version="2.6",
write_statistics=None,
data_page_size=None,
compression_level=None,
@@ -601,6 +602,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._set_properties()
self._set_arrow_properties()
+ def __repr__(self):
+ return "<pyarrow.dataset.ParquetFileWriteOptions {0}>".format(
+ " ".join([f"{key}={value}" for key, value in
self._properties.items()])
+ )
+
cdef set _PARQUET_READ_OPTIONS = {
'dictionary_columns', 'coerce_int96_timestamp_unit'
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index e7945499be..c93a346cbb 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -3078,6 +3078,8 @@ def write_table(table, where, row_group_size=None,
version='2.6',
dictionary_pagesize_limit=None,
store_schema=True,
**kwargs):
+ # Implementor's note: when adding keywords here / updating defaults, also
+ # update it in write_to_dataset and _dataset_parquet.pyx
ParquetFileWriteOptions
row_group_size = kwargs.pop('chunk_size', row_group_size)
use_int96 = use_deprecated_int96_timestamps
try:
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index d939af662f..e8026a2af6 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4564,7 +4564,9 @@ def test_write_table_partitioned_dict(tempdir):
@pytest.mark.parquet
def test_write_dataset_parquet(tempdir):
table = pa.table([
- pa.array(range(20)), pa.array(np.random.randn(20)),
+ pa.array(range(20), type="uint32"),
+ pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype(
+ "datetime64[ns]")),
pa.array(np.repeat(['a', 'b'], 10))
], names=["f1", "f2", "part"])
@@ -4576,7 +4578,7 @@ def test_write_dataset_parquet(tempdir):
file_paths = list(base_dir.rglob("*"))
expected_paths = [base_dir / "part-0.parquet"]
assert set(file_paths) == set(expected_paths)
- # check Table roundtrip
+ # check Table roundtrip with default version
result = ds.dataset(base_dir, format="parquet").to_table()
assert result.equals(table)
@@ -4584,12 +4586,24 @@ def test_write_dataset_parquet(tempdir):
for version in ["1.0", "2.4", "2.6"]:
format = ds.ParquetFileFormat()
opts = format.make_write_options(version=version)
+ assert "<pyarrow.dataset.ParquetFileWriteOptions" in repr(opts)
base_dir = tempdir / 'parquet_dataset_version{0}'.format(version)
ds.write_dataset(table, base_dir, format=format, file_options=opts)
meta = pq.read_metadata(base_dir / "part-0.parquet")
expected_version = "1.0" if version == "1.0" else "2.6"
assert meta.format_version == expected_version
+ # ensure version is actually honored based on supported datatypes
+ result = ds.dataset(base_dir, format="parquet").to_table()
+ schema = table.schema
+ if version == "1.0":
+ # uint32 is written as int64
+ schema = schema.set(0, schema.field(0).with_type(pa.int64()))
+ if version in ("1.0", "2.4"):
+ schema = schema.set(1,
schema.field(1).with_type(pa.timestamp("us")))
+ expected = table.cast(schema)
+ assert result.equals(expected)
+
def test_write_dataset_csv(tempdir):
table = pa.table([