This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new 75ef45d1 Core: Fix bin packing when target file size is smaller than a
row (#2844)
75ef45d1 is described below
commit 75ef45d1d4460dc17812859a7fee4619442765ed
Author: Soham <[email protected]>
AuthorDate: Mon Dec 22 23:51:46 2025 +0530
Core: Fix bin packing when target file size is smaller than a row (#2844)
## What does this change do?
When `write.target-file-size-bytes` is smaller than a single row, bin
packing computed a 0 row chunk size and PyArrow raised a ValueError.
This change clamps the chunk size to at least 1, so writes still succeed
(one row per batch/file when needed).
## Why is this needed?
Fixes a crash when users set a small target file size and attempt to
write large records.
## How was this tested?
- make lint
- uv run python -m pytest tests/io/test_pyarrow.py -k
"bin_pack_arrow_table" -v
- make test (timed out at ~42%)
Closes #2795
Co-authored-by: Soham <[email protected]>
---
pyiceberg/io/pyarrow.py | 2 +-
tests/io/test_pyarrow.py | 6 ++++++
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 1077f41f..efeb72cb 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size:
int) -> Iterator[list[
from pyiceberg.utils.bin_packing import PackingIterator
avg_row_size_bytes = tbl.nbytes / tbl.num_rows
- target_rows_per_file = target_file_size // avg_row_size_bytes
+ target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes))
batches = tbl.to_batches(max_chunksize=target_rows_per_file)
bin_packed_record_batches = PackingIterator(
items=batches,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index 869e60f4..ea2928ca 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -2248,6 +2248,12 @@ def test_bin_pack_arrow_table(arrow_table_with_null:
pa.Table) -> None:
assert len(list(bin_packed)) == 5
+def
test_bin_pack_arrow_table_target_size_smaller_than_row(arrow_table_with_null:
pa.Table) -> None:
+ bin_packed = list(bin_pack_arrow_table(arrow_table_with_null,
target_file_size=1))
+ assert len(bin_packed) == arrow_table_with_null.num_rows
+ assert sum(batch.num_rows for bin_ in bin_packed for batch in bin_) ==
arrow_table_with_null.num_rows
+
+
def test_schema_mismatch_type(table_schema_simple: Schema) -> None:
other_schema = pa.schema(
(