This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 75ef45d1 Core: Fix bin packing when target file size is smaller than a 
row (#2844)
75ef45d1 is described below

commit 75ef45d1d4460dc17812859a7fee4619442765ed
Author: Soham <[email protected]>
AuthorDate: Mon Dec 22 23:51:46 2025 +0530

    Core: Fix bin packing when target file size is smaller than a row (#2844)
    
    ## What does this change do?
    When `write.target-file-size-bytes` is smaller than a single row, bin
    packing computed a 0 row chunk size and PyArrow raised a ValueError.
    This change clamps the chunk size to at least 1, so writes still succeed
    (one row per batch/file when needed).
    
    ## Why is this needed?
    Fixes a crash when users set a small target file size and attempt to
    write large records.
    
    ## How was this tested?
    - make lint
    - uv run python -m pytest tests/io/test_pyarrow.py -k
    "bin_pack_arrow_table" -v
    - make test (timed out at ~42%)
    
    Closes #2795
    
    Co-authored-by: Soham <[email protected]>
---
 pyiceberg/io/pyarrow.py  | 2 +-
 tests/io/test_pyarrow.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 1077f41f..efeb72cb 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: 
int) -> Iterator[list[
     from pyiceberg.utils.bin_packing import PackingIterator
 
     avg_row_size_bytes = tbl.nbytes / tbl.num_rows
-    target_rows_per_file = target_file_size // avg_row_size_bytes
+    target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes))
     batches = tbl.to_batches(max_chunksize=target_rows_per_file)
     bin_packed_record_batches = PackingIterator(
         items=batches,
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index 869e60f4..ea2928ca 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -2248,6 +2248,12 @@ def test_bin_pack_arrow_table(arrow_table_with_null: 
pa.Table) -> None:
     assert len(list(bin_packed)) == 5
 
 
+def 
test_bin_pack_arrow_table_target_size_smaller_than_row(arrow_table_with_null: 
pa.Table) -> None:
+    bin_packed = list(bin_pack_arrow_table(arrow_table_with_null, 
target_file_size=1))
+    assert len(bin_packed) == arrow_table_with_null.num_rows
+    assert sum(batch.num_rows for bin_ in bin_packed for batch in bin_) == 
arrow_table_with_null.num_rows
+
+
 def test_schema_mismatch_type(table_schema_simple: Schema) -> None:
     other_schema = pa.schema(
         (

Reply via email to