This is an automated email from the ASF dual-hosted git repository.

kevinjqliu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git


The following commit(s) were added to refs/heads/main by this push:
     new bad9cda1 fix: Cast smaller integer types to int32/int64 on write for 
Spark compatibility (#2799)
bad9cda1 is described below

commit bad9cda1027068cf9720e3bb9f0b24d4279b13e8
Author: Somasundaram Sekar <[email protected]>
AuthorDate: Sun Jan 25 19:36:38 2026 +0100

    fix: Cast smaller integer types to int32/int64 on write for Spark 
compatibility (#2799)
    
    ## Summary
    - Fixes #2791: Writing smaller integer types (uint8, int8, int16,
    uint16) to Iceberg IntegerType columns now correctly casts to
    int32/int64
    - PyIceberg was preserving original Arrow types in Parquet files,
    causing Spark to fail with `Unsupported logical type: UINT_8`
    - Added integer type widening logic in
    `ArrowProjectionVisitor._cast_if_needed()` following the same pattern as
    existing timestamp handling
    - Only widening conversions are allowed (e.g., uint8 → int32, int32 →
    int64); narrowing conversions continue to be rejected via `promote()`
    
    ## Test plan
    - [x] All 3041 unit tests pass
    - [x] Lint passes
    - [x] New parameterized test covers: uint8, int8, int16, uint16 → int32
    and uint32, int32 → int64
    - [x] Existing `test_projection_filter_add_column_demote` still works
    (narrowing rejection)
    - [x] Manual verification: uint8 data written to IntegerType column
    produces int32 in Parquet file
    
    Closes #2791
    
    Co-authored-by: Somasundaram Sekar <[email protected]>
---
 pyiceberg/io/pyarrow.py  |  9 +++++++++
 tests/io/test_pyarrow.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 55ecc7ac..d07510d4 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1910,6 +1910,15 @@ class 
ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None]
                         elif target_type.unit == "us" and values.type.unit in 
{"s", "ms", "us"}:
                             return values.cast(target_type)
                     raise ValueError(f"Unsupported schema projection from 
{values.type} to {target_type}")
+                elif isinstance(field.field_type, (IntegerType, LongType)):
+                    # Cast smaller integer types to target type for 
cross-platform compatibility
+                    # Only allow widening conversions (smaller bit width to 
larger)
+                    # Narrowing conversions fall through to promote() handling 
below
+                    if pa.types.is_integer(values.type):
+                        source_width = values.type.bit_width
+                        target_width = target_type.bit_width
+                        if source_width < target_width:
+                            return values.cast(target_type)
 
             if field.field_type != file_field.field_type:
                 target_schema = schema_to_pyarrow(
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
index f1ed109d..e815ea9d 100644
--- a/tests/io/test_pyarrow.py
+++ b/tests/io/test_pyarrow.py
@@ -2752,6 +2752,38 @@ def 
test__to_requested_schema_timestamps_without_downcast_raises_exception(
     assert "Unsupported schema projection from timestamp[ns] to timestamp[us]" 
in str(exc_info.value)
 
 
[email protected](
+    "arrow_type,iceberg_type,expected_arrow_type",
+    [
+        (pa.uint8(), IntegerType(), pa.int32()),
+        (pa.int8(), IntegerType(), pa.int32()),
+        (pa.int16(), IntegerType(), pa.int32()),
+        (pa.uint16(), IntegerType(), pa.int32()),
+        (pa.uint32(), LongType(), pa.int64()),
+        (pa.int32(), LongType(), pa.int64()),
+    ],
+)
+def test__to_requested_schema_integer_promotion(
+    arrow_type: pa.DataType,
+    iceberg_type: PrimitiveType,
+    expected_arrow_type: pa.DataType,
+) -> None:
+    """Test that smaller integer types are cast to target Iceberg type during 
write."""
+    requested_schema = Schema(NestedField(1, "col", iceberg_type, 
required=False))
+    file_schema = requested_schema
+
+    arrow_schema = pa.schema([pa.field("col", arrow_type)])
+    data = pa.array([1, 2, 3, None], type=arrow_type)
+    batch = pa.RecordBatch.from_arrays([data], schema=arrow_schema)
+
+    result = _to_requested_schema(
+        requested_schema, file_schema, batch, 
downcast_ns_timestamp_to_us=False, include_field_ids=False
+    )
+
+    assert result.schema[0].type == expected_arrow_type
+    assert result.column(0).to_pylist() == [1, 2, 3, None]
+
+
 def test_pyarrow_file_io_fs_by_scheme_cache() -> None:
     # It's better to set up multi-region minio servers for an integration test 
once `endpoint_url` argument becomes available for `resolve_s3_region`
     # Refer to: https://github.com/apache/arrow/issues/43713

Reply via email to