syun64 commented on code in PR #902:
URL: https://github.com/apache/iceberg-python/pull/902#discussion_r1669524329
##########
pyiceberg/io/pyarrow.py:
##########
@@ -1268,14 +1265,8 @@ def __init__(self, file_schema: Schema):
def _cast_if_needed(self, field: NestedField, values: pa.Array) ->
pa.Array:
file_field = self.file_schema.find_field(field.field_id)
- if field.field_type.is_primitive:
- if field.field_type != file_field.field_type:
- return
values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type),
include_field_ids=False))
- elif (target_type := schema_to_pyarrow(field.field_type,
include_field_ids=False)) != values.type:
Review Comment:
The removal of this casting logic results in errors when writing the parquet
files:
```
> for data_file in
_dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table,
io=txn._table.io):
tests/integration/test_writes/test_writes.py:732:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _
pyiceberg/table/__init__.py:2944: in _dataframe_to_data_files
yield from write_file(
/usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:621: in
result_iterator
yield _result_or_cancel(fs.pop())
/usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:319: in
_result_or_cancel
return fut.result(timeout)
/usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:458: in
result
return self.__get_result()
/usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:403: in
__get_result
raise self._exception
/usr/local/python/3.10.13/lib/python3.10/concurrent/futures/thread.py:58: in
run
result = self.fn(*self.args, **self.kwargs)
pyiceberg/io/pyarrow.py:1915: in write_parquet
writer.write(arrow_table, row_group_size=row_group_size)
/home/codespace/.cache/pypoetry/virtualenvs/pyiceberg-FsHa-ZgB-py3.10/lib/python3.10/site-packages/pyarrow/parquet/core.py:1052:
in write
self.write_table(table_or_batch, row_group_size)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <pyarrow.parquet.core.ParquetWriter object at 0x7f79a96cc820>, table
= pyarrow.Table
foo: string
----
foo: [["a",null,"z"]], row_group_size = 134217728
def write_table(self, table, row_group_size=None):
"""
Write Table to the Parquet file.
Parameters
----------
table : Table
row_group_size : int, default None
Maximum number of rows in each written row group. If None,
the row group size will be the minimum of the Table size
and 1024 * 1024. If set larger than 64Mi then 64Mi will
be used instead.
"""
if self.schema_changed:
table = _sanitize_table(table, self.schema, self.flavor)
assert self.is_open
if not table.schema.equals(self.schema, check_metadata=False):
msg = ('Table schema does not match schema used to create file: '
'\ntable:\n{!s} vs. \nfile:\n{!s}'
.format(table.schema, self.schema))
> raise ValueError(msg)
E ValueError: Table schema does not match schema used to create
file:
E table:
E foo: string vs.
E file:
E foo: large_string
E -- field metadata --
E PARQUET:field_id: '1'
/home/codespace/.cache/pypoetry/virtualenvs/pyiceberg-FsHa-ZgB-py3.10/lib/python3.10/site-packages/pyarrow/parquet/core.py:1094:
ValueError
===============================================================================
short test summary info
================================================================================
FAILED
tests/integration/test_writes/test_writes.py::test_create_table_transaction[session_catalog_hive-1]
- ValueError: Table schema does not match schema used to create file:
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]