kevinjqliu commented on code in PR #1983:
URL: https://github.com/apache/iceberg-python/pull/1983#discussion_r2081953874
##########
pyiceberg/io/pyarrow.py:
##########
@@ -636,7 +636,13 @@ def visit_fixed(self, fixed_type: FixedType) ->
pa.DataType:
return pa.binary(len(fixed_type))
def visit_decimal(self, decimal_type: DecimalType) -> pa.DataType:
- return pa.decimal128(decimal_type.precision, decimal_type.scale)
+ return (
+ pa.decimal32(decimal_type.precision, decimal_type.scale)
+ if decimal_type.precision <= 9
+ else pa.decimal64(decimal_type.precision, decimal_type.scale)
+ if decimal_type.precision <= 18
+ else pa.decimal128(decimal_type.precision, decimal_type.scale)
Review Comment:
"""
Scale is fixed, precision must be 38 or less
"""
from https://iceberg.apache.org/spec/#primitive-types

##########
pyiceberg/io/pyarrow.py:
##########
@@ -2442,7 +2448,9 @@ def write_parquet(task: WriteTask) -> DataFile:
)
fo = io.new_output(file_path)
with fo.create(overwrite=True) as fos:
- with pq.ParquetWriter(fos, schema=arrow_table.schema,
**parquet_writer_kwargs) as writer:
+ with pq.ParquetWriter(
+ fos, schema=arrow_table.schema, store_decimal_as_integer=True,
**parquet_writer_kwargs
Review Comment:
"""
By default, this is DISABLED and all decimal types annotate
fixed_len_byte_array. When enabled, the writer will use the following physical
types to store decimals:
- int32: for 1 <= precision <= 9.
- int64: for 10 <= precision <= 18.
- fixed_len_byte_array: for precision > 18.
"""
from
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html

##########
pyiceberg/io/pyarrow.py:
##########
@@ -636,7 +636,13 @@ def visit_fixed(self, fixed_type: FixedType) ->
pa.DataType:
return pa.binary(len(fixed_type))
def visit_decimal(self, decimal_type: DecimalType) -> pa.DataType:
- return pa.decimal128(decimal_type.precision, decimal_type.scale)
+ return (
+ pa.decimal32(decimal_type.precision, decimal_type.scale)
+ if decimal_type.precision <= 9
+ else pa.decimal64(decimal_type.precision, decimal_type.scale)
+ if decimal_type.precision <= 18
+ else pa.decimal128(decimal_type.precision, decimal_type.scale)
Review Comment:
pyarrow.decimal128 supports up to precision 38
https://arrow.apache.org/docs/python/generated/pyarrow.decimal128.html#pyarrow-decimal128
##########
pyiceberg/io/pyarrow.py:
##########
@@ -2442,7 +2448,9 @@ def write_parquet(task: WriteTask) -> DataFile:
)
fo = io.new_output(file_path)
with fo.create(overwrite=True) as fos:
- with pq.ParquetWriter(fos, schema=arrow_table.schema,
**parquet_writer_kwargs) as writer:
+ with pq.ParquetWriter(
+ fos, schema=arrow_table.schema, store_decimal_as_integer=True,
**parquet_writer_kwargs
Review Comment:
this matches the parquet data type mapping for decimal
https://iceberg.apache.org/spec/#parquet

--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]