[ 
https://issues.apache.org/jira/browse/ARROW-12054?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17307041#comment-17307041
 ] 

Taras Kuzyo commented on ARROW-12054:
-------------------------------------

Actually, it is even weirder

 
{code:java}
import decimal
import pyarrow as pa
import pyarrow.parquet as pq

def pad(b):
 # Left pad 0 or 1 based on leading digit (2's complement rules)
 if b[-1] & 128 == 0:
 return b.ljust(16, b'\x00')
 else:
 return b.ljust(16, b'\xff')

def to_pyarrow_bytes(b):
 # converts from big-endian (parquet's repr) to little endian (arrow's repr)
 # and then pads to 16 bytes
 return pad(b[::-1])

def decode_stats_decimal(b):
 pyarrow_bytes = to_pyarrow_bytes(b)
 arr = pa.Array.from_buffers(pa.decimal128(12, 4), 1, [None, 
pa.py_buffer(pyarrow_bytes)], 0)
 return arr[0].as_py()

context = decimal.Context(prec=12)
numbers = [context.create_decimal(x/10) for x in range(10, 30)]
a = pa.array(numbers, pa.decimal128(12, 4))
numbers = [context.create_decimal(x - 10) for x in range(10, 30)]
b = pa.array(numbers, pa.decimal128(12, 4))
numbers = [context.create_decimal(x % 20) for x in range(10, 30)]
c = pa.array(numbers, pa.decimal128(12, 4))
numbers = [context.create_decimal(x % 10) for x in range(10, 30)]
d = pa.array(numbers, pa.decimal128(12, 4))

table = pa.Table.from_arrays([a, b, c, d], ['A', 'B', 'C', 'D'])
with pq.ParquetWriter('test.parquet', table.schema) as writer:
 writer.write_table(table)
reader = pq.ParquetFile('test.parquet')
for rowgroup in range(reader.num_row_groups):
    meta = reader.metadata.row_group(rowgroup)
    for i in range(meta.num_columns):
       print(f'Column {i}')
       min_price = meta.column(i).statistics.min
       max_price = meta.column(i).statistics.max
       df = reader.read_row_group(0).column(i).to_pandas()
       actual_min, actual_max = df.min(), df.max()
       print(f'Min decoded: {decode_stats_decimal(min_price)} Min actual: 
{actual_min}') 
       print(f'Max decoded: {decode_stats_decimal(max_price)} Max actual: 
{actual_max}') 
{code}
 

The output:
{noformat}
Column 0
Min decoded: 1.0000 Min actual: 1.0000
Max decoded: 2.9000 Max actual: 2.9000
Column 1
Min decoded: 4.0000 Min actual: 0.0000
Max decoded: 16.0000 Max actual: 19.0000
Column 2
Min decoded: 4.0000 Min actual: 0.0000
Max decoded: 16.0000 Max actual: 19.0000
Column 3
Min decoded: 4.0000 Min actual: 0.0000
Max decoded: 9.0000 Max actual: 9.0000{noformat}

> [C++] Parquet statistics incorrect for decimal128
> -------------------------------------------------
>
>                 Key: ARROW-12054
>                 URL: https://issues.apache.org/jira/browse/ARROW-12054
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>    Affects Versions: 3.0.0
>            Reporter: Weston Pace
>            Priority: Major
>
> {code:java}
> import decimal
> import pyarrow as pa
> import pyarrow.parquet as pq
> dtype = pa.decimal128(12, 4)
> ctx = decimal.Context(prec=12)
> arr = pa.array([0, ctx.create_decimal(3.99)], dtype)
> table = pa.Table.from_arrays([arr], ["foo"])
> pq.write_table(table, '/tmp/foo.pq')
> meta = pq.read_metadata('/tmp/foo.pq')
> print(meta.row_group(0).column(0).statistics)
> {code}
> Expected 0 to be the min and 3.99 to be the max but got the reverse.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to