[ https://issues.apache.org/jira/browse/ARROW-12054?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17307041#comment-17307041 ]
Taras Kuzyo commented on ARROW-12054: ------------------------------------- Actually, it is even weirder {code:java} import decimal import pyarrow as pa import pyarrow.parquet as pq def pad(b): # Left pad 0 or 1 based on leading digit (2's complement rules) if b[-1] & 128 == 0: return b.ljust(16, b'\x00') else: return b.ljust(16, b'\xff') def to_pyarrow_bytes(b): # converts from big-endian (parquet's repr) to little endian (arrow's repr) # and then pads to 16 bytes return pad(b[::-1]) def decode_stats_decimal(b): pyarrow_bytes = to_pyarrow_bytes(b) arr = pa.Array.from_buffers(pa.decimal128(12, 4), 1, [None, pa.py_buffer(pyarrow_bytes)], 0) return arr[0].as_py() context = decimal.Context(prec=12) numbers = [context.create_decimal(x/10) for x in range(10, 30)] a = pa.array(numbers, pa.decimal128(12, 4)) numbers = [context.create_decimal(x - 10) for x in range(10, 30)] b = pa.array(numbers, pa.decimal128(12, 4)) numbers = [context.create_decimal(x % 20) for x in range(10, 30)] c = pa.array(numbers, pa.decimal128(12, 4)) numbers = [context.create_decimal(x % 10) for x in range(10, 30)] d = pa.array(numbers, pa.decimal128(12, 4)) table = pa.Table.from_arrays([a, b, c, d], ['A', 'B', 'C', 'D']) with pq.ParquetWriter('test.parquet', table.schema) as writer: writer.write_table(table) reader = pq.ParquetFile('test.parquet') for rowgroup in range(reader.num_row_groups): meta = reader.metadata.row_group(rowgroup) for i in range(meta.num_columns): print(f'Column {i}') min_price = meta.column(i).statistics.min max_price = meta.column(i).statistics.max df = reader.read_row_group(0).column(i).to_pandas() actual_min, actual_max = df.min(), df.max() print(f'Min decoded: {decode_stats_decimal(min_price)} Min actual: {actual_min}') print(f'Max decoded: {decode_stats_decimal(max_price)} Max actual: {actual_max}') {code} The output: {noformat} Column 0 Min decoded: 1.0000 Min actual: 1.0000 Max decoded: 2.9000 Max actual: 2.9000 Column 1 Min decoded: 4.0000 Min actual: 0.0000 Max decoded: 16.0000 Max actual: 19.0000 Column 2 Min decoded: 4.0000 Min actual: 0.0000 Max decoded: 16.0000 Max actual: 19.0000 Column 3 Min decoded: 4.0000 Min actual: 0.0000 Max decoded: 9.0000 Max actual: 9.0000{noformat} > [C++] Parquet statistics incorrect for decimal128 > ------------------------------------------------- > > Key: ARROW-12054 > URL: https://issues.apache.org/jira/browse/ARROW-12054 > Project: Apache Arrow > Issue Type: Bug > Components: C++ > Affects Versions: 3.0.0 > Reporter: Weston Pace > Priority: Major > > {code:java} > import decimal > import pyarrow as pa > import pyarrow.parquet as pq > dtype = pa.decimal128(12, 4) > ctx = decimal.Context(prec=12) > arr = pa.array([0, ctx.create_decimal(3.99)], dtype) > table = pa.Table.from_arrays([arr], ["foo"]) > pq.write_table(table, '/tmp/foo.pq') > meta = pq.read_metadata('/tmp/foo.pq') > print(meta.row_group(0).column(0).statistics) > {code} > Expected 0 to be the min and 3.99 to be the max but got the reverse. -- This message was sent by Atlassian Jira (v8.3.4#803005)