[ 
https://issues.apache.org/jira/browse/ARROW-8057?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Wes McKinney reassigned ARROW-8057:
-----------------------------------

    Assignee: Wes McKinney

> [C++] Schema equality not roundtrip safe through Parquet
> --------------------------------------------------------
>
>                 Key: ARROW-8057
>                 URL: https://issues.apache.org/jira/browse/ARROW-8057
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Python
>            Reporter: Florian Jetter
>            Assignee: Wes McKinney
>            Priority: Major
>
> When performing schema roundtrips, the equality check for fields break. This 
> is a regression from PyArrow 0.16.0
> The equality check for entire schemas has never worked (but should from my 
> POV)
> {code:python}
> import pyarrow.parquet as pq
> import pyarrow as pa
> print(pa.__version__)
> fields = [
>     pa.field("bool", pa.bool_()),
>     pa.field("byte", pa.binary()),
>     pa.field("date", pa.date32()),
>     pa.field("datetime64", pa.timestamp("us")),
>     pa.field("float32", pa.float64()),
>     pa.field("float64", pa.float64()),
>     pa.field("int16", pa.int64()),
>     pa.field("int32", pa.int64()),
>     pa.field("int64", pa.int64()),
>     pa.field("int8", pa.int64()),
>     pa.field("null", pa.null()),
>     pa.field("uint16", pa.uint64()),
>     pa.field("uint32", pa.uint64()),
>     pa.field("uint64", pa.uint64()),
>     pa.field("uint8", pa.uint64()),
>     pa.field("unicode", pa.string()),
>     pa.field("array_float32", pa.list_(pa.float64())),
>     pa.field("array_float64", pa.list_(pa.float64())),
>     pa.field("array_int16", pa.list_(pa.int64())),
>     pa.field("array_int32", pa.list_(pa.int64())),
>     pa.field("array_int64", pa.list_(pa.int64())),
>     pa.field("array_int8", pa.list_(pa.int64())),
>     pa.field("array_uint16", pa.list_(pa.uint64())),
>     pa.field("array_uint32", pa.list_(pa.uint64())),
>     pa.field("array_uint64", pa.list_(pa.uint64())),
>     pa.field("array_uint8", pa.list_(pa.uint64())),
>     pa.field("array_unicode", pa.list_(pa.string())),
> ]
> schema = pa.schema(fields)
> buf = pa.BufferOutputStream()
> pq.write_metadata(schema, buf)
> reader = pa.BufferReader(buf.getvalue().to_pybytes())
> reconstructed_schema = pq.read_schema(reader)
> assert reconstructed_schema == reconstructed_schema
> assert reconstructed_schema[0] == reconstructed_schema[0]
> # This breaks on master / regression from 0.16.0 
> assert schema[0] == reconstructed_schema[0]
> # This never worked but should
> assert reconstructed_schema == schema
> assert schema == reconstructed_schema
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to