Jim Pivarski created ARROW-17539: ------------------------------------ Summary: Reading a StructArray column with an ExtensionType causes segfault Key: ARROW-17539 URL: https://issues.apache.org/jira/browse/ARROW-17539 Project: Apache Arrow Issue Type: Bug Components: Python Affects Versions: 9.0.0 Reporter: Jim Pivarski
We can make nested columns in a Parquet file by putting a {{pa.StructArray}} in a {{pa.Table}} and writing that Table to Parquet. We can selectively read back that nested column by specifying it with dot syntax: {{pq.ParquetFile("f.parquet").read_row_groups([0], ["table_column.struct_field"])}} But if the Arrow types are ExtensionTypes, then the above causes a segfault. The segfault depends both on the nested struct field and the ExtensionTypes. Here is a minimally reproducing example of reading a nested struct field without extension types, which does not raise a segfault. (I'm building the {{pa.StructArray}} manually with {{from_buffers}} because I'll have to add the ExtensionTypes in the next example.) {code:python} import numpy as np import pyarrow as pa import pyarrow.parquet as pq one = pa.Array.from_buffers( pa.int64(), 3, [None, pa.py_buffer(np.array([10, 20, 30], dtype=np.int64))], ) two = pa.Array.from_buffers( pa.float64(), 3, [None, pa.py_buffer(np.array([1.1, 2.2, 3.3], dtype=np.float64))], ) record = pa.Array.from_buffers( pa.struct([ pa.field("one", one.type, False), pa.field("two", two.type, False), ]), 3, [None], children=[one, two], ) assert record.to_pylist() == [ {"one": 10, "two": 1.1}, {"one": 20, "two": 2.2}, {"one": 30, "two": 3.3}, ] table = pa.Table.from_arrays([record], names=["column"]) pq.write_table(table, "record.parquet") table2 = pq.ParquetFile("record.parquet").read_row_groups([0], ["column.one"]) assert table2.to_pylist() == [ {"column": {"one": 10}}, {"column": {"one": 20}}, {"column": {"one": 30}}, ] {code} So far, so good; no segfault. Next, we define and register an ExtensionType, {code:python} import json class AnnotatedType(pa.ExtensionType): def __init__(self, storage_type, annotation): self.annotation = annotation super().__init__(storage_type, "my:app") def __arrow_ext_serialize__(self): return json.dumps(self.annotation).encode() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): annotation = json.loads(serialized.decode()) print(storage_type, annotation) return cls(storage_type, annotation) @property def num_buffers(self): return self.storage_type.num_buffers @property def num_fields(self): return self.storage_type.num_fields pa.register_extension_type(AnnotatedType(pa.null(), None)) {code} build the {{pa.StructArray}} again, {code:python} one = pa.Array.from_buffers( AnnotatedType(pa.int64(), {"annotated": "one"}), 3, [None, pa.py_buffer(np.array([10, 20, 30], dtype=np.int64))], ) two = pa.Array.from_buffers( AnnotatedType(pa.float64(), {"annotated": "two"}), 3, [None, pa.py_buffer(np.array([1.1, 2.2, 3.3], dtype=np.float64))], ) record = pa.Array.from_buffers( AnnotatedType( pa.struct([ pa.field("one", one.type, False), pa.field("two", two.type, False), ]), {"annotated": "record"}, ), 3, [None], children=[one, two], ) assert record.to_pylist() == [ {"one": 10, "two": 1.1}, {"one": 20, "two": 2.2}, {"one": 30, "two": 3.3}, ] {code} Now when we write and read this back, there's a segfault: {code:python} table = pa.Table.from_arrays([record], names=["column"]) pq.write_table(table, "record_annotated.parquet") print("before segfault") table2 = pq.ParquetFile("record_annotated.parquet").read_row_groups([0], ["column.one"]) print("after segfault") {code} The output, which prints each annotation as the ExtensionType is deserialized, is {code:java} before segfault int64 {'annotated': 'one'} double {'annotated': 'two'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} Segmentation fault (core dumped) {code} Note that if we read back that file, {{{}record_annotated.parquet{}}}, without the ExtensionType, everything is fine: {code:java} Python 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) [GCC 10.3.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import pyarrow as pa >>> import pyarrow.parquet as pq >>> table2 = pq.ParquetFile("record_annotated.parquet").read_row_groups([0], >>> ["column.one"]) >>> assert table2.to_pylist() == [ ... {"column": {"one": 10}}, ... {"column": {"one": 20}}, ... {"column": {"one": 30}}, ... ] {code} and if we register the ExtensionType but don't select a column, everything is fine: {code:java} Python 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:56:21) [GCC 10.3.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import pyarrow as pa >>> import pyarrow.parquet as pq >>> import json >>> >>> class AnnotatedType(pa.ExtensionType): ... def __init__(self, storage_type, annotation): ... self.annotation = annotation ... super().__init__(storage_type, "my:app") ... def __arrow_ext_serialize__(self): ... return json.dumps(self.annotation).encode() ... @classmethod ... def __arrow_ext_deserialize__(cls, storage_type, serialized): ... annotation = json.loads(serialized.decode()) ... print(storage_type, annotation) ... return cls(storage_type, annotation) ... @property ... def num_buffers(self): ... return self.storage_type.num_buffers ... @property ... def num_fields(self): ... return self.storage_type.num_fields ... >>> pa.register_extension_type(AnnotatedType(pa.null(), None)) >>> >>> table2 = pq.ParquetFile("record_annotated.parquet").read_row_groups([0]) int64 {'annotated': 'one'} double {'annotated': 'two'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} >>> assert table2.to_pylist() == [ ... {"column": {"one": 10, "two": 1.1}}, ... {"column": {"one": 20, "two": 2.2}}, ... {"column": {"one": 30, "two": 3.3}}, ... ] int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} int64 {'annotated': 'one'} double {'annotated': 'two'} struct<one: extension<my:app<AnnotatedType>> not null, two: extension<my:app<AnnotatedType>> not null> {'annotated': 'record'} int64 {'annotated': 'one'} double {'annotated': 'two'} {code} It's just the case of doing both that causes the segfault. -- This message was sent by Atlassian Jira (v8.20.10#820010)