emkornfield commented on a change in pull request #11385: URL: https://github.com/apache/arrow/pull/11385#discussion_r727665157
########## File path: python/pyarrow/tests/parquet/test_parquet_file.py ########## @@ -274,3 +274,75 @@ def test_pre_buffer(pre_buffer): buf.seek(0) pf = pq.ParquetFile(buf, pre_buffer=pre_buffer) assert pf.read().num_rows == N + + +def test_column_selection(tempdir): + # create a table with nested types + inner = pa.field('inner', pa.int64()) + middle = pa.field('middle', pa.struct([inner])) + fields = [ + pa.field('basic', pa.int32()), + pa.field( + 'list', pa.list_(pa.field('item', pa.int32())) + ), + pa.field( + 'struct', pa.struct([middle, pa.field('inner2', pa.int64())]) + ), + pa.field( + 'list-struct', pa.list_(pa.field( + 'item', pa.struct([ + pa.field('inner1', pa.int64()), + pa.field('inner2', pa.int64()) + ]) + )) + ), + pa.field('basic2', pa.int64()), + ] + arrs = [ + [0], [[1, 2]], [{"middle": {"inner": 3}, "inner2": 4}], + [[{"inner1": 5, "inner2": 6}, {"inner1": 7, "inner2": 8}]], [9]] + table = pa.table(arrs, schema=pa.schema(fields)) + + path = str(tempdir / 'test.parquet') + pq.write_table(table, path) + pf = pq.ParquetFile(path) + + # default selecting all columns + result1 = pf.read() + assert result1.equals(table) + + # selecting with columns names + result2 = pf.read(columns=["basic", "basic2"]) + assert result2.equals(table.select(["basic", "basic2"])) + + result3 = pf.read(columns=["list", "struct", "basic2"]) + assert result3.equals(table.select(["list", "struct", "basic2"])) + + # using dotted paths + result4 = pf.read(columns=["struct.middle.inner"]) + expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]}) + assert result4.equals(expected4) + + result5 = pf.read(columns=["struct.inner2"]) + expected5 = pa.table({"struct": [{"inner2": 4}]}) + assert result5.equals(expected5) + + result6 = pf.read( + columns=["list", "struct.middle.inner", "struct.inner2"] + ) + assert result6.equals(table.select(["list", "struct"])) + + # dotted paths for lists with or without list.item + + # result7 = pf.read(columns=["list-struct.list.item.inner1"]) + # expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]}) + # assert result7.equals(expected7) + + # result7b = pf.read(columns=["list-struct.inner1"]) + # assert result7b.equals(expected7) Review comment: its been merged now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org