jorisvandenbossche opened a new issue, #40428: URL: https://github.com/apache/arrow/issues/40428
See eg https://github.com/ursacomputing/crossbow/actions/runs/8104554803/job/22151387751 ``` ___________________________ test_filters_equivalency ___________________________ tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_equivalency0') @pytest.mark.pandas def test_filters_equivalency(tempdir): local = LocalFileSystem() base_path = tempdir integer_keys = [0, 1] string_keys = ['a', 'b', 'c'] boolean_keys = [True, False] partition_spec = [ ['integer', integer_keys], ['string', string_keys], ['boolean', boolean_keys] ] df = pd.DataFrame({ 'integer': np.array(integer_keys, dtype='i4').repeat(15), 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2), 'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3), }, columns=['integer', 'string', 'boolean']) _generate_partition_directories(local, base_path, partition_spec, df) # Old filters syntax: # integer == 1 AND string != b AND boolean == True dataset = pq.ParquetDataset( base_path, filesystem=local, filters=[('integer', '=', 1), ('string', '!=', 'b'), ('boolean', '==', 'True')], ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) assert 0 not in result_df['integer'].values assert 'b' not in result_df['string'].values assert False not in result_df['boolean'].values # filters in disjunctive normal form: # (integer == 1 AND string != b AND boolean == True) OR # (integer == 2 AND boolean == False) # TODO(ARROW-3388): boolean columns are reconstructed as string filters = [ [ ('integer', '=', 1), ('string', '!=', 'b'), ('boolean', '==', 'True') ], [('integer', '=', 0), ('boolean', '==', 'False')] ] dataset = pq.ParquetDataset( base_path, filesystem=local, filters=filters) table = dataset.read() result_df = table.to_pandas().reset_index(drop=True) # Check that all rows in the DF fulfill the filter df_filter_1 = (result_df['integer'] == 1) \ & (result_df['string'] != 'b') \ & (result_df['boolean'] == 'True') df_filter_2 = (np.array(result_df['integer']) == 0) \ & (result_df['boolean'] == 'False') > assert df_filter_1.sum() > 0 E assert np.int64(0) > 0 E + where np.int64(0) = <bound method Series.sum of Series([], dtype: bool)>() E + where <bound method Series.sum of Series([], dtype: bool)> = Series([], dtype: bool).sum opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:153: AssertionError __________________________ test_filters_inclusive_set __________________________ tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_inclusive_set0') @pytest.mark.pandas def test_filters_inclusive_set(tempdir): local = LocalFileSystem() base_path = tempdir integer_keys = [0, 1] string_keys = ['a', 'b', 'c'] boolean_keys = [True, False] partition_spec = [ ['integer', integer_keys], ['string', string_keys], ['boolean', boolean_keys] ] df = pd.DataFrame({ 'integer': np.array(integer_keys, dtype='i4').repeat(15), 'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2), 'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3), }, columns=['integer', 'string', 'boolean']) _generate_partition_directories(local, base_path, partition_spec, df) dataset = pq.ParquetDataset( base_path, filesystem=local, filters=[('string', 'in', 'ab')], ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) > assert 'a' in result_df['string'].values E AssertionError: assert 'a' in [], Categories (3, object): ['a', 'b', 'c'] E + where [], Categories (3, object): ['a', 'b', 'c'] = Series([], Name: string, dtype: category\nCategories (3, object): ['a', 'b', 'c']).values opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:328: AssertionError ``` From debugging the failure, it seems this is due to pandas changing a filter operation to sometimes preserve a RangeIndex now instead of returning an Integer64Index. And the conversion to Arrow changes based on that (RangeIndex is metadata only by default, integer index becomes a column) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org