jorisvandenbossche opened a new issue, #40428:
URL: https://github.com/apache/arrow/issues/40428

   See eg 
https://github.com/ursacomputing/crossbow/actions/runs/8104554803/job/22151387751
   
   ```
   ___________________________ test_filters_equivalency 
___________________________
   
   tempdir = PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_equivalency0')
   
       @pytest.mark.pandas
       def test_filters_equivalency(tempdir):
           local = LocalFileSystem()
           base_path = tempdir
       
           integer_keys = [0, 1]
           string_keys = ['a', 'b', 'c']
           boolean_keys = [True, False]
           partition_spec = [
               ['integer', integer_keys],
               ['string', string_keys],
               ['boolean', boolean_keys]
           ]
       
           df = pd.DataFrame({
               'integer': np.array(integer_keys, dtype='i4').repeat(15),
               'string': np.tile(np.tile(np.array(string_keys, dtype=object), 
5), 2),
               'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 
5),
                                  3),
           }, columns=['integer', 'string', 'boolean'])
       
           _generate_partition_directories(local, base_path, partition_spec, df)
       
           # Old filters syntax:
           #  integer == 1 AND string != b AND boolean == True
           dataset = pq.ParquetDataset(
               base_path, filesystem=local,
               filters=[('integer', '=', 1), ('string', '!=', 'b'),
                        ('boolean', '==', 'True')],
           )
           table = dataset.read()
           result_df = (table.to_pandas().reset_index(drop=True))
       
           assert 0 not in result_df['integer'].values
           assert 'b' not in result_df['string'].values
           assert False not in result_df['boolean'].values
       
           # filters in disjunctive normal form:
           #  (integer == 1 AND string != b AND boolean == True) OR
           #  (integer == 2 AND boolean == False)
           # TODO(ARROW-3388): boolean columns are reconstructed as string
           filters = [
               [
                   ('integer', '=', 1),
                   ('string', '!=', 'b'),
                   ('boolean', '==', 'True')
               ],
               [('integer', '=', 0), ('boolean', '==', 'False')]
           ]
           dataset = pq.ParquetDataset(
               base_path, filesystem=local, filters=filters)
           table = dataset.read()
           result_df = table.to_pandas().reset_index(drop=True)
       
           # Check that all rows in the DF fulfill the filter
           df_filter_1 = (result_df['integer'] == 1) \
               & (result_df['string'] != 'b') \
               & (result_df['boolean'] == 'True')
           df_filter_2 = (np.array(result_df['integer']) == 0) \
               & (result_df['boolean'] == 'False')
   >       assert df_filter_1.sum() > 0
   E       assert np.int64(0) > 0
   E        +  where np.int64(0) = <bound method Series.sum of Series([], 
dtype: bool)>()
   E        +    where <bound method Series.sum of Series([], dtype: bool)> = 
Series([], dtype: bool).sum
   
   
opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:153:
 AssertionError
   __________________________ test_filters_inclusive_set 
__________________________
   
   tempdir = 
PosixPath('/tmp/pytest-of-root/pytest-0/test_filters_inclusive_set0')
   
       @pytest.mark.pandas
       def test_filters_inclusive_set(tempdir):
           local = LocalFileSystem()
           base_path = tempdir
       
           integer_keys = [0, 1]
           string_keys = ['a', 'b', 'c']
           boolean_keys = [True, False]
           partition_spec = [
               ['integer', integer_keys],
               ['string', string_keys],
               ['boolean', boolean_keys]
           ]
       
           df = pd.DataFrame({
               'integer': np.array(integer_keys, dtype='i4').repeat(15),
               'string': np.tile(np.tile(np.array(string_keys, dtype=object), 
5), 2),
               'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 
5),
                                  3),
           }, columns=['integer', 'string', 'boolean'])
       
           _generate_partition_directories(local, base_path, partition_spec, df)
       
           dataset = pq.ParquetDataset(
               base_path, filesystem=local,
               filters=[('string', 'in', 'ab')],
           )
           table = dataset.read()
           result_df = (table.to_pandas().reset_index(drop=True))
       
   >       assert 'a' in result_df['string'].values
   E       AssertionError: assert 'a' in [], Categories (3, object): ['a', 'b', 
'c']
   E        +  where [], Categories (3, object): ['a', 'b', 'c'] = Series([], 
Name: string, dtype: category\nCategories (3, object): ['a', 'b', 'c']).values
   
   
opt/conda/envs/arrow/lib/python3.10/site-packages/pyarrow/tests/parquet/test_dataset.py:328:
 AssertionError
   ```
   
   From debugging the failure, it seems this is due to pandas changing a filter 
operation to sometimes preserve a RangeIndex now instead of returning an 
Integer64Index. And the conversion to Arrow changes based on that (RangeIndex 
is metadata only by default, integer index becomes a column)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to