do-me commented on issue #14229: URL: https://github.com/apache/arrow/issues/14229#issuecomment-1826259855
Same here. I'm trying to load a 6GB parquet file with 3 cols two string cols and one with embeddings (array) in pandas with ```python df = pd.read_parquet("test.parquet") ``` ``` File size in bytes: 6207538015 bytes File size in kilobytes: 6062048.84 KB ``` Tried with pandas 2.0.3 and latest 2.1.3 on Windows (32Gb RAM) and Ubuntu (128Gb RAM): ``` --------------------------------------------------------------------------- OSError Traceback (most recent call last) File <timed exec>:4 File ~/anaconda3/lib/python3.11/site-packages/pandas/io/parquet.py:509, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, **kwargs) 506 use_nullable_dtypes = False 507 check_dtype_backend(dtype_backend) --> 509 return impl.read( 510 path, 511 columns=columns, 512 storage_options=storage_options, 513 use_nullable_dtypes=use_nullable_dtypes, 514 dtype_backend=dtype_backend, 515 **kwargs, 516 ) File ~/anaconda3/lib/python3.11/site-packages/pandas/io/parquet.py:227, in PyArrowImpl.read(self, path, columns, use_nullable_dtypes, dtype_backend, storage_options, **kwargs) 220 path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( 221 path, 222 kwargs.pop("filesystem", None), 223 storage_options=storage_options, 224 mode="rb", 225 ) 226 try: --> 227 pa_table = self.api.parquet.read_table( 228 path_or_handle, columns=columns, **kwargs 229 ) 230 result = pa_table.to_pandas(**to_pandas_kwargs) 232 if manager == "array": File ~/anaconda3/lib/python3.11/site-packages/pyarrow/parquet/core.py:2973, in read_table(source, columns, use_threads, metadata, schema, use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit) 2962 # TODO test that source is not a directory or a list 2963 dataset = ParquetFile( 2964 source, metadata=metadata, read_dictionary=read_dictionary, 2965 memory_map=memory_map, buffer_size=buffer_size, (...) 2970 thrift_container_size_limit=thrift_container_size_limit, 2971 ) -> 2973 return dataset.read(columns=columns, use_threads=use_threads, 2974 use_pandas_metadata=use_pandas_metadata) 2976 warnings.warn( 2977 "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " 2978 "deprecated as of pyarrow 8.0.0, and the legacy implementation will " 2979 "be removed in a future version.", 2980 FutureWarning, stacklevel=2) 2982 if ignore_prefixes is not None: File ~/anaconda3/lib/python3.11/site-packages/pyarrow/parquet/core.py:2601, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata) 2593 index_columns = [ 2594 col for col in _get_pandas_index_columns(metadata) 2595 if not isinstance(col, dict) 2596 ] 2597 columns = ( 2598 list(columns) + list(set(index_columns) - set(columns)) 2599 ) -> 2601 table = self._dataset.to_table( 2602 columns=columns, filter=self._filter_expression, 2603 use_threads=use_threads 2604 ) 2606 # if use_pandas_metadata, restore the pandas metadata (which gets 2607 # lost if doing a specific `columns` selection in to_table) 2608 if use_pandas_metadata: File ~/anaconda3/lib/python3.11/site-packages/pyarrow/_dataset.pyx:369, in pyarrow._dataset.Dataset.to_table() File ~/anaconda3/lib/python3.11/site-packages/pyarrow/_dataset.pyx:2818, in pyarrow._dataset.Scanner.to_table() File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status() File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:115, in pyarrow.lib.check_status() OSError: List index overflow. ``` The weird thing is that this that I processed 20 of these files with different file sizes and even bigger ones than this one (with 7GB) worked. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org