do-me commented on issue #14229:
URL: https://github.com/apache/arrow/issues/14229#issuecomment-1826259855

   Same here. I'm trying to load a 6GB parquet file with 3 cols two string cols 
and one with embeddings (array) in pandas with
   
   ```python
    df = pd.read_parquet("test.parquet")
   ```
   
   ```
   File size in bytes: 6207538015 bytes
   File size in kilobytes: 6062048.84 KB
   ```
   
   Tried with pandas 2.0.3 and latest 2.1.3 on Windows (32Gb RAM) and Ubuntu 
(128Gb RAM):
   
   ```
   ---------------------------------------------------------------------------
   OSError                                   Traceback (most recent call last)
   File <timed exec>:4
   
   File ~/anaconda3/lib/python3.11/site-packages/pandas/io/parquet.py:509, in 
read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, 
dtype_backend, **kwargs)
       506     use_nullable_dtypes = False
       507 check_dtype_backend(dtype_backend)
   --> 509 return impl.read(
       510     path,
       511     columns=columns,
       512     storage_options=storage_options,
       513     use_nullable_dtypes=use_nullable_dtypes,
       514     dtype_backend=dtype_backend,
       515     **kwargs,
       516 )
   
   File ~/anaconda3/lib/python3.11/site-packages/pandas/io/parquet.py:227, in 
PyArrowImpl.read(self, path, columns, use_nullable_dtypes, dtype_backend, 
storage_options, **kwargs)
       220 path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
       221     path,
       222     kwargs.pop("filesystem", None),
       223     storage_options=storage_options,
       224     mode="rb",
       225 )
       226 try:
   --> 227     pa_table = self.api.parquet.read_table(
       228         path_or_handle, columns=columns, **kwargs
       229     )
       230     result = pa_table.to_pandas(**to_pandas_kwargs)
       232     if manager == "array":
   
   File ~/anaconda3/lib/python3.11/site-packages/pyarrow/parquet/core.py:2973, 
in read_table(source, columns, use_threads, metadata, schema, 
use_pandas_metadata, read_dictionary, memory_map, buffer_size, partitioning, 
filesystem, filters, use_legacy_dataset, ignore_prefixes, pre_buffer, 
coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, 
thrift_container_size_limit)
      2962         # TODO test that source is not a directory or a list
      2963         dataset = ParquetFile(
      2964             source, metadata=metadata, 
read_dictionary=read_dictionary,
      2965             memory_map=memory_map, buffer_size=buffer_size,
      (...)
      2970             thrift_container_size_limit=thrift_container_size_limit,
      2971         )
   -> 2973     return dataset.read(columns=columns, use_threads=use_threads,
      2974                         use_pandas_metadata=use_pandas_metadata)
      2976 warnings.warn(
      2977     "Passing 'use_legacy_dataset=True' to get the legacy behaviour 
is "
      2978     "deprecated as of pyarrow 8.0.0, and the legacy implementation 
will "
      2979     "be removed in a future version.",
      2980     FutureWarning, stacklevel=2)
      2982 if ignore_prefixes is not None:
   
   File ~/anaconda3/lib/python3.11/site-packages/pyarrow/parquet/core.py:2601, 
in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata)
      2593         index_columns = [
      2594             col for col in _get_pandas_index_columns(metadata)
      2595             if not isinstance(col, dict)
      2596         ]
      2597         columns = (
      2598             list(columns) + list(set(index_columns) - set(columns))
      2599         )
   -> 2601 table = self._dataset.to_table(
      2602     columns=columns, filter=self._filter_expression,
      2603     use_threads=use_threads
      2604 )
      2606 # if use_pandas_metadata, restore the pandas metadata (which gets
      2607 # lost if doing a specific `columns` selection in to_table)
      2608 if use_pandas_metadata:
   
   File ~/anaconda3/lib/python3.11/site-packages/pyarrow/_dataset.pyx:369, in 
pyarrow._dataset.Dataset.to_table()
   
   File ~/anaconda3/lib/python3.11/site-packages/pyarrow/_dataset.pyx:2818, in 
pyarrow._dataset.Scanner.to_table()
   
   File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:144, in 
pyarrow.lib.pyarrow_internal_check_status()
   
   File ~/anaconda3/lib/python3.11/site-packages/pyarrow/error.pxi:115, in 
pyarrow.lib.check_status()
   
   OSError: List index overflow.
   ```
   
   The weird thing is that this that I processed 20 of these files with 
different file sizes and even bigger ones than this one (with 7GB) worked.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to