[ https://issues.apache.org/jira/browse/ARROW-2654?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16612122#comment-16612122 ]
Andy Reagan commented on ARROW-2654: ------------------------------------ I get a different error now on 0.10.0. Here is the full stacktrace: {code:java} ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs) 286 287 impl = get_engine(engine) --> 288 return impl.read(path, columns=columns, **kwargs) ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs) 129 kwargs['use_pandas_metadata'] = True 130 result = self.api.parquet.read_table(path, columns=columns, --> 131 **kwargs).to_pandas() 132 if should_close: 133 try: ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/parquet.py in read_table(source, columns, nthreads, metadata, use_pandas_metadata) 1044 fs = _get_fs_from_path(source) 1045 return fs.read_parquet(source, columns=columns, metadata=metadata, -> 1046 use_pandas_metadata=use_pandas_metadata) 1047 1048 pf = ParquetFile(source, metadata=metadata) ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/filesystem.py in read_parquet(self, path, columns, metadata, schema, nthreads, use_pandas_metadata) 175 filesystem=self) 176 return dataset.read(columns=columns, nthreads=nthreads, --> 177 use_pandas_metadata=use_pandas_metadata) 178 179 def open(self, path, mode='rb'): ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/parquet.py in read(self, columns, nthreads, use_pandas_metadata) 896 partitions=self.partitions, 897 open_file_func=open_file, --> 898 use_pandas_metadata=use_pandas_metadata) 899 tables.append(table) 900 ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/parquet.py in read(self, columns, nthreads, partitions, open_file_func, file, use_pandas_metadata) 459 table = reader.read_row_group(self.row_group, **options) 460 else: --> 461 table = reader.read(**options) 462 463 if len(self.partition_keys) > 0: ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/parquet.py in read(self, columns, nthreads, use_pandas_metadata) 150 columns, use_pandas_metadata=use_pandas_metadata) 151 return self.reader.read_all(column_indices=column_indices, --> 152 nthreads=nthreads) 153 154 def scan_contents(self, columns=None, batch_size=65536): ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/_parquet.pyx in pyarrow._parquet.ParquetReader.read_all() ~/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status() ArrowIOError: Arrow error: Capacity error: BinaryArray cannot contain more than 2147483646 bytes, have 2147484002 {code} > [Python] Error with errno 22 when loading 3.6 GB Parquet file > ------------------------------------------------------------- > > Key: ARROW-2654 > URL: https://issues.apache.org/jira/browse/ARROW-2654 > Project: Apache Arrow > Issue Type: Bug > Components: Python > Affects Versions: 0.9.0 > Reporter: Andy Reagan > Priority: Major > Labels: parquet > Fix For: 0.11.0 > > > I saved a file using pandas to_parquet method, but can't read it back in. > Here's the full stack trace: > > {code:java} > Traceback (most recent call last): > File "src/data/CLXP_pull.py", line 214, in <module> > main() > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/click/core.py", > line 722, in _call_ > return self.main(*args, **kwargs) > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/click/core.py", > line 697, in main > rv = self.invoke(ctx) > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/click/core.py", > line 895, in invoke > return ctx.invoke(self.callback, **ctx.params) > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/click/core.py", > line 535, in invoke > return callback(*args, **kwargs) > File "src/data/CLXP_pull.py", line 188, in main > results[fullname] = pd.read_parquet(os.path.join(project_dir, "data", "raw", > fullname+".parquet"), engine="pyarrow") > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pandas/io/parquet.py", > line 257, in read_parquet > return impl.read(path, columns=columns, **kwargs) > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pandas/io/parquet.py", > line 130, in read > **kwargs).to_pandas() > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/parquet.py", > line 939, in read_table > pf = ParquetFile(source, metadata=metadata) > File > "/Users/mm51929/projects/2018/03-advisor-recruiting/pyenv/lib/python3.6/site-packages/pyarrow/parquet.py", > line 64, in _init_ > self.reader.open(source, metadata=metadata) > File "_parquet.pyx", line 651, in pyarrow._parquet.ParquetReader.open > File "error.pxi", line 79, in pyarrow.lib.check_status > pyarrow.lib.ArrowIOError: Arrow error: IOError: [Errno 22] Invalid argument > {code} > Any ideas what could cause this? The file itself is 3.6GB. > I'm running pandas==0.22.0. -- This message was sent by Atlassian JIRA (v7.6.3#76005)