This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 210a856972 GH-49751: [Python] Add raw fd support to pa.OSFile (#49750)
210a856972 is described below
commit 210a856972ad3d008ab157290ad0004e5d597e49
Author: Ádám Lippai <[email protected]>
AuthorDate: Wed Apr 22 05:35:09 2026 -0400
GH-49751: [Python] Add raw fd support to pa.OSFile (#49750)
### Rationale for this change
The goal of the MR is to support the low level python raw fds (`os.open()`)
### What changes are in this PR?
As an alternative to the `str` path, the `pa.OSFile` accepts the `int`
parameter as well.
### Are these changes tested?
Yes, new end-to-end tests were added
### Are there any user-facing changes?
Yes, the API is extended with a new parameter type
This MR was created by Codex Cloud.
* GitHub Issue: #49751
Authored-by: Ádám Lippai <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/includes/libarrow.pxd | 9 +++++
python/pyarrow/io.pxi | 65 +++++++++++++++++++++++++-----
python/pyarrow/tests/parquet/test_basic.py | 17 ++++++++
python/pyarrow/tests/test_io.py | 25 ++++++++++++
4 files changed, 105 insertions(+), 11 deletions(-)
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index e96a7d8469..8ee7784461 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1677,6 +1677,9 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io"
nogil:
@staticmethod
CResult[shared_ptr[COutputStream]] Open(const c_string& path)
+ @staticmethod
+ CResult[shared_ptr[COutputStream]] Open(int fd)
+
@staticmethod
CResult[shared_ptr[COutputStream]] OpenWithAppend" Open"(
const c_string& path, c_bool append)
@@ -1687,6 +1690,12 @@ cdef extern from "arrow/io/api.h" namespace "arrow::io"
nogil:
@staticmethod
CResult[shared_ptr[ReadableFile]] Open(const c_string& path)
+ @staticmethod
+ CResult[shared_ptr[ReadableFile]] Open(int fd)
+
+ @staticmethod
+ CResult[shared_ptr[ReadableFile]] Open(int fd, CMemoryPool*
memory_pool)
+
@staticmethod
CResult[shared_ptr[ReadableFile]] Open(const c_string& path,
CMemoryPool* memory_pool)
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index fd2d4df42c..b648fbf669 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -1183,6 +1183,12 @@ cdef class OSFile(NativeFile):
"""
A stream backed by a regular file descriptor.
+ Parameters
+ ----------
+ path : str or int
+ A file path or an open file descriptor.
+ Passed file descriptors are owned and closed by OSFile.
+
Examples
--------
Create a new file to write to:
@@ -1228,22 +1234,33 @@ cdef class OSFile(NativeFile):
object path
def __cinit__(self, path, mode='r', MemoryPool memory_pool=None):
- _check_is_file(path)
self.path = path
cdef:
FileMode c_mode
shared_ptr[Readable] handle
- c_string c_path = encode_file_path(path)
-
- if mode in ('r', 'rb'):
- self._open_readable(c_path, maybe_unbox_memory_pool(memory_pool))
- elif mode in ('w', 'wb'):
- self._open_writable(c_path)
- elif mode in ('a', 'ab'):
- self._open_writable(c_path, append=True)
+ c_string c_path
+ int fd
+
+ if isinstance(path, int):
+ fd = path
+ if mode in ('r', 'rb'):
+ self._open_readable_fd(fd,
maybe_unbox_memory_pool(memory_pool))
+ elif mode in ('w', 'wb', 'a', 'ab'):
+ self._open_writable_fd(fd, append=(mode in ('a', 'ab')))
+ else:
+ raise ValueError(f'Invalid file mode: {mode}')
else:
- raise ValueError(f'Invalid file mode: {mode}')
+ _check_is_file(path)
+ c_path = encode_file_path(path)
+ if mode in ('r', 'rb'):
+ self._open_readable(c_path,
maybe_unbox_memory_pool(memory_pool))
+ elif mode in ('w', 'wb'):
+ self._open_writable(c_path)
+ elif mode in ('a', 'ab'):
+ self._open_writable(c_path, append=True)
+ else:
+ raise ValueError(f'Invalid file mode: {mode}')
cdef _open_readable(self, c_string path, CMemoryPool* pool):
cdef shared_ptr[ReadableFile] handle
@@ -1262,9 +1279,35 @@ cdef class OSFile(NativeFile):
self.is_writable = True
self._is_appending = append
+ cdef _open_readable_fd(self, int fd, CMemoryPool* pool):
+ cdef shared_ptr[ReadableFile] handle
+
+ with nogil:
+ handle = GetResultValue(ReadableFile.Open(fd, pool))
+
+ self.is_readable = True
+ self.set_random_access_file(<shared_ptr[CRandomAccessFile]> handle)
+
+ cdef _open_writable_fd(self, int fd, c_bool append=False):
+ with nogil:
+ self.output_stream = GetResultValue(FileOutputStream.Open(fd))
+ self.is_writable = True
+ self._is_appending = append
+
def fileno(self):
self._assert_open()
- return self.handle.file_descriptor()
+ cdef:
+ shared_ptr[ReadableFile] readable_handle
+ shared_ptr[FileOutputStream] writable_handle
+
+ if self.is_readable:
+ readable_handle = static_pointer_cast[ReadableFile,
CRandomAccessFile](
+ self.get_random_access_file())
+ return readable_handle.get().file_descriptor()
+ else:
+ writable_handle = static_pointer_cast[FileOutputStream,
COutputStream](
+ self.get_output_stream())
+ return writable_handle.get().file_descriptor()
cdef class FixedSizeBufferWriter(NativeFile):
diff --git a/python/pyarrow/tests/parquet/test_basic.py
b/python/pyarrow/tests/parquet/test_basic.py
index 4f26f22b10..01c84b4f35 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -133,6 +133,23 @@ def test_memory_map(tempdir):
assert table_read.equals(table)
+def test_parquet_read_write_table_raw_fd(tempdir):
+ table = pa.table({'a': [1, 2, 3]})
+ path = str(tempdir / 'raw-fd.parquet')
+ binary_flag = getattr(os, "O_BINARY", 0)
+
+ fd = os.open(path, os.O_CREAT | os.O_WRONLY | os.O_TRUNC | binary_flag,
+ 0o666)
+ with pa.OSFile(fd, mode='wb') as sink:
+ pq.write_table(table, sink)
+
+ fd = os.open(path, os.O_RDONLY | binary_flag)
+ with pa.OSFile(fd, mode='rb') as source:
+ result = pq.read_table(source)
+
+ assert result.equals(table)
+
+
@pytest.mark.pandas
def test_enable_buffered_stream(tempdir):
df = alltypes_sample(size=10)
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index a6d3546e57..824c69c989 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -17,6 +17,7 @@
import bz2
from contextlib import contextmanager
+import errno
from io import (BytesIO, StringIO, TextIOWrapper, BufferedIOBase, IOBase)
import itertools
import gc
@@ -1280,6 +1281,30 @@ def test_os_file_writer(tmpdir):
assert f5.size() == 6 # foo + bar
+def test_os_file_raw_fd(tmpdir):
+ path = os.path.join(str(tmpdir), guid())
+ binary_flag = getattr(os, "O_BINARY", 0)
+
+ fd = os.open(path, os.O_CREAT | os.O_WRONLY | os.O_TRUNC | binary_flag,
+ 0o666)
+ with pa.OSFile(fd, mode='wb') as f:
+ assert f.fileno() == fd
+ f.write(b'foo')
+
+ with pytest.raises(OSError) as exc:
+ os.fstat(fd)
+ assert exc.value.errno == errno.EBADF
+
+ fd = os.open(path, os.O_RDONLY | binary_flag)
+ with pa.OSFile(fd, mode='rb') as f:
+ assert f.fileno() == fd
+ assert f.read() == b'foo'
+
+ with pytest.raises(OSError) as exc:
+ os.fstat(fd)
+ assert exc.value.errno == errno.EBADF
+
+
def test_native_file_write_reject_unicode():
# ARROW-3227
nf = pa.BufferOutputStream()