This is an automated email from the ASF dual-hosted git repository.
rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 0a2a0c75f9 GH-49452: [Python] Reintroduce docstring injection for
stubfiles (#49453)
0a2a0c75f9 is described below
commit 0a2a0c75f9e8106d0ad7e53d2e3183a89c043620
Author: Rok Mihevc <[email protected]>
AuthorDate: Fri Mar 13 11:49:26 2026 +0100
GH-49452: [Python] Reintroduce docstring injection for stubfiles (#49453)
### Rationale for this change
Warning: should not be merged before
https://github.com/apache/arrow/pull/49259.
See https://github.com/apache/arrow/issues/49452 and
https://github.com/apache/arrow/pull/48618
### What changes are included in this PR?
Adds a wheel build time script to populate stubfiles with runtime
docstrings.
### Are these changes tested?
Not yet.
### Are there any user-facing changes?
Users will get docstrings.
* GitHub Issue: #49452
Lead-authored-by: Rok Mihevc <[email protected]>
Co-authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Rok Mihevc <[email protected]>
---
ci/scripts/python_test_type_annotations.sh | 4 +-
ci/scripts/python_wheel_macos_build.sh | 1 +
ci/scripts/python_wheel_validate_contents.py | 72 +++++++++++++++++----
ci/scripts/python_wheel_windows_build.bat | 1 +
ci/scripts/python_wheel_xlinux_build.sh | 1 +
python/CMakeLists.txt | 33 ++++++++++
python/pyproject.toml | 6 +-
python/scripts/update_stub_docstrings.py | 96 +++++++++++++++++++++-------
8 files changed, 176 insertions(+), 38 deletions(-)
diff --git a/ci/scripts/python_test_type_annotations.sh
b/ci/scripts/python_test_type_annotations.sh
index c1a051b1e5..092bedf3f5 100755
--- a/ci/scripts/python_test_type_annotations.sh
+++ b/ci/scripts/python_test_type_annotations.sh
@@ -34,5 +34,5 @@ pip install mypy pyright ty
# Run type checkers
cd "${pyarrow_dir}"
mypy
-pyright
-ty check
+pyright --stats
+ty check --verbose --output-format concise
diff --git a/ci/scripts/python_wheel_macos_build.sh
b/ci/scripts/python_wheel_macos_build.sh
index 1571cd57f2..31395e26c2 100755
--- a/ci/scripts/python_wheel_macos_build.sh
+++ b/ci/scripts/python_wheel_macos_build.sh
@@ -147,6 +147,7 @@ popd
echo "=== (${PYTHON_VERSION}) Building wheel ==="
export PYARROW_BUNDLE_ARROW_CPP=ON
+export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON
export PYARROW_WITH_ACERO=${ARROW_ACERO}
export PYARROW_WITH_AZURE=${ARROW_AZURE}
export PYARROW_WITH_DATASET=${ARROW_DATASET}
diff --git a/ci/scripts/python_wheel_validate_contents.py
b/ci/scripts/python_wheel_validate_contents.py
index 153a70eb40..8388f6ebf3 100644
--- a/ci/scripts/python_wheel_validate_contents.py
+++ b/ci/scripts/python_wheel_validate_contents.py
@@ -16,29 +16,79 @@
# under the License.
import argparse
+import ast
from pathlib import Path
import re
import zipfile
+def _count_docstrings(source):
+ """Count docstrings in module, function, and class bodies."""
+ tree = ast.parse(source)
+ count = 0
+ for node in ast.walk(tree):
+ if isinstance(node, (ast.Module, ast.FunctionDef,
+ ast.AsyncFunctionDef, ast.ClassDef)):
+ if (node.body
+ and isinstance(node.body[0], ast.Expr)
+ and isinstance(node.body[0].value, ast.Constant)
+ and isinstance(node.body[0].value.value, str)):
+ count += 1
+ return count
+
+
def validate_wheel(path):
p = Path(path)
wheels = list(p.glob('*.whl'))
error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})"
assert len(wheels) == 1, error_msg
- f = zipfile.ZipFile(wheels[0])
- outliers = [
- info.filename for info in f.filelist if not re.match(
- r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)',
info.filename
+ with zipfile.ZipFile(wheels[0]) as wheel_zip:
+ outliers = [
+ info.filename for info in wheel_zip.filelist if not re.match(
+ r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/|pyarrow\.libs/)',
info.filename
+ )
+ ]
+ assert not outliers, f"Unexpected contents in wheel:
{sorted(outliers)}"
+ for filename in ('LICENSE.txt', 'NOTICE.txt'):
+ assert any(
+ info.filename.split("/")[-1] == filename for info in
wheel_zip.filelist
+ ), f"{filename} is missing from the wheel."
+
+ assert any(
+ info.filename == "pyarrow/py.typed" for info in wheel_zip.filelist
+ ), "pyarrow/py.typed is missing from the wheel."
+
+ source_root = Path(__file__).resolve().parents[2]
+ stubs_dir = source_root / "python" / "pyarrow-stubs" / "pyarrow"
+ assert stubs_dir.exists(), f"Stub source directory not found:
{stubs_dir}"
+
+ expected_stub_files = {
+ f"pyarrow/{stub_file.relative_to(stubs_dir).as_posix()}"
+ for stub_file in stubs_dir.rglob("*.pyi")
+ }
+
+ wheel_stub_files = {
+ info.filename
+ for info in wheel_zip.filelist
+ if info.filename.startswith("pyarrow/") and
info.filename.endswith(".pyi")
+ }
+
+ assert wheel_stub_files == expected_stub_files, (
+ "Wheel .pyi files differ from python/pyarrow-stubs/pyarrow.\n"
+ f"Missing in wheel: {sorted(expected_stub_files -
wheel_stub_files)}\n"
+ f"Unexpected in wheel: {sorted(wheel_stub_files -
expected_stub_files)}"
+ )
+
+ wheel_docstring_count = sum(
+ _count_docstrings(wheel_zip.read(wsf).decode("utf-8"))
+ for wsf in wheel_stub_files
)
- ]
- assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}"
- for filename in ('LICENSE.txt', 'NOTICE.txt'):
- assert any(info.filename.split("/")[-1] == filename
- for info in f.filelist), \
- f"{filename} is missing from the wheel."
+
+ print(f"Found {wheel_docstring_count} docstring(s) in wheel stubs.")
+ assert wheel_docstring_count, "No docstrings found in wheel stub
files."
+
print(f"The wheel: {wheels[0]} seems valid.")
- # TODO(GH-32609): Validate some docstrings were generated and added.
+
def main():
parser = argparse.ArgumentParser()
diff --git a/ci/scripts/python_wheel_windows_build.bat
b/ci/scripts/python_wheel_windows_build.bat
index 14e3e5a629..e094d82861 100644
--- a/ci/scripts/python_wheel_windows_build.bat
+++ b/ci/scripts/python_wheel_windows_build.bat
@@ -116,6 +116,7 @@ popd
echo "=== (%PYTHON%) Building wheel ==="
set PYARROW_BUNDLE_ARROW_CPP=ON
+set PYARROW_REQUIRE_STUB_DOCSTRINGS=ON
set PYARROW_WITH_ACERO=%ARROW_ACERO%
set PYARROW_WITH_AZURE=%ARROW_AZURE%
set PYARROW_WITH_DATASET=%ARROW_DATASET%
diff --git a/ci/scripts/python_wheel_xlinux_build.sh
b/ci/scripts/python_wheel_xlinux_build.sh
index 960fe5bad6..223bd0b1cb 100755
--- a/ci/scripts/python_wheel_xlinux_build.sh
+++ b/ci/scripts/python_wheel_xlinux_build.sh
@@ -155,6 +155,7 @@ check_arrow_visibility
echo "=== (${PYTHON_VERSION}) Building wheel ==="
export PYARROW_BUNDLE_ARROW_CPP=ON
+export PYARROW_REQUIRE_STUB_DOCSTRINGS=ON
export PYARROW_WITH_ACERO=${ARROW_ACERO}
export PYARROW_WITH_AZURE=${ARROW_AZURE}
export PYARROW_WITH_DATASET=${ARROW_DATASET}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0630e0cff7..6395b3e1e7 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1025,3 +1025,36 @@ if(PYARROW_BUILD_PARQUET)
target_link_libraries(_parquet_encryption PRIVATE
arrow_python_parquet_encryption)
endif()
endif()
+
+#
+# Type stubs with docstring injection
+#
+# Stubs live in pyarrow-stubs/pyarrow/ during development but are installed
+# alongside the package so type checkers can find them (PEP 561).
+set(PYARROW_STUBS_SOURCE_DIR
"${CMAKE_CURRENT_SOURCE_DIR}/pyarrow-stubs/pyarrow")
+if(EXISTS "${PYARROW_STUBS_SOURCE_DIR}")
+ install(DIRECTORY "${PYARROW_STUBS_SOURCE_DIR}/"
+ DESTINATION "."
+ FILES_MATCHING
+ PATTERN "*.pyi")
+
+ if(PYARROW_REQUIRE_STUB_DOCSTRINGS)
+ install(CODE "
+ execute_process(
+ COMMAND \"${Python3_EXECUTABLE}\"
+
\"${CMAKE_CURRENT_SOURCE_DIR}/scripts/update_stub_docstrings.py\"
+ \"${CMAKE_INSTALL_PREFIX}\"
+ \"${CMAKE_CURRENT_SOURCE_DIR}\"
+ RESULT_VARIABLE _pyarrow_stub_docstrings_result
+ )
+ if(NOT _pyarrow_stub_docstrings_result EQUAL 0)
+ message(FATAL_ERROR \"Stub docstring injection failed (exit code:
\${_pyarrow_stub_docstrings_result})\")
+ endif()
+ ")
+ endif()
+else()
+ if(PYARROW_REQUIRE_STUB_DOCSTRINGS)
+ message(FATAL_ERROR "PyArrow stub source directory not found at
${PYARROW_STUBS_SOURCE_DIR}; "
+ "cannot build wheel without .pyi files.")
+ endif()
+endif()
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 14aa37ed04..a6bba335b8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -85,7 +85,7 @@ exclude = [
[tool.scikit-build]
cmake.build-type = "Release"
metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
-sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/"]
+sdist.include = ["pyarrow/_generated_version.py", "cmake_modules/",
"pyarrow-stubs/"]
wheel.packages = ["pyarrow"]
wheel.install-dir = "pyarrow"
@@ -94,6 +94,7 @@ PYARROW_BUNDLE_ARROW_CPP = {env = "PYARROW_BUNDLE_ARROW_CPP",
default = "OFF"}
PYARROW_BUNDLE_CYTHON_CPP = {env = "PYARROW_BUNDLE_CYTHON_CPP", default =
"OFF"}
PYARROW_GENERATE_COVERAGE = {env = "PYARROW_GENERATE_COVERAGE", default =
"OFF"}
PYARROW_CXXFLAGS = {env = "PYARROW_CXXFLAGS", default = ""}
+PYARROW_REQUIRE_STUB_DOCSTRINGS = {env = "PYARROW_REQUIRE_STUB_DOCSTRINGS",
default = "OFF"}
[tool.setuptools_scm]
root = '..'
@@ -129,6 +130,9 @@ stubPath = "pyarrow-stubs"
typeCheckingMode = "basic"
# TODO: Enable type checking once stubs are merged
+[tool.ty.environment]
+extra-paths = ["pyarrow-stubs"]
+
[tool.ty.src]
include = ["pyarrow-stubs"]
exclude = [
diff --git a/python/scripts/update_stub_docstrings.py
b/python/scripts/update_stub_docstrings.py
index 5fd24014a0..44bd19bfdc 100644
--- a/python/scripts/update_stub_docstrings.py
+++ b/python/scripts/update_stub_docstrings.py
@@ -18,14 +18,17 @@
"""
Extract docstrings from pyarrow runtime and insert them into stub files.
-Usage (from python/ directory with pyarrow built):
- python scripts/update_stub_docstrings.py pyarrow-stubs
+Usage:
+ python scripts/update_stub_docstrings.py <install_prefix> <source_dir>
"""
import argparse
import importlib
import inspect
+import os
+import shutil
import sys
+import tempfile
from pathlib import Path
from textwrap import indent
@@ -178,7 +181,7 @@ def add_docstrings_to_stubs(stubs_dir):
pyarrow = importlib.import_module("pyarrow")
- for stub_file in stubs_dir.rglob('*.pyi'):
+ for stub_file in sorted(stubs_dir.rglob('*.pyi')):
if stub_file.name == "_stubs_typing.pyi":
continue
@@ -186,43 +189,88 @@ def add_docstrings_to_stubs(stubs_dir):
if module_name in LIB_MODULES:
namespace = "lib"
elif stub_file.parent.name in ("parquet", "interchange"):
- namespace = f"{stub_file.parent.name}.{module_name}"
+ namespace = (stub_file.parent.name if module_name == "__init__"
+ else f"{stub_file.parent.name}.{module_name}")
elif module_name == "__init__":
namespace = ""
else:
namespace = module_name
print(f" {stub_file.name} -> {namespace or '(root)'}")
- tree = libcst.parse_module(stub_file.read_text())
+ tree = libcst.parse_module(stub_file.read_text(encoding="utf-8"))
modified = tree.visit(DocstringInserter(pyarrow, namespace))
- stub_file.write_text(modified.code)
+ stub_file.write_text(modified.code, encoding="utf-8")
-def add_docstrings_from_build(stubs_dir, build_lib):
+def _link_or_copy(source, destination):
+ # Prefer symlinks (faster, no disk use) but fall back to copying when the
+ # filesystem doesn't support them (e.g. Docker volumes, network mounts).
+ if sys.platform != "win32":
+ try:
+ os.symlink(source, destination)
+ return
+ except OSError:
+ pass
+
+ if source.is_dir():
+ shutil.copytree(source, destination, symlinks=(sys.platform !=
"win32"))
+ else:
+ shutil.copy2(source, destination)
+
+
+def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir):
"""
- Entry point for setup.py: update docstrings using pyarrow from build
directory.
+ Assemble an importable pyarrow package inside a temporary directory.
- During the build process, pyarrow is not installed in the system Python.
- We need to temporarily add the build directory to sys.path so we can
- import pyarrow and extract docstrings from it.
+ During wheel builds the .py sources and compiled binary artifacts live in
+ separate trees (source checkout vs CMake install prefix). This function
+ symlinks (or copies) both into pyarrow_pkg folder so that a plain
+ ``import pyarrow`` works and docstrings can be extracted at build time.
"""
- stubs_dir, build_lib = Path(stubs_dir), Path(build_lib)
+ source_pyarrow = source_dir / "pyarrow"
+ if not source_pyarrow.exists():
+ raise FileNotFoundError(f"PyArrow source package not found:
{source_pyarrow}")
+
+ for source_path in sorted(source_pyarrow.iterdir()):
+ if source_path.suffix == ".py":
+ _link_or_copy(source_path, pyarrow_pkg / source_path.name)
+ elif source_path.is_dir() and not source_path.name.startswith((".",
"__")):
+ _link_or_copy(source_path, pyarrow_pkg / source_path.name)
+
+ for artifact in sorted(install_pyarrow_dir.iterdir()):
+ if not artifact.is_file() or artifact.suffix == ".pyi":
+ continue
- sys.path.insert(0, str(build_lib))
- try:
- add_docstrings_to_stubs(stubs_dir)
- finally:
- sys.path.pop(0)
+ destination = pyarrow_pkg / artifact.name
+ if not destination.exists():
+ _link_or_copy(artifact, destination)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs
folder")
+ parser.add_argument("install_prefix", type=Path,
+ help="CMAKE_INSTALL_PREFIX used by wheel build")
+ parser.add_argument("source_dir", type=Path,
+ help="PyArrow source directory")
args = parser.parse_args()
- # Add the directory containing this script's parent (python/) to sys.path
- # so pyarrow can be imported when running from the python/ directory
- script_dir = Path(__file__).resolve().parent
- python_dir = script_dir.parent
- sys.path.insert(0, str(python_dir))
- add_docstrings_to_stubs(args.stubs_dir.resolve())
+ install_prefix = args.install_prefix.resolve()
+ source_dir = args.source_dir.resolve()
+ install_pyarrow_dir = install_prefix / "pyarrow"
+ if not install_pyarrow_dir.exists():
+ install_pyarrow_dir = install_prefix
+
+ if not any(install_pyarrow_dir.rglob("*.pyi")):
+ print("No .pyi files found in install tree, skipping docstring
injection")
+ sys.exit(0)
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ pyarrow_pkg = Path(tmpdir) / "pyarrow"
+ pyarrow_pkg.mkdir()
+ _create_importable_pyarrow(pyarrow_pkg, source_dir,
install_pyarrow_dir)
+
+ sys.path.insert(0, tmpdir)
+ try:
+ add_docstrings_to_stubs(install_pyarrow_dir)
+ finally:
+ sys.path.pop(0)