This is an automated email from the ASF dual-hosted git repository.
damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 7323fb82cb5 split hdfs into extra (#36773)
7323fb82cb5 is described below
commit 7323fb82cb5652d2de416cb3b12a1b615e81018b
Author: Danny McCormick <[email protected]>
AuthorDate: Wed Nov 26 15:53:20 2025 -0500
split hdfs into extra (#36773)
* split hdfs into extra
* CHANGES
* tox
* try/catch
* test fixes
* add to coverage tasks
---
CHANGES.md | 2 +-
sdks/python/apache_beam/io/hadoopfilesystem.py | 11 +++++++++--
sdks/python/apache_beam/io/hadoopfilesystem_test.py | 7 +++++++
sdks/python/setup.py | 2 +-
sdks/python/tox.ini | 9 +++++----
5 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 50d04b6f0e4..2f393bde759 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -76,7 +76,7 @@
## Breaking Changes
-* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)).
+* (Python) Some Python dependencies have been split out into extras. To ensure
all previously installed dependencies are installed, when installing Beam you
can `pip install apache-beam[gcp,interactive,yaml,redis,hadoop,tfrecord]`,
though most users will not need all of these extras
([#34554](https://github.com/apache/beam/issues/34554)).
## Deprecations
diff --git a/sdks/python/apache_beam/io/hadoopfilesystem.py
b/sdks/python/apache_beam/io/hadoopfilesystem.py
index cf488c228a2..3287644eed8 100644
--- a/sdks/python/apache_beam/io/hadoopfilesystem.py
+++ b/sdks/python/apache_beam/io/hadoopfilesystem.py
@@ -26,8 +26,6 @@ import posixpath
import re
from typing import BinaryIO # pylint: disable=unused-import
-import hdfs
-
from apache_beam.io import filesystemio
from apache_beam.io.filesystem import BeamIOError
from apache_beam.io.filesystem import CompressedFile
@@ -37,6 +35,11 @@ from apache_beam.io.filesystem import FileSystem
from apache_beam.options.pipeline_options import HadoopFileSystemOptions
from apache_beam.options.pipeline_options import PipelineOptions
+try:
+ import hdfs
+except ImportError:
+ hdfs = None
+
__all__ = ['HadoopFileSystem']
_HDFS_PREFIX = 'hdfs:/'
@@ -108,6 +111,10 @@ class HadoopFileSystem(FileSystem):
See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`.
"""
super().__init__(pipeline_options)
+ if hdfs is None:
+ raise ImportError(
+ 'Failed to import hdfs. You can ensure it is '
+ 'installed by installing the hadoop beam extra')
logging.getLogger('hdfs.client').setLevel(logging.WARN)
if pipeline_options is None:
raise ValueError('pipeline_options is not set')
diff --git a/sdks/python/apache_beam/io/hadoopfilesystem_test.py
b/sdks/python/apache_beam/io/hadoopfilesystem_test.py
index 8c21effc882..eb0925224dd 100644
--- a/sdks/python/apache_beam/io/hadoopfilesystem_test.py
+++ b/sdks/python/apache_beam/io/hadoopfilesystem_test.py
@@ -32,6 +32,11 @@ from apache_beam.io.filesystem import BeamIOError
from apache_beam.options.pipeline_options import HadoopFileSystemOptions
from apache_beam.options.pipeline_options import PipelineOptions
+try:
+ import hdfs as actual_hdfs
+except ImportError:
+ actual_hdfs = None
+
class FakeFile(io.BytesIO):
"""File object for FakeHdfs"""
@@ -201,6 +206,7 @@ class FakeHdfs(object):
@parameterized_class(('full_urls', ), [(False, ), (True, )])
[email protected](actual_hdfs is None, "hdfs extra not installed")
class HadoopFileSystemTest(unittest.TestCase):
def setUp(self):
self._fake_hdfs = FakeHdfs()
@@ -607,6 +613,7 @@ class HadoopFileSystemTest(unittest.TestCase):
self.assertFalse(self.fs.exists(url2))
[email protected](actual_hdfs is None, "hdfs extra not installed")
class HadoopFileSystemRuntimeValueProviderTest(unittest.TestCase):
"""Tests pipeline_options, in the form of a
RuntimeValueProvider.runtime_options object."""
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 289433f9ea5..b700d796983 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -379,7 +379,6 @@ if __name__ == '__main__':
# TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc
'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0;
python_version <= "3.12"', # pylint: disable=line-too-long
'grpcio>=1.67.0; python_version >= "3.13"',
- 'hdfs>=2.1.0,<3.0.0',
'httplib2>=0.8,<0.23.0',
'jsonpickle>=3.0.0,<4.0.0',
# numpy can have breaking changes in minor versions.
@@ -563,6 +562,7 @@ if __name__ == '__main__':
# `--update` / `-U` flag to replace the dask release brought in
# by distributed.
],
+ 'hadoop': ['hdfs>=2.1.0,<3.0.0'],
'yaml': [
'docstring-parser>=0.15,<1.0',
'jinja2>=3.0,<3.2',
diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini
index da0932728b2..431cd186c1b 100644
--- a/sdks/python/tox.ini
+++ b/sdks/python/tox.ini
@@ -33,7 +33,7 @@ pip_pre = True
# allow apps that support color to use it.
passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD
# Set [] options for pip installation of apache-beam tarball.
-extras = test,dataframe,redis,tfrecord,yaml
+extras = test,dataframe,hadoop,redis,tfrecord,yaml
# Don't warn that these commands aren't installed.
allowlist_externals =
false
@@ -97,8 +97,8 @@ install_command = {envbindir}/python.exe {envbindir}/pip.exe
install --retries 1
list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze
[testenv:py{310,311,312,313}-cloud]
-; extras = test,gcp,interactive,dataframe,aws,azure,redis
-extras = test,gcp,interactive,dataframe,aws,azure
+; extras = test,gcp,interactive,dataframe,aws,azure
+extras = test,hadoop,gcp,interactive,dataframe,aws,azure
commands =
python apache_beam/examples/complete/autocomplete_test.py
bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}"
@@ -173,7 +173,7 @@ setenv =
TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1}
# NOTE: we could add ml_test to increase the collected code coverage metrics,
but it would make the suite slower.
-extras = test,gcp,interactive,dataframe,aws,redis
+extras = test,hadoop,gcp,interactive,dataframe,aws,redis
commands =
bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}"
"--cov-report=xml --cov=. --cov-append"
@@ -228,6 +228,7 @@ deps =
holdup==1.8.0
extras =
gcp
+ hdfs
allowlist_externals =
bash
echo