This is an automated email from the ASF dual-hosted git repository. damccorm pushed a commit to branch users/damccorm/extras-cp in repository https://gitbox.apache.org/repos/asf/beam.git
commit 9e4cbde2649f330f8fc57f43b43361a1a11b3eba Author: Danny McCormick <[email protected]> AuthorDate: Wed Nov 26 15:53:20 2025 -0500 split hdfs into extra (#36773) * split hdfs into extra * CHANGES * tox * try/catch * test fixes * add to coverage tasks --- CHANGES.md | 2 +- sdks/python/apache_beam/io/hadoopfilesystem.py | 11 +++++++++-- sdks/python/apache_beam/io/hadoopfilesystem_test.py | 7 +++++++ sdks/python/setup.py | 2 +- sdks/python/tox.ini | 9 +++++---- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 68af5a342d7..5dd07aab92b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -81,7 +81,7 @@ Now Beam has full support for Milvus integration including Milvus enrichment and ## Breaking Changes -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). +* (Python) Some Python dependencies have been split out into extras. To ensure all previously installed dependencies are installed, when installing Beam you can `pip install apache-beam[gcp,interactive,yaml,redis,hadoop,tfrecord]`, though most users will not need all of these extras ([#34554](https://github.com/apache/beam/issues/34554)). ## Deprecations diff --git a/sdks/python/apache_beam/io/hadoopfilesystem.py b/sdks/python/apache_beam/io/hadoopfilesystem.py index cf488c228a2..3287644eed8 100644 --- a/sdks/python/apache_beam/io/hadoopfilesystem.py +++ b/sdks/python/apache_beam/io/hadoopfilesystem.py @@ -26,8 +26,6 @@ import posixpath import re from typing import BinaryIO # pylint: disable=unused-import -import hdfs - from apache_beam.io import filesystemio from apache_beam.io.filesystem import BeamIOError from apache_beam.io.filesystem import CompressedFile @@ -37,6 +35,11 @@ from apache_beam.io.filesystem import FileSystem from apache_beam.options.pipeline_options import HadoopFileSystemOptions from apache_beam.options.pipeline_options import PipelineOptions +try: + import hdfs +except ImportError: + hdfs = None + __all__ = ['HadoopFileSystem'] _HDFS_PREFIX = 'hdfs:/' @@ -108,6 +111,10 @@ class HadoopFileSystem(FileSystem): See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`. """ super().__init__(pipeline_options) + if hdfs is None: + raise ImportError( + 'Failed to import hdfs. You can ensure it is ' + 'installed by installing the hadoop beam extra') logging.getLogger('hdfs.client').setLevel(logging.WARN) if pipeline_options is None: raise ValueError('pipeline_options is not set') diff --git a/sdks/python/apache_beam/io/hadoopfilesystem_test.py b/sdks/python/apache_beam/io/hadoopfilesystem_test.py index 8c21effc882..eb0925224dd 100644 --- a/sdks/python/apache_beam/io/hadoopfilesystem_test.py +++ b/sdks/python/apache_beam/io/hadoopfilesystem_test.py @@ -32,6 +32,11 @@ from apache_beam.io.filesystem import BeamIOError from apache_beam.options.pipeline_options import HadoopFileSystemOptions from apache_beam.options.pipeline_options import PipelineOptions +try: + import hdfs as actual_hdfs +except ImportError: + actual_hdfs = None + class FakeFile(io.BytesIO): """File object for FakeHdfs""" @@ -201,6 +206,7 @@ class FakeHdfs(object): @parameterized_class(('full_urls', ), [(False, ), (True, )]) [email protected](actual_hdfs is None, "hdfs extra not installed") class HadoopFileSystemTest(unittest.TestCase): def setUp(self): self._fake_hdfs = FakeHdfs() @@ -607,6 +613,7 @@ class HadoopFileSystemTest(unittest.TestCase): self.assertFalse(self.fs.exists(url2)) [email protected](actual_hdfs is None, "hdfs extra not installed") class HadoopFileSystemRuntimeValueProviderTest(unittest.TestCase): """Tests pipeline_options, in the form of a RuntimeValueProvider.runtime_options object.""" diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 289433f9ea5..b700d796983 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -379,7 +379,6 @@ if __name__ == '__main__': # TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc 'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0; python_version <= "3.12"', # pylint: disable=line-too-long 'grpcio>=1.67.0; python_version >= "3.13"', - 'hdfs>=2.1.0,<3.0.0', 'httplib2>=0.8,<0.23.0', 'jsonpickle>=3.0.0,<4.0.0', # numpy can have breaking changes in minor versions. @@ -563,6 +562,7 @@ if __name__ == '__main__': # `--update` / `-U` flag to replace the dask release brought in # by distributed. ], + 'hadoop': ['hdfs>=2.1.0,<3.0.0'], 'yaml': [ 'docstring-parser>=0.15,<1.0', 'jinja2>=3.0,<3.2', diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index da0932728b2..431cd186c1b 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -33,7 +33,7 @@ pip_pre = True # allow apps that support color to use it. passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD # Set [] options for pip installation of apache-beam tarball. -extras = test,dataframe,redis,tfrecord,yaml +extras = test,dataframe,hadoop,redis,tfrecord,yaml # Don't warn that these commands aren't installed. allowlist_externals = false @@ -97,8 +97,8 @@ install_command = {envbindir}/python.exe {envbindir}/pip.exe install --retries 1 list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze [testenv:py{310,311,312,313}-cloud] -; extras = test,gcp,interactive,dataframe,aws,azure,redis -extras = test,gcp,interactive,dataframe,aws,azure +; extras = test,gcp,interactive,dataframe,aws,azure +extras = test,hadoop,gcp,interactive,dataframe,aws,azure commands = python apache_beam/examples/complete/autocomplete_test.py bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" @@ -173,7 +173,7 @@ setenv = TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1} # NOTE: we could add ml_test to increase the collected code coverage metrics, but it would make the suite slower. -extras = test,gcp,interactive,dataframe,aws,redis +extras = test,hadoop,gcp,interactive,dataframe,aws,redis commands = bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" "--cov-report=xml --cov=. --cov-append" @@ -228,6 +228,7 @@ deps = holdup==1.8.0 extras = gcp + hdfs allowlist_externals = bash echo
