Skip to content

Commit 9e4cbde

Browse files
committed
split hdfs into extra (#36773)
* split hdfs into extra * CHANGES * tox * try/catch * test fixes * add to coverage tasks
1 parent e075a50 commit 9e4cbde

File tree

5 files changed

+23
-8
lines changed

5 files changed

+23
-8
lines changed

CHANGES.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Now Beam has full support for Milvus integration including Milvus enrichment and
8181

8282
## Breaking Changes
8383

84-
* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)).
84+
* (Python) Some Python dependencies have been split out into extras. To ensure all previously installed dependencies are installed, when installing Beam you can `pip install apache-beam[gcp,interactive,yaml,redis,hadoop,tfrecord]`, though most users will not need all of these extras ([#34554](https://github.com/apache/beam/issues/34554)).
8585

8686
## Deprecations
8787

sdks/python/apache_beam/io/hadoopfilesystem.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626
import re
2727
from typing import BinaryIO # pylint: disable=unused-import
2828

29-
import hdfs
30-
3129
from apache_beam.io import filesystemio
3230
from apache_beam.io.filesystem import BeamIOError
3331
from apache_beam.io.filesystem import CompressedFile
@@ -37,6 +35,11 @@
3735
from apache_beam.options.pipeline_options import HadoopFileSystemOptions
3836
from apache_beam.options.pipeline_options import PipelineOptions
3937

38+
try:
39+
import hdfs
40+
except ImportError:
41+
hdfs = None
42+
4043
__all__ = ['HadoopFileSystem']
4144

4245
_HDFS_PREFIX = 'hdfs:/'
@@ -108,6 +111,10 @@ def __init__(self, pipeline_options):
108111
See :class:`~apache_beam.options.pipeline_options.HadoopFileSystemOptions`.
109112
"""
110113
super().__init__(pipeline_options)
114+
if hdfs is None:
115+
raise ImportError(
116+
'Failed to import hdfs. You can ensure it is '
117+
'installed by installing the hadoop beam extra')
111118
logging.getLogger('hdfs.client').setLevel(logging.WARN)
112119
if pipeline_options is None:
113120
raise ValueError('pipeline_options is not set')

sdks/python/apache_beam/io/hadoopfilesystem_test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
from apache_beam.options.pipeline_options import HadoopFileSystemOptions
3333
from apache_beam.options.pipeline_options import PipelineOptions
3434

35+
try:
36+
import hdfs as actual_hdfs
37+
except ImportError:
38+
actual_hdfs = None
39+
3540

3641
class FakeFile(io.BytesIO):
3742
"""File object for FakeHdfs"""
@@ -201,6 +206,7 @@ def checksum(self, path):
201206

202207

203208
@parameterized_class(('full_urls', ), [(False, ), (True, )])
209+
@unittest.skipIf(actual_hdfs is None, "hdfs extra not installed")
204210
class HadoopFileSystemTest(unittest.TestCase):
205211
def setUp(self):
206212
self._fake_hdfs = FakeHdfs()
@@ -607,6 +613,7 @@ def test_delete_error(self):
607613
self.assertFalse(self.fs.exists(url2))
608614

609615

616+
@unittest.skipIf(actual_hdfs is None, "hdfs extra not installed")
610617
class HadoopFileSystemRuntimeValueProviderTest(unittest.TestCase):
611618
"""Tests pipeline_options, in the form of a
612619
RuntimeValueProvider.runtime_options object."""

sdks/python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,6 @@ def get_portability_package_data():
379379
# TODO(https://github.com/grpc/grpc/issues/37710): Unpin grpc
380380
'grpcio>=1.33.1,<2,!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<1.66.0; python_version <= "3.12"', # pylint: disable=line-too-long
381381
'grpcio>=1.67.0; python_version >= "3.13"',
382-
'hdfs>=2.1.0,<3.0.0',
383382
'httplib2>=0.8,<0.23.0',
384383
'jsonpickle>=3.0.0,<4.0.0',
385384
# numpy can have breaking changes in minor versions.
@@ -563,6 +562,7 @@ def get_portability_package_data():
563562
# `--update` / `-U` flag to replace the dask release brought in
564563
# by distributed.
565564
],
565+
'hadoop': ['hdfs>=2.1.0,<3.0.0'],
566566
'yaml': [
567567
'docstring-parser>=0.15,<1.0',
568568
'jinja2>=3.0,<3.2',

sdks/python/tox.ini

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ pip_pre = True
3333
# allow apps that support color to use it.
3434
passenv=TERM,CLOUDSDK_CONFIG,DOCKER_*,TESTCONTAINERS_*,TC_*,ALLOYDB_PASSWORD
3535
# Set [] options for pip installation of apache-beam tarball.
36-
extras = test,dataframe,redis,tfrecord,yaml
36+
extras = test,dataframe,hadoop,redis,tfrecord,yaml
3737
# Don't warn that these commands aren't installed.
3838
allowlist_externals =
3939
false
@@ -97,8 +97,8 @@ install_command = {envbindir}/python.exe {envbindir}/pip.exe install --retries 1
9797
list_dependencies_command = {envbindir}/python.exe {envbindir}/pip.exe freeze
9898

9999
[testenv:py{310,311,312,313}-cloud]
100-
; extras = test,gcp,interactive,dataframe,aws,azure,redis
101-
extras = test,gcp,interactive,dataframe,aws,azure
100+
; extras = test,gcp,interactive,dataframe,aws,azure
101+
extras = test,hadoop,gcp,interactive,dataframe,aws,azure
102102
commands =
103103
python apache_beam/examples/complete/autocomplete_test.py
104104
bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}"
@@ -173,7 +173,7 @@ setenv =
173173
TC_SLEEP_TIME = {env:TC_SLEEP_TIME:1}
174174

175175
# NOTE: we could add ml_test to increase the collected code coverage metrics, but it would make the suite slower.
176-
extras = test,gcp,interactive,dataframe,aws,redis
176+
extras = test,hadoop,gcp,interactive,dataframe,aws,redis
177177
commands =
178178
bash {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" "--cov-report=xml --cov=. --cov-append"
179179

@@ -228,6 +228,7 @@ deps =
228228
holdup==1.8.0
229229
extras =
230230
gcp
231+
hdfs
231232
allowlist_externals =
232233
bash
233234
echo

0 commit comments

Comments
 (0)