This is an automated email from the ASF dual-hosted git repository.
shahar1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 8da850235a2 Reject GCS blob names that escape the target directory on
download (#67509)
8da850235a2 is described below
commit 8da850235a2b2e870ca391233dfc86f03ea4c367
Author: Jarek Potiuk <[email protected]>
AuthorDate: Fri May 29 15:25:09 2026 +0200
Reject GCS blob names that escape the target directory on download (#67509)
``GCSHook.sync_to_local_dir`` and
``GCSTimeSpanFileTransformOperator._download``
joined GCS blob names into local paths without verifying the resolved
path stayed within the intended directory. GCS allows object names
containing ``..`` segments, so a hostile blob name could cause files
to be written outside ``local_dir`` / the operator's temp dir — a
classic CWE-22 path-traversal sink. The trust model matters: a DAG
author's own bucket is trusted, but operators are routinely pointed
at buckets shared with external partners or other tenants, where the
write side may not be fully trusted.
Resolve the destination and assert ``is_relative_to`` the target root
before any download. On violation, raise ``ValueError`` with a clear
message instead of silently writing outside the target.
---
.../airflow/providers/google/cloud/hooks/gcs.py | 11 +++++++
.../providers/google/cloud/operators/gcs.py | 10 ++++++
.../tests/unit/google/cloud/hooks/test_gcs.py | 28 ++++++++++++++++
.../tests/unit/google/cloud/operators/test_gcs.py | 38 ++++++++++++++++++++++
4 files changed, 87 insertions(+)
diff --git a/providers/google/src/airflow/providers/google/cloud/hooks/gcs.py
b/providers/google/src/airflow/providers/google/cloud/hooks/gcs.py
index 2b527365ec7..e9eb0666cb9 100644
--- a/providers/google/src/airflow/providers/google/cloud/hooks/gcs.py
+++ b/providers/google/src/airflow/providers/google/cloud/hooks/gcs.py
@@ -1362,6 +1362,7 @@ class GCSHook(GoogleBaseHook):
gcs_bucket = self.get_bucket(bucket_name)
local_gcs_objects = []
+ local_dir_resolved = local_dir_path.resolve()
for blob in gcs_bucket.list_blobs(prefix=prefix):
# GCS lists "directories" as objects ending with a slash. We
should skip them.
if blob.name.endswith("/"):
@@ -1369,6 +1370,16 @@ class GCSHook(GoogleBaseHook):
blob_path = Path(blob.name)
local_target_path =
local_dir_path.joinpath(blob_path.relative_to(prefix))
+ # Containment check: ``blob.name`` originates outside the worker,
and GCS allows
+ # object names containing ``..``. Resolve the target and assert it
stays under
+ # ``local_dir`` so a hostile blob name cannot write outside the
intended directory
+ # (CWE-22).
+ if not
local_target_path.resolve().is_relative_to(local_dir_resolved):
+ raise ValueError(
+ f"Refusing to write GCS blob {blob.name!r}: resolved path "
+ f"{local_target_path} escapes the target directory
{local_dir_path}."
+ )
+
if not local_target_path.parent.exists():
local_target_path.parent.mkdir(parents=True, exist_ok=True)
self.log.debug("Created local directory: %s",
local_target_path.parent)
diff --git
a/providers/google/src/airflow/providers/google/cloud/operators/gcs.py
b/providers/google/src/airflow/providers/google/cloud/operators/gcs.py
index 6d9eb2aa6a9..c649962b217 100644
--- a/providers/google/src/airflow/providers/google/cloud/operators/gcs.py
+++ b/providers/google/src/airflow/providers/google/cloud/operators/gcs.py
@@ -877,6 +877,7 @@ class
GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
with TemporaryDirectory() as temp_input_dir, TemporaryDirectory() as
temp_output_dir:
temp_input_dir_path = Path(temp_input_dir)
+ temp_input_dir_resolved = temp_input_dir_path.resolve()
temp_output_dir_path = Path(temp_output_dir)
num_downloads = len(blobs_to_transform)
@@ -897,6 +898,15 @@ class
GCSTimeSpanFileTransformOperator(GoogleCloudBaseOperator):
blob = bucket.blob(blob_name=blob_name,
chunk_size=self.chunk_size)
destination_file = temp_input_dir_path / blob_name
+ # Containment check: ``blob_name`` originates outside the
worker, and GCS
+ # allows object names containing ``..``. Resolve the target
and assert it
+ # stays under ``temp_input_dir_path`` so a hostile blob name
cannot write
+ # outside the worker's temp directory (CWE-22).
+ if not
destination_file.resolve().is_relative_to(temp_input_dir_resolved):
+ raise ValueError(
+ f"Refusing to download GCS blob {blob_name!r}:
resolved path "
+ f"{destination_file} escapes the temp directory
{temp_input_dir_path}."
+ )
destination_file.parent.mkdir(parents=True, exist_ok=True)
blob.download_to_filename(filename=str(destination_file))
diff --git a/providers/google/tests/unit/google/cloud/hooks/test_gcs.py
b/providers/google/tests/unit/google/cloud/hooks/test_gcs.py
index 6d516889acd..59bf1da4f0b 100644
--- a/providers/google/tests/unit/google/cloud/hooks/test_gcs.py
+++ b/providers/google/tests/unit/google/cloud/hooks/test_gcs.py
@@ -2003,3 +2003,31 @@ class TestSyncGcsHook:
assert "GCS object size (15) and local file size (9) differ." in
logs_string
assert f"Downloading dag_03.py to {sync_local_dir}/dag_03.py" in
logs_string
self.gcs_hook.download.assert_called_once()
+
+ @mock.patch(GCS_STRING.format("GCSHook.get_conn"))
+ def test_sync_to_local_dir_rejects_path_traversal(self, mock_get_conn,
tmp_path):
+ """A blob name that resolves outside ``local_dir`` must be refused.
+
+ GCS allows ``..`` segments in object names. Without a containment
check,
+ ``local_dir.joinpath(blob.name)`` could write outside the intended
directory
+ (CWE-22) — exploitable when the bucket is shared with untrusted
writers.
+ """
+ test_bucket = "test_bucket"
+ mock_bucket = self._create_bucket(name=test_bucket)
+ mock_get_conn.return_value.bucket.return_value = mock_bucket
+ mock_bucket.list_blobs.return_value = [
+ self._create_blob("../escape.py", "C1", mock_bucket),
+ ]
+
+ sync_local_dir = tmp_path / "gcs_sync_dir"
+ sync_local_dir.mkdir()
+ self.gcs_hook.download = MagicMock()
+
+ with pytest.raises(ValueError, match="escapes the target directory"):
+ self.gcs_hook.sync_to_local_dir(
+ bucket_name=test_bucket, local_dir=sync_local_dir, prefix="",
delete_stale=False
+ )
+
+ self.gcs_hook.download.assert_not_called()
+ # Nothing should have been written outside the sync dir.
+ assert not (tmp_path / "escape.py").exists()
diff --git a/providers/google/tests/unit/google/cloud/operators/test_gcs.py
b/providers/google/tests/unit/google/cloud/operators/test_gcs.py
index 26281bdc7a8..f48aaddba75 100644
--- a/providers/google/tests/unit/google/cloud/operators/test_gcs.py
+++ b/providers/google/tests/unit/google/cloud/operators/test_gcs.py
@@ -991,6 +991,44 @@ class TestGCSTimeSpanFileTransformOperator:
other_future.cancel.assert_called()
+ @mock.patch("airflow.providers.google.cloud.operators.gcs.subprocess")
+ @mock.patch("airflow.providers.google.cloud.operators.gcs.GCSHook")
+ def test_execute_rejects_path_traversal_in_blob_name(self, mock_hook,
mock_subprocess, tmp_path):
+ """A blob name that resolves outside the temp input dir must be
refused.
+
+ GCS allows ``..`` segments in object names. Without a containment
check,
+ ``temp_input_dir_path / blob_name`` could write outside the worker's
temp
+ directory (CWE-22) — exploitable when the source bucket is shared with
+ untrusted writers.
+ """
+ timespan_start = datetime(2015, 2, 1, 15, 16, 17, 345,
tzinfo=timezone.utc)
+ timespan_end = timespan_start + timedelta(hours=1)
+ context = dict(
+ logical_date=timespan_start,
+ data_interval_start=timespan_start,
+ data_interval_end=timespan_end,
+ ti=mock.Mock(),
+ task=mock.MagicMock(),
+ )
+
+ mock_hook.return_value.list_by_timespan.return_value = ["../escape.py"]
+ mock_client, mock_bucket, mock_blob =
self._setup_gcs_client_chain(mock_hook)
+
+ op = GCSTimeSpanFileTransformOperator(
+ task_id=TASK_ID,
+ source_bucket=TEST_BUCKET,
+ source_prefix=None,
+ source_gcp_conn_id="",
+ destination_bucket=TEST_BUCKET + "_dest",
+ destination_prefix=None,
+ destination_gcp_conn_id="",
+ transform_script="script.py",
+ )
+
+ with pytest.raises(ValueError, match="escapes the temp directory"):
+ op.execute(context=context)
+ mock_blob.download_to_filename.assert_not_called()
+
class TestGCSDeleteBucketOperator:
@mock.patch("airflow.providers.google.cloud.operators.gcs.GCSHook")