This is an automated email from the ASF dual-hosted git repository.

Yicong-Huang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git


The following commit(s) were added to refs/heads/main by this push:
     new 6b7e8cf7ce test(pyamber): add unit tests for DatasetFileDocument 
(#4702)
6b7e8cf7ce is described below

commit 6b7e8cf7ce2b2a6f302eff52f2a05a85d23f2bb1
Author: Yicong Huang <[email protected]>
AuthorDate: Sat May 2 17:00:27 2026 -0700

    test(pyamber): add unit tests for DatasetFileDocument (#4702)
    
    ### What changes were proposed in this PR?
    
    Adds pytest coverage for
    `amber/src/main/python/pytexera/storage/dataset_file_document.py`.
    
    ### Any related issues, documentation, discussions?
    
    Closes #4701.
    
    Potential bug noted while reading the module (not pinned by these
    tests): `get_presigned_url` does `response.json().get("presignedUrl")`,
    so a 200 response that omits the `presignedUrl` field silently returns
    `None` rather than raising. `read_file` then calls `requests.get(None)`
    and the caller gets a less-actionable error than the explicit-status
    path.
    
    ### How was this PR tested?
    
    ```
    cd amber/src/main/python
    ruff check pytexera/storage/test_dataset_file_document.py
    ruff format --check pytexera/storage/test_dataset_file_document.py
    python -m pytest pytexera/storage/test_dataset_file_document.py
    ```
    
    ### Was this PR authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (claude-opus-4-7)
---
 .../pytexera/storage/test_dataset_file_document.py | 192 +++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git 
a/amber/src/main/python/pytexera/storage/test_dataset_file_document.py 
b/amber/src/main/python/pytexera/storage/test_dataset_file_document.py
new file mode 100644
index 0000000000..ecf9dd5b8c
--- /dev/null
+++ b/amber/src/main/python/pytexera/storage/test_dataset_file_document.py
@@ -0,0 +1,192 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+from pytexera.storage.dataset_file_document import DatasetFileDocument
+
+
+DEFAULT_ENDPOINT = "http://localhost:9092/api/dataset/presign-download";
+CUSTOM_ENDPOINT = "https://example.test/api/presign";
+
+
[email protected]
+def auth_env(monkeypatch):
+    """Provide a JWT and pinned presign endpoint for the duration of one 
test."""
+    monkeypatch.setenv("USER_JWT_TOKEN", "test-jwt-token")
+    monkeypatch.setenv("FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT", 
CUSTOM_ENDPOINT)
+
+
+def make_response(status_code: int, body=None, content: bytes = b""):
+    response = MagicMock()
+    response.status_code = status_code
+    response.json.return_value = body or {}
+    response.text = "" if body is None else str(body)
+    response.content = content
+    return response
+
+
+class TestDatasetFileDocumentInit:
+    def test_parses_minimal_four_part_path(self, auth_env):
+        doc = DatasetFileDocument("/[email protected]/ds/v1/file.csv")
+        assert doc.owner_email == "[email protected]"
+        assert doc.dataset_name == "ds"
+        assert doc.version_name == "v1"
+        assert doc.file_relative_path == "file.csv"
+
+    def test_joins_nested_relative_path_back_with_slashes(self, auth_env):
+        doc = DatasetFileDocument("/[email protected]/ds/v1/a/b/c/file.csv")
+        assert doc.file_relative_path == "a/b/c/file.csv"
+
+    def test_strips_leading_and_trailing_slashes_before_parsing(self, 
auth_env):
+        doc = DatasetFileDocument("///[email protected]/ds/v1/file.csv///")
+        assert doc.owner_email == "[email protected]"
+        assert doc.file_relative_path == "file.csv"
+
+    def test_rejects_path_with_fewer_than_four_segments(self, auth_env):
+        with pytest.raises(ValueError, match="Invalid file path format"):
+            DatasetFileDocument("/[email protected]/ds/v1")
+
+    def test_requires_jwt_token_in_environment(self, monkeypatch):
+        monkeypatch.delenv("USER_JWT_TOKEN", raising=False)
+        monkeypatch.setenv("FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT", 
CUSTOM_ENDPOINT)
+        with pytest.raises(ValueError, match="JWT token is required"):
+            DatasetFileDocument("/[email protected]/ds/v1/file.csv")
+
+    def test_treats_empty_jwt_as_missing(self, monkeypatch):
+        # An empty string is falsy and should be rejected just like an unset 
var.
+        monkeypatch.setenv("USER_JWT_TOKEN", "")
+        with pytest.raises(ValueError, match="JWT token is required"):
+            DatasetFileDocument("/[email protected]/ds/v1/file.csv")
+
+    def test_falls_back_to_default_endpoint_when_env_missing(self, 
monkeypatch):
+        monkeypatch.setenv("USER_JWT_TOKEN", "tok")
+        monkeypatch.delenv("FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT", 
raising=False)
+        doc = DatasetFileDocument("/[email protected]/ds/v1/file.csv")
+        assert doc.presign_endpoint == DEFAULT_ENDPOINT
+
+    def test_uses_explicit_endpoint_from_environment(self, auth_env):
+        doc = DatasetFileDocument("/[email protected]/ds/v1/file.csv")
+        assert doc.presign_endpoint == CUSTOM_ENDPOINT
+
+
+class TestGetPresignedUrl:
+    def _make_doc(self, monkeypatch, path="/[email protected]/ds/v1/file.csv"):
+        monkeypatch.setenv("USER_JWT_TOKEN", "test-jwt-token")
+        monkeypatch.setenv("FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT", 
CUSTOM_ENDPOINT)
+        return DatasetFileDocument(path)
+
+    def test_returns_presigned_url_field_from_json_body(self, monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(
+                200, body={"presignedUrl": "https://signed.test/x"}
+            )
+            assert doc.get_presigned_url() == "https://signed.test/x";
+
+    def test_sends_bearer_authorization_header_with_jwt(self, monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(200, body={"presignedUrl": 
"u"})
+            doc.get_presigned_url()
+            _, kwargs = mock_get.call_args
+            assert kwargs["headers"] == {"Authorization": "Bearer 
test-jwt-token"}
+
+    def test_url_encodes_filepath_query_parameter(self, monkeypatch):
+        # urllib.parse.quote keeps "/" as safe by default, but encodes "@"
+        # and " " — pin both pieces so the contract is explicit.
+        doc = self._make_doc(monkeypatch, path="/[email protected]/ds/v1/data 
file.csv")
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(200, body={"presignedUrl": 
"u"})
+            doc.get_presigned_url()
+            _, kwargs = mock_get.call_args
+            file_path = kwargs["params"]["filePath"]
+            assert "data%20file.csv" in file_path
+            assert "bob%40x.com" in file_path
+            assert file_path.startswith("/")
+
+    def test_calls_configured_endpoint(self, monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(200, body={"presignedUrl": 
"u"})
+            doc.get_presigned_url()
+            args, _ = mock_get.call_args
+            assert args[0] == CUSTOM_ENDPOINT
+
+    def test_raises_runtime_error_with_status_and_body_on_failure(self, 
monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(403, body="forbidden")
+            with pytest.raises(RuntimeError, match=r"403.*forbidden"):
+                doc.get_presigned_url()
+
+    def test_returns_none_when_response_body_lacks_presigned_url_key(self, 
monkeypatch):
+        # Pins current behavior: a 200 with no "presignedUrl" key yields None
+        # rather than raising. read_file() will then call requests.get(None).
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(200, body={"other": "value"})
+            assert doc.get_presigned_url() is None
+
+
+class TestReadFile:
+    def _make_doc(self, monkeypatch):
+        monkeypatch.setenv("USER_JWT_TOKEN", "test-jwt-token")
+        monkeypatch.setenv("FILE_SERVICE_GET_PRESIGNED_URL_ENDPOINT", 
CUSTOM_ENDPOINT)
+        return DatasetFileDocument("/[email protected]/ds/v1/file.csv")
+
+    def test_returns_bytesio_with_downloaded_content(self, monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.side_effect = [
+                make_response(200, body={"presignedUrl": 
"https://signed.test/x"}),
+                make_response(200, content=b"hello-bytes"),
+            ]
+            buf = doc.read_file()
+            assert isinstance(buf, io.BytesIO)
+            assert buf.read() == b"hello-bytes"
+
+    def test_propagates_presigned_url_failure(self, monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.return_value = make_response(500, body="upstream down")
+            with pytest.raises(RuntimeError, match=r"500.*upstream down"):
+                doc.read_file()
+
+    def test_raises_runtime_error_when_download_fails(self, monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.side_effect = [
+                make_response(200, body={"presignedUrl": 
"https://signed.test/x"}),
+                make_response(404, body="missing"),
+            ]
+            with pytest.raises(RuntimeError, match=r"404.*missing"):
+                doc.read_file()
+
+    def test_downloads_from_presigned_url_returned_by_first_call(self, 
monkeypatch):
+        doc = self._make_doc(monkeypatch)
+        with patch("pytexera.storage.dataset_file_document.requests.get") as 
mock_get:
+            mock_get.side_effect = [
+                make_response(200, body={"presignedUrl": 
"https://signed.test/x"}),
+                make_response(200, content=b""),
+            ]
+            doc.read_file()
+            second_call_args, _ = mock_get.call_args_list[1]
+            assert second_call_args[0] == "https://signed.test/x";

Reply via email to