This is an automated email from the ASF dual-hosted git repository.
mengw15 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new bfa79a7c69 refactor(amber): stop hardcoding S3 in REST catalog init
(#4988)
bfa79a7c69 is described below
commit bfa79a7c69e43ac274553468a6cc07dfca5b7a6e
Author: Meng Wang <[email protected]>
AuthorDate: Sat May 16 10:39:03 2026 -0700
refactor(amber): stop hardcoding S3 in REST catalog init (#4988)
### What changes were proposed in this PR?
Stop hardcoding `s3.endpoint`, `s3.region`, `s3.path-style-access`,
`s3.access-key-id` and `s3.secret-access-key` at REST-catalog init in
both `IcebergUtil.createRestCatalog` (Scala) and
`iceberg_utils.create_rest_catalog` (Python). Both helpers now pass only
`warehouse` + catalog `uri` (and on the Scala side the `FileIO` impl
hint).
**Why:** When a Lakekeeper warehouse is created, its S3 settings
(endpoint, region, credentials, path-style) are registered against that
warehouse on the server. At catalog init the client only needs
`warehouse` + `uri` — Lakekeeper resolves the S3 config from the
warehouse record and serves it back. The hardcoded `StorageConfig.s3*`
values on the client were redundant, and forcing them everywhere also
pinned every warehouse to the single system bucket. Removing them lets
each warehouse own its own storage settings.
`StorageConfig.s3*` itself is kept —
`pytexera/storage/large_binary_manager.py` still uses it for the
non-Iceberg `texera-large-binaries` bucket (R UDF large-binary support),
which is out of scope.
### Any related issues, documentation, discussions?
Closes #4987
### How was this PR tested?
- `sbt "WorkflowCore/compile"` — passes; verifies no other Scala caller
depends on the removed properties.
- Python edits parse cleanly via `ast.parse`; the only caller
(`iceberg_catalog_instance.py`) is updated to match the new
`create_rest_catalog` signature.
End-to-end verification (warehouse with its own S3 settings → REST
catalog opened with only `warehouse` + `uri` → table round-trip)
requires a running Lakekeeper, which CI doesn't have today. #4276
(draft) wires Lakekeeper into CI; once that lands I'll add the
integration test on top of it.
### Was this PR authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Opus 4.7)
---------
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
.../core/storage/iceberg/iceberg_catalog_instance.py | 4 ----
.../main/python/core/storage/iceberg/iceberg_utils.py | 17 ++---------------
.../iceberg/test_iceberg_rest_catalog_integration.py | 4 ----
.../org/apache/texera/amber/util/IcebergUtil.scala | 17 +++++------------
.../org/apache/texera/amber/util/IcebergUtilSpec.scala | 10 ++++++++++
5 files changed, 17 insertions(+), 35 deletions(-)
diff --git
a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py
b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py
index 0059808f9f..65a4f6beec 100644
--- a/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py
+++ b/amber/src/main/python/core/storage/iceberg/iceberg_catalog_instance.py
@@ -60,10 +60,6 @@ class IcebergCatalogInstance:
"texera_iceberg",
StorageConfig.ICEBERG_REST_CATALOG_WAREHOUSE_NAME,
StorageConfig.ICEBERG_REST_CATALOG_URI,
- StorageConfig.S3_ENDPOINT,
- StorageConfig.S3_REGION,
- StorageConfig.S3_AUTH_USERNAME,
- StorageConfig.S3_AUTH_PASSWORD,
)
else:
raise ValueError(f"Unsupported catalog type: {catalog_type}")
diff --git a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py
b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py
index 844ef3e00f..c66ba74ba2 100644
--- a/amber/src/main/python/core/storage/iceberg/iceberg_utils.py
+++ b/amber/src/main/python/core/storage/iceberg/iceberg_utils.py
@@ -157,23 +157,15 @@ def create_rest_catalog(
catalog_name: str,
warehouse_name: str,
rest_uri: str,
- s3_endpoint: str,
- s3_region: str,
- s3_username: str,
- s3_password: str,
) -> Catalog:
"""
Creates a REST catalog instance by connecting to a REST endpoint.
- - Configures the catalog to interact with a REST endpoint.
- The warehouse_name parameter specifies the warehouse identifier.
- - Configures S3FileIO for MinIO/S3 storage backend.
+ - S3 settings (endpoint, region, credentials) are supplied by the REST
+ catalog server at runtime.
:param catalog_name: the name of the catalog.
:param warehouse_name: the warehouse identifier.
:param rest_uri: the URI of the REST catalog endpoint.
- :param s3_endpoint: the S3 endpoint URL.
- :param s3_region: the S3 region.
- :param s3_username: the S3 access key ID.
- :param s3_password: the S3 secret access key.
:return: a Catalog instance (REST catalog).
"""
return load_catalog(
@@ -182,11 +174,6 @@ def create_rest_catalog(
"type": "rest",
"uri": rest_uri,
"warehouse": warehouse_name,
- "s3.endpoint": s3_endpoint,
- "s3.access-key-id": s3_username,
- "s3.secret-access-key": s3_password,
- "s3.region": s3_region,
- "s3.path-style-access": "true",
},
)
diff --git
a/amber/src/test/python/core/storage/iceberg/test_iceberg_rest_catalog_integration.py
b/amber/src/test/python/core/storage/iceberg/test_iceberg_rest_catalog_integration.py
index 642fbb08e5..f023bd70d5 100644
---
a/amber/src/test/python/core/storage/iceberg/test_iceberg_rest_catalog_integration.py
+++
b/amber/src/test/python/core/storage/iceberg/test_iceberg_rest_catalog_integration.py
@@ -33,10 +33,6 @@ def rest_catalog():
catalog_name="rest_integration_test",
warehouse_name="texera",
rest_uri="http://localhost:8181/catalog/",
- s3_endpoint="http://localhost:9000",
- s3_region="us-west-2",
- s3_username="texera_minio",
- s3_password="password",
)
diff --git
a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala
b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala
index d6e406a53f..cee293a758 100644
---
a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala
+++
b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/IcebergUtil.scala
@@ -108,19 +108,12 @@ object IcebergUtil {
): RESTCatalog = {
val catalog = new RESTCatalog()
- // Build base properties map
- var properties = Map(
+ // S3 settings (endpoint, region, credentials) are supplied by the REST
+ // catalog server at runtime.
+ val properties = Map(
"warehouse" -> warehouse,
- CatalogProperties.URI -> StorageConfig.icebergRESTCatalogUri
- )
-
- properties = properties ++ Map(
- CatalogProperties.FILE_IO_IMPL -> classOf[S3FileIO].getName,
- "s3.endpoint" -> StorageConfig.s3Endpoint,
- "s3.access-key-id" -> StorageConfig.s3Username,
- "s3.secret-access-key" -> StorageConfig.s3Password,
- "s3.region" -> StorageConfig.s3Region,
- "s3.path-style-access" -> "true"
+ CatalogProperties.URI -> StorageConfig.icebergRESTCatalogUri,
+ CatalogProperties.FILE_IO_IMPL -> classOf[S3FileIO].getName
)
catalog.initialize(catalogName, properties.asJava)
diff --git
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/IcebergUtilSpec.scala
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/IcebergUtilSpec.scala
index da15a8060d..75a997cbb5 100644
---
a/common/workflow-core/src/test/scala/org/apache/texera/amber/util/IcebergUtilSpec.scala
+++
b/common/workflow-core/src/test/scala/org/apache/texera/amber/util/IcebergUtilSpec.scala
@@ -22,6 +22,7 @@ package org.apache.texera.amber.util
import org.apache.texera.amber.core.tuple.{AttributeType, LargeBinary, Schema,
Tuple}
import org.apache.texera.amber.util.IcebergUtil.toIcebergSchema
import org.apache.iceberg.data.GenericRecord
+import org.apache.iceberg.exceptions.RESTException
import org.apache.iceberg.types.Types
import org.apache.iceberg.{Schema => IcebergSchema}
import org.scalatest.flatspec.AnyFlatSpec
@@ -298,4 +299,13 @@ class IcebergUtilSpec extends AnyFlatSpec {
assert(IcebergUtil.fromRecord(record, schema) == tuple)
}
+
+ it should "surface RESTException when createRestCatalog cannot reach the
REST endpoint" in {
+ // Property Map is built before any network call. With or without
+ // Lakekeeper reachable, .initialize surfaces a RESTException — the
+ // failure is on the server side, not from Map composition.
+ intercept[RESTException] {
+ IcebergUtil.createRestCatalog("test", "non-existent-warehouse")
+ }
+ }
}