gh-yzou commented on code in PR #1862:
URL: https://github.com/apache/polaris/pull/1862#discussion_r2178168915
##########
plugins/spark/v3.5/integration/build.gradle.kts:
##########
@@ -60,12 +60,51 @@ dependencies {
exclude("org.apache.logging.log4j", "log4j-core")
exclude("org.slf4j", "jul-to-slf4j")
}
+
+ // Add spark-hive for Hudi integration - provides HiveExternalCatalog that
Hudi needs
+
testImplementation("org.apache.spark:spark-hive_${scalaVersion}:${spark35Version}")
{
+ // exclude log4j dependencies to match spark-sql exclusions
Review Comment:
can we remove the spark_sql dependency above?
##########
plugins/spark/v3.5/integration/src/intTest/resources/logback.xml:
##########
@@ -32,6 +32,9 @@ out the configuration if you would like ot see all spark
debug log during the ru
</encoder>
</appender>
+ <!-- Hudi-specific loggers for test -->
+ <logger name="org.apache.hudi" level="INFO"/>
Review Comment:
does hudi output a log of logs? too much logging was causing problems to the
test efficiency, so we only turned on the error log here. How long does the
integration test take now?
##########
plugins/spark/v3.5/spark/build.gradle.kts:
##########
@@ -46,6 +46,47 @@ dependencies {
// TODO: extract a polaris-rest module as a thin layer for
// client to depends on.
implementation(project(":polaris-core")) { isTransitive = false }
+ implementation(project(":polaris-api-iceberg-service")) {
Review Comment:
why does hudi need those dependency ?
##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/utils/PolarisCatalogUtils.java:
##########
@@ -87,6 +119,32 @@ public static Table loadSparkTable(GenericTable
genericTable) {
provider, new CaseInsensitiveStringMap(tableProperties),
scala.Option.empty());
}
+ public static Table loadHudiSparkTable(GenericTable genericTable, Identifier
identifier) {
+ SparkSession sparkSession = SparkSession.active();
+ Map<String, String> tableProperties = Maps.newHashMap();
+ tableProperties.putAll(genericTable.getProperties());
+ tableProperties.put(
+ TABLE_PATH_KEY,
genericTable.getProperties().get(TableCatalog.PROP_LOCATION));
+ String namespacePath = String.join(".", identifier.namespace());
+ TableIdentifier tableIdentifier =
+ new TableIdentifier(identifier.name(), Option.apply(namespacePath));
+ CatalogTable catalogTable = null;
+ try {
+ catalogTable =
sparkSession.sessionState().catalog().getTableMetadata(tableIdentifier);
+ } catch (NoSuchDatabaseException e) {
+ throw new RuntimeException(
+ "No database found for the given tableIdentifier:" +
tableIdentifier, e);
+ } catch (NoSuchTableException e) {
+ LOG.debug("No table currently exists, as an initial create table");
+ }
+ return new HoodieInternalV2Table(
Review Comment:
does hudi catalog provides any load table functionality, which also reads
the hudi logs? if yes, we can load the hudi catalog here and call the
functions, it would be better if we could avoid any table format specific
dependency in the client
##########
plugins/spark/v3.5/spark/build.gradle.kts:
##########
@@ -46,6 +46,47 @@ dependencies {
// TODO: extract a polaris-rest module as a thin layer for
// client to depends on.
implementation(project(":polaris-core")) { isTransitive = false }
+ implementation(project(":polaris-api-iceberg-service")) {
+ // exclude the iceberg dependencies, use the ones pulled
+ // by iceberg-core
+ exclude("org.apache.iceberg", "*")
+ // exclude all cloud and quarkus specific dependencies to avoid
+ // running into problems with signature files.
+ exclude("com.azure", "*")
+ exclude("software.amazon.awssdk", "*")
+ exclude("com.google.cloud", "*")
+ exclude("io.airlift", "*")
+ exclude("io.smallrye", "*")
+ exclude("io.smallrye.common", "*")
+ exclude("io.swagger", "*")
+ exclude("org.apache.commons", "*")
+ }
+ implementation(project(":polaris-api-catalog-service")) {
+ exclude("org.apache.iceberg", "*")
+ exclude("com.azure", "*")
+ exclude("software.amazon.awssdk", "*")
+ exclude("com.google.cloud", "*")
+ exclude("io.airlift", "*")
+ exclude("io.smallrye", "*")
+ exclude("io.smallrye.common", "*")
+ exclude("io.swagger", "*")
+ exclude("org.apache.commons", "*")
+ }
+ implementation(project(":polaris-core")) {
+ exclude("org.apache.iceberg", "*")
+ exclude("com.azure", "*")
+ exclude("software.amazon.awssdk", "*")
+ exclude("com.google.cloud", "*")
+ exclude("io.airlift", "*")
+ exclude("io.smallrye", "*")
+ exclude("io.smallrye.common", "*")
+ exclude("io.swagger", "*")
+ exclude("org.apache.commons", "*")
+ }
+
+ implementation("org.apache.iceberg:iceberg-core:${icebergVersion}")
+ compileOnly("org.apache.hudi:hudi-spark3.5-bundle_${scalaVersion}:0.15.0")
Review Comment:
i don't think we need to depends on hudi bundle here, similar as how we
handles delta
##########
plugins/spark/v3.5/spark/build.gradle.kts:
##########
@@ -46,6 +46,47 @@ dependencies {
// TODO: extract a polaris-rest module as a thin layer for
// client to depends on.
implementation(project(":polaris-core")) { isTransitive = false }
+ implementation(project(":polaris-api-iceberg-service")) {
+ // exclude the iceberg dependencies, use the ones pulled
+ // by iceberg-core
+ exclude("org.apache.iceberg", "*")
+ // exclude all cloud and quarkus specific dependencies to avoid
+ // running into problems with signature files.
+ exclude("com.azure", "*")
+ exclude("software.amazon.awssdk", "*")
+ exclude("com.google.cloud", "*")
+ exclude("io.airlift", "*")
+ exclude("io.smallrye", "*")
+ exclude("io.smallrye.common", "*")
+ exclude("io.swagger", "*")
+ exclude("org.apache.commons", "*")
+ }
+ implementation(project(":polaris-api-catalog-service")) {
+ exclude("org.apache.iceberg", "*")
+ exclude("com.azure", "*")
+ exclude("software.amazon.awssdk", "*")
+ exclude("com.google.cloud", "*")
+ exclude("io.airlift", "*")
+ exclude("io.smallrye", "*")
+ exclude("io.smallrye.common", "*")
+ exclude("io.swagger", "*")
+ exclude("org.apache.commons", "*")
+ }
+ implementation(project(":polaris-core")) {
+ exclude("org.apache.iceberg", "*")
+ exclude("com.azure", "*")
+ exclude("software.amazon.awssdk", "*")
+ exclude("com.google.cloud", "*")
+ exclude("io.airlift", "*")
+ exclude("io.smallrye", "*")
+ exclude("io.smallrye.common", "*")
+ exclude("io.swagger", "*")
+ exclude("org.apache.commons", "*")
+ }
+
+ implementation("org.apache.iceberg:iceberg-core:${icebergVersion}")
Review Comment:
we do not intend to depends on the iceberg core, it causes spark compatible
issues
##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/SparkCatalog.java:
##########
@@ -270,18 +286,24 @@ public Map<String, String> loadNamespaceMetadata(String[]
namespace)
public void createNamespace(String[] namespace, Map<String, String> metadata)
throws NamespaceAlreadyExistsException {
this.icebergsSparkCatalog.createNamespace(namespace, metadata);
+ HudiCatalogUtils.createNamespace(namespace, metadata);
Review Comment:
What is the difference between delta and hudi? I assume the hudi catalog
will be used as spark session catalog and directly called for namespace
operations
##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/utils/HudiCatalogUtils.java:
##########
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.polaris.spark.utils;
+
+import java.util.Map;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.connector.catalog.NamespaceChange;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility class for Hudi-specific catalog operations, particularly namespace
synchronization
+ * between Polaris catalog and Spark session catalog for Hudi compatibility.
+ *
+ * <p>Hudi table loading requires namespace validation through the session
catalog, but only the
+ * Polaris catalog contains the actual namespace metadata. This class provides
methods to
+ * synchronize namespace operations to maintain consistency between catalogs.
+ */
+public class HudiCatalogUtils {
+ private static final Logger LOG =
LoggerFactory.getLogger(HudiCatalogUtils.class);
+
+ /**
+ * Synchronizes namespace creation to session catalog when Hudi extension is
enabled. This ensures
Review Comment:
@rahil-c this part of change is actually out of my expectation, I might need
some more time to understand the motivation for this part. Can you update the
readme for the project with instruction about how to use hudi? I want to
checkout the code and try it out. Thanks!
##########
plugins/spark/v3.5/integration/build.gradle.kts:
##########
@@ -60,12 +60,51 @@ dependencies {
exclude("org.apache.logging.log4j", "log4j-core")
exclude("org.slf4j", "jul-to-slf4j")
}
+
+ // Add spark-hive for Hudi integration - provides HiveExternalCatalog that
Hudi needs
+
testImplementation("org.apache.spark:spark-hive_${scalaVersion}:${spark35Version}")
{
+ // exclude log4j dependencies to match spark-sql exclusions
+ exclude("org.apache.logging.log4j", "log4j-slf4j2-impl")
+ exclude("org.apache.logging.log4j", "log4j-1.2-api")
+ exclude("org.apache.logging.log4j", "log4j-core")
+ exclude("org.slf4j", "jul-to-slf4j")
+ // exclude old slf4j 1.x to log4j 2.x bridge that conflicts with slf4j 2.x
bridge
+ exclude("org.apache.logging.log4j", "log4j-slf4j-impl")
+ }
// enforce the usage of log4j 2.24.3. This is for the log4j-api compatibility
// of spark-sql dependency
testRuntimeOnly("org.apache.logging.log4j:log4j-core:2.24.3")
testRuntimeOnly("org.apache.logging.log4j:log4j-slf4j2-impl:2.24.3")
testImplementation("io.delta:delta-spark_${scalaVersion}:3.3.1")
+
testImplementation("org.apache.hudi:hudi-spark3.5-bundle_${scalaVersion}:0.15.0")
{
+ // exclude log4j dependencies to match spark-sql exclusions and prevent
version conflicts
Review Comment:
does the bundle already contain all spark dependency needed? if that is the
case, we shouldn't need the spark_hive dependency anymore, right?
##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/SparkCatalog.java:
##########
@@ -156,15 +160,23 @@ public Table createTable(
throw new UnsupportedOperationException(
"Create table without location key is not supported by Polaris.
Please provide location or path on table creation.");
}
-
if (PolarisCatalogUtils.useDelta(provider)) {
// For delta table, we load the delta catalog to help dealing with the
// delta log creation.
TableCatalog deltaCatalog =
deltaHelper.loadDeltaCatalog(this.polarisSparkCatalog);
return deltaCatalog.createTable(ident, schema, transforms, properties);
- } else {
- return this.polarisSparkCatalog.createTable(ident, schema, transforms,
properties);
}
+ if (PolarisCatalogUtils.useHudi(provider)) {
+ // First make a call via polaris's spark catalog
+ // to ensure an entity is created within the catalog and is authorized
+ polarisSparkCatalog.createTable(ident, schema, transforms, properties);
+
+ // Then for actually creating the hudi table, we load HoodieCatalog
+ // to create the .hoodie folder in cloud storage
+ TableCatalog hudiCatalog =
hudiHelper.loadHudiCatalog(this.polarisSparkCatalog);
Review Comment:
it might be better to first make sure the hudiCatalog.createTable can be
done successfully first, and then create the catalog with the remote service
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]