Re: [PR] Initial integration for hudi tables within Polaris [polaris]

via GitHub Tue, 01 Jul 2025 10:48:18 -0700


gh-yzou commented on code in PR #1862:
URL: https://github.com/apache/polaris/pull/1862#discussion_r2178168915



##########
plugins/spark/v3.5/integration/build.gradle.kts:
##########
@@ -60,12 +60,51 @@ dependencies {
     exclude("org.apache.logging.log4j", "log4j-core")
     exclude("org.slf4j", "jul-to-slf4j")
   }
+
+  // Add spark-hive for Hudi integration - provides HiveExternalCatalog that 
Hudi needs
+  
testImplementation("org.apache.spark:spark-hive_${scalaVersion}:${spark35Version}")
 {
+    // exclude log4j dependencies to match spark-sql exclusions

Review Comment:
   can we remove the spark_sql dependency above?



##########
plugins/spark/v3.5/integration/src/intTest/resources/logback.xml:
##########
@@ -32,6 +32,9 @@ out the configuration if you would like ot see all spark 
debug log during the ru
     </encoder>
   </appender>
 
+  <!-- Hudi-specific loggers for test -->
+  <logger name="org.apache.hudi" level="INFO"/>

Review Comment:
   does hudi output a log of logs? too much logging was causing problems to the 
test efficiency, so we only turned on the error log here. How long does the 
integration test take now?



##########
plugins/spark/v3.5/spark/build.gradle.kts:
##########
@@ -46,6 +46,47 @@ dependencies {
   // TODO: extract a polaris-rest module as a thin layer for
   //  client to depends on.
   implementation(project(":polaris-core")) { isTransitive = false }
+  implementation(project(":polaris-api-iceberg-service")) {

Review Comment:
   why does hudi need those dependency ?



##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/utils/PolarisCatalogUtils.java:
##########
@@ -87,6 +119,32 @@ public static Table loadSparkTable(GenericTable 
genericTable) {
         provider, new CaseInsensitiveStringMap(tableProperties), 
scala.Option.empty());
   }
 
+  public static Table loadHudiSparkTable(GenericTable genericTable, Identifier 
identifier) {
+    SparkSession sparkSession = SparkSession.active();
+    Map<String, String> tableProperties = Maps.newHashMap();
+    tableProperties.putAll(genericTable.getProperties());
+    tableProperties.put(
+        TABLE_PATH_KEY, 
genericTable.getProperties().get(TableCatalog.PROP_LOCATION));
+    String namespacePath = String.join(".", identifier.namespace());
+    TableIdentifier tableIdentifier =
+        new TableIdentifier(identifier.name(), Option.apply(namespacePath));
+    CatalogTable catalogTable = null;
+    try {
+      catalogTable = 
sparkSession.sessionState().catalog().getTableMetadata(tableIdentifier);
+    } catch (NoSuchDatabaseException e) {
+      throw new RuntimeException(
+          "No database found for the given tableIdentifier:" + 
tableIdentifier, e);
+    } catch (NoSuchTableException e) {
+      LOG.debug("No table currently exists, as an initial create table");
+    }
+    return new HoodieInternalV2Table(

Review Comment:
   does hudi catalog provides any load table functionality, which also reads 
the hudi logs? if yes, we can load the hudi catalog here and call the 
functions, it would be better if we could avoid any table format specific 
dependency in the client



##########
plugins/spark/v3.5/spark/build.gradle.kts:
##########
@@ -46,6 +46,47 @@ dependencies {
   // TODO: extract a polaris-rest module as a thin layer for
   //  client to depends on.
   implementation(project(":polaris-core")) { isTransitive = false }
+  implementation(project(":polaris-api-iceberg-service")) {
+    // exclude the iceberg dependencies, use the ones pulled
+    // by iceberg-core
+    exclude("org.apache.iceberg", "*")
+    // exclude all cloud and quarkus specific dependencies to avoid
+    // running into problems with signature files.
+    exclude("com.azure", "*")
+    exclude("software.amazon.awssdk", "*")
+    exclude("com.google.cloud", "*")
+    exclude("io.airlift", "*")
+    exclude("io.smallrye", "*")
+    exclude("io.smallrye.common", "*")
+    exclude("io.swagger", "*")
+    exclude("org.apache.commons", "*")
+  }
+  implementation(project(":polaris-api-catalog-service")) {
+    exclude("org.apache.iceberg", "*")
+    exclude("com.azure", "*")
+    exclude("software.amazon.awssdk", "*")
+    exclude("com.google.cloud", "*")
+    exclude("io.airlift", "*")
+    exclude("io.smallrye", "*")
+    exclude("io.smallrye.common", "*")
+    exclude("io.swagger", "*")
+    exclude("org.apache.commons", "*")
+  }
+  implementation(project(":polaris-core")) {
+    exclude("org.apache.iceberg", "*")
+    exclude("com.azure", "*")
+    exclude("software.amazon.awssdk", "*")
+    exclude("com.google.cloud", "*")
+    exclude("io.airlift", "*")
+    exclude("io.smallrye", "*")
+    exclude("io.smallrye.common", "*")
+    exclude("io.swagger", "*")
+    exclude("org.apache.commons", "*")
+  }
+
+  implementation("org.apache.iceberg:iceberg-core:${icebergVersion}")
+  compileOnly("org.apache.hudi:hudi-spark3.5-bundle_${scalaVersion}:0.15.0")

Review Comment:
   i don't think we need to depends on hudi bundle here, similar as how we 
handles delta



##########
plugins/spark/v3.5/spark/build.gradle.kts:
##########
@@ -46,6 +46,47 @@ dependencies {
   // TODO: extract a polaris-rest module as a thin layer for
   //  client to depends on.
   implementation(project(":polaris-core")) { isTransitive = false }
+  implementation(project(":polaris-api-iceberg-service")) {
+    // exclude the iceberg dependencies, use the ones pulled
+    // by iceberg-core
+    exclude("org.apache.iceberg", "*")
+    // exclude all cloud and quarkus specific dependencies to avoid
+    // running into problems with signature files.
+    exclude("com.azure", "*")
+    exclude("software.amazon.awssdk", "*")
+    exclude("com.google.cloud", "*")
+    exclude("io.airlift", "*")
+    exclude("io.smallrye", "*")
+    exclude("io.smallrye.common", "*")
+    exclude("io.swagger", "*")
+    exclude("org.apache.commons", "*")
+  }
+  implementation(project(":polaris-api-catalog-service")) {
+    exclude("org.apache.iceberg", "*")
+    exclude("com.azure", "*")
+    exclude("software.amazon.awssdk", "*")
+    exclude("com.google.cloud", "*")
+    exclude("io.airlift", "*")
+    exclude("io.smallrye", "*")
+    exclude("io.smallrye.common", "*")
+    exclude("io.swagger", "*")
+    exclude("org.apache.commons", "*")
+  }
+  implementation(project(":polaris-core")) {
+    exclude("org.apache.iceberg", "*")
+    exclude("com.azure", "*")
+    exclude("software.amazon.awssdk", "*")
+    exclude("com.google.cloud", "*")
+    exclude("io.airlift", "*")
+    exclude("io.smallrye", "*")
+    exclude("io.smallrye.common", "*")
+    exclude("io.swagger", "*")
+    exclude("org.apache.commons", "*")
+  }
+
+  implementation("org.apache.iceberg:iceberg-core:${icebergVersion}")

Review Comment:
   we do not intend to depends on the iceberg core, it causes spark compatible 
issues



##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/SparkCatalog.java:
##########
@@ -270,18 +286,24 @@ public Map<String, String> loadNamespaceMetadata(String[] 
namespace)
   public void createNamespace(String[] namespace, Map<String, String> metadata)
       throws NamespaceAlreadyExistsException {
     this.icebergsSparkCatalog.createNamespace(namespace, metadata);
+    HudiCatalogUtils.createNamespace(namespace, metadata);

Review Comment:
   What is the difference between delta and hudi? I assume the hudi catalog 
will be used as spark session catalog and directly called for namespace 
operations



##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/utils/HudiCatalogUtils.java:
##########
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.polaris.spark.utils;
+
+import java.util.Map;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.connector.catalog.NamespaceChange;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility class for Hudi-specific catalog operations, particularly namespace 
synchronization
+ * between Polaris catalog and Spark session catalog for Hudi compatibility.
+ *
+ * <p>Hudi table loading requires namespace validation through the session 
catalog, but only the
+ * Polaris catalog contains the actual namespace metadata. This class provides 
methods to
+ * synchronize namespace operations to maintain consistency between catalogs.
+ */
+public class HudiCatalogUtils {
+  private static final Logger LOG = 
LoggerFactory.getLogger(HudiCatalogUtils.class);
+
+  /**
+   * Synchronizes namespace creation to session catalog when Hudi extension is 
enabled. This ensures

Review Comment:
   @rahil-c this part of change is actually out of my expectation, I might need 
some more time to understand the motivation for this part.  Can you update the 
readme for the project with instruction about how to use hudi? I want to 
checkout the code and try it out. Thanks!



##########
plugins/spark/v3.5/integration/build.gradle.kts:
##########
@@ -60,12 +60,51 @@ dependencies {
     exclude("org.apache.logging.log4j", "log4j-core")
     exclude("org.slf4j", "jul-to-slf4j")
   }
+
+  // Add spark-hive for Hudi integration - provides HiveExternalCatalog that 
Hudi needs
+  
testImplementation("org.apache.spark:spark-hive_${scalaVersion}:${spark35Version}")
 {
+    // exclude log4j dependencies to match spark-sql exclusions
+    exclude("org.apache.logging.log4j", "log4j-slf4j2-impl")
+    exclude("org.apache.logging.log4j", "log4j-1.2-api")
+    exclude("org.apache.logging.log4j", "log4j-core")
+    exclude("org.slf4j", "jul-to-slf4j")
+    // exclude old slf4j 1.x to log4j 2.x bridge that conflicts with slf4j 2.x 
bridge
+    exclude("org.apache.logging.log4j", "log4j-slf4j-impl")
+  }
   // enforce the usage of log4j 2.24.3. This is for the log4j-api compatibility
   // of spark-sql dependency
   testRuntimeOnly("org.apache.logging.log4j:log4j-core:2.24.3")
   testRuntimeOnly("org.apache.logging.log4j:log4j-slf4j2-impl:2.24.3")
 
   testImplementation("io.delta:delta-spark_${scalaVersion}:3.3.1")
+  
testImplementation("org.apache.hudi:hudi-spark3.5-bundle_${scalaVersion}:0.15.0")
 {
+    // exclude log4j dependencies to match spark-sql exclusions and prevent 
version conflicts

Review Comment:
   does the bundle already contain all spark dependency needed? if that is the 
case, we shouldn't need the spark_hive dependency anymore, right?



##########
plugins/spark/v3.5/spark/src/main/java/org/apache/polaris/spark/SparkCatalog.java:
##########
@@ -156,15 +160,23 @@ public Table createTable(
         throw new UnsupportedOperationException(
             "Create table without location key is not supported by Polaris. 
Please provide location or path on table creation.");
       }
-
       if (PolarisCatalogUtils.useDelta(provider)) {
         // For delta table, we load the delta catalog to help dealing with the
         // delta log creation.
         TableCatalog deltaCatalog = 
deltaHelper.loadDeltaCatalog(this.polarisSparkCatalog);
         return deltaCatalog.createTable(ident, schema, transforms, properties);
-      } else {
-        return this.polarisSparkCatalog.createTable(ident, schema, transforms, 
properties);
       }
+      if (PolarisCatalogUtils.useHudi(provider)) {
+        // First make a call via polaris's spark catalog
+        // to ensure an entity is created within the catalog and is authorized
+        polarisSparkCatalog.createTable(ident, schema, transforms, properties);
+
+        // Then for actually creating the hudi table, we load HoodieCatalog
+        // to create the .hoodie folder in cloud storage
+        TableCatalog hudiCatalog = 
hudiHelper.loadHudiCatalog(this.polarisSparkCatalog);

Review Comment:
   it might be better to first make sure the hudiCatalog.createTable can be 
done successfully first, and then create the catalog with the remote service



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Initial integration for hudi tables within Polaris [polaris]

Reply via email to