(hudi) branch master updated: feat(spark): Add HoodieSparkSqlUtils APIs for tooling (#18202)

sivabalan Tue, 24 Feb 2026 11:08:39 -0800

This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/master by this push:
     new 6156b74414d5 feat(spark): Add HoodieSparkSqlUtils APIs for tooling 
(#18202)
6156b74414d5 is described below

commit 6156b74414d58250557b691fc7ae5fd6e25d5ae4
Author: Surya Prasanna <[email protected]>
AuthorDate: Tue Feb 24 11:08:26 2026 -0800

    feat(spark): Add HoodieSparkSqlUtils APIs for tooling (#18202)
---
 .../java/org/apache/hudi/HoodieSparkSQLUtils.java  | 118 +++++++++++++++++++++
 .../hudi/utils/TestHoodieSparkSQLUtils.scala       | 109 +++++++++++++++++++
 2 files changed, 227 insertions(+)

diff --git 
a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieSparkSQLUtils.java
 
b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieSparkSQLUtils.java
new file mode 100644
index 000000000000..399dc5ec648f
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieSparkSQLUtils.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi;
+
+import org.apache.hudi.common.util.StringUtils;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class HoodieSparkSQLUtils {
+  private static final Logger LOG = 
LoggerFactory.getLogger(HoodieSparkSQLUtils.class);
+
+  /**
+   * Returns basepath of Hudi table given the table name.
+   *
+   * @param jsc java spark context
+   * @param fullTableName this is full table name including database.
+   */
+  public static String getBasePathFromTableName(JavaSparkContext jsc, String 
fullTableName) {
+    SparkSession sparkSession = 
SparkSession.builder().enableHiveSupport().sparkContext(jsc.sc()).getOrCreate();
+    return getBasePathFromTableName(sparkSession, fullTableName);
+  }
+
+  /**
+   * Returns basepath of Hudi table given the table name.
+   *
+   * @param sparkSession spark session.
+   * @param fullTableName this is full table name including database.
+   */
+  public static String getBasePathFromTableName(SparkSession sparkSession, 
String fullTableName) {
+    try {
+      return sparkSession.sql(String.format("desc formatted %s", 
fullTableName))
+          .collectAsList()
+          .stream()
+          .filter(row -> "Location".equals(row.getString(0)))
+          .map(v -> v.getString(1))
+          .findFirst()
+          .get();
+    } catch (Exception e) {
+      throw new HoodieException(String.format("Failed to fetch basepath for: 
%s", fullTableName), e);
+    }
+  }
+
+  public static List<Pair<String, String>> 
loadHoodiePathsFromHive(SparkSession sparkSession, String database,
+                                                                    boolean 
filterHudiDatasets) {
+    return loadHoodiePathsFromHive(sparkSession, database, filterHudiDatasets, 
"");
+  }
+
+  /**
+   * Get valid hoodie base path by checking hive registered tables.
+   *
+   * @param sparkSession sparkSession
+   * @param database hive database to check
+   * @param filterHudiDatasets decides whether to consider only hudi datasets.
+   * @return List of a pair of (tablename, basepath)
+   */
+  public static List<Pair<String, String>> 
loadHoodiePathsFromHive(SparkSession sparkSession, String database,
+                                                                    boolean 
filterHudiDatasets, String tableNamePrefix) {
+    Dataset<Row> tables = sparkSession.sql("SHOW TABLES FROM " + database);
+    List<Row> tableRows = !StringUtils.isNullOrEmpty(tableNamePrefix)
+        ? 
tables.filter(tables.col("tableName").startsWith(tableNamePrefix)).collectAsList()
+        : tables.collectAsList();
+    List<Pair<String, String>> validHoodiePaths = new ArrayList<>();
+    tableRows.forEach(tableRow -> {
+      String fullTableName = null;
+      try {
+        fullTableName = tableRow.getString(0) + "." + tableRow.getString(1);
+        LOG.info("Table name {}", fullTableName);
+        List<Row> rows = sparkSession.sql(String.format("desc formatted %s", 
fullTableName))
+            .collectAsList()
+            .stream()
+            .filter(row -> "Location".equals(row.getString(0)) || 
"InputFormat".equals(row.getString(0)))
+            .collect(Collectors.toList());
+        boolean skipDataset = false;
+        if (filterHudiDatasets) {
+          skipDataset = rows.stream().noneMatch(row -> 
"InputFormat".equals(row.getString(0))
+              && row.getString(1).startsWith("Hoodie"));
+        }
+        if (!skipDataset) {
+          String basepath = rows.stream().filter(row -> 
"Location".equals(row.getString(0)))
+              .map(v -> v.getString(1))
+              .findFirst()
+              .get();
+          validHoodiePaths.add(Pair.of(fullTableName, basepath));
+        }
+      } catch (Exception e) {
+        LOG.error("Exception in fetching basepath for table {}", 
fullTableName, e);
+      }
+    });
+    return validHoodiePaths;
+  }
+}
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/utils/TestHoodieSparkSQLUtils.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/utils/TestHoodieSparkSQLUtils.scala
new file mode 100644
index 000000000000..0406a9523347
--- /dev/null
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/utils/TestHoodieSparkSQLUtils.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.utils
+
+import org.apache.hudi.HoodieSparkSQLUtils
+
+import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase
+import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
+
+import scala.collection.JavaConverters._
+
+class TestHoodieSparkSQLUtils extends HoodieSparkSqlTestBase {
+
+  test("Test getBasePathFromTableName APIs") {
+    withTable(generateTableName) { tableName =>
+      val fullTableName = s"default.$tableName"
+      createHudiTable(tableName)
+
+      val expectedBasePath = getTableLocation(fullTableName)
+      assertEquals(expectedBasePath, 
HoodieSparkSQLUtils.getBasePathFromTableName(spark, fullTableName))
+
+      val jsc = JavaSparkContext.fromSparkContext(spark.sparkContext)
+      assertEquals(expectedBasePath, 
HoodieSparkSQLUtils.getBasePathFromTableName(jsc, fullTableName))
+    }
+  }
+
+  test("Test loadHoodiePathsFromHive APIs with and without prefix") {
+    val tableName1 = generateTableName
+    val tableName2 = generateTableName
+    val otherPrefixTable = generateTableName
+    val tableNamePrefix = tableName1.take(6)
+
+    withTable(tableName1) { _ =>
+      withTable(tableName2) { _ =>
+        withTable(otherPrefixTable) { _ =>
+          createHudiTable(tableName1)
+          createHudiTable(tableName2)
+          createHudiTable(otherPrefixTable)
+
+          val fullTableName1 = s"default.$tableName1"
+          val fullTableName2 = s"default.$tableName2"
+          val fullOtherTableName = s"default.$otherPrefixTable"
+
+          val allPairs = HoodieSparkSQLUtils.loadHoodiePathsFromHive(spark, 
"default", false).asScala
+          val allTableNames = allPairs.map(_.getLeft).toSet
+          assertTrue(allTableNames.contains(fullTableName1))
+          assertTrue(allTableNames.contains(fullTableName2))
+          assertTrue(allTableNames.contains(fullOtherTableName))
+
+          val prefixPairs = HoodieSparkSQLUtils
+            .loadHoodiePathsFromHive(spark, "default", false, tableNamePrefix)
+            .asScala
+          
assertTrue(prefixPairs.forall(_.getLeft.split("\\.")(1).startsWith(tableNamePrefix)))
+        }
+      }
+    }
+  }
+
+  test("Test loadHoodiePathsFromHive filterHudiDatasets mode") {
+    withTable(generateTableName) { tableName =>
+      createHudiTable(tableName)
+      val filteredPairs = HoodieSparkSQLUtils.loadHoodiePathsFromHive(spark, 
"default", true).asScala
+      // The current filter checks InputFormat value with startsWith("Hoodie").
+      assertTrue(filteredPairs.isEmpty)
+    }
+  }
+
+  private def createHudiTable(tableName: String): Unit = {
+    spark.sql(
+      s"""
+         | create table $tableName (
+         |  id int,
+         |  name string,
+         |  ts long
+         | ) using hudi
+         | tblproperties (
+         |  primaryKey = 'id',
+         |  orderingFields = 'ts'
+         | )
+         |""".stripMargin)
+    spark.sql(s"insert into $tableName values (1, 'a1', 1000)")
+  }
+
+  private def getTableLocation(fullTableName: String): String = {
+    spark.sql(s"desc formatted $fullTableName")
+      .collect()
+      .find(row => "Location".equals(row.getString(0)))
+      .map(row => row.getString(1))
+      .get
+  }
+}

(hudi) branch master updated: feat(spark): Add HoodieSparkSqlUtils APIs for tooling (#18202)

Reply via email to