This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new bd5b88c498 [hive] Fix clone Hudi for table writen by old Hudi version
(#5794)
bd5b88c498 is described below
commit bd5b88c4984b11fad6611af9dd43d601f4ae23ff
Author: timmyyao <[email protected]>
AuthorDate: Thu Jun 26 13:55:34 2025 +0800
[hive] Fix clone Hudi for table writen by old Hudi version (#5794)
---
.../java/org/apache/paimon/hudi/HudiFileIndex.java | 7 +++----
.../apache/paimon/hudi/HudiHiveCloneExtractor.java | 24 +++++++++++++++-------
2 files changed, 20 insertions(+), 11 deletions(-)
diff --git
a/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiFileIndex.java
b/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiFileIndex.java
index 792228aa2a..1d9444a5d4 100644
--- a/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiFileIndex.java
+++ b/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiFileIndex.java
@@ -181,18 +181,17 @@ public class HudiFileIndex {
private List<String> extractPartitionValues(String partitionPath) {
String[] paths = partitionPath.split(StoragePath.SEPARATOR);
- // TODO: because we only support table in HMS, the partition is
hive-style by default
- // pt1=v1/pt2=v2
List<String> partitionValues = new ArrayList<>();
Arrays.stream(paths)
.forEach(
p -> {
String[] kv = p.split("=");
if (kv.length == 2) {
+ // partition path in form of pt1=v1/pt2=v2
partitionValues.add(kv[1]);
} else {
- throw new RuntimeException(
- "Wrong hudi partition path: " +
partitionPath);
+ // partition path in form of v1/v2
+ partitionValues.add(kv[0]);
}
});
return partitionValues;
diff --git
a/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiHiveCloneExtractor.java
b/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiHiveCloneExtractor.java
index 1d54bab0dc..9eacb127bf 100644
---
a/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiHiveCloneExtractor.java
+++
b/paimon-hudi/src/main/java/org/apache/paimon/hudi/HudiHiveCloneExtractor.java
@@ -48,12 +48,26 @@ import static
org.apache.paimon.utils.Preconditions.checkArgument;
/** A {@link HiveCloneExtractor} for Hudi tables. */
public class HudiHiveCloneExtractor extends HiveTableCloneExtractor {
private static final Logger LOG =
LoggerFactory.getLogger(HudiHiveCloneExtractor.class);
+ private static final Set<String> HUDI_METADATA_FIELDS =
+ Arrays.stream(HoodieRecord.HoodieMetadataField.values())
+ .map(HoodieRecord.HoodieMetadataField::getFieldName)
+ .collect(Collectors.toSet());
@Override
public boolean matches(Table table) {
- return table.getParameters()
+ if (table.getParameters()
.getOrDefault("spark.sql.sources.provider", "")
- .equalsIgnoreCase("hudi");
+ .equalsIgnoreCase("hudi")) {
+ return true;
+ }
+ // For Hudi version < 0.9, there is no spark-sql support. So we need
to check Hudi fields to
+ // determine if it is a Hudi table.
+ for (FieldSchema field : table.getSd().getCols()) {
+ if (HUDI_METADATA_FIELDS.contains(field.getName())) {
+ return true;
+ }
+ }
+ return false;
}
@Override
@@ -61,13 +75,9 @@ public class HudiHiveCloneExtractor extends
HiveTableCloneExtractor {
IMetaStoreClient client, Table hiveTable, String database, String
table)
throws Exception {
List<FieldSchema> fields = client.getSchema(database, table);
- Set<String> hudiMetadataFields =
- Arrays.stream(HoodieRecord.HoodieMetadataField.values())
- .map(HoodieRecord.HoodieMetadataField::getFieldName)
- .collect(Collectors.toSet());
List<FieldSchema> resultFields =
fields.stream()
- .filter(f -> !hudiMetadataFields.contains(f.getName()))
+ .filter(f ->
!HUDI_METADATA_FIELDS.contains(f.getName()))
.collect(Collectors.toList());
LOG.info(
"Hudi table {}.{} with total field count {}, and result field
count {} after filter",