This is an automated email from the ASF dual-hosted git repository.

nagarwal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 022df0d  [HUDI-1611] Added a configuration to allow specific 
directories to be filtered out during Metadata Table bootstrap. (#2565)
022df0d is described below

commit 022df0d1b134422f7b6f305cd7ec04b25caa23f0
Author: Prashant Wason <pwa...@uber.com>
AuthorDate: Thu Feb 25 16:52:28 2021 -0800

    [HUDI-1611] Added a configuration to allow specific directories to be 
filtered out during Metadata Table bootstrap. (#2565)
---
 .../metadata/HoodieBackedTableMetadataWriter.java     |  6 ++++++
 .../hudi/metadata/TestHoodieBackedMetadata.java       | 19 +++++++++++++++++--
 .../hudi/common/config/HoodieMetadataConfig.java      | 15 +++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
index 003ec7d..5aae7b7 100644
--- 
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
+++ 
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java
@@ -318,6 +318,7 @@ public abstract class HoodieBackedTableMetadataWriter 
implements HoodieTableMeta
     Map<String, List<FileStatus>> partitionToFileStatus = new HashMap<>();
     final int fileListingParallelism = 
metadataWriteConfig.getFileListingParallelism();
     SerializableConfiguration conf = new 
SerializableConfiguration(datasetMetaClient.getHadoopConf());
+    final String dirFilterRegex = 
datasetWriteConfig.getMetadataConfig().getDirectoryFilterRegex();
 
     while (!pathsToList.isEmpty()) {
       int listingParallelism = Math.min(fileListingParallelism, 
pathsToList.size());
@@ -331,6 +332,11 @@ public abstract class HoodieBackedTableMetadataWriter 
implements HoodieTableMeta
       // If the listing reveals a directory, add it to queue. If the listing 
reveals a hoodie partition, add it to
       // the results.
       dirToFileListing.forEach(p -> {
+        if (!dirFilterRegex.isEmpty() && 
p.getLeft().getName().matches(dirFilterRegex)) {
+          LOG.info("Ignoring directory " + p.getLeft() + " which matches the 
filter regex " + dirFilterRegex);
+          return;
+        }
+
         List<FileStatus> filesInDir = Arrays.stream(p.getRight()).parallel()
             .filter(fs -> 
!fs.getPath().getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE))
             .collect(Collectors.toList());
diff --git 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
index 3697ec1..4fa0bc8 100644
--- 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
+++ 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
@@ -148,14 +148,22 @@ public class TestHoodieBackedMetadata extends 
HoodieClientTestHarness {
     final String nonPartitionDirectory = 
HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-nonpartition";
     Files.createDirectories(Paths.get(basePath, nonPartitionDirectory));
 
+    // Three directories which are partitions but will be ignored due to filter
+    final String filterDirRegex = ".*-filterDir\\d|\\..*";
+    final String filteredDirectoryOne = 
HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir1";
+    final String filteredDirectoryTwo = 
HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0] + "-filterDir2";
+    final String filteredDirectoryThree = ".backups";
+
     // Create some commits
     HoodieTestTable testTable = HoodieTestTable.of(metaClient);
-    testTable.withPartitionMetaFiles("p1", "p2")
+    testTable.withPartitionMetaFiles("p1", "p2", filteredDirectoryOne, 
filteredDirectoryTwo, filteredDirectoryThree)
         .addCommit("001").withBaseFilesInPartition("p1", 
10).withBaseFilesInPartition("p2", 10, 10)
         .addCommit("002").withBaseFilesInPartition("p1", 
10).withBaseFilesInPartition("p2", 10, 10, 10)
         .addInflightCommit("003").withBaseFilesInPartition("p1", 
10).withBaseFilesInPartition("p2", 10);
 
-    try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, 
getWriteConfig(true, true))) {
+    final HoodieWriteConfig writeConfig = getWriteConfigBuilder(true, true, 
false)
+        
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build();
+    try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, 
writeConfig)) {
       client.startCommitWithTime("005");
 
       List<String> partitions = 
metadataWriter(client).metadata().getAllPartitionPaths();
@@ -164,6 +172,13 @@ public class TestHoodieBackedMetadata extends 
HoodieClientTestHarness {
       assertTrue(partitions.contains("p1"), "Must contain partition p1");
       assertTrue(partitions.contains("p2"), "Must contain partition p2");
 
+      assertFalse(partitions.contains(filteredDirectoryOne),
+          "Must not contain the filtered directory " + filteredDirectoryOne);
+      assertFalse(partitions.contains(filteredDirectoryTwo),
+          "Must not contain the filtered directory " + filteredDirectoryTwo);
+      assertFalse(partitions.contains(filteredDirectoryThree),
+          "Must not contain the filtered directory " + filteredDirectoryThree);
+
       FileStatus[] statuses = metadata(client).getAllFilesInPartition(new 
Path(basePath, "p1"));
       assertTrue(statuses.length == 2);
       statuses = metadata(client).getAllFilesInPartition(new Path(basePath, 
"p2"));
diff --git 
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java
index 1ead9c8..6346a65 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieMetadataConfig.java
@@ -75,6 +75,10 @@ public final class HoodieMetadataConfig extends 
DefaultHoodieConfig {
   public static final String ENABLE_FALLBACK_PROP = METADATA_PREFIX + 
".fallback.enable";
   public static final String DEFAULT_ENABLE_FALLBACK = "true";
 
+  // Regex to filter out matching directories during bootstrap
+  public static final String DIRECTORY_FILTER_REGEX = METADATA_PREFIX + 
".dir.filter.regex";
+  public static final String DEFAULT_DIRECTORY_FILTER_REGEX = "";
+
   public static final String HOODIE_ASSUME_DATE_PARTITIONING_PROP = 
"hoodie.assume.date.partitioning";
   public static final String DEFAULT_ASSUME_DATE_PARTITIONING = "false";
 
@@ -117,6 +121,10 @@ public final class HoodieMetadataConfig extends 
DefaultHoodieConfig {
     return 
Boolean.parseBoolean(props.getProperty(METADATA_METRICS_ENABLE_PROP));
   }
 
+  public String getDirectoryFilterRegex() {
+    return props.getProperty(DIRECTORY_FILTER_REGEX);
+  }
+
   public static class Builder {
 
     private final Properties props = new Properties();
@@ -194,6 +202,11 @@ public final class HoodieMetadataConfig extends 
DefaultHoodieConfig {
       return this;
     }
 
+    public Builder withDirectoryFilterRegex(String regex) {
+      props.setProperty(DIRECTORY_FILTER_REGEX, regex);
+      return this;
+    }
+
     public HoodieMetadataConfig build() {
       HoodieMetadataConfig config = new HoodieMetadataConfig(props);
       setDefaultOnCondition(props, !props.containsKey(METADATA_ENABLE_PROP), 
METADATA_ENABLE_PROP,
@@ -222,6 +235,8 @@ public final class HoodieMetadataConfig extends 
DefaultHoodieConfig {
           DEFAULT_ENABLE_FALLBACK);
       setDefaultOnCondition(props, !props.containsKey(ENABLE_REUSE_PROP), 
ENABLE_REUSE_PROP,
           DEFAULT_ENABLE_REUSE);
+      setDefaultOnCondition(props, !props.containsKey(DIRECTORY_FILTER_REGEX), 
DIRECTORY_FILTER_REGEX,
+          DEFAULT_DIRECTORY_FILTER_REGEX);
       return config;
     }
   }

Reply via email to