This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 7e9abc71ed03 feat(table-services): Add config to filter partitions
during full clean (#17550)
7e9abc71ed03 is described below
commit 7e9abc71ed03ebebdfee63f7010aed8c6e4aac2a
Author: Prashant Wason <[email protected]>
AuthorDate: Fri Feb 27 15:11:52 2026 -0800
feat(table-services): Add config to filter partitions during full clean
(#17550)
When incremental cleaning is disabled, users can now use regex or a
static list to filter which partitions are cleaned during full clean
operations. This helps manage memory usage on large tables.
New configs:
- hoodie.clean.partition.filter.regex: Regex pattern to match partitions
- hoodie.clean.partition.filter.selected: Comma-separated list of partitions
Co-authored-by: Claude Opus 4.6 <[email protected]>
---
.../org/apache/hudi/config/HoodieCleanConfig.java | 25 +++++++++++++++++
.../org/apache/hudi/config/HoodieWriteConfig.java | 8 ++++++
.../hudi/table/action/clean/CleanPlanner.java | 32 +++++++++++++++++++++-
3 files changed, 64 insertions(+), 1 deletion(-)
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java
index 178495b07836..e5a638956575 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java
@@ -212,6 +212,31 @@ public class HoodieCleanConfig extends HoodieConfig {
+ "By using local engine context, file listing is performed on the
driver, allowing targeted memory scaling. "
+ "When enabled, both non-partitioned datasets and metadata tables
use the driver for scheduling cleans.");
+ private static final String CLEAN_PARTITION_FILTER_REGEX_KEY =
"hoodie.clean.partition.filter.regex";
+ private static final String CLEAN_PARTITION_FILTER_SELECTED_KEY =
"hoodie.clean.partition.filter.selected";
+
+ public static final ConfigProperty<String> CLEAN_PARTITION_FILTER_REGEX =
ConfigProperty
+ .key(CLEAN_PARTITION_FILTER_REGEX_KEY)
+ .noDefaultValue()
+ .withAlternatives("hoodie.cleaner.partition.filter.regex")
+ .markAdvanced()
+ .sinceVersion("1.2.0")
+ .withDocumentation("When incremental clean is disabled, this regex can
be used to filter the partitions to be cleaned. "
+ + "Only partitions matching this regex pattern will be cleaned. "
+ + "This can be useful for very large tables to avoid OOM issues
during cleaning. "
+ + "If both this config and " + CLEAN_PARTITION_FILTER_SELECTED_KEY +
" are set, the selected partitions take precedence.");
+
+ public static final ConfigProperty<String> CLEAN_PARTITION_FILTER_SELECTED =
ConfigProperty
+ .key(CLEAN_PARTITION_FILTER_SELECTED_KEY)
+ .noDefaultValue()
+ .withAlternatives("hoodie.cleaner.partition.filter.selected")
+ .markAdvanced()
+ .sinceVersion("1.2.0")
+ .withDocumentation("When incremental clean is disabled, this
comma-separated list of partitions can be used to filter the partitions to be
cleaned. "
+ + "Only the specified partitions will be cleaned. "
+ + "This can be useful for very large tables to avoid OOM issues
during cleaning. "
+ + "If both this config and " + CLEAN_PARTITION_FILTER_REGEX_KEY + "
are set, the selected partitions take precedence.");
+
/** @deprecated Use {@link #CLEANER_POLICY} and its methods instead */
@Deprecated
public static final String CLEANER_POLICY_PROP = CLEANER_POLICY.key();
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
index 2187d03aa98d..b4b1af1059d9 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java
@@ -1838,6 +1838,14 @@ public class HoodieWriteConfig extends HoodieConfig {
return getBoolean(HoodieCleanConfig.CLEANER_INCREMENTAL_MODE_ENABLE);
}
+ public String getCleanerPartitionFilterRegex() {
+ return getString(HoodieCleanConfig.CLEAN_PARTITION_FILTER_REGEX);
+ }
+
+ public String getCleanerPartitionFilterSelected() {
+ return getString(HoodieCleanConfig.CLEAN_PARTITION_FILTER_SELECTED);
+ }
+
public boolean inlineCompactionEnabled() {
return getBoolean(HoodieCompactionConfig.INLINE_COMPACT);
}
diff --git
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java
index 5b1a76f5da6b..d5d5b5ebabe2 100644
---
a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java
+++
b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/clean/CleanPlanner.java
@@ -254,11 +254,41 @@ public class CleanPlanner<T, I, K, O> implements
Serializable {
*/
private List<String> getPartitionPathsForFullCleaning() {
// Go to brute force mode of scanning all partitions
+ List<String> allPartitionPaths;
try {
- return hoodieTable.getTableMetadata().getAllPartitionPaths();
+ allPartitionPaths =
hoodieTable.getTableMetadata().getAllPartitionPaths();
} catch (IOException ioe) {
throw new HoodieIOException("Fetching all partitions failed ", ioe);
}
+
+ String partitionSelected = config.getCleanerPartitionFilterSelected();
+ String partitionRegex = config.getCleanerPartitionFilterRegex();
+
+ // Return early if no partition filter is configured
+ if (StringUtils.isNullOrEmpty(partitionSelected) &&
StringUtils.isNullOrEmpty(partitionRegex)) {
+ return allPartitionPaths;
+ }
+
+ // Partition filter cannot be used with incremental cleaning mode
+ if (config.incrementalCleanerModeEnabled()) {
+ throw new IllegalArgumentException("Incremental Cleaning mode is
enabled. Partition filter for clean cannot be used.");
+ }
+
+ // Static list of partitions takes precedence over regex pattern
+ List<String> filteredPartitions;
+ if (!StringUtils.isNullOrEmpty(partitionSelected)) {
+ List<String> selectedPartitions =
Arrays.asList(partitionSelected.split(","));
+ filteredPartitions = allPartitionPaths.stream()
+ .filter(selectedPartitions::contains)
+ .collect(Collectors.toList());
+ log.info("Restricting partitions to clean using selected list.
Partitions to clean: {}", filteredPartitions);
+ } else {
+ filteredPartitions = allPartitionPaths.stream()
+ .filter(p -> p.matches(partitionRegex))
+ .collect(Collectors.toList());
+ log.info("Restricting partitions to clean using regex '{}'. Partitions
to clean: {}", partitionRegex, filteredPartitions);
+ }
+ return filteredPartitions;
}
/**