This is an automated email from the ASF dual-hosted git repository. sivabalan pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push: new 13a8e5c7297 [HUDI-5348] Cache file slices in HoodieBackedTableMetadata (#7436) 13a8e5c7297 is described below commit 13a8e5c729750ba5907d75df3d22473feaaa2a03 Author: Y Ethan Guo <ethan.guoyi...@gmail.com> AuthorDate: Mon Dec 12 17:00:10 2022 -0800 [HUDI-5348] Cache file slices in HoodieBackedTableMetadata (#7436) --- .../org/apache/hudi/metadata/HoodieBackedTableMetadata.java | 13 +++++++++++-- .../org/apache/hudi/metadata/HoodieTableMetadataUtil.java | 10 ++++++---- .../java/org/apache/hudi/utilities/TestHoodieIndexer.java | 7 +++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java index 7743a65bf05..e2fbc4e6716 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java @@ -40,6 +40,7 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; +import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.util.ClosableIterator; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; @@ -78,6 +79,7 @@ import static org.apache.hudi.common.util.ValidationUtils.checkArgument; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_FILES; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getFileSystemView; /** * Table metadata provided by an internal DFS backed Hudi metadata table. @@ -92,6 +94,7 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { // Metadata table's timeline and metaclient private HoodieTableMetaClient metadataMetaClient; private HoodieTableConfig metadataTableConfig; + private HoodieTableFileSystemView metadataFileSystemView; // should we reuse the open file handles, across calls private final boolean reuse; @@ -120,6 +123,7 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { } else if (this.metadataMetaClient == null) { try { this.metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataBasePath).build(); + this.metadataFileSystemView = getFileSystemView(metadataMetaClient); this.metadataTableConfig = metadataMetaClient.getTableConfig(); this.isBloomFilterIndexEnabled = metadataConfig.isBloomFilterIndexEnabled(); this.isColumnStatsIndexEnabled = metadataConfig.isColumnStatsIndexEnabled(); @@ -127,11 +131,13 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { LOG.warn("Metadata table was not found at path " + metadataBasePath); this.isMetadataTableEnabled = false; this.metadataMetaClient = null; + this.metadataFileSystemView = null; this.metadataTableConfig = null; } catch (Exception e) { LOG.error("Failed to initialize metadata table at path " + metadataBasePath, e); this.isMetadataTableEnabled = false; this.metadataMetaClient = null; + this.metadataFileSystemView = null; this.metadataTableConfig = null; } } @@ -162,7 +168,8 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { // to scan all file-groups for all key-prefixes as each of these might contain some // records matching the key-prefix List<FileSlice> partitionFileSlices = - HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, partitionName); + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices( + metadataMetaClient, metadataFileSystemView, partitionName); return (shouldLoadInMemory ? HoodieListData.lazy(partitionFileSlices) : engineContext.parallelize(partitionFileSlices)) .flatMap((SerializableFunction<FileSlice, Iterator<HoodieRecord<HoodieMetadataPayload>>>) fileSlice -> { @@ -379,7 +386,8 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { private Map<Pair<String, FileSlice>, List<String>> getPartitionFileSliceToKeysMapping(final String partitionName, final List<String> keys) { // Metadata is in sync till the latest completed instant on the dataset List<FileSlice> latestFileSlices = - HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, partitionName); + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices( + metadataMetaClient, metadataFileSystemView, partitionName); Map<Pair<String, FileSlice>, List<String>> partitionFileSliceToKeysMap = new HashMap<>(); for (String key : keys) { @@ -646,6 +654,7 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata { dataMetaClient.reloadActiveTimeline(); if (metadataMetaClient != null) { metadataMetaClient.reloadActiveTimeline(); + metadataFileSystemView = getFileSystemView(metadataMetaClient); } // the cached reader has max instant time restriction, they should be cleared // because the metadata timeline may have changed. diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java index 5896c1a5ebb..0ceb43b86c6 100644 --- a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java @@ -991,13 +991,15 @@ public class HoodieTableMetadataUtil { * just before the compaction instant time. The list of file slices returned is * sorted in the correct order of file group name. * - * @param metaClient - Instance of {@link HoodieTableMetaClient}. - * @param partition - The name of the partition whose file groups are to be loaded. + * @param metaClient Instance of {@link HoodieTableMetaClient}. + * @param fsView Metadata table filesystem view. + * @param partition The name of the partition whose file groups are to be loaded. * @return List of latest file slices for all file groups in a given partition. */ - public static List<FileSlice> getPartitionLatestMergedFileSlices(HoodieTableMetaClient metaClient, String partition) { + public static List<FileSlice> getPartitionLatestMergedFileSlices( + HoodieTableMetaClient metaClient, HoodieTableFileSystemView fsView, String partition) { LOG.info("Loading latest merged file slices for metadata table partition " + partition); - return getPartitionFileSlices(metaClient, Option.empty(), partition, true); + return getPartitionFileSlices(metaClient, Option.of(fsView), partition, true); } /** diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java index f5a0fadc87f..ac7b86f4cfa 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java @@ -56,6 +56,7 @@ import java.util.stream.Stream; import static org.apache.hudi.common.table.HoodieTableMetaClient.reload; import static org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED; +import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getFileSystemView; import static org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists; import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS; import static org.apache.hudi.metadata.MetadataPartitionType.COLUMN_STATS; @@ -175,7 +176,8 @@ public class TestHoodieIndexer extends SparkClientFunctionalTestHarness implemen HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getMetaPath() + "/metadata").build(); List<FileSlice> partitionFileSlices = - HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, COLUMN_STATS.getPartitionPath()); + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices( + metadataMetaClient, getFileSystemView(metadataMetaClient), COLUMN_STATS.getPartitionPath()); assertEquals(partitionFileSlices.size(), colStatsFileGroupCount); } @@ -220,7 +222,8 @@ public class TestHoodieIndexer extends SparkClientFunctionalTestHarness implemen HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getMetaPath() + "/metadata").build(); List<FileSlice> partitionFileSlices = - HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient, COLUMN_STATS.getPartitionPath()); + HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices( + metadataMetaClient, getFileSystemView(metadataMetaClient), COLUMN_STATS.getPartitionPath()); assertEquals(partitionFileSlices.size(), HoodieMetadataConfig.METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT.defaultValue()); }