danny0405 commented on code in PR #12779:
URL: https://github.com/apache/hudi/pull/12779#discussion_r1945885827
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java:
##########
@@ -1706,4 +1711,87 @@ public boolean isInitialized() {
}
protected abstract BaseHoodieWriteClient<?, I, ?, ?> initializeWriteClient();
+
+ /**
+ * A class which represents a directory and the files and directories inside
it.
+ * <p>
+ * A {@code PartitionFileInfo} object saves the name of the partition and
various properties requires of each file
+ * required for initializing the metadata table. Saving limited properties
reduces the total memory footprint when
+ * a very large number of files are present in the dataset being initialized.
+ */
+ static class DirectoryInfo implements Serializable {
+ // Relative path of the directory (relative to the base directory)
+ private final String relativePath;
+ // Map of filenames within this partition to their respective sizes
+ private final HashMap<String, Long> filenameToSizeMap;
+ // List of directories within this partition
+ private final List<StoragePath> subDirectories = new ArrayList<>();
+ // Is this a hoodie partition
+ private boolean isHoodiePartition = false;
+
+ public DirectoryInfo(String relativePath, FileStatus[] fileStatus, String
maxInstantTime) {
+ this.relativePath = relativePath;
+
+ // Pre-allocate with the maximum length possible
+ filenameToSizeMap = new HashMap<>(fileStatus.length);
+
+ // FileId to commit map. Used for ensuring we don't have two files with
the same filegroup and commit time.
+ HashMap<String, Set<String>> seenGroupCommitPairs = new HashMap<>();
+
+ // Presence of partition meta file implies this is a HUDI partition
+ isHoodiePartition = Arrays.stream(fileStatus).anyMatch(status ->
status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX));
+ for (FileStatus status : fileStatus) {
+ StoragePath path = new StoragePath(status.getPath().toUri());
+ if (status.isDirectory()) {
+ // Ignore .hoodie directory as there cannot be any partitions inside
it
+ if
(!status.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) {
+ this.subDirectories.add(new StoragePath(status.getPath().toUri()));
+ }
+ } else if (isHoodiePartition && FSUtils.isDataFile(path)) {
+ // Regular HUDI data file (base file or log file)
+ String dataFileCommitTime =
FSUtils.getCommitTime(status.getPath().getName());
+
+ // Sanity check: ensure that we don't have base files with duplicate
file groups and commit times.
+ if (FSUtils.isBaseFile(path)) {
+ String fileGroup = FSUtils.getFileId(status.getPath().getName());
+ if (seenGroupCommitPairs.containsKey(fileGroup)) {
+ if
(seenGroupCommitPairs.get(fileGroup).contains(dataFileCommitTime)) {
Review Comment:
And I kind of think the MDT should not care about the spurious data files,
we should do this in DT before commit.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]