nsivabalan commented on code in PR #9337: URL: https://github.com/apache/hudi/pull/9337#discussion_r1283356063
########## hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java: ########## @@ -47,64 +52,86 @@ public class HoodieLogFile implements Serializable { private transient FileStatus fileStatus; private final String pathStr; + private final String fileId; + private final String baseCommitTime; + private final int logVersion; + private final String logWriteToken; + private final String fileExtension; + private final String suffix; + private final Path path; private long fileLen; public HoodieLogFile(HoodieLogFile logFile) { - this.fileStatus = logFile.fileStatus; - this.pathStr = logFile.pathStr; - this.fileLen = logFile.fileLen; + this(logFile.getFileStatus(), logFile.getPath(), logFile.pathStr, logFile.getFileSize()); } public HoodieLogFile(FileStatus fileStatus) { - this.fileStatus = fileStatus; - this.pathStr = fileStatus.getPath().toString(); - this.fileLen = fileStatus.getLen(); + this(fileStatus, fileStatus.getPath(), fileStatus.getPath().toString(), fileStatus.getLen()); } public HoodieLogFile(Path logPath) { - this.fileStatus = null; - this.pathStr = logPath.toString(); - this.fileLen = -1; + this(null, logPath, logPath.toString(), -1); } - public HoodieLogFile(Path logPath, Long fileLen) { - this.fileStatus = null; - this.pathStr = logPath.toString(); - this.fileLen = fileLen; + public HoodieLogFile(Path logPath, long fileLen) { + this(null, logPath, logPath.toString(), fileLen); } public HoodieLogFile(String logPathStr) { - this.fileStatus = null; + this(null, null, logPathStr, -1); + } + + private HoodieLogFile(FileStatus fileStatus, Path logPath, String logPathStr, long fileLen) { + this.fileStatus = fileStatus; this.pathStr = logPathStr; - this.fileLen = -1; + this.fileLen = fileLen; + if (logPath != null) { + if (logPath instanceof CachingPath) { + this.path = logPath; + } else { + this.path = new CachingPath(logPath.getParent(), logPath.getName()); + } + } else { + this.path = new CachingPath(pathStr); + } + Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(path, "LogFile"); + } + this.fileId = matcher.group(1); + this.baseCommitTime = matcher.group(2); + this.fileExtension = matcher.group(3); + this.logVersion = Integer.parseInt(matcher.group(4)); + this.logWriteToken = matcher.group(6); + this.suffix = matcher.group(10); } public String getFileId() { - return FSUtils.getFileIdFromLogPath(getPath()); Review Comment: 👍 ########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java: ########## @@ -214,12 +214,16 @@ protected List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromMetaIndex( String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + List<Pair<String, HoodieBaseFile>> baseFilesForAllPartitions = HoodieIndexUtils.getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable); + List<Pair<String, String>> partitionFileNameList = new ArrayList<>(baseFilesForAllPartitions.size()); + Map<Pair<String, String>, String> partitionAndFileNameToFileId = new HashMap<>(baseFilesForAllPartitions.size()); + baseFilesForAllPartitions.forEach(pair -> { + Pair<String, String> parititonAndFileName = Pair.of(pair.getKey(), pair.getValue().getFileName()); + partitionFileNameList.add(parititonAndFileName); + partitionAndFileNameToFileId.put(parititonAndFileName, pair.getValue().getFileId()); + }); // Partition and file name pairs - List<Pair<String, String>> partitionFileNameList = - HoodieIndexUtils.getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() - .map(partitionBaseFilePair -> Pair.of(partitionBaseFilePair.getLeft(), partitionBaseFilePair.getRight().getFileName())) - .sorted() - .collect(toList()); + Collections.sort(partitionFileNameList); // TODO why does this need to be sorted? Review Comment: I checked the code. I don't think we need sorting here. anyways, internaly when polling col stats from MDT, after constructing all record keys to be looked up, we sort before looking up in hfile. we can probably remove this. ########## hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java: ########## @@ -47,64 +52,86 @@ public class HoodieLogFile implements Serializable { private transient FileStatus fileStatus; private final String pathStr; + private final String fileId; + private final String baseCommitTime; + private final int logVersion; + private final String logWriteToken; + private final String fileExtension; + private final String suffix; + private final Path path; private long fileLen; public HoodieLogFile(HoodieLogFile logFile) { - this.fileStatus = logFile.fileStatus; - this.pathStr = logFile.pathStr; - this.fileLen = logFile.fileLen; + this(logFile.getFileStatus(), logFile.getPath(), logFile.pathStr, logFile.getFileSize()); } public HoodieLogFile(FileStatus fileStatus) { - this.fileStatus = fileStatus; - this.pathStr = fileStatus.getPath().toString(); - this.fileLen = fileStatus.getLen(); + this(fileStatus, fileStatus.getPath(), fileStatus.getPath().toString(), fileStatus.getLen()); } public HoodieLogFile(Path logPath) { - this.fileStatus = null; - this.pathStr = logPath.toString(); - this.fileLen = -1; + this(null, logPath, logPath.toString(), -1); } - public HoodieLogFile(Path logPath, Long fileLen) { - this.fileStatus = null; - this.pathStr = logPath.toString(); - this.fileLen = fileLen; + public HoodieLogFile(Path logPath, long fileLen) { + this(null, logPath, logPath.toString(), fileLen); } public HoodieLogFile(String logPathStr) { - this.fileStatus = null; + this(null, null, logPathStr, -1); + } + + private HoodieLogFile(FileStatus fileStatus, Path logPath, String logPathStr, long fileLen) { + this.fileStatus = fileStatus; this.pathStr = logPathStr; - this.fileLen = -1; + this.fileLen = fileLen; + if (logPath != null) { + if (logPath instanceof CachingPath) { + this.path = logPath; + } else { + this.path = new CachingPath(logPath.getParent(), logPath.getName()); + } + } else { + this.path = new CachingPath(pathStr); + } + Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); + if (!matcher.find()) { + throw new InvalidHoodiePathException(path, "LogFile"); + } + this.fileId = matcher.group(1); + this.baseCommitTime = matcher.group(2); + this.fileExtension = matcher.group(3); + this.logVersion = Integer.parseInt(matcher.group(4)); + this.logWriteToken = matcher.group(6); + this.suffix = matcher.group(10); } public String getFileId() { - return FSUtils.getFileIdFromLogPath(getPath()); Review Comment: 👍 ########## hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java: ########## @@ -114,7 +115,8 @@ public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSc // NOTE: We repackage {@code HoodieLogFile} here to make sure that the provided path // is prefixed with an appropriate scheme given that we're not propagating the FS // further - this.logFile = new HoodieLogFile(FSUtils.makeQualified(fs, logFile.getPath()), logFile.getFileSize()); + Path updatedPath = FSUtils.makeQualified(fs, logFile.getPath()); + this.logFile = updatedPath.equals(logFile.getPath()) ? logFile : new HoodieLogFile(updatedPath, logFile.getFileSize()); Review Comment: may I know what this change is for. ########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java: ########## @@ -481,10 +482,10 @@ private Pair<Integer, HoodieData<HoodieRecord>> initializeRecordIndexPartition() // Collect the list of latest base files present in each partition List<String> partitions = metadata.getAllPartitionPaths(); fsView.loadAllPartitions(); - final List<Pair<String, String>> partitionBaseFilePairs = new ArrayList<>(); + final List<Pair<String, HoodieBaseFile>> partitionBaseFilePairs = new ArrayList<>(); Review Comment: we need to be cautious here. we are bringing in more memory to driver w/ this change. previously we had just a string. not its entire HoodieBaseFile. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org