nsivabalan commented on code in PR #9337:
URL: https://github.com/apache/hudi/pull/9337#discussion_r1283356063


##########
hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java:
##########
@@ -47,64 +52,86 @@ public class HoodieLogFile implements Serializable {
 
   private transient FileStatus fileStatus;
   private final String pathStr;
+  private final String fileId;
+  private final String baseCommitTime;
+  private final int logVersion;
+  private final String logWriteToken;
+  private final String fileExtension;
+  private final String suffix;
+  private final Path path;
   private long fileLen;
 
   public HoodieLogFile(HoodieLogFile logFile) {
-    this.fileStatus = logFile.fileStatus;
-    this.pathStr = logFile.pathStr;
-    this.fileLen = logFile.fileLen;
+    this(logFile.getFileStatus(), logFile.getPath(), logFile.pathStr, 
logFile.getFileSize());
   }
 
   public HoodieLogFile(FileStatus fileStatus) {
-    this.fileStatus = fileStatus;
-    this.pathStr = fileStatus.getPath().toString();
-    this.fileLen = fileStatus.getLen();
+    this(fileStatus, fileStatus.getPath(), fileStatus.getPath().toString(), 
fileStatus.getLen());
   }
 
   public HoodieLogFile(Path logPath) {
-    this.fileStatus = null;
-    this.pathStr = logPath.toString();
-    this.fileLen = -1;
+    this(null, logPath, logPath.toString(), -1);
   }
 
-  public HoodieLogFile(Path logPath, Long fileLen) {
-    this.fileStatus = null;
-    this.pathStr = logPath.toString();
-    this.fileLen = fileLen;
+  public HoodieLogFile(Path logPath, long fileLen) {
+    this(null, logPath, logPath.toString(), fileLen);
   }
 
   public HoodieLogFile(String logPathStr) {
-    this.fileStatus = null;
+    this(null, null, logPathStr, -1);
+  }
+
+  private HoodieLogFile(FileStatus fileStatus, Path logPath, String 
logPathStr, long fileLen) {
+    this.fileStatus = fileStatus;
     this.pathStr = logPathStr;
-    this.fileLen = -1;
+    this.fileLen = fileLen;
+    if (logPath != null) {
+      if (logPath instanceof CachingPath) {
+        this.path = logPath;
+      } else {
+        this.path = new CachingPath(logPath.getParent(), logPath.getName());
+      }
+    } else {
+      this.path = new CachingPath(pathStr);
+    }
+    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
+    if (!matcher.find()) {
+      throw new InvalidHoodiePathException(path, "LogFile");
+    }
+    this.fileId = matcher.group(1);
+    this.baseCommitTime = matcher.group(2);
+    this.fileExtension = matcher.group(3);
+    this.logVersion = Integer.parseInt(matcher.group(4));
+    this.logWriteToken = matcher.group(6);
+    this.suffix = matcher.group(10);
   }
 
   public String getFileId() {
-    return FSUtils.getFileIdFromLogPath(getPath());

Review Comment:
   👍 



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java:
##########
@@ -214,12 +214,16 @@ protected List<Pair<String, BloomIndexFileInfo>> 
loadColumnRangesFromMetaIndex(
 
     String keyField = 
hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
 
+    List<Pair<String, HoodieBaseFile>> baseFilesForAllPartitions = 
HoodieIndexUtils.getLatestBaseFilesForAllPartitions(partitions, context, 
hoodieTable);
+    List<Pair<String, String>> partitionFileNameList = new 
ArrayList<>(baseFilesForAllPartitions.size());
+    Map<Pair<String, String>, String> partitionAndFileNameToFileId = new 
HashMap<>(baseFilesForAllPartitions.size());
+    baseFilesForAllPartitions.forEach(pair -> {
+      Pair<String, String> parititonAndFileName = Pair.of(pair.getKey(), 
pair.getValue().getFileName());
+      partitionFileNameList.add(parititonAndFileName);
+      partitionAndFileNameToFileId.put(parititonAndFileName, 
pair.getValue().getFileId());
+    });
     // Partition and file name pairs
-    List<Pair<String, String>> partitionFileNameList =
-        HoodieIndexUtils.getLatestBaseFilesForAllPartitions(partitions, 
context, hoodieTable).stream()
-            .map(partitionBaseFilePair -> 
Pair.of(partitionBaseFilePair.getLeft(), 
partitionBaseFilePair.getRight().getFileName()))
-            .sorted()
-            .collect(toList());
+    Collections.sort(partitionFileNameList); // TODO why does this need to be 
sorted?

Review Comment:
   I checked the code. I don't think we need sorting here. anyways, internaly 
when polling col stats from MDT, after constructing all record keys to be 
looked up, we sort before looking up in hfile. we can probably remove this. 



##########
hudi-common/src/main/java/org/apache/hudi/common/model/HoodieLogFile.java:
##########
@@ -47,64 +52,86 @@ public class HoodieLogFile implements Serializable {
 
   private transient FileStatus fileStatus;
   private final String pathStr;
+  private final String fileId;
+  private final String baseCommitTime;
+  private final int logVersion;
+  private final String logWriteToken;
+  private final String fileExtension;
+  private final String suffix;
+  private final Path path;
   private long fileLen;
 
   public HoodieLogFile(HoodieLogFile logFile) {
-    this.fileStatus = logFile.fileStatus;
-    this.pathStr = logFile.pathStr;
-    this.fileLen = logFile.fileLen;
+    this(logFile.getFileStatus(), logFile.getPath(), logFile.pathStr, 
logFile.getFileSize());
   }
 
   public HoodieLogFile(FileStatus fileStatus) {
-    this.fileStatus = fileStatus;
-    this.pathStr = fileStatus.getPath().toString();
-    this.fileLen = fileStatus.getLen();
+    this(fileStatus, fileStatus.getPath(), fileStatus.getPath().toString(), 
fileStatus.getLen());
   }
 
   public HoodieLogFile(Path logPath) {
-    this.fileStatus = null;
-    this.pathStr = logPath.toString();
-    this.fileLen = -1;
+    this(null, logPath, logPath.toString(), -1);
   }
 
-  public HoodieLogFile(Path logPath, Long fileLen) {
-    this.fileStatus = null;
-    this.pathStr = logPath.toString();
-    this.fileLen = fileLen;
+  public HoodieLogFile(Path logPath, long fileLen) {
+    this(null, logPath, logPath.toString(), fileLen);
   }
 
   public HoodieLogFile(String logPathStr) {
-    this.fileStatus = null;
+    this(null, null, logPathStr, -1);
+  }
+
+  private HoodieLogFile(FileStatus fileStatus, Path logPath, String 
logPathStr, long fileLen) {
+    this.fileStatus = fileStatus;
     this.pathStr = logPathStr;
-    this.fileLen = -1;
+    this.fileLen = fileLen;
+    if (logPath != null) {
+      if (logPath instanceof CachingPath) {
+        this.path = logPath;
+      } else {
+        this.path = new CachingPath(logPath.getParent(), logPath.getName());
+      }
+    } else {
+      this.path = new CachingPath(pathStr);
+    }
+    Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName());
+    if (!matcher.find()) {
+      throw new InvalidHoodiePathException(path, "LogFile");
+    }
+    this.fileId = matcher.group(1);
+    this.baseCommitTime = matcher.group(2);
+    this.fileExtension = matcher.group(3);
+    this.logVersion = Integer.parseInt(matcher.group(4));
+    this.logWriteToken = matcher.group(6);
+    this.suffix = matcher.group(10);
   }
 
   public String getFileId() {
-    return FSUtils.getFileIdFromLogPath(getPath());

Review Comment:
   👍 



##########
hudi-common/src/main/java/org/apache/hudi/common/table/log/HoodieLogFileReader.java:
##########
@@ -114,7 +115,8 @@ public HoodieLogFileReader(FileSystem fs, HoodieLogFile 
logFile, Schema readerSc
     // NOTE: We repackage {@code HoodieLogFile} here to make sure that the 
provided path
     //       is prefixed with an appropriate scheme given that we're not 
propagating the FS
     //       further
-    this.logFile = new HoodieLogFile(FSUtils.makeQualified(fs, 
logFile.getPath()), logFile.getFileSize());
+    Path updatedPath = FSUtils.makeQualified(fs, logFile.getPath());
+    this.logFile = updatedPath.equals(logFile.getPath()) ? logFile : new 
HoodieLogFile(updatedPath, logFile.getFileSize());

Review Comment:
   may I know what this change is for.



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java:
##########
@@ -481,10 +482,10 @@ private Pair<Integer, HoodieData<HoodieRecord>> 
initializeRecordIndexPartition()
     // Collect the list of latest base files present in each partition
     List<String> partitions = metadata.getAllPartitionPaths();
     fsView.loadAllPartitions();
-    final List<Pair<String, String>> partitionBaseFilePairs = new 
ArrayList<>();
+    final List<Pair<String, HoodieBaseFile>> partitionBaseFilePairs = new 
ArrayList<>();

Review Comment:
   we need to be cautious here. we are bringing in more memory to driver w/ 
this change. previously we had just a string. not its entire HoodieBaseFile. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to