yihua commented on code in PR #9114: URL: https://github.com/apache/hudi/pull/9114#discussion_r1251176617
########## hudi-spark-datasource/hudi-spark/src/test/java/org/apache/hudi/functional/TestGlobalIndexEnableUpdatePartitions.java: ########## @@ -65,8 +65,8 @@ private static Stream<Arguments> getTableTypeAndIndexType() { Arguments.of(COPY_ON_WRITE, GLOBAL_BLOOM), Arguments.of(COPY_ON_WRITE, RECORD_INDEX), Arguments.of(MERGE_ON_READ, GLOBAL_SIMPLE), - Arguments.of(MERGE_ON_READ, GLOBAL_BLOOM), - Arguments.of(MERGE_ON_READ, RECORD_INDEX) + Arguments.of(MERGE_ON_READ, GLOBAL_BLOOM) + // Arguments.of(MERGE_ON_READ, RECORD_INDEX) Review Comment: Is this still failing? ########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java: ########## @@ -1159,10 +1161,27 @@ protected boolean validateTimelineBeforeSchedulingCompaction(Option<String> inFl * @param writeStatuses {@code WriteStatus} from the write operation */ private HoodieData<HoodieRecord> getRecordIndexUpdates(HoodieData<WriteStatus> writeStatuses) { - return writeStatuses.flatMap(writeStatus -> { - List<HoodieRecord> recordList = new LinkedList<>(); - for (HoodieRecordDelegate recordDelegate : writeStatus.getWrittenRecordDelegates()) { - if (!writeStatus.isErrored(recordDelegate.getHoodieKey())) { + // 1. List<HoodieRecordDelegate> + // 2. Reduce by key: accept keys only when new location is not + return writeStatuses.map(writeStatus -> writeStatus.getWrittenRecordDelegates().stream() + .map(recordDelegate -> Pair.of(recordDelegate.getRecordKey(), recordDelegate))) + .flatMapToPair(Stream::iterator) + .reduceByKey((recordDelegate1, recordDelegate2) -> { + if (recordDelegate1.getRecordKey().equals(recordDelegate2.getRecordKey())) { + if (recordDelegate1.getNewLocation().isPresent() && recordDelegate1.getNewLocation().get().getFileId() != null) { + return recordDelegate1; + } else if (recordDelegate2.getNewLocation().isPresent() && recordDelegate2.getNewLocation().get().getFileId() != null) { + return recordDelegate2; + } else { + // should not come here, one of the above must have a new location set + return null; + } + } else { + return recordDelegate1; + } + }, 1) Review Comment: Parallelism should be adjustable, not 1? ########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java: ########## @@ -1159,10 +1161,27 @@ protected boolean validateTimelineBeforeSchedulingCompaction(Option<String> inFl * @param writeStatuses {@code WriteStatus} from the write operation */ private HoodieData<HoodieRecord> getRecordIndexUpdates(HoodieData<WriteStatus> writeStatuses) { - return writeStatuses.flatMap(writeStatus -> { - List<HoodieRecord> recordList = new LinkedList<>(); - for (HoodieRecordDelegate recordDelegate : writeStatus.getWrittenRecordDelegates()) { - if (!writeStatus.isErrored(recordDelegate.getHoodieKey())) { + // 1. List<HoodieRecordDelegate> + // 2. Reduce by key: accept keys only when new location is not + return writeStatuses.map(writeStatus -> writeStatus.getWrittenRecordDelegates().stream() + .map(recordDelegate -> Pair.of(recordDelegate.getRecordKey(), recordDelegate))) + .flatMapToPair(Stream::iterator) + .reduceByKey((recordDelegate1, recordDelegate2) -> { + if (recordDelegate1.getRecordKey().equals(recordDelegate2.getRecordKey())) { + if (recordDelegate1.getNewLocation().isPresent() && recordDelegate1.getNewLocation().get().getFileId() != null) { + return recordDelegate1; + } else if (recordDelegate2.getNewLocation().isPresent() && recordDelegate2.getNewLocation().get().getFileId() != null) { + return recordDelegate2; + } else { + // should not come here, one of the above must have a new location set + return null; Review Comment: Should this throw an exception? ########## hudi-client/hudi-client-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadataWriter.java: ########## @@ -1159,10 +1161,27 @@ protected boolean validateTimelineBeforeSchedulingCompaction(Option<String> inFl * @param writeStatuses {@code WriteStatus} from the write operation */ private HoodieData<HoodieRecord> getRecordIndexUpdates(HoodieData<WriteStatus> writeStatuses) { - return writeStatuses.flatMap(writeStatus -> { - List<HoodieRecord> recordList = new LinkedList<>(); - for (HoodieRecordDelegate recordDelegate : writeStatus.getWrittenRecordDelegates()) { - if (!writeStatus.isErrored(recordDelegate.getHoodieKey())) { + // 1. List<HoodieRecordDelegate> + // 2. Reduce by key: accept keys only when new location is not + return writeStatuses.map(writeStatus -> writeStatus.getWrittenRecordDelegates().stream() + .map(recordDelegate -> Pair.of(recordDelegate.getRecordKey(), recordDelegate))) + .flatMapToPair(Stream::iterator) + .reduceByKey((recordDelegate1, recordDelegate2) -> { Review Comment: `ReduceByKey` can be a costly operation. Have we measured the performance impact of this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org