the-other-tim-brown commented on code in PR #10344: URL: https://github.com/apache/hudi/pull/10344#discussion_r1437340978
########## hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java: ########## @@ -78,41 +78,49 @@ public class ExternalSpillableMap<T extends Serializable, R extends Serializable // Enables compression of values stored in disc private final boolean isCompressionEnabled; // current space occupied by this map in-memory - private Long currentInMemoryMapSize; + private long currentInMemoryMapSize; // An estimate of the size of each payload written to this map private volatile long estimatedPayloadSize = 0; // Base File Path private final String baseFilePath; - public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, + public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, SizeEstimator<R> valueSizeEstimator) throws IOException { this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, DiskMapType.BITCASK); } - public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, + public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, SizeEstimator<R> valueSizeEstimator, DiskMapType diskMapType) throws IOException { this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, diskMapType, false); } - public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, + public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, SizeEstimator<R> valueSizeEstimator, DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException { this.inMemoryMap = new HashMap<>(); this.baseFilePath = baseFilePath; - this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * sizingFactorForInMemoryMap); + this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * SIZING_FACTOR_FOR_IN_MEMORY_MAP); this.currentInMemoryMapSize = 0L; this.keySizeEstimator = keySizeEstimator; this.valueSizeEstimator = valueSizeEstimator; this.diskMapType = diskMapType; this.isCompressionEnabled = isCompressionEnabled; } + private DiskMap<T, R> getDiskBasedMap() { + return getDiskBasedMap(false); + } + + private DiskMap<T, R> getOrCreateDiskBasedMap() { + return getDiskBasedMap(true); + } + private DiskMap<T, R> getDiskBasedMap(boolean forceInitialization) { if (null == diskBasedMap) { - if (!forceInitialization) { - return DiskMap.empty(); - } synchronized (this) { if (null == diskBasedMap) { + if (!forceInitialization) { + return DiskMap.empty(); Review Comment: @danny0405 let's breakdown this suggestion. You are asking to: 1. Introduce a new abstract init method in `DiskMap` 2. Move the class initialization of DiskMap to its own init method that 3. Make sure the initialization methods are only called once there is a `put` called 4. In all other methods in DiskMap, account for whether or not the init method has been called. 5. Add unit tests or some test suite for DiskMap and its implementations to ensure all future iterations properly implement the lazy initialization. The changes I have in the `ExternalSpillableMap` map are around 30 lines. There is an existing set of tests that can be augmented to make sure that the logic is working properly. I want to bring up what you have claimed is your biggest concern with the PR: ``` My biggest concern is for write method (like #put) and read method (like #contains) we need to take care the disk map type to ensure the correctness. ``` If this is your biggest concern, why isn't a unit test sufficient? @danny0405 I have already suggested this much and you have not addressed whether this is sufficient. ########## hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java: ########## @@ -78,41 +78,49 @@ public class ExternalSpillableMap<T extends Serializable, R extends Serializable // Enables compression of values stored in disc private final boolean isCompressionEnabled; // current space occupied by this map in-memory - private Long currentInMemoryMapSize; + private long currentInMemoryMapSize; // An estimate of the size of each payload written to this map private volatile long estimatedPayloadSize = 0; // Base File Path private final String baseFilePath; - public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, + public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, SizeEstimator<R> valueSizeEstimator) throws IOException { this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, DiskMapType.BITCASK); } - public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, + public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, SizeEstimator<R> valueSizeEstimator, DiskMapType diskMapType) throws IOException { this(maxInMemorySizeInBytes, baseFilePath, keySizeEstimator, valueSizeEstimator, diskMapType, false); } - public ExternalSpillableMap(Long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, + public ExternalSpillableMap(long maxInMemorySizeInBytes, String baseFilePath, SizeEstimator<T> keySizeEstimator, SizeEstimator<R> valueSizeEstimator, DiskMapType diskMapType, boolean isCompressionEnabled) throws IOException { this.inMemoryMap = new HashMap<>(); this.baseFilePath = baseFilePath; - this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * sizingFactorForInMemoryMap); + this.maxInMemorySizeInBytes = (long) Math.floor(maxInMemorySizeInBytes * SIZING_FACTOR_FOR_IN_MEMORY_MAP); this.currentInMemoryMapSize = 0L; this.keySizeEstimator = keySizeEstimator; this.valueSizeEstimator = valueSizeEstimator; this.diskMapType = diskMapType; this.isCompressionEnabled = isCompressionEnabled; } + private DiskMap<T, R> getDiskBasedMap() { + return getDiskBasedMap(false); + } + + private DiskMap<T, R> getOrCreateDiskBasedMap() { + return getDiskBasedMap(true); + } + private DiskMap<T, R> getDiskBasedMap(boolean forceInitialization) { if (null == diskBasedMap) { - if (!forceInitialization) { - return DiskMap.empty(); - } synchronized (this) { if (null == diskBasedMap) { + if (!forceInitialization) { + return DiskMap.empty(); Review Comment: @danny0405 let's breakdown this suggestion. You are asking for: 1. Introduce a new abstract init method in `DiskMap` 2. Move the class initialization of DiskMap to its own init method that 3. Make sure the initialization methods are only called once there is a `put` called 4. In all other methods in DiskMap, account for whether or not the init method has been called. 5. Add unit tests or some test suite for DiskMap and its implementations to ensure all future iterations properly implement the lazy initialization. The changes I have in the `ExternalSpillableMap` map are around 30 lines. There is an existing set of tests that can be augmented to make sure that the logic is working properly. I want to bring up what you have claimed is your biggest concern with the PR: ``` My biggest concern is for write method (like #put) and read method (like #contains) we need to take care the disk map type to ensure the correctness. ``` If this is your biggest concern, why isn't a unit test sufficient? @danny0405 I have already suggested this much and you have not addressed whether this is sufficient. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org