jiangxb1987 commented on a change in pull request #27968: [SPARK-31202][CORE]Improve SizeEstimator for AppendOnlyMap URL: https://github.com/apache/spark/pull/27968#discussion_r396917895
########## File path: core/src/main/scala/org/apache/spark/util/SizeEstimator.scala ########## @@ -290,6 +306,88 @@ object SizeEstimator extends Logging { size } + + /** Visit AppendOnlyMap data field which stored all the KVs, we handle this field separately + * because the underlying type of the elems of this array is different, and their size may vary + * significantly, for example, the value may be an array-like buffer to store merged or grouped + * values for aggregation. + * */ + private def visitKVDataArray( + data: Array[AnyRef], + keyPositions: java.util.BitSet, + totalValueElements: Int, + state: SearchState): Unit = { + val length = data.length + var arrSize: Long = alignSize(objectSize + INT_SIZE) + state.size += arrSize + state.size += alignSize((length - keyPositions.size) * pointerSize) + + if (length <= ARRAY_SIZE_FOR_SAMPLING) { + for (e <- data) { + state.enqueue(e) + } + } else { + val rand = new Random(42) + val drawn = new OpenHashSet[Int](2 * ARRAY_SAMPLE_SIZE) + val (numKeys1, keySize1, numValueElements1, valueSize1) = + sampleKVDataArray(data, keyPositions, state, rand, drawn, length) + val (numKeys2, keySize2, numValueElements2, valueSize2) = + sampleKVDataArray(data, keyPositions, state, rand, drawn, length) + val (_, keySizeForMax, numKeysForMin, keySizeForMin) = if (keySize1 > keySize2) { + (numKeys1, keySize1, numKeys2, keySize2) + } else (numKeys2, keySize2, numKeys1, keySize1) + val keySize = keySizeForMax + (keySizeForMin * Review comment: What does this try to do? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org