Github user mridulm commented on a diff in the pull request: https://github.com/apache/spark/pull/16196#discussion_r91270825 --- Diff: core/src/main/scala/org/apache/spark/util/SizeEstimator.scala --- @@ -243,47 +253,59 @@ object SizeEstimator extends Logging { arrSize += alignSize(length.toLong * primitiveSize(elementClass)) state.size += arrSize } else { + // We know that the array we are dealing with is an array of references + // so explicitly expose this type so we can directly manipulate the array + // without help form the Scala runtime for efficency arrSize += alignSize(length.toLong * pointerSize) state.size += arrSize + val objArray = array.asInstanceOf[Array[AnyRef]] + if (length <= ARRAY_SIZE_FOR_SAMPLING) { var arrayIndex = 0 while (arrayIndex < length) { - state.enqueue(ScalaRunTime.array_apply(array, arrayIndex).asInstanceOf[AnyRef]) + state.enqueue(objArray(arrayIndex)) arrayIndex += 1 } } else { // Estimate the size of a large array by sampling elements without replacement. // To exclude the shared objects that the array elements may link, sample twice - // and use the min one to calculate array size. - val rand = new Random(42) + // and use the min one to calculate array size. + // Use ThreadLocalRandom here since the random is only accessed from 1 thread + // and we can save the overhead of the full thread-safe Random + val rand = ThreadLocalRandom.current val drawn = new OpenHashSet[Int](2 * ARRAY_SAMPLE_SIZE) - val s1 = sampleArray(array, state, rand, drawn, length) - val s2 = sampleArray(array, state, rand, drawn, length) + val s1 = sampleArray(objArray, state, rand, drawn, length) + val s2 = sampleArray(objArray, state, rand, drawn, length) val size = math.min(s1, s2) + state.size += math.max(s1, s2) + (size * ((length - ARRAY_SAMPLE_SIZE) / (ARRAY_SAMPLE_SIZE))).toLong } } } private def sampleArray( - array: AnyRef, + array: Array[AnyRef], state: SearchState, - rand: Random, + rand: ThreadLocalRandom, drawn: OpenHashSet[Int], length: Int): Long = { var size = 0L - for (i <- 0 until ARRAY_SAMPLE_SIZE) { + // avoid the use of an iterator derrived from the range syntax here for performance + var count = 0 + val end = ARRAY_SAMPLE_SIZE + while (count <= end) { --- End diff -- < end for until semantics
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org