[GitHub] spark pull request #16196: [SPARK-18231] Optimise SizeEstimator implementati...

mridulm Wed, 07 Dec 2016 03:07:54 -0800

Github user mridulm commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16196#discussion_r91270825
  
    --- Diff: core/src/main/scala/org/apache/spark/util/SizeEstimator.scala ---
    @@ -243,47 +253,59 @@ object SizeEstimator extends Logging {
           arrSize += alignSize(length.toLong * primitiveSize(elementClass))
           state.size += arrSize
         } else {
    +      // We know that the array we are dealing with is an array of 
references
    +      // so explicitly expose this type so we can directly manipulate the 
array
    +      // without help form the Scala runtime for efficency
           arrSize += alignSize(length.toLong * pointerSize)
           state.size += arrSize
     
    +      val objArray = array.asInstanceOf[Array[AnyRef]]
    +
           if (length <= ARRAY_SIZE_FOR_SAMPLING) {
             var arrayIndex = 0
             while (arrayIndex < length) {
    -          state.enqueue(ScalaRunTime.array_apply(array, 
arrayIndex).asInstanceOf[AnyRef])
    +          state.enqueue(objArray(arrayIndex))
               arrayIndex += 1
             }
           } else {
             // Estimate the size of a large array by sampling elements without 
replacement.
             // To exclude the shared objects that the array elements may link, 
sample twice
    -        // and use the min one to calculate array size.
    -        val rand = new Random(42)
    +        //  and use the min one to calculate array size.
    +        //  Use ThreadLocalRandom here since the random is only accessed 
from 1 thread
    +        // and we can save the overhead of the full thread-safe Random
    +        val rand = ThreadLocalRandom.current
             val drawn = new OpenHashSet[Int](2 * ARRAY_SAMPLE_SIZE)
    -        val s1 = sampleArray(array, state, rand, drawn, length)
    -        val s2 = sampleArray(array, state, rand, drawn, length)
    +        val s1 = sampleArray(objArray, state, rand, drawn, length)
    +        val s2 = sampleArray(objArray, state, rand, drawn, length)
             val size = math.min(s1, s2)
    +
             state.size += math.max(s1, s2) +
               (size * ((length - ARRAY_SAMPLE_SIZE) / 
(ARRAY_SAMPLE_SIZE))).toLong
           }
         }
       }
     
       private def sampleArray(
    -      array: AnyRef,
    +      array: Array[AnyRef],
           state: SearchState,
    -      rand: Random,
    +      rand: ThreadLocalRandom,
           drawn: OpenHashSet[Int],
           length: Int): Long = {
         var size = 0L
    -    for (i <- 0 until ARRAY_SAMPLE_SIZE) {
    +    // avoid the use of an iterator derrived from the range syntax here 
for performance
    +    var count = 0
    +    val end = ARRAY_SAMPLE_SIZE
    +    while (count <= end) {
    --- End diff --
    
    < end for until semantics



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #16196: [SPARK-18231] Optimise SizeEstimator implementati...

Reply via email to