Github user rxin commented on a diff in the pull request: https://github.com/apache/spark/pull/7592#discussion_r35399269 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala --- @@ -149,31 +158,141 @@ private[joins] object HashedRelation { } } +/** + * An extended CompactBuffer that could grow and update. + */ +private[joins] class MutableCompactBuffer[T: ClassTag] extends CompactBuffer[T] { + override def growToSize(newSize: Int): Unit = super.growToSize(newSize) + override def update(i: Int, v: T): Unit = super.update(i, v) +} /** * A HashedRelation for UnsafeRow, which is backed by BytesToBytesMap that maps the key into a * sequence of values. - * - * TODO(davies): use BytesToBytesMap */ private[joins] final class UnsafeHashedRelation( private var hashTable: JavaHashMap[UnsafeRow, CompactBuffer[UnsafeRow]]) extends HashedRelation with Externalizable { - def this() = this(null) // Needed for serialization + private[joins] def this() = this(null) // Needed for serialization + + // Use BytesToBytesMap in executor for better performance (it's created when deserialization) + @transient private[this] var binaryMap: BytesToBytesMap = _ + + // A pool of compact buffers to reduce memory garbage + @transient private[this] val bufferPool = new ThreadLocal[MutableCompactBuffer[UnsafeRow]] --- End diff -- buffer pool has a special meaning in databases. we should pick a different name.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org