msamirkhan commented on a change in pull request #29354: URL: https://github.com/apache/spark/pull/29354#discussion_r465982616
########## File path: external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala ########## @@ -367,15 +372,45 @@ class AvroDeserializer( } } - private def createArrayData(elementType: DataType, length: Int): ArrayData = elementType match { - case BooleanType => UnsafeArrayData.fromPrimitiveArray(new Array[Boolean](length)) - case ByteType => UnsafeArrayData.fromPrimitiveArray(new Array[Byte](length)) - case ShortType => UnsafeArrayData.fromPrimitiveArray(new Array[Short](length)) - case IntegerType => UnsafeArrayData.fromPrimitiveArray(new Array[Int](length)) - case LongType => UnsafeArrayData.fromPrimitiveArray(new Array[Long](length)) - case FloatType => UnsafeArrayData.fromPrimitiveArray(new Array[Float](length)) - case DoubleType => UnsafeArrayData.fromPrimitiveArray(new Array[Double](length)) - case _ => new GenericArrayData(new Array[Any](length)) + private def getArrayDataCreator(elementType: DataType): Int => ArrayData = elementType match { + case BooleanType => length => UnsafeArrayData.createFreshArray(length, 1) + case ByteType => length => UnsafeArrayData.createFreshArray(length, 1) + case ShortType => length => UnsafeArrayData.createFreshArray(length, 2) + case IntegerType => length => UnsafeArrayData.createFreshArray(length, 4) + case LongType => length => UnsafeArrayData.createFreshArray(length, 8) + case FloatType => length => UnsafeArrayData.createFreshArray(length, 4) + case DoubleType => length => UnsafeArrayData.createFreshArray(length, 8) + case _ => length => new GenericArrayData(new Array[Any](length)) + } + + private def getRowCreator(st: StructType): () => InternalRow = { + val constructorsArray = new Array[Unit => MutableValue](st.fields.length) + var i = 0 + while (i < st.fields.length) { + st.fields(i).dataType match { + case BooleanType => constructorsArray(i) = _ => new MutableBoolean + case ByteType => constructorsArray(i) = _ => new MutableByte + case ShortType => constructorsArray(i) = _ => new MutableShort + // We use INT for DATE internally + case IntegerType | DateType => constructorsArray(i) = _ => new MutableInt + // We use Long for Timestamp internally + case LongType | TimestampType => constructorsArray(i) = _ => new MutableLong + case FloatType => constructorsArray(i) = _ => new MutableFloat + case DoubleType => constructorsArray(i) = _ => new MutableDouble + case _ => constructorsArray(i) = _ => new MutableAny + } + i += 1 + } + + () => { + val array = new Array[MutableValue](constructorsArray.length) + var i = 0 + while (i < constructorsArray.length) { + array(i) = constructorsArray(i)(Unit) + i += 1 + } + new SpecificInternalRow(array) + } Review comment: The profiler showed some time being spent in SpecificInternalRow constructor, and we saw improvements when moving to this model where based on the schema we can fill in a constructors array and for each data point, call these constructors one by one. In retrospect, changes can instead be made to the SpecificInternalRow constructor which will benefit https://github.com/apache/spark/pull/29353 as well. So this has been reverted in a latter commit. Read time improvements can be found in the pdf attached to PR in pg 2 under column B. The changes to SpecificInternalRow constructor can be found here: https://github.com/apache/spark/pull/29353#issuecomment-669459288 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org