msamirkhan commented on a change in pull request #29354: URL: https://github.com/apache/spark/pull/29354#discussion_r465994403
########## File path: external/avro/src/main/scala/org/apache/spark/sql/avro/SparkAvroDatumReader.scala ########## @@ -452,71 +452,73 @@ class SparkAvroDatumReader[T]( private[this] def getArrayReader(avroType: Schema, elementType: DataType, - path: List[String], - reuseObj: Boolean + path: List[String] ): (CatalystDataUpdater, Int, ResolvingDecoder) => Unit = { val elementReader = newReader(avroType.getElementType, elementType, path, false) - val array = new ArrayBuffer[Any] - val arrayUpdater = new ArrayBufferUpdater(array) - val toArrayConverter = getToArrayDataConverter(elementType, array) + val arrayCreator = getArrayDataCreator(elementType) + val arrayExpander = getArrayDataExpander(elementType) (updater, ordinal, in) => { var length = in.readArrayStart() - array.sizeHint(length.toInt) + var array = arrayCreator(length) // if (length == 0) 0 else 1) + val arrayUpdater = new ArrayDataUpdater(array) + var base: Int = 0 while (length > 0) { var i = 0 while (i < length) { - elementReader(arrayUpdater, i, in) + elementReader(arrayUpdater, base + i, in) i += 1 + // array = arrayExpander(arrayUpdater, if (i == length) 0 else 1) } + base += length.toInt length = in.arrayNext() - array.sizeHint((array.length + length).toInt) + array = arrayExpander(arrayUpdater, length) // if (length == 0) 0 else 1) } - updater.set(ordinal, toArrayConverter()) - array.clear() + updater.set(ordinal, array) } } Review comment: In this commit arrays are read directly to ArrayData. This requires the ability to "expand" GenericArrayData and UnsafeArrayData. Added one method to UnsafeArrayData https://github.com/apache/spark/pull/29354/commits/2c62ac9ae960582058e96860002f7768eebb95f2#r465994024 Read time improvements can be found in the benchmarks pdf on pg 2 under column G. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org