[GitHub] [spark] HyukjinKwon commented on a change in pull request #29353: [SPARK-32532][SQL] Improve ORC read/write performance on nested structs and array of structs

GitBox Tue, 04 Aug 2020 21:13:11 -0700


HyukjinKwon commented on a change in pull request #29353:
URL: https://github.com/apache/spark/pull/29353#discussion_r465460837




##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
##########
@@ -72,137 +74,191 @@ class OrcDeserializer(
   /**
    * Creates a writer to write ORC values to Catalyst data structure at the 
given ordinal.
    */
-  private def newWriter(
-      dataType: DataType, updater: CatalystDataUpdater): (Int, 
WritableComparable[_]) => Unit =
-    dataType match {
-      case NullType => (ordinal, _) =>
-        updater.setNullAt(ordinal)
-
-      case BooleanType => (ordinal, value) =>
-        updater.setBoolean(ordinal, value.asInstanceOf[BooleanWritable].get)
-
-      case ByteType => (ordinal, value) =>
-        updater.setByte(ordinal, value.asInstanceOf[ByteWritable].get)
-
-      case ShortType => (ordinal, value) =>
-        updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get)
-
-      case IntegerType => (ordinal, value) =>
-        updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
-
-      case LongType => (ordinal, value) =>
-        updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
-
-      case FloatType => (ordinal, value) =>
-        updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
-
-      case DoubleType => (ordinal, value) =>
-        updater.setDouble(ordinal, value.asInstanceOf[DoubleWritable].get)
-
-      case StringType => (ordinal, value) =>
-        updater.set(ordinal, 
UTF8String.fromBytes(value.asInstanceOf[Text].copyBytes))
-
-      case BinaryType => (ordinal, value) =>
-        val binary = value.asInstanceOf[BytesWritable]
-        val bytes = new Array[Byte](binary.getLength)
-        System.arraycopy(binary.getBytes, 0, bytes, 0, binary.getLength)
-        updater.set(ordinal, bytes)
-
-      case DateType => (ordinal, value) =>
-        updater.setInt(ordinal, OrcShimUtils.getGregorianDays(value))
-
-      case TimestampType => (ordinal, value) =>
-        updater.setLong(ordinal, 
DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
-
-      case DecimalType.Fixed(precision, scale) => (ordinal, value) =>
-        val v = OrcShimUtils.getDecimal(value)
-        v.changePrecision(precision, scale)
-        updater.set(ordinal, v)
-
-      case st: StructType => (ordinal, value) =>
-        val result = new SpecificInternalRow(st)
-        val fieldUpdater = new RowUpdater(result)
-        val fieldConverters = st.map(_.dataType).map { dt =>
-          newWriter(dt, fieldUpdater)
-        }.toArray
-        val orcStruct = value.asInstanceOf[OrcStruct]
-
-        var i = 0
-        while (i < st.length) {
-          val value = orcStruct.getFieldValue(i)
-          if (value == null) {
-            result.setNullAt(i)
-          } else {
-            fieldConverters(i)(i, value)
+  private def newWriter(dataType: DataType, reuseObj: Boolean):
+  (CatalystDataUpdater, Int, WritableComparable[_]) => Unit = dataType match {

Review comment:
       I would keep this style as:
   
   ```scala
   private def newWriter(
       dataType: DataType, reuseObj: Boolean)
   : (CatalystDataUpdater, Int, WritableComparable[_]) => Unit =
     dataType match {
       case NullType => (updater, ordinal, _) =>
         ...
     }
   ```
   
   to reduce the diff.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] HyukjinKwon commented on a change in pull request #29353: [SPARK-32532][SQL] Improve ORC read/write performance on nested structs and array of structs

Reply via email to