huaxingao commented on a change in pull request #32049:
URL: https://github.com/apache/spark/pull/32049#discussion_r606863729



##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
##########
@@ -127,4 +147,328 @@ object ParquetUtils {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
   }
+
+  private[sql] def aggResultToSparkInternalRows(
+      footer: ParquetMetadata,
+      parquetTypes: Seq[PrimitiveType.PrimitiveTypeName],
+      values: Seq[Any],
+      dataSchema: StructType,
+      datetimeRebaseModeInRead: String,
+      int96RebaseModeInRead: String,
+      convertTz: Option[ZoneId]): InternalRow = {
+    val mutableRow = new SpecificInternalRow(dataSchema.fields.map(x => 
x.dataType))
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get,
+      datetimeRebaseModeInRead)
+    val int96RebaseMode = DataSourceUtils.int96RebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get,
+      int96RebaseModeInRead)
+    parquetTypes.zipWithIndex.map {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          dataSchema.fields(i).dataType match {
+            case b: ByteType =>
+              mutableRow.setByte(i, values(i).asInstanceOf[Integer].toByte)
+            case s: ShortType =>
+              mutableRow.setShort(i, values(i).asInstanceOf[Integer].toShort)
+            case int: IntegerType =>
+              mutableRow.setInt(i, values(i).asInstanceOf[Integer])
+            case d: DateType =>
+              val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+                datetimeRebaseMode, "Parquet")
+              mutableRow.update(i, 
dateRebaseFunc(values(i).asInstanceOf[Integer]))
+            case d: DecimalType =>
+              val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+              mutableRow.setDecimal(i, decimal, d.precision)
+            case _ => throw new IllegalArgumentException("Unexpected type for 
INT32")
+          }
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          dataSchema.fields(i).dataType match {
+            case long: LongType =>
+              mutableRow.setLong(i, values(i).asInstanceOf[Long])
+            case d: DecimalType =>
+              val decimal = Decimal(values(i).asInstanceOf[Integer].toLong, 
d.precision, d.scale)
+              mutableRow.setDecimal(i, decimal, d.precision)
+            case _ => throw new IllegalArgumentException("Unexpected type for 
INT64")
+          }
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT96, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          dataSchema.fields(i).dataType match {
+            case l: LongType =>
+              mutableRow.setLong(i, values(i).asInstanceOf[Long])
+            case d: TimestampType =>
+              val int96RebaseFunc = 
DataSourceUtils.creteTimestampRebaseFuncInRead(
+                int96RebaseMode, "Parquet INT96")
+              val julianMicros =
+                
ParquetRowConverter.binaryToSQLTimestamp(values(i).asInstanceOf[Binary])
+              val gregorianMicros = int96RebaseFunc(julianMicros)
+              val adjTime =
+                convertTz.map(DateTimeUtils.convertTz(gregorianMicros, _, 
ZoneOffset.UTC))
+                  .getOrElse(gregorianMicros)
+              mutableRow.setLong(i, adjTime)
+            case _ =>
+          }
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          mutableRow.setFloat(i, values(i).asInstanceOf[Float])
+        }
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          mutableRow.setDouble(i, values(i).asInstanceOf[Double])
+        }
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          mutableRow.setBoolean(i, values(i).asInstanceOf[Boolean])
+        }
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          val bytes = values(i).asInstanceOf[Binary].getBytes
+          dataSchema.fields(i).dataType match {
+            case s: StringType =>
+              mutableRow.update(i, UTF8String.fromBytes(bytes))
+            case b: BinaryType =>
+              mutableRow.update(i, bytes)
+            case d: DecimalType =>
+              val decimal =
+                Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+              mutableRow.setDecimal(i, decimal, d.precision)
+            case _ => throw new IllegalArgumentException("Unexpected type for 
Binary")
+          }
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        if (values(i) == null) {
+          mutableRow.setNullAt(i)
+        } else {
+          val bytes = values(i).asInstanceOf[Binary].getBytes
+          dataSchema.fields(i).dataType match {
+            case d: DecimalType =>
+              val decimal =
+                Decimal(new BigDecimal(new BigInteger(bytes), d.scale), 
d.precision, d.scale)
+              mutableRow.setDecimal(i, decimal, d.precision)
+            case _ => throw new IllegalArgumentException("Unexpected type for 
FIXED_LEN_BYTE_ARRAY")
+          }
+        }
+      case _ =>
+        throw new IllegalArgumentException("Unexpected parquet type name")
+    }
+    mutableRow
+  }
+
+  private[sql] def aggResultToSparkColumnarBatch(
+      footer: ParquetMetadata,
+      parquetTypes: Seq[PrimitiveType.PrimitiveTypeName],
+      values: Seq[Any],
+      dataSchema: StructType,
+      offHeap: Boolean,
+      datetimeRebaseModeInRead: String,
+      int96RebaseModeInRead: String,
+      convertTz: Option[ZoneId]): ColumnarBatch = {
+    val capacity = 4 * 1024
+    val footerFileMetaData = footer.getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get,
+      datetimeRebaseModeInRead)
+    val int96RebaseMode = DataSourceUtils.int96RebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get,
+      int96RebaseModeInRead)
+    val columnVectors = if (offHeap) {
+      OffHeapColumnVector.allocateColumns(capacity, dataSchema)
+    } else {
+      OnHeapColumnVector.allocateColumns(capacity, dataSchema)
+    }
+
+    parquetTypes.zipWithIndex.map {
+      case (PrimitiveType.PrimitiveTypeName.INT32, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          dataSchema.fields(i).dataType match {
+            case b: ByteType =>
+              
columnVectors(i).appendByte(values(i).asInstanceOf[Integer].toByte)
+            case s: ShortType =>
+              
columnVectors(i).appendShort(values(i).asInstanceOf[Integer].toShort)
+            case int: IntegerType =>
+              columnVectors(i).appendInt(values(i).asInstanceOf[Integer])
+            case d: DateType =>
+              val dateRebaseFunc = DataSourceUtils.creteDateRebaseFuncInRead(
+                datetimeRebaseMode, "Parquet")
+              
columnVectors(i).appendInt(dateRebaseFunc(values(i).asInstanceOf[Integer]))
+            case _ => throw new IllegalArgumentException("Unexpected type for 
INT32")
+          }
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT64, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+        }
+      case (PrimitiveType.PrimitiveTypeName.INT96, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          dataSchema.fields(i).dataType match {
+            case l: LongType =>
+              columnVectors(i).appendLong(values(i).asInstanceOf[Long])
+            case d: TimestampType =>
+              val int96RebaseFunc = 
DataSourceUtils.creteTimestampRebaseFuncInRead(
+                int96RebaseMode, "Parquet INT96")
+              val julianMicros =
+                
ParquetRowConverter.binaryToSQLTimestamp(values(i).asInstanceOf[Binary])
+              val gregorianMicros = int96RebaseFunc(julianMicros)
+              val adjTime =
+                convertTz.map(DateTimeUtils.convertTz(gregorianMicros, _, 
ZoneOffset.UTC))
+                  .getOrElse(gregorianMicros)
+              columnVectors(i).appendLong(adjTime)
+            case _ => throw new IllegalArgumentException("Unexpected type for 
INT96")
+          }
+        }
+      case (PrimitiveType.PrimitiveTypeName.FLOAT, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          columnVectors(i).appendFloat(values(i).asInstanceOf[Float])
+        }
+      case (PrimitiveType.PrimitiveTypeName.DOUBLE, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          columnVectors(i).appendDouble(values(i).asInstanceOf[Double])
+        }
+      case (PrimitiveType.PrimitiveTypeName.BINARY, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          val bytes = values(i).asInstanceOf[Binary].getBytes
+          columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+        }
+      case (PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          val bytes = values(i).asInstanceOf[Binary].getBytes
+          columnVectors(i).putByteArray(0, bytes, 0, bytes.length)
+        }
+      case (PrimitiveType.PrimitiveTypeName.BOOLEAN, i) =>
+        if (values(i) == null) {
+          columnVectors(i).appendNull()
+        } else {
+          columnVectors(i).appendBoolean(values(i).asInstanceOf[Boolean])
+        }
+      case _ =>
+        throw new IllegalArgumentException("Unexpected parquet type name")
+    }
+    new ColumnarBatch(columnVectors.asInstanceOf[Array[ColumnVector]], 1)
+  }
+
+  private[sql] def getPushedDownAggResult(
+      footer: ParquetMetadata,
+      dataSchema: StructType,
+      aggregation: Aggregation)
+  : (Array[PrimitiveType.PrimitiveTypeName], Array[Any]) = {
+    val footerFileMetaData = footer.getFileMetaData
+    val fields = footerFileMetaData.getSchema.getFields
+    val blocks = footer.getBlocks()
+    val typesBuilder = ArrayBuilder.make[PrimitiveType.PrimitiveTypeName]
+    val valuesBuilder = ArrayBuilder.make[Any]
+
+    blocks.forEach { block =>
+      val columns = block.getColumns()
+      for (i <- 0 until aggregation.aggregateExpressions.size) {

Review comment:
       Sorry, I didn't do this right. Will fix this.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to