Github user henryr commented on a diff in the pull request: https://github.com/apache/spark/pull/19769#discussion_r151830623 --- Diff: sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java --- @@ -430,9 +439,11 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) { } else if (column.dataType() == DataTypes.TimestampType) { for (int i = 0; i < num; i++) { if (defColumn.readInteger() == maxDefLevel) { - column.putLong(rowId + i, - // Read 12 bytes for INT96 - ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12))); + // Read 12 bytes for INT96 + long rawTime = ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12)); + long adjTime = + convertTz == null ? rawTime : DateTimeUtils.convertTz(rawTime, convertTz, UTC); --- End diff -- it might be worth hoisting the conditional here as high as possible: if (convertTz == null) { for (int i = 0; i < num; i++) { /// etc } else { for (int i = 0; i < num; i++) { /// etc } In this case the code duplication might be worth avoiding a branch on every read.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org