This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new 45befc07d2a0 [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist 45befc07d2a0 is described below commit 45befc07d2a064ab2a279a113489ed5c66f7a69d Author: Gene Pang <gene.p...@databricks.com> AuthorDate: Sun May 5 21:50:15 2024 +0800 [SPARK-48019][SQL][FOLLOWUP] Use primitive arrays over object arrays when nulls exist ### What changes were proposed in this pull request? This is a followup to https://github.com/apache/spark/pull/46254 . Instead of using object arrays when nulls are present, continue to use primitive arrays when appropriate. This PR sets the null bits appropriately for the primitive array copy. Primitive arrays are faster than object arrays and won't create unnecessary objects. ### Why are the changes needed? This will improve performance and memory usage, when nulls are present in the `ColumnarArray`. ### Does this PR introduce _any_ user-facing change? This is expected to be faster when copying `ColumnarArray`. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46372 from gene-db/primitive-nulls. Authored-by: Gene Pang <gene.p...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit bf2e25459fe46ca2b1d26e1c98c873923fc135e1) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/vectorized/ColumnarArray.java | 36 ++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java index c4de83cf8b82..1f8e679a4146 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java @@ -47,31 +47,43 @@ public final class ColumnarArray extends ArrayData { return length; } + /** + * Sets all the appropriate null bits in the input UnsafeArrayData. + * + * @param arrayData The UnsafeArrayData to set the null bits for + * @return The UnsafeArrayData with the null bits set + */ + private UnsafeArrayData setNullBits(UnsafeArrayData arrayData) { + if (data.hasNull()) { + for (int i = 0; i < length; i++) { + if (data.isNullAt(i)) { + arrayData.setNullAt(i); + } + } + } + return arrayData; + } + @Override public ArrayData copy() { DataType dt = data.dataType(); - if (data.hasNull()) { - // UnsafeArrayData cannot be used if there are any nulls. - return new GenericArrayData(toObjectArray(dt)).copy(); - } - if (dt instanceof BooleanType) { - return UnsafeArrayData.fromPrimitiveArray(toBooleanArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toBooleanArray())); } else if (dt instanceof ByteType) { - return UnsafeArrayData.fromPrimitiveArray(toByteArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toByteArray())); } else if (dt instanceof ShortType) { - return UnsafeArrayData.fromPrimitiveArray(toShortArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toShortArray())); } else if (dt instanceof IntegerType || dt instanceof DateType || dt instanceof YearMonthIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toIntArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toIntArray())); } else if (dt instanceof LongType || dt instanceof TimestampType || dt instanceof DayTimeIntervalType) { - return UnsafeArrayData.fromPrimitiveArray(toLongArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toLongArray())); } else if (dt instanceof FloatType) { - return UnsafeArrayData.fromPrimitiveArray(toFloatArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toFloatArray())); } else if (dt instanceof DoubleType) { - return UnsafeArrayData.fromPrimitiveArray(toDoubleArray()); + return setNullBits(UnsafeArrayData.fromPrimitiveArray(toDoubleArray())); } else { return new GenericArrayData(toObjectArray(dt)).copy(); // ensure the elements are copied. } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org