This is an automated email from the ASF dual-hosted git repository. pvary pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 937b165d908 HIVE-22670: ArrayIndexOutOfBoundsException when vectorized reader is (#3328) (Ganesha Shreedhara and Abhay Chennagiri reviewed by Peter Vary) 937b165d908 is described below commit 937b165d908229d6b01f3ffaa064cf442de1d9ec Author: achennagiri <77031092+achennag...@users.noreply.github.com> AuthorDate: Tue May 31 00:05:49 2022 -0700 HIVE-22670: ArrayIndexOutOfBoundsException when vectorized reader is (#3328) (Ganesha Shreedhara and Abhay Chennagiri reviewed by Peter Vary) --- data/files/hive22670.parquet | Bin 0 -> 737 bytes .../vector/VectorizedPrimitiveColumnReader.java | 134 +++++++++++++-------- .../clientpositive/parquet_vectorization_18.q | 24 ++++ .../llap/parquet_vectorization_18.q.out | 74 ++++++++++++ 4 files changed, 179 insertions(+), 53 deletions(-) diff --git a/data/files/hive22670.parquet b/data/files/hive22670.parquet new file mode 100644 index 00000000000..2700b6fb711 Binary files /dev/null and b/data/files/hive22670.parquet differ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java index bb08c278668..db52d6a2964 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java @@ -521,31 +521,37 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader switch (primitiveColumnType.getPrimitiveCategory()) { case INT: for (int i = rowId; i < rowId + num; ++i) { - ((LongColumnVector) column).vector[i] = - dictionary.readInteger((int) dictionaryIds.vector[i]); - if (!dictionary.isValid()) { - setNullValue(column, i); - ((LongColumnVector) column).vector[i] = 0; + if (!column.isNull[i]) { + ((LongColumnVector) column).vector[i] = + dictionary.readInteger((int) dictionaryIds.vector[i]); + if (!dictionary.isValid()) { + setNullValue(column, i); + ((LongColumnVector) column).vector[i] = 0; + } } } break; case BYTE: for (int i = rowId; i < rowId + num; ++i) { - ((LongColumnVector) column).vector[i] = - dictionary.readTinyInt((int) dictionaryIds.vector[i]); - if (!dictionary.isValid()) { - setNullValue(column, i); - ((LongColumnVector) column).vector[i] = 0; + if (!column.isNull[i]) { + ((LongColumnVector) column).vector[i] = + dictionary.readTinyInt((int) dictionaryIds.vector[i]); + if (!dictionary.isValid()) { + setNullValue(column, i); + ((LongColumnVector) column).vector[i] = 0; + } } } break; case SHORT: for (int i = rowId; i < rowId + num; ++i) { - ((LongColumnVector) column).vector[i] = - dictionary.readSmallInt((int) dictionaryIds.vector[i]); - if (!dictionary.isValid()) { - setNullValue(column, i); - ((LongColumnVector) column).vector[i] = 0; + if (!column.isNull[i]) { + ((LongColumnVector) column).vector[i] = + dictionary.readSmallInt((int) dictionaryIds.vector[i]); + if (!dictionary.isValid()) { + setNullValue(column, i); + ((LongColumnVector) column).vector[i] = 0; + } } } break; @@ -553,74 +559,92 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader DateColumnVector dc = (DateColumnVector) column; dc.setUsingProlepticCalendar(true); for (int i = rowId; i < rowId + num; ++i) { - dc.vector[i] = - skipProlepticConversion ? - dictionary.readLong((int) dictionaryIds.vector[i]) : - CalendarUtils.convertDateToProleptic((int) dictionary.readLong((int) dictionaryIds.vector[i])); - if (!dictionary.isValid()) { - setNullValue(column, i); - dc.vector[i] = 0; + if (!column.isNull[i]) { + dc.vector[i] = + skipProlepticConversion ? + dictionary.readLong((int) dictionaryIds.vector[i]) : + CalendarUtils.convertDateToProleptic((int) dictionary.readLong((int) dictionaryIds.vector[i])); + if (!dictionary.isValid()) { + setNullValue(column, i); + dc.vector[i] = 0; + } } } break; case INTERVAL_YEAR_MONTH: case LONG: for (int i = rowId; i < rowId + num; ++i) { - ((LongColumnVector) column).vector[i] = - dictionary.readLong((int) dictionaryIds.vector[i]); - if (!dictionary.isValid()) { - setNullValue(column, i); - ((LongColumnVector) column).vector[i] = 0; + if (!column.isNull[i]) { + ((LongColumnVector) column).vector[i] = + dictionary.readLong((int) dictionaryIds.vector[i]); + if (!dictionary.isValid()) { + setNullValue(column, i); + ((LongColumnVector) column).vector[i] = 0; + } } } break; case BOOLEAN: for (int i = rowId; i < rowId + num; ++i) { - ((LongColumnVector) column).vector[i] = - dictionary.readBoolean((int) dictionaryIds.vector[i]) ? 1 : 0; + if (!column.isNull[i]) { + ((LongColumnVector) column).vector[i] = + dictionary.readBoolean((int) dictionaryIds.vector[i]) ? 1 : 0; + } } break; case DOUBLE: for (int i = rowId; i < rowId + num; ++i) { - ((DoubleColumnVector) column).vector[i] = - dictionary.readDouble((int) dictionaryIds.vector[i]); - if (!dictionary.isValid()) { - setNullValue(column, i); - ((DoubleColumnVector) column).vector[i] = 0; + if (!column.isNull[i]) { + ((DoubleColumnVector) column).vector[i] = + dictionary.readDouble((int) dictionaryIds.vector[i]); + if (!dictionary.isValid()) { + setNullValue(column, i); + ((DoubleColumnVector) column).vector[i] = 0; + } } } break; case BINARY: for (int i = rowId; i < rowId + num; ++i) { - ((BytesColumnVector) column) - .setVal(i, dictionary.readBytes((int) dictionaryIds.vector[i])); + if (!column.isNull[i]) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readBytes((int) dictionaryIds.vector[i])); + } } break; case STRING: for (int i = rowId; i < rowId + num; ++i) { - ((BytesColumnVector) column) - .setVal(i, dictionary.readString((int) dictionaryIds.vector[i])); + if (!column.isNull[i]) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readString((int) dictionaryIds.vector[i])); + } } break; case VARCHAR: for (int i = rowId; i < rowId + num; ++i) { - ((BytesColumnVector) column) - .setVal(i, dictionary.readVarchar((int) dictionaryIds.vector[i])); + if (!column.isNull[i]) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readVarchar((int) dictionaryIds.vector[i])); + } } break; case CHAR: for (int i = rowId; i < rowId + num; ++i) { - ((BytesColumnVector) column) - .setVal(i, dictionary.readChar((int) dictionaryIds.vector[i])); + if (!column.isNull[i]) { + ((BytesColumnVector) column) + .setVal(i, dictionary.readChar((int) dictionaryIds.vector[i])); + } } break; case FLOAT: for (int i = rowId; i < rowId + num; ++i) { - ((DoubleColumnVector) column).vector[i] = - dictionary.readFloat((int) dictionaryIds.vector[i]); - if (!dictionary.isValid()) { - setNullValue(column, i); - ((DoubleColumnVector) column).vector[i] = 0; + if (!column.isNull[i]) { + ((DoubleColumnVector) column).vector[i] = + dictionary.readFloat((int) dictionaryIds.vector[i]); + if (!dictionary.isValid()) { + setNullValue(column, i); + ((DoubleColumnVector) column).vector[i] = 0; + } } } break; @@ -635,11 +659,13 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader fillDecimalPrecisionScale(decimalLogicalType, decimalColumnVector); for (int i = rowId; i < rowId + num; ++i) { - decimalData = dictionary.readDecimal((int) dictionaryIds.vector[i]); - if (dictionary.isValid()) { - decimalColumnVector.vector[i].set(decimalData, decimalColumnVector.scale); - } else { - setNullValue(column, i); + if (!column.isNull[i]) { + decimalData = dictionary.readDecimal((int) dictionaryIds.vector[i]); + if (dictionary.isValid()) { + decimalColumnVector.vector[i].set(decimalData, decimalColumnVector.scale); + } else { + setNullValue(column, i); + } } } break; @@ -647,7 +673,9 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader TimestampColumnVector tsc = (TimestampColumnVector) column; tsc.setUsingProlepticCalendar(true); for (int i = rowId; i < rowId + num; ++i) { - tsc.set(i, dictionary.readTimestamp((int) dictionaryIds.vector[i]).toSqlTimestamp()); + if (!column.isNull[i]) { + tsc.set(i, dictionary.readTimestamp((int) dictionaryIds.vector[i]).toSqlTimestamp()); + } } break; case INTERVAL_DAY_TIME: diff --git a/ql/src/test/queries/clientpositive/parquet_vectorization_18.q b/ql/src/test/queries/clientpositive/parquet_vectorization_18.q new file mode 100644 index 00000000000..d7d707d5cae --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_vectorization_18.q @@ -0,0 +1,24 @@ +dfs ${system:test.dfs.mkdir} -p ${system:test.tmp.dir}/hive22670; +dfs -copyFromLocal ../../data/files/hive22670.parquet ${system:test.tmp.dir}/hive22670/; +dfs -ls ${system:test.tmp.dir}/hive22670/; + +drop table if exists test_parquet_na; +create external table test_parquet_na( + x int, + y int) + stored as parquet + location '${system:test.tmp.dir}/hive22670'; + +set hive.vectorized.execution.enabled=false; +select * from test_parquet_na; +select * from test_parquet_na order by y; + +set hive.vectorized.execution.enabled=true; +select * from test_parquet_na; + +set hive.vectorized.execution.enabled=true; +select * from test_parquet_na order by y; + +drop table test_parquet_na; +dfs -ls ${system:test.tmp.dir}/hive22670/; +dfs -rmr ${system:test.tmp.dir}/hive22670; diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_18.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_18.q.out new file mode 100644 index 00000000000..49ace72fefe --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_18.q.out @@ -0,0 +1,74 @@ +Found 1 items +#### A masked pattern was here #### +PREHOOK: query: drop table if exists test_parquet_na +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists test_parquet_na +POSTHOOK: type: DROPTABLE +PREHOOK: query: create external table test_parquet_na( + x int, + y int) + stored as parquet +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +#### A masked pattern was here #### +PREHOOK: Output: database:default +PREHOOK: Output: default@test_parquet_na +POSTHOOK: query: create external table test_parquet_na( + x int, + y int) + stored as parquet +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +#### A masked pattern was here #### +POSTHOOK: Output: database:default +POSTHOOK: Output: default@test_parquet_na +PREHOOK: query: select * from test_parquet_na +PREHOOK: type: QUERY +PREHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +POSTHOOK: query: select * from test_parquet_na +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +NULL 1 +NULL 2 +PREHOOK: query: select * from test_parquet_na order by y +PREHOOK: type: QUERY +PREHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +POSTHOOK: query: select * from test_parquet_na order by y +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +NULL 1 +NULL 2 +PREHOOK: query: select * from test_parquet_na +PREHOOK: type: QUERY +PREHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +POSTHOOK: query: select * from test_parquet_na +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +NULL 1 +NULL 2 +PREHOOK: query: select * from test_parquet_na order by y +PREHOOK: type: QUERY +PREHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +POSTHOOK: query: select * from test_parquet_na order by y +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_parquet_na +#### A masked pattern was here #### +NULL 1 +NULL 2 +PREHOOK: query: drop table test_parquet_na +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@test_parquet_na +PREHOOK: Output: default@test_parquet_na +POSTHOOK: query: drop table test_parquet_na +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@test_parquet_na +POSTHOOK: Output: default@test_parquet_na +Found 1 items +#### A masked pattern was here ####