This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 8e693d1 HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich) 8e693d1 is described below commit 8e693d1b36e1ff0aacd802d16e1a3d0ec72ef04b Author: Syed Shameerur Rahman <rhma...@amazon.com> AuthorDate: Thu Nov 18 12:59:50 2021 +0530 HIVE-25443 : Arrow SerDe Cannot serialize/deserialize complex data types When there are more than 1024 values (#2581) (Syed Shameerur Rahman reviewed by Zoltan Haindrich) --- .../hive/ql/io/arrow/ArrowColumnarBatchSerDe.java | 4 +- .../hadoop/hive/ql/io/arrow/Deserializer.java | 3 ++ .../ql/io/arrow/TestArrowColumnarBatchSerDe.java | 43 ++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java index fdef3b8..ceb794f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/ArrowColumnarBatchSerDe.java @@ -210,9 +210,9 @@ public class ArrowColumnarBatchSerDe extends AbstractSerDe { static ListColumnVector toStructListVector(MapColumnVector mapVector) { final StructColumnVector structVector; final ListColumnVector structListVector; - structVector = new StructColumnVector(); + structVector = new StructColumnVector(mapVector.childCount); structVector.fields = new ColumnVector[] {mapVector.keys, mapVector.values}; - structListVector = new ListColumnVector(); + structListVector = new ListColumnVector(mapVector.childCount, null); structListVector.child = structVector; structListVector.childCount = mapVector.childCount; structListVector.isRepeating = mapVector.isRepeating; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java index ac4d237..ce8488f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java @@ -391,6 +391,7 @@ class Deserializer { private void readList(FieldVector arrowVector, ListColumnVector hiveVector, ListTypeInfo typeInfo) { final int size = arrowVector.getValueCount(); + hiveVector.ensureSize(size, false); final ArrowBuf offsets = arrowVector.getOffsetBuffer(); final int OFFSET_WIDTH = 4; @@ -412,6 +413,7 @@ class Deserializer { private void readMap(FieldVector arrowVector, MapColumnVector hiveVector, MapTypeInfo typeInfo) { final int size = arrowVector.getValueCount(); + hiveVector.ensureSize(size, false); final ListTypeInfo mapStructListTypeInfo = toStructListTypeInfo(typeInfo); final ListColumnVector mapStructListVector = toStructListVector(hiveVector); final StructColumnVector mapStructVector = (StructColumnVector) mapStructListVector.child; @@ -430,6 +432,7 @@ class Deserializer { private void readStruct(FieldVector arrowVector, StructColumnVector hiveVector, StructTypeInfo typeInfo) { final int size = arrowVector.getValueCount(); + hiveVector.ensureSize(size, false); final List<TypeInfo> fieldTypeInfos = typeInfo.getAllStructFieldTypeInfos(); final int fieldSize = arrowVector.getChildrenFromFields().size(); for (int i = 0; i < fieldSize; i++) { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java index d803063..a4b296b 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hive.ql.io.arrow; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ARROW_BATCH_SIZE; import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -157,6 +158,7 @@ public class TestArrowColumnarBatchSerDe { @Before public void setUp() { conf = new Configuration(); + conf.setInt(HIVE_ARROW_BATCH_SIZE.varname, 1025); } private static ByteWritable byteW(int value) { @@ -1024,4 +1026,45 @@ public class TestArrowColumnarBatchSerDe { initAndSerializeAndDeserialize(schema, toList(DECIMAL_ROWS)); } + @Test + public void testListBooleanWithMoreThan1024Values() throws SerDeException { + String[][] schema = { + {"boolean_list", "array<boolean>"}, + }; + + Object[][] rows = new Object[1025][1]; + for (int i = 0; i < 1025; i++) { + rows[i][0] = new BooleanWritable(true); + } + + initAndSerializeAndDeserialize(schema, toList(rows)); + } + + @Test + public void testStructBooleanWithMoreThan1024Values() throws SerDeException { + String[][] schema = { + {"boolean_struct", "struct<boolean1:boolean>"}, + }; + + Object[][] rows = new Object[1025][1]; + for (int i = 0; i < 1025; i++) { + rows[i][0] = new BooleanWritable(true); + } + + initAndSerializeAndDeserialize(schema, toStruct(rows)); + } + + @Test + public void testMapIntergerWithMoreThan1024Values() throws SerDeException { + String[][] schema = { + {"int_map", "map<string,int>"}, + }; + + Object[][] rows = new Object[1025][1]; + for (int i = 0; i < 1025; i++) { + rows[i][0] = intW(i); + } + + initAndSerializeAndDeserialize(schema, toMap(rows)); + } }