This is an automated email from the ASF dual-hosted git repository.
siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8986521 ARROW-1943: [JAVA] handle setInitialCapacity for deeply
nested lists
8986521 is described below
commit 8986521255f48a2aa775921eac0175b4e7afaa16
Author: siddharth <[email protected]>
AuthorDate: Fri Dec 22 11:24:43 2017 -0800
ARROW-1943: [JAVA] handle setInitialCapacity for deeply nested lists
The current implementation of setInitialCapacity() uses a factor of 5 for
every level we go into list:
So if the schema is LIST (LIST (LIST (LIST (LIST (LIST (LIST (BIGINT))))))
and we start with an initial capacity of 128, we end up throwing
OversizedAllocationException from the BigIntVector because at every level we
increased the capacity by 5 and by the time we reached inner scalar that
actually stores the data, we were well over max size limit per vector (1MB).
We saw this problem downstream when we failed to read deeply nested JSON
data.
The potential fix is to use the factor of 5 only when we are down to the
leaf vector. As the depth increases and we are still working with complex/list,
we don't use the factor of 5.
cc @jacques-n , @BryanCutler , @icexelloss
Author: siddharth <[email protected]>
Closes #1439 from siddharthteotia/ARROW-1943 and squashes the following
commits:
d0adbade [siddharth] unit tests
e2f21a8b [siddharth] fix imports
d103436b [siddharth] ARROW-1943: handle setInitialCapacity for deeply
nested lists
---
.../vector/complex/BaseRepeatedValueVector.java | 8 +-
.../org/apache/arrow/vector/TestListVector.java | 128 +++++++++++++++++++++
2 files changed, 135 insertions(+), 1 deletion(-)
diff --git
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index a9221f2..9a23fd8 100644
---
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -29,6 +29,8 @@ import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.UInt4Vector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.ZeroVector;
+import org.apache.arrow.vector.BaseVariableWidthVector;
+import org.apache.arrow.vector.BaseFixedWidthVector;
import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID;
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.util.CallBack;
@@ -134,7 +136,11 @@ public abstract class BaseRepeatedValueVector extends
BaseValueVector implements
@Override
public void setInitialCapacity(int numRecords) {
offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
- vector.setInitialCapacity(numRecords *
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
+ if (vector instanceof BaseFixedWidthVector || vector instanceof
BaseVariableWidthVector) {
+ vector.setInitialCapacity(numRecords *
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
+ } else {
+ vector.setInitialCapacity(numRecords);
+ }
}
@Override
diff --git
a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index 1acce7e..e2023f4 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -560,6 +560,134 @@ public class TestListVector {
}
@Test
+ public void testNestedListVector1() throws Exception {
+ try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
+
+ MinorType listType = MinorType.LIST;
+ MinorType scalarType = MinorType.BIGINT;
+
+ listVector.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList1 = (ListVector)listVector.getDataVector();
+ innerList1.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList2 = (ListVector)innerList1.getDataVector();
+ innerList2.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList3 = (ListVector)innerList2.getDataVector();
+ innerList3.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList4 = (ListVector)innerList3.getDataVector();
+ innerList4.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList5 = (ListVector)innerList4.getDataVector();
+ innerList5.addOrGetVector(FieldType.nullable(listType.getType()));
+
+ ListVector innerList6 = (ListVector)innerList5.getDataVector();
+ innerList6.addOrGetVector(FieldType.nullable(scalarType.getType()));
+
+ listVector.setInitialCapacity(128);
+ }
+ }
+
+ @Test
+ public void testNestedListVector2() throws Exception {
+ try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
+ listVector.setInitialCapacity(1);
+ UnionListWriter listWriter = listVector.getWriter();
+ /* allocate memory */
+ listWriter.allocate();
+
+ /* write one or more inner lists at index 0 */
+ listWriter.setPosition(0);
+ listWriter.startList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(50);
+ listWriter.list().bigInt().writeBigInt(100);
+ listWriter.list().bigInt().writeBigInt(200);
+ listWriter.list().endList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(75);
+ listWriter.list().bigInt().writeBigInt(125);
+ listWriter.list().endList();
+
+ listWriter.endList();
+
+ /* write one or more inner lists at index 1 */
+ listWriter.setPosition(1);
+ listWriter.startList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(15);
+ listWriter.list().bigInt().writeBigInt(20);
+ listWriter.list().endList();
+
+ listWriter.list().startList();
+ listWriter.list().bigInt().writeBigInt(25);
+ listWriter.list().bigInt().writeBigInt(30);
+ listWriter.list().bigInt().writeBigInt(35);
+ listWriter.list().endList();
+
+ listWriter.endList();
+
+ assertEquals(2, listVector.getLastSet());
+
+ listVector.setValueCount(2);
+
+ assertEquals(2, listVector.getValueCount());
+
+ /* get listVector value at index 0 -- the value itself is a listvector */
+ Object result = listVector.getObject(0);
+ ArrayList<ArrayList<Long>> resultSet = (ArrayList<ArrayList<Long>>)
result;
+ ArrayList<Long> list;
+
+ assertEquals(2, resultSet.size()); /* 2 inner lists at
index 0 */
+ assertEquals(3, resultSet.get(0).size()); /* size of first inner
list */
+ assertEquals(2, resultSet.get(1).size()); /* size of second inner
list */
+
+ list = resultSet.get(0);
+ assertEquals(new Long(50), list.get(0));
+ assertEquals(new Long(100), list.get(1));
+ assertEquals(new Long(200), list.get(2));
+
+ list = resultSet.get(1);
+ assertEquals(new Long(75), list.get(0));
+ assertEquals(new Long(125), list.get(1));
+
+ /* get listVector value at index 1 -- the value itself is a listvector
*/
+ result = listVector.getObject(1);
+ resultSet = (ArrayList<ArrayList<Long>>) result;
+
+ assertEquals(2, resultSet.size()); /* 3 inner lists at
index 1 */
+ assertEquals(2, resultSet.get(0).size()); /* size of first inner
list */
+ assertEquals(3, resultSet.get(1).size()); /* size of second inner
list */
+
+ list = resultSet.get(0);
+ assertEquals(new Long(15), list.get(0));
+ assertEquals(new Long(20), list.get(1));
+
+ list = resultSet.get(1);
+ assertEquals(new Long(25), list.get(0));
+ assertEquals(new Long(30), list.get(1));
+ assertEquals(new Long(35), list.get(2));
+
+ /* check underlying bitVector */
+ assertFalse(listVector.isNull(0));
+ assertFalse(listVector.isNull(1));
+
+ /* check underlying offsets */
+ final ArrowBuf offsetBuffer = listVector.getOffsetBuffer();
+
+ /* listVector has 2 lists at index 0 and 3 lists at index 1 */
+ assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH));
+ assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH));
+ assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH));
+ }
+ }
+
+ @Test
public void testGetBufferAddress() throws Exception {
try (ListVector listVector = ListVector.empty("vector", allocator)) {
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].