This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8986521  ARROW-1943: [JAVA] handle setInitialCapacity for deeply 
nested lists
8986521 is described below

commit 8986521255f48a2aa775921eac0175b4e7afaa16
Author: siddharth <[email protected]>
AuthorDate: Fri Dec 22 11:24:43 2017 -0800

    ARROW-1943: [JAVA] handle setInitialCapacity for deeply nested lists
    
    The current implementation of setInitialCapacity() uses a factor of 5 for 
every level we go into list:
    
    So if the schema is LIST (LIST (LIST (LIST (LIST (LIST (LIST (BIGINT)))))) 
and we start with an initial capacity of 128, we end up throwing 
OversizedAllocationException from the BigIntVector because at every level we 
increased the capacity by 5 and by the time we reached inner scalar that 
actually stores the data, we were well over max size limit per vector (1MB).
    
    We saw this problem downstream when we failed to read deeply nested JSON 
data.
    
    The potential fix is to use the factor of 5 only when we are down to the 
leaf vector. As the depth increases and we are still working with complex/list, 
we don't use the factor of 5.
    
    cc @jacques-n , @BryanCutler , @icexelloss
    
    Author: siddharth <[email protected]>
    
    Closes #1439 from siddharthteotia/ARROW-1943 and squashes the following 
commits:
    
    d0adbade [siddharth] unit tests
    e2f21a8b [siddharth] fix imports
    d103436b [siddharth] ARROW-1943: handle setInitialCapacity for deeply 
nested lists
---
 .../vector/complex/BaseRepeatedValueVector.java    |   8 +-
 .../org/apache/arrow/vector/TestListVector.java    | 128 +++++++++++++++++++++
 2 files changed, 135 insertions(+), 1 deletion(-)

diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
index a9221f2..9a23fd8 100644
--- 
a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
+++ 
b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java
@@ -29,6 +29,8 @@ import org.apache.arrow.vector.FieldVector;
 import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.ZeroVector;
+import org.apache.arrow.vector.BaseVariableWidthVector;
+import org.apache.arrow.vector.BaseFixedWidthVector;
 import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeID;
 import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.util.CallBack;
@@ -134,7 +136,11 @@ public abstract class BaseRepeatedValueVector extends 
BaseValueVector implements
   @Override
   public void setInitialCapacity(int numRecords) {
     offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH;
-    vector.setInitialCapacity(numRecords * 
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
+    if (vector instanceof BaseFixedWidthVector || vector instanceof 
BaseVariableWidthVector) {
+      vector.setInitialCapacity(numRecords * 
RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD);
+    } else {
+     vector.setInitialCapacity(numRecords);
+    }
   }
 
   @Override
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
index 1acce7e..e2023f4 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java
@@ -560,6 +560,134 @@ public class TestListVector {
   }
 
   @Test
+  public void testNestedListVector1() throws Exception {
+    try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
+
+      MinorType listType = MinorType.LIST;
+      MinorType scalarType = MinorType.BIGINT;
+
+      listVector.addOrGetVector(FieldType.nullable(listType.getType()));
+
+      ListVector innerList1 = (ListVector)listVector.getDataVector();
+      innerList1.addOrGetVector(FieldType.nullable(listType.getType()));
+
+      ListVector innerList2 = (ListVector)innerList1.getDataVector();
+      innerList2.addOrGetVector(FieldType.nullable(listType.getType()));
+
+      ListVector innerList3 = (ListVector)innerList2.getDataVector();
+      innerList3.addOrGetVector(FieldType.nullable(listType.getType()));
+
+      ListVector innerList4 = (ListVector)innerList3.getDataVector();
+      innerList4.addOrGetVector(FieldType.nullable(listType.getType()));
+
+      ListVector innerList5 = (ListVector)innerList4.getDataVector();
+      innerList5.addOrGetVector(FieldType.nullable(listType.getType()));
+
+      ListVector innerList6 = (ListVector)innerList5.getDataVector();
+      innerList6.addOrGetVector(FieldType.nullable(scalarType.getType()));
+
+      listVector.setInitialCapacity(128);
+    }
+  }
+
+  @Test
+  public void testNestedListVector2() throws Exception {
+    try (ListVector listVector = ListVector.empty("sourceVector", allocator)) {
+      listVector.setInitialCapacity(1);
+      UnionListWriter listWriter = listVector.getWriter();
+      /* allocate memory */
+      listWriter.allocate();
+
+      /* write one or more inner lists at index 0 */
+      listWriter.setPosition(0);
+      listWriter.startList();
+
+      listWriter.list().startList();
+      listWriter.list().bigInt().writeBigInt(50);
+      listWriter.list().bigInt().writeBigInt(100);
+      listWriter.list().bigInt().writeBigInt(200);
+      listWriter.list().endList();
+
+      listWriter.list().startList();
+      listWriter.list().bigInt().writeBigInt(75);
+      listWriter.list().bigInt().writeBigInt(125);
+      listWriter.list().endList();
+
+      listWriter.endList();
+
+      /* write one or more inner lists at index 1 */
+      listWriter.setPosition(1);
+      listWriter.startList();
+
+      listWriter.list().startList();
+      listWriter.list().bigInt().writeBigInt(15);
+      listWriter.list().bigInt().writeBigInt(20);
+      listWriter.list().endList();
+
+      listWriter.list().startList();
+      listWriter.list().bigInt().writeBigInt(25);
+      listWriter.list().bigInt().writeBigInt(30);
+      listWriter.list().bigInt().writeBigInt(35);
+      listWriter.list().endList();
+
+      listWriter.endList();
+
+      assertEquals(2, listVector.getLastSet());
+
+      listVector.setValueCount(2);
+
+      assertEquals(2, listVector.getValueCount());
+
+      /* get listVector value at index 0 -- the value itself is a listvector */
+      Object result = listVector.getObject(0);
+      ArrayList<ArrayList<Long>> resultSet = (ArrayList<ArrayList<Long>>) 
result;
+      ArrayList<Long> list;
+
+      assertEquals(2, resultSet.size());              /* 2 inner lists at 
index 0 */
+      assertEquals(3, resultSet.get(0).size());       /* size of first inner 
list */
+      assertEquals(2, resultSet.get(1).size());      /* size of second inner 
list */
+
+      list = resultSet.get(0);
+      assertEquals(new Long(50), list.get(0));
+      assertEquals(new Long(100), list.get(1));
+      assertEquals(new Long(200), list.get(2));
+
+      list = resultSet.get(1);
+      assertEquals(new Long(75), list.get(0));
+      assertEquals(new Long(125), list.get(1));
+
+       /* get listVector value at index 1 -- the value itself is a listvector 
*/
+      result = listVector.getObject(1);
+      resultSet = (ArrayList<ArrayList<Long>>) result;
+
+      assertEquals(2, resultSet.size());              /* 3 inner lists at 
index 1 */
+      assertEquals(2, resultSet.get(0).size());       /* size of first inner 
list */
+      assertEquals(3, resultSet.get(1).size());      /* size of second inner 
list */
+
+      list = resultSet.get(0);
+      assertEquals(new Long(15), list.get(0));
+      assertEquals(new Long(20), list.get(1));
+
+      list = resultSet.get(1);
+      assertEquals(new Long(25), list.get(0));
+      assertEquals(new Long(30), list.get(1));
+      assertEquals(new Long(35), list.get(2));
+
+      /* check underlying bitVector */
+      assertFalse(listVector.isNull(0));
+      assertFalse(listVector.isNull(1));
+
+      /* check underlying offsets */
+      final ArrowBuf offsetBuffer = listVector.getOffsetBuffer();
+
+      /* listVector has 2 lists at index 0 and 3 lists at index 1 */
+      assertEquals(0, offsetBuffer.getInt(0 * ListVector.OFFSET_WIDTH));
+      assertEquals(2, offsetBuffer.getInt(1 * ListVector.OFFSET_WIDTH));
+      assertEquals(4, offsetBuffer.getInt(2 * ListVector.OFFSET_WIDTH));
+    }
+  }
+
+  @Test
   public void testGetBufferAddress() throws Exception {
     try (ListVector listVector = ListVector.empty("vector", allocator)) {
 

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to