PARQUET-585: Slowly ramp up sizes of int[]s in IntList to keep sizes small when 
data sets are small

One of the follow up items from PR - 
https://github.com/apache/parquet-mr/pull/339 was to slowly ramp up the size of 
the int[] created in IntList to ensure we don't allocate 64K arrays right off 
the bat. This PR updates the code to start with a 4K array then keeps doubling 
till 64K (and stays at 64K after that).

Author: Piyush Narang <pnar...@twitter.com>

Closes #341 from piyushnarang/master and squashes the following commits:

0bc6b84 [Piyush Narang] Fix review comments - add spaces, check slab size, fix 
slab init
d1b4df1 [Piyush Narang] Make IntListTest values relative to constants in IntList
9617015 [Piyush Narang] Update IntList slab creation to keep bumping up size 
gradually
ebf1c58 [Piyush Narang] Merge branch 'master' of 
https://github.com/apache/parquet-mr
3ecc577 [Piyush Narang] Remove redundant IntList ctor
f7dfd5f [Piyush Narang] Switch int[] initialization in IntList to be lazy


Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/7885f9fb
Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/7885f9fb
Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/7885f9fb

Branch: refs/heads/parquet-1.8.x
Commit: 7885f9fb9b51633b3b9a161cc88ed8b7a3cca79d
Parents: aa249a5
Author: Piyush Narang <pnar...@twitter.com>
Authored: Wed Apr 20 20:47:49 2016 -0700
Committer: Ryan Blue <b...@apache.org>
Committed: Mon Jan 9 16:54:53 2017 -0800

----------------------------------------------------------------------
 .../column/values/dictionary/IntList.java       | 64 ++++++++++++---
 .../column/values/dictionary/IntListTest.java   | 84 ++++++++++++++++++++
 2 files changed, 136 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7885f9fb/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java
----------------------------------------------------------------------
diff --git 
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java
 
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java
index 8e6228a..dca1470 100644
--- 
a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java
+++ 
b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java
@@ -31,7 +31,20 @@ import java.util.List;
  */
 public class IntList {
 
-  private static final int SLAB_SIZE = 64 * 1024;
+  static final int MAX_SLAB_SIZE = 64 * 1024;
+  static final int INITIAL_SLAB_SIZE = 4 * 1024;
+
+  // Double slab size till we reach the max slab size. At that point we just 
add slabs of size
+  // MAX_SLAB_SIZE. This ensures we don't allocate very large slabs from the 
start if we don't have
+  // too much data.
+  private int currentSlabSize = INITIAL_SLAB_SIZE;
+
+  /**
+   * Visible for testing to verify the current slab size
+   */
+  int getCurrentSlabSize() {
+    return currentSlabSize;
+  }
 
   /**
    * to iterate on the content of the list
@@ -43,14 +56,17 @@ public class IntList {
   public static class IntIterator {
 
     private final int[][] slabs;
-    private int current;
     private final int count;
 
+    private int current;
+    private int currentRow;
+    private int currentCol;
+
     /**
      * slabs will be iterated in order up to the provided count
      * as the last slab may not be full
      * @param slabs contain the ints
-     * @param count total count of ints
+     * @param count count of ints
      */
     public IntIterator(int[][] slabs, int count) {
       this.slabs = slabs;
@@ -68,11 +84,19 @@ public class IntList {
      * @return the next int
      */
     public int next() {
-      final int result = slabs[current / SLAB_SIZE][current % SLAB_SIZE];
-      ++ current;
+      final int result = slabs[currentRow][currentCol];
+      incrementPosition();
       return result;
     }
 
+    private void incrementPosition() {
+      current++;
+      currentCol++;
+      if (currentCol >= slabs[currentRow].length) {
+        currentCol = 0;
+        currentRow++;
+      }
+    }
   }
 
   private List<int[]> slabs = new ArrayList<int[]>();
@@ -82,20 +106,31 @@ public class IntList {
   private int[] currentSlab;
   private int currentSlabPos;
 
-  private void initSlab() {
-    currentSlab = new int[SLAB_SIZE];
+  private void allocateSlab() {
+    currentSlab = new int[currentSlabSize];
     currentSlabPos = 0;
   }
 
+  // Double slab size up to the MAX_SLAB_SIZE limit
+  private void updateCurrentSlabSize() {
+    if (currentSlabSize < MAX_SLAB_SIZE) {
+      currentSlabSize *= 2;
+      if (currentSlabSize > MAX_SLAB_SIZE) {
+        currentSlabSize = MAX_SLAB_SIZE;
+      }
+    }
+  }
+
   /**
    * @param i value to append to the end of the list
    */
   public void add(int i) {
     if (currentSlab == null) {
-      initSlab();
+      allocateSlab();
     } else if (currentSlabPos == currentSlab.length) {
       slabs.add(currentSlab);
-      initSlab();
+      updateCurrentSlabSize();
+      allocateSlab();
     }
 
     currentSlab[currentSlabPos] = i;
@@ -108,19 +143,24 @@ public class IntList {
    */
   public IntIterator iterator() {
     if (currentSlab == null) {
-      initSlab();
+      allocateSlab();
     }
 
     int[][] itSlabs = slabs.toArray(new int[slabs.size() + 1][]);
     itSlabs[slabs.size()] = currentSlab;
-    return new IntIterator(itSlabs, SLAB_SIZE * slabs.size() + currentSlabPos);
+    return new IntIterator(itSlabs, size());
   }
 
   /**
    * @return the current size of the list
    */
   public int size() {
-    return SLAB_SIZE * slabs.size() + currentSlabPos;
+    int size = currentSlabPos;
+    for (int [] slab : slabs) {
+      size += slab.length;
+    }
+
+    return size;
   }
 
 }

http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7885f9fb/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
----------------------------------------------------------------------
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
 
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
new file mode 100644
index 0000000..e6df4db
--- /dev/null
+++ 
b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.dictionary;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+public class IntListTest {
+
+  /**
+   * Test IntList of fairly small size (< INITIAL_SLAB_SIZE), this tests a 
single slab being created
+   */
+  @Test
+  public void testSmallList() {
+    int testSize = IntList.INITIAL_SLAB_SIZE - 100;
+    doTestIntList(testSize, IntList.INITIAL_SLAB_SIZE);
+  }
+
+  /**
+   * Test IntList > INITIAL_SLAB_SIZE so that we have multiple slabs being 
created
+   */
+  @Test
+  public void testListGreaterThanInitialSlabSize() {
+    int testSize = IntList.INITIAL_SLAB_SIZE + 100;
+    doTestIntList(testSize, IntList.INITIAL_SLAB_SIZE * 2);
+  }
+
+  /**
+   * Test IntList of a fairly large size (> MAX_SLAB_SIZE) so that we have 
multiple slabs
+   * created of varying sizes
+   */
+  @Test
+  public void testListGreaterThanMaxSlabSize() {
+    int testSize = IntList.MAX_SLAB_SIZE * 4 + 100;
+    doTestIntList(testSize, IntList.MAX_SLAB_SIZE);
+  }
+
+  private void doTestIntList(int testSize, int expectedSlabSize) {
+    IntList testList = new IntList();
+    populateList(testList, testSize);
+
+    verifyIteratorResults(testSize, testList);
+
+    // confirm the size of the current slab
+    Assert.assertEquals(expectedSlabSize, testList.getCurrentSlabSize());
+  }
+
+  private void populateList(IntList testList, int size) {
+    for(int i = 0; i < size; i++) {
+      testList.add(i);
+    }
+  }
+
+  private void verifyIteratorResults(int testSize, IntList testList) {
+    IntList.IntIterator iterator = testList.iterator();
+    int expected = 0;
+
+    while (iterator.hasNext()) {
+      int val = iterator.next();
+      Assert.assertEquals(expected, val);
+      expected++;
+    }
+
+    // ensure we have the correct final value of expected
+    Assert.assertEquals(testSize, expected);
+  }
+}

Reply via email to