PARQUET-585: Slowly ramp up sizes of int[]s in IntList to keep sizes small when data sets are small
One of the follow up items from PR - https://github.com/apache/parquet-mr/pull/339 was to slowly ramp up the size of the int[] created in IntList to ensure we don't allocate 64K arrays right off the bat. This PR updates the code to start with a 4K array then keeps doubling till 64K (and stays at 64K after that). Author: Piyush Narang <pnar...@twitter.com> Closes #341 from piyushnarang/master and squashes the following commits: 0bc6b84 [Piyush Narang] Fix review comments - add spaces, check slab size, fix slab init d1b4df1 [Piyush Narang] Make IntListTest values relative to constants in IntList 9617015 [Piyush Narang] Update IntList slab creation to keep bumping up size gradually ebf1c58 [Piyush Narang] Merge branch 'master' of https://github.com/apache/parquet-mr 3ecc577 [Piyush Narang] Remove redundant IntList ctor f7dfd5f [Piyush Narang] Switch int[] initialization in IntList to be lazy Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/7885f9fb Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/7885f9fb Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/7885f9fb Branch: refs/heads/parquet-1.8.x Commit: 7885f9fb9b51633b3b9a161cc88ed8b7a3cca79d Parents: aa249a5 Author: Piyush Narang <pnar...@twitter.com> Authored: Wed Apr 20 20:47:49 2016 -0700 Committer: Ryan Blue <b...@apache.org> Committed: Mon Jan 9 16:54:53 2017 -0800 ---------------------------------------------------------------------- .../column/values/dictionary/IntList.java | 64 ++++++++++++--- .../column/values/dictionary/IntListTest.java | 84 ++++++++++++++++++++ 2 files changed, 136 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7885f9fb/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java index 8e6228a..dca1470 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/IntList.java @@ -31,7 +31,20 @@ import java.util.List; */ public class IntList { - private static final int SLAB_SIZE = 64 * 1024; + static final int MAX_SLAB_SIZE = 64 * 1024; + static final int INITIAL_SLAB_SIZE = 4 * 1024; + + // Double slab size till we reach the max slab size. At that point we just add slabs of size + // MAX_SLAB_SIZE. This ensures we don't allocate very large slabs from the start if we don't have + // too much data. + private int currentSlabSize = INITIAL_SLAB_SIZE; + + /** + * Visible for testing to verify the current slab size + */ + int getCurrentSlabSize() { + return currentSlabSize; + } /** * to iterate on the content of the list @@ -43,14 +56,17 @@ public class IntList { public static class IntIterator { private final int[][] slabs; - private int current; private final int count; + private int current; + private int currentRow; + private int currentCol; + /** * slabs will be iterated in order up to the provided count * as the last slab may not be full * @param slabs contain the ints - * @param count total count of ints + * @param count count of ints */ public IntIterator(int[][] slabs, int count) { this.slabs = slabs; @@ -68,11 +84,19 @@ public class IntList { * @return the next int */ public int next() { - final int result = slabs[current / SLAB_SIZE][current % SLAB_SIZE]; - ++ current; + final int result = slabs[currentRow][currentCol]; + incrementPosition(); return result; } + private void incrementPosition() { + current++; + currentCol++; + if (currentCol >= slabs[currentRow].length) { + currentCol = 0; + currentRow++; + } + } } private List<int[]> slabs = new ArrayList<int[]>(); @@ -82,20 +106,31 @@ public class IntList { private int[] currentSlab; private int currentSlabPos; - private void initSlab() { - currentSlab = new int[SLAB_SIZE]; + private void allocateSlab() { + currentSlab = new int[currentSlabSize]; currentSlabPos = 0; } + // Double slab size up to the MAX_SLAB_SIZE limit + private void updateCurrentSlabSize() { + if (currentSlabSize < MAX_SLAB_SIZE) { + currentSlabSize *= 2; + if (currentSlabSize > MAX_SLAB_SIZE) { + currentSlabSize = MAX_SLAB_SIZE; + } + } + } + /** * @param i value to append to the end of the list */ public void add(int i) { if (currentSlab == null) { - initSlab(); + allocateSlab(); } else if (currentSlabPos == currentSlab.length) { slabs.add(currentSlab); - initSlab(); + updateCurrentSlabSize(); + allocateSlab(); } currentSlab[currentSlabPos] = i; @@ -108,19 +143,24 @@ public class IntList { */ public IntIterator iterator() { if (currentSlab == null) { - initSlab(); + allocateSlab(); } int[][] itSlabs = slabs.toArray(new int[slabs.size() + 1][]); itSlabs[slabs.size()] = currentSlab; - return new IntIterator(itSlabs, SLAB_SIZE * slabs.size() + currentSlabPos); + return new IntIterator(itSlabs, size()); } /** * @return the current size of the list */ public int size() { - return SLAB_SIZE * slabs.size() + currentSlabPos; + int size = currentSlabPos; + for (int [] slab : slabs) { + size += slab.length; + } + + return size; } } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/7885f9fb/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java ---------------------------------------------------------------------- diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java new file mode 100644 index 0000000..e6df4db --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/IntListTest.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.values.dictionary; + +import org.junit.Test; + +import junit.framework.Assert; + +public class IntListTest { + + /** + * Test IntList of fairly small size (< INITIAL_SLAB_SIZE), this tests a single slab being created + */ + @Test + public void testSmallList() { + int testSize = IntList.INITIAL_SLAB_SIZE - 100; + doTestIntList(testSize, IntList.INITIAL_SLAB_SIZE); + } + + /** + * Test IntList > INITIAL_SLAB_SIZE so that we have multiple slabs being created + */ + @Test + public void testListGreaterThanInitialSlabSize() { + int testSize = IntList.INITIAL_SLAB_SIZE + 100; + doTestIntList(testSize, IntList.INITIAL_SLAB_SIZE * 2); + } + + /** + * Test IntList of a fairly large size (> MAX_SLAB_SIZE) so that we have multiple slabs + * created of varying sizes + */ + @Test + public void testListGreaterThanMaxSlabSize() { + int testSize = IntList.MAX_SLAB_SIZE * 4 + 100; + doTestIntList(testSize, IntList.MAX_SLAB_SIZE); + } + + private void doTestIntList(int testSize, int expectedSlabSize) { + IntList testList = new IntList(); + populateList(testList, testSize); + + verifyIteratorResults(testSize, testList); + + // confirm the size of the current slab + Assert.assertEquals(expectedSlabSize, testList.getCurrentSlabSize()); + } + + private void populateList(IntList testList, int size) { + for(int i = 0; i < size; i++) { + testList.add(i); + } + } + + private void verifyIteratorResults(int testSize, IntList testList) { + IntList.IntIterator iterator = testList.iterator(); + int expected = 0; + + while (iterator.hasNext()) { + int val = iterator.next(); + Assert.assertEquals(expected, val); + expected++; + } + + // ensure we have the correct final value of expected + Assert.assertEquals(testSize, expected); + } +}