[CARBONDATA-2615][32K] Support page size less than 32000 in CarbondataV3 Since we support super long string, if it is long enough, a column page with 32000 rows will exceed 2GB, so we support a page less than 32000 rows.
This closes #2383 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/091a28bf Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/091a28bf Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/091a28bf Branch: refs/heads/carbonstore Commit: 091a28bf833a5296dd3878ddb11b243f7f37a8fc Parents: 2ea3b2d Author: xuchuanyin <xuchuan...@hust.edu.cn> Authored: Wed Jun 20 19:07:03 2018 +0800 Committer: kumarvishal09 <kumarvishal1...@gmail.com> Committed: Thu Jun 21 11:00:02 2018 +0530 ---------------------------------------------------------------------- .../testsuite/dataload/TestLoadDataGeneral.scala | 16 ++++++++++++++++ .../store/CarbonFactDataHandlerColumnar.java | 7 ++++++- 2 files changed, 22 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/091a28bf/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala index 688928f..8b51090 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/dataload/TestLoadDataGeneral.scala @@ -259,6 +259,22 @@ class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach { CarbonLoadOptionConstants.ENABLE_CARBON_LOAD_DIRECT_WRITE_HDFS, originStatus) } + + test("test data loading with page size less than 32000") { + CarbonProperties.getInstance().addProperty( + CarbonCommonConstants.BLOCKLET_SIZE, "16000") + + val testData = s"$resourcesPath/sample.csv" + sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") + checkAnswer( + sql("SELECT COUNT(*) FROM loadtest"), + Seq(Row(6)) + ) + + CarbonProperties.getInstance().addProperty(CarbonCommonConstants.BLOCKLET_SIZE, + CarbonCommonConstants.BLOCKLET_SIZE_DEFAULT_VAL) + } + override def afterEach { sql("DROP TABLE if exists loadtest") sql("drop table if exists invalidMeasures") http://git-wip-us.apache.org/repos/asf/carbondata/blob/091a28bf/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerColumnar.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerColumnar.java b/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerColumnar.java index c0acadd..5fe3261 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerColumnar.java +++ b/processing/src/main/java/org/apache/carbondata/processing/store/CarbonFactDataHandlerColumnar.java @@ -371,8 +371,13 @@ public class CarbonFactDataHandlerColumnar implements CarbonFactHandler { this.pageSize = Integer.parseInt(CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.BLOCKLET_SIZE, CarbonCommonConstants.BLOCKLET_SIZE_DEFAULT_VAL)); + // support less than 32000 rows in one page, because we support super long string, + // if it is long enough, a clomun page with 32000 rows will exceed 2GB if (version == ColumnarFormatVersion.V3) { - this.pageSize = CarbonV3DataFormatConstants.NUMBER_OF_ROWS_PER_BLOCKLET_COLUMN_PAGE_DEFAULT; + this.pageSize = + pageSize < CarbonV3DataFormatConstants.NUMBER_OF_ROWS_PER_BLOCKLET_COLUMN_PAGE_DEFAULT ? + pageSize : + CarbonV3DataFormatConstants.NUMBER_OF_ROWS_PER_BLOCKLET_COLUMN_PAGE_DEFAULT; } LOGGER.info("Number of rows per column blocklet " + pageSize); dataRows = new ArrayList<>(this.pageSize);