[CARBONDATA-2798] Fix Dictionary_Include for ComplexDataType Problem1: Select Filter is throwing BufferUnderFlow Exception as cardinality is filled for Non-Dictionary columns. Solution: Check if a complex column has Encoding => Dictionary and fill cardinality for that column only.
Problem2: Transactional Table is throwing NullPointerException if csv fileheader is not proper. Solution: Throw CarbonDataLoadingException if csv fileheader is not proper. This closes #2578 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/2846eddb Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/2846eddb Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/2846eddb Branch: refs/heads/branch-1.4 Commit: 2846eddb9156276e8bf97d225fd5597d26c0cafb Parents: d4acf03 Author: Indhumathi27 <indhumathi...@gmail.com> Authored: Mon Jul 30 14:18:44 2018 +0530 Committer: ravipesala <ravi.pes...@gmail.com> Committed: Thu Aug 9 23:38:51 2018 +0530 ---------------------------------------------------------------------- .../src/test/resources/nontransactional1.csv | 2 ++ .../complexType/TestComplexDataType.scala | 7 +++++ .../TestNonTransactionalCarbonTable.scala | 30 +++++++++++++++++++ .../processing/datatypes/ArrayDataType.java | 24 +++++++++++++-- .../processing/datatypes/StructDataType.java | 31 +++++++++++++++----- .../converter/impl/FieldEncoderFactory.java | 6 ++-- .../processing/loading/model/LoadOption.java | 4 ++- 7 files changed, 91 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/integration/spark-common-test/src/test/resources/nontransactional1.csv ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/resources/nontransactional1.csv b/integration/spark-common-test/src/test/resources/nontransactional1.csv new file mode 100644 index 0000000..ac9ec54 --- /dev/null +++ b/integration/spark-common-test/src/test/resources/nontransactional1.csv @@ -0,0 +1,2 @@ +arvind, 33, 6.2 +bill, 35, 7.3 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala index 1451f7b..1ad7889 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala @@ -971,6 +971,13 @@ class TestComplexDataType extends QueryTest with BeforeAndAfterAll { "('dictionary_include'='b')") sql("insert into test values(1,2) ") checkAnswer(sql("select b[0] from test"),Seq(Row(2))) + sql("DROP TABLE IF EXISTS test") + sql( + "create table test(intval array<array<int>>,str array<array<string>>, bool " + + "array<array<boolean>>, sint array<array<short>>, big array<array<bigint>>) stored by " + + "'carbondata' tblproperties('dictionary_include'='bool,sint,big')") + sql("insert into test values(1,'ab',true,22,33)") + checkExistence(sql("select * from test"), true, "33") } test("date with struct and array") { http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/createTable/TestNonTransactionalCarbonTable.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/createTable/TestNonTransactionalCarbonTable.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/createTable/TestNonTransactionalCarbonTable.scala index 8a1d465..b92d41d 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/createTable/TestNonTransactionalCarbonTable.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/createTable/TestNonTransactionalCarbonTable.scala @@ -52,6 +52,7 @@ import org.apache.carbondata.core.datastore.page.encoding.DefaultEncodingFactory import org.apache.carbondata.core.metadata.ColumnarFormatVersion import org.apache.carbondata.core.metadata.datatype.DataTypes import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil, DataFileFooterConverterV3} +import org.apache.carbondata.processing.loading.exception.CarbonDataLoadingException import org.apache.carbondata.sdk.file._ @@ -350,6 +351,35 @@ class TestNonTransactionalCarbonTable extends QueryTest with BeforeAndAfterAll { cleanTestData() } + test(" test csv fileheader for transactional table") { + FileUtils.deleteDirectory(new File(writerPath)) + buildTestDataWithSameUUID(3, false, null, List("name")) + assert(new File(writerPath).exists()) + + sql("DROP TABLE IF EXISTS sdkOutputTable") + + sql( + s"""CREATE EXTERNAL TABLE sdkOutputTable STORED BY 'carbondata' LOCATION + |'$writerPath' """.stripMargin) + + checkAnswer(sql("SELECT name,name FROM sdkOutputTable"), Seq( + Row("robot0", "robot0"), + Row("robot1", "robot1"), + Row("robot2", "robot2"))) + //load csvfile without fileheader + var exception = intercept[CarbonDataLoadingException] { + sql(s"""load data inpath '$resourcesPath/nontransactional1.csv' into table sdkOutputTable""").show(200,false) + } + assert(exception.getMessage() + .contains("CSV header in input file is not proper. Column names in schema and csv header are not the same.")) + + sql("DROP TABLE sdkOutputTable") + // drop table should not delete the files + assert(new File(writerPath).exists()) + cleanTestData() + } + + test("test count star with multiple loads files with same schema and UUID") { FileUtils.deleteDirectory(new File(writerPath)) buildTestDataWithSameUUID(3, false, null, List("name")) http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/processing/src/main/java/org/apache/carbondata/processing/datatypes/ArrayDataType.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/datatypes/ArrayDataType.java b/processing/src/main/java/org/apache/carbondata/processing/datatypes/ArrayDataType.java index 60972e8..0a1eba8 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/datatypes/ArrayDataType.java +++ b/processing/src/main/java/org/apache/carbondata/processing/datatypes/ArrayDataType.java @@ -63,6 +63,11 @@ public class ArrayDataType implements GenericDataType<ArrayObject> { private int outputArrayIndex; /** + * Dictionary column + */ + private boolean isDictionaryColumn; + + /** * current data counter */ private int dataCounter; @@ -88,6 +93,21 @@ public class ArrayDataType implements GenericDataType<ArrayObject> { this.columnId = columnId; } + /** + * constructor + * @param name + * @param parentname + * @param columnId + * @param isDictionaryColumn + */ + public ArrayDataType(String name, String parentname, String columnId, + Boolean isDictionaryColumn) { + this.name = name; + this.parentname = parentname; + this.columnId = columnId; + this.isDictionaryColumn = isDictionaryColumn; + } + /* * to add child dimensions */ @@ -153,7 +173,7 @@ public class ArrayDataType implements GenericDataType<ArrayObject> { } @Override public boolean getIsColumnDictionary() { - return true; + return isDictionaryColumn; } @Override public void writeByteArray(ArrayObject input, DataOutputStream dataOutputStream, @@ -172,7 +192,7 @@ public class ArrayDataType implements GenericDataType<ArrayObject> { @Override public void fillCardinality(List<Integer> dimCardWithComplex) { - if (children.getIsColumnDictionary()) { + if (this.getIsColumnDictionary()) { dimCardWithComplex.add(0); children.fillCardinality(dimCardWithComplex); } http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/processing/src/main/java/org/apache/carbondata/processing/datatypes/StructDataType.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/datatypes/StructDataType.java b/processing/src/main/java/org/apache/carbondata/processing/datatypes/StructDataType.java index af95de6..31f2234 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/datatypes/StructDataType.java +++ b/processing/src/main/java/org/apache/carbondata/processing/datatypes/StructDataType.java @@ -57,6 +57,12 @@ public class StructDataType implements GenericDataType<StructObject> { * output array index */ private int outputArrayIndex; + + /** + * Dictionary column + */ + private boolean isDictionaryColumn; + /** * data counter */ @@ -82,6 +88,21 @@ public class StructDataType implements GenericDataType<StructObject> { this.columnId = columnId; } + /** + * constructor + * @param name + * @param parentname + * @param columnId + * @param isDictionaryColumn + */ + public StructDataType(String name, String parentname, String columnId, + Boolean isDictionaryColumn) { + this.name = name; + this.parentname = parentname; + this.columnId = columnId; + this.isDictionaryColumn = isDictionaryColumn; + } + /* * add child dimensions */ @@ -153,7 +174,7 @@ public class StructDataType implements GenericDataType<StructObject> { } @Override public boolean getIsColumnDictionary() { - return true; + return isDictionaryColumn; } @Override public void writeByteArray(StructObject input, DataOutputStream dataOutputStream, @@ -178,13 +199,7 @@ public class StructDataType implements GenericDataType<StructObject> { @Override public void fillCardinality(List<Integer> dimCardWithComplex) { - boolean isDictionaryColumn = false; - for (GenericDataType child : children) { - if (child.getIsColumnDictionary()) { - isDictionaryColumn = true; - } - } - if (isDictionaryColumn) { + if (this.getIsColumnDictionary()) { dimCardWithComplex.add(0); for (int i = 0; i < children.size(); i++) { children.get(i).fillCardinality(dimCardWithComplex); http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/FieldEncoderFactory.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/FieldEncoderFactory.java b/processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/FieldEncoderFactory.java index 39c12a9..e9d2b02 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/FieldEncoderFactory.java +++ b/processing/src/main/java/org/apache/carbondata/processing/loading/converter/impl/FieldEncoderFactory.java @@ -144,7 +144,8 @@ public class FieldEncoderFactory { ((CarbonDimension) carbonColumn).getListOfChildDimensions(); // Create array parser with complex delimiter ArrayDataType arrayDataType = - new ArrayDataType(carbonColumn.getColName(), parentName, carbonColumn.getColumnId()); + new ArrayDataType(carbonColumn.getColName(), parentName, carbonColumn.getColumnId(), + carbonColumn.hasEncoding(Encoding.DICTIONARY)); for (CarbonDimension dimension : listOfChildDimensions) { arrayDataType.addChildren( createComplexType(dimension, carbonColumn.getColName(), absoluteTableIdentifier, @@ -156,7 +157,8 @@ public class FieldEncoderFactory { ((CarbonDimension) carbonColumn).getListOfChildDimensions(); // Create struct parser with complex delimiter StructDataType structDataType = - new StructDataType(carbonColumn.getColName(), parentName, carbonColumn.getColumnId()); + new StructDataType(carbonColumn.getColName(), parentName, carbonColumn.getColumnId(), + carbonColumn.hasEncoding(Encoding.DICTIONARY)); for (CarbonDimension dimension : dimensions) { structDataType.addChildren( createComplexType(dimension, carbonColumn.getColName(), absoluteTableIdentifier, http://git-wip-us.apache.org/repos/asf/carbondata/blob/2846eddb/processing/src/main/java/org/apache/carbondata/processing/loading/model/LoadOption.java ---------------------------------------------------------------------- diff --git a/processing/src/main/java/org/apache/carbondata/processing/loading/model/LoadOption.java b/processing/src/main/java/org/apache/carbondata/processing/loading/model/LoadOption.java index 9733816..98cd90d 100644 --- a/processing/src/main/java/org/apache/carbondata/processing/loading/model/LoadOption.java +++ b/processing/src/main/java/org/apache/carbondata/processing/loading/model/LoadOption.java @@ -236,7 +236,9 @@ public class LoadOption { } } - if (carbonLoadModel.isCarbonTransactionalTable() && !CarbonDataProcessorUtil + // In SDK flow, hadoopConf will always be null, + // hence FileHeader check is not required for nontransactional table + if (hadoopConf != null && !CarbonDataProcessorUtil .isHeaderValid(carbonLoadModel.getTableName(), csvColumns, carbonLoadModel.getCarbonDataLoadSchema(), staticPartitionCols)) { if (csvFile == null) {