[CARBONDATA-2149]Fix complex type data displaying error when use DataFrame to write complex type data
The default value of 'complex_delimiter_level_1' and 'complex_delimiter_level_2' is wrong, it must be '$' and ':', not be '$' and '\:'. Escape characters '\' need to be added only when using delimiters in ArrayParserImpl or StructParserImpl This closes #1962 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/6f9016db Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/6f9016db Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/6f9016db Branch: refs/heads/branch-1.3 Commit: 6f9016db52dd3f9c31ba20e585debfc283e2594e Parents: 4bbbd4b Author: Zhang Zhichao <441586...@qq.com> Authored: Fri Feb 9 17:32:54 2018 +0800 Committer: Venkata Ramana G <ramana.gollam...@huawei.com> Committed: Tue Feb 27 12:50:55 2018 +0530 ---------------------------------------------------------------------- .../examples/DataFrameComplexTypeExample.scala | 90 ++++++++++++++++++++ .../hadoop/api/CarbonTableOutputFormat.java | 4 +- .../dataload/TestLoadDataWithNoMeasure.scala | 5 +- .../carbondata/spark/util/CarbonScalaUtil.scala | 4 +- .../carbondata/spark/util/DataLoadingUtil.scala | 10 +-- .../spark/util/GlobalDictionaryUtil.scala | 4 +- .../spark/util/AllDictionaryTestCase.scala | 4 +- .../util/ExternalColumnDictionaryTestCase.scala | 4 +- 8 files changed, 107 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/examples/spark2/src/main/scala/org/apache/carbondata/examples/DataFrameComplexTypeExample.scala ---------------------------------------------------------------------- diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/DataFrameComplexTypeExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/DataFrameComplexTypeExample.scala new file mode 100644 index 0000000..b5ff49b --- /dev/null +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/DataFrameComplexTypeExample.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.examples + +import org.apache.spark.sql.SaveMode + +case class StructElement(school: Array[String], age: Int) +case class ComplexTypeData(id: Int, name: String, city: String, salary: Float, file: StructElement) + +// scalastyle:off println +object DataFrameComplexTypeExample { + + def main(args: Array[String]) { + + val spark = ExampleUtils.createCarbonSession("DataFrameComplexTypeExample", 4) + val complexTableName = s"complex_type_table" + + import spark.implicits._ + + // drop table if exists previously + spark.sql(s"DROP TABLE IF EXISTS ${ complexTableName }") + spark.sql( + s""" + | CREATE TABLE ${ complexTableName }( + | id INT, + | name STRING, + | city STRING, + | salary FLOAT, + | file struct<school:array<string>, age:int> + | ) + | STORED BY 'carbondata' + | TBLPROPERTIES( + | 'sort_columns'='name', + | 'dictionary_include'='city') + | """.stripMargin) + + val sc = spark.sparkContext + // generate data + val df = sc.parallelize(Seq( + ComplexTypeData(1, "index_1", "city_1", 10000.0f, + StructElement(Array("struct_11", "struct_12"), 10)), + ComplexTypeData(2, "index_2", "city_2", 20000.0f, + StructElement(Array("struct_21", "struct_22"), 20)), + ComplexTypeData(3, "index_3", "city_3", 30000.0f, + StructElement(Array("struct_31", "struct_32"), 30)) + )).toDF + df.printSchema() + df.write + .format("carbondata") + .option("tableName", complexTableName) + .mode(SaveMode.Append) + .save() + + spark.sql(s"select count(*) from ${ complexTableName }").show(100, truncate = false) + + spark.sql(s"select * from ${ complexTableName } order by id desc").show(300, truncate = false) + + spark.sql(s"select * " + + s"from ${ complexTableName } " + + s"where id = 100000001 or id = 1 limit 100").show(100, truncate = false) + + spark.sql(s"select * " + + s"from ${ complexTableName } " + + s"where id > 10 limit 100").show(100, truncate = false) + + // show segments + spark.sql(s"SHOW SEGMENTS FOR TABLE ${complexTableName}").show(false) + + // drop table + spark.sql(s"DROP TABLE IF EXISTS ${ complexTableName }") + + spark.stop() + } +} +// scalastyle:on println http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableOutputFormat.java ---------------------------------------------------------------------- diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableOutputFormat.java b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableOutputFormat.java index e600f0c..47c8da9 100644 --- a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableOutputFormat.java +++ b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonTableOutputFormat.java @@ -299,11 +299,11 @@ public class CarbonTableOutputFormat extends FileOutputFormat<NullWritable, Stri SKIP_EMPTY_LINE, carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_SKIP_EMPTY_LINE))); - String complexDelim = conf.get(COMPLEX_DELIMITERS, "\\$" + "," + "\\:"); + String complexDelim = conf.get(COMPLEX_DELIMITERS, "$" + "," + ":"); String[] split = complexDelim.split(","); model.setComplexDelimiterLevel1(split[0]); if (split.length > 1) { - model.setComplexDelimiterLevel1(split[1]); + model.setComplexDelimiterLevel2(split[1]); } model.setDateFormat( conf.get( http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestLoadDataWithNoMeasure.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestLoadDataWithNoMeasure.scala b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestLoadDataWithNoMeasure.scala index f0b5fe6..f1d25e6 100644 --- a/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestLoadDataWithNoMeasure.scala +++ b/integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/dataload/TestLoadDataWithNoMeasure.scala @@ -86,7 +86,8 @@ class TestLoadDataWithNoMeasure extends QueryTest with BeforeAndAfterAll { ) val testData = s"$resourcesPath/datasingleComplexCol.csv" sql("LOAD DATA LOCAL INPATH '" + testData + "' into table nomeasureTest_scd options " + - "('DELIMITER'=',','QUOTECHAR'='\"','FILEHEADER'='cityDetail','COMPLEX_DELIMITER_LEVEL_1'=':')" + "('DELIMITER'=',','QUOTECHAR'='\"','FILEHEADER'='cityDetail'," + + "'COMPLEX_DELIMITER_LEVEL_1'=':','COMPLEX_DELIMITER_LEVEL_2'='$')" ) } @@ -101,7 +102,7 @@ class TestLoadDataWithNoMeasure extends QueryTest with BeforeAndAfterAll { val testData = s"$resourcesPath/datasingleComplexCol.csv" sql("LOAD DATA LOCAL INPATH '" + testData + "' into table nomeasureTest_scd options " + "('DELIMITER'=',','QUOTECHAR'='\"','FILEHEADER'='cityDetail'," + - "'COMPLEX_DELIMITER_LEVEL_1'=':')" + "'COMPLEX_DELIMITER_LEVEL_1'=':','COMPLEX_DELIMITER_LEVEL_2'='$')" ) } http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala index 262adf2..748945d 100644 --- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala +++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CarbonScalaUtil.scala @@ -137,7 +137,7 @@ object CarbonScalaUtil { builder.append(getString(x, serializationNullFormat, delimiterLevel1, delimiterLevel2, timeStampFormat, dateFormat, level + 1)).append(delimiter) } - builder.substring(0, builder.length - 1) + builder.substring(0, builder.length - delimiter.length()) case m: scala.collection.Map[Any, Any] => throw new Exception("Unsupported data type: Map") case r: org.apache.spark.sql.Row => @@ -151,7 +151,7 @@ object CarbonScalaUtil { builder.append(getString(r(i), serializationNullFormat, delimiterLevel1, delimiterLevel2, timeStampFormat, dateFormat, level + 1)).append(delimiter) } - builder.substring(0, builder.length - 1) + builder.substring(0, builder.length - delimiter.length()) case other => other.toString } } http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/DataLoadingUtil.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/DataLoadingUtil.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/DataLoadingUtil.scala index 3696e23..a38eaba 100644 --- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/DataLoadingUtil.scala +++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/DataLoadingUtil.scala @@ -122,11 +122,11 @@ object DataLoadingUtil { optionsFinal.put( "complex_delimiter_level_1", - options.getOrElse("complex_delimiter_level_1", "\\$")) + options.getOrElse("complex_delimiter_level_1", "$")) optionsFinal.put( "complex_delimiter_level_2", - options.getOrElse("complex_delimiter_level_2", "\\:")) + options.getOrElse("complex_delimiter_level_2", ":")) optionsFinal.put( "dateformat", @@ -323,10 +323,8 @@ object DataLoadingUtil { delimeter.equalsIgnoreCase(complex_delimeter_level2)) { CarbonException.analysisException(s"Field Delimiter and Complex types delimiter are same") } else { - carbonLoadModel.setComplexDelimiterLevel1( - CarbonUtil.delimiterConverter(complex_delimeter_level1)) - carbonLoadModel.setComplexDelimiterLevel2( - CarbonUtil.delimiterConverter(complex_delimeter_level2)) + carbonLoadModel.setComplexDelimiterLevel1(complex_delimeter_level1) + carbonLoadModel.setComplexDelimiterLevel2(complex_delimeter_level2) } // set local dictionary path, and dictionary file extension carbonLoadModel.setAllDictPath(all_dictionary_path) http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/GlobalDictionaryUtil.scala ---------------------------------------------------------------------- diff --git a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/GlobalDictionaryUtil.scala b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/GlobalDictionaryUtil.scala index 5f44e43..9e1ece7 100644 --- a/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/GlobalDictionaryUtil.scala +++ b/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/GlobalDictionaryUtil.scala @@ -274,10 +274,10 @@ object GlobalDictionaryUtil { Pattern.compile(if (d == null) { "" } else { - d + CarbonUtil.delimiterConverter(d) }) } - DataFormat(delimiters, 0, patterns) + DataFormat(delimiters.map(CarbonUtil.delimiterConverter(_)), 0, patterns) } else { null } http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/AllDictionaryTestCase.scala ---------------------------------------------------------------------- diff --git a/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/AllDictionaryTestCase.scala b/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/AllDictionaryTestCase.scala index 56c5747..e3678cd 100644 --- a/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/AllDictionaryTestCase.scala +++ b/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/AllDictionaryTestCase.scala @@ -54,8 +54,8 @@ class AllDictionaryTestCase extends Spark2QueryTest with BeforeAndAfterAll { carbonLoadModel.setFactFilePath(filePath) carbonLoadModel.setCsvHeader(header) carbonLoadModel.setCsvDelimiter(",") - carbonLoadModel.setComplexDelimiterLevel1("\\$") - carbonLoadModel.setComplexDelimiterLevel2("\\:") + carbonLoadModel.setComplexDelimiterLevel1("$") + carbonLoadModel.setComplexDelimiterLevel2(":") carbonLoadModel.setAllDictPath(allDictFilePath) carbonLoadModel.setSerializationNullFormat( TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName + ",\\N") http://git-wip-us.apache.org/repos/asf/carbondata/blob/6f9016db/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/ExternalColumnDictionaryTestCase.scala ---------------------------------------------------------------------- diff --git a/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/ExternalColumnDictionaryTestCase.scala b/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/ExternalColumnDictionaryTestCase.scala index a1b39d8..e543893 100644 --- a/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/ExternalColumnDictionaryTestCase.scala +++ b/integration/spark2/src/test/scala/org/apache/carbondata/spark/util/ExternalColumnDictionaryTestCase.scala @@ -163,8 +163,8 @@ class ExternalColumnDictionaryTestCase extends Spark2QueryTest with BeforeAndAft carbonLoadModel.setFactFilePath(filePath) carbonLoadModel.setCsvHeader(header) carbonLoadModel.setCsvDelimiter(csvDelimiter) - carbonLoadModel.setComplexDelimiterLevel1("\\$") - carbonLoadModel.setComplexDelimiterLevel2("\\:") + carbonLoadModel.setComplexDelimiterLevel1("$") + carbonLoadModel.setComplexDelimiterLevel2(":") carbonLoadModel.setColDictFilePath(extColFilePath) carbonLoadModel.setQuoteChar("\""); carbonLoadModel.setSerializationNullFormat(