[GitHub] carbondata pull request #2966: [CARBONDATA-3162][CARBONDATA-3163][CARBONDATA...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2966#discussion_r241643126 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java --- @@ -501,6 +502,17 @@ public CarbonLoadModel buildLoadModel(Schema carbonSchema) } // for the longstring field, change the datatype from string to varchar this.schema = updateSchemaFields(carbonSchema, longStringColumns); +if (sortColumns != null && sortColumns.length != 0) { + if (options == null || options.get("sort_scope") == null) { +// If sort_columns are specified and sort_scope is not specified, +// change sort scope to local_sort as now by default sort scope is no_sort. +if (options == null) { + options = new HashMap<>(); +} +//TODO: add in carbon property instead of load options --- End diff -- example, sort_columns are mentioned in table properties, but sort_scope is present as "Batch_sort" in carbon properties, so in this time If I set in table properties, test case fails. that too they set carbon properties after creating the table ! ---
[GitHub] carbondata pull request #2966: [CARBONDATA-3162][CARBONDATA-3163][CARBONDATA...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2966#discussion_r241642790 --- Diff: integration/spark2/src/test/scala/org/apache/spark/carbondata/restructure/AlterTableValidationTestCase.scala --- @@ -523,7 +523,8 @@ class AlterTableValidationTestCase extends Spark2QueryTest with BeforeAndAfterAl } } - test("describe formatted for default sort_columns pre and post alter") { + // after changing default sort_scope to no_sort, all dimensions are not selected for sorting. + ignore("describe formatted for default sort_columns pre and post alter") { --- End diff -- ok. ---
[GitHub] carbondata pull request #2966: [CARBONDATA-3162][CARBONDATA-3163][CARBONDATA...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2966#discussion_r241642755 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java --- @@ -501,6 +502,17 @@ public CarbonLoadModel buildLoadModel(Schema carbonSchema) } // for the longstring field, change the datatype from string to varchar this.schema = updateSchemaFields(carbonSchema, longStringColumns); +if (sortColumns != null && sortColumns.length != 0) { + if (options == null || options.get("sort_scope") == null) { +// If sort_columns are specified and sort_scope is not specified, +// change sort scope to local_sort as now by default sort scope is no_sort. +if (options == null) { + options = new HashMap<>(); +} +//TODO: add in carbon property instead of load options --- End diff -- there are test cases, that just set sort scope in carbon property but not sort columns, that test case fails as table property that I set has higher priority than carbon properties ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241334135 --- Diff: store/CSDK/test/main.cpp --- @@ -645,6 +653,278 @@ bool testWriteData(JNIEnv *env, char *path, int argc, char *argv[]) { } } +void writeData(JNIEnv *env, CarbonWriter writer, int size, jclass objClass, char *stringField, short shortField) { +jobjectArray arr = env->NewObjectArray(size, objClass, 0); + +jobject jStringField = env->NewStringUTF(stringField); +env->SetObjectArrayElement(arr, 0, jStringField); + +char ctrShort[10]; +gcvt(shortField % 1, 10, ctrShort); +jobject jShortField = env->NewStringUTF(ctrShort); +env->SetObjectArrayElement(arr, 1, jShortField); + +writer.write(arr); + +env->DeleteLocalRef(jStringField); +env->DeleteLocalRef(jShortField); +env->DeleteLocalRef(arr); +} + +/** + * test WithLoadOption interface + * + * @param env jni env + * @param path file path + * @param argc argument counter + * @param argv argument vector + * @return true or throw exception + */ +bool testWithLoadOption(JNIEnv *env, char *path, int argc, char *argv[]) { + +char *jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{longField:long},{doubleField:double},{boolField:boolean},{dateField:date},{timeField:timestamp},{floatField:float},{arrayField:array}]"; +try { +CarbonWriter writer; +writer.builder(env); +writer.outputPath(path); +writer.withCsvInput(jsonSchema); +writer.withLoadOption("complex_delimiter_level_1", "#"); +writer.writtenBy("CSDK"); +writer.taskNo(185); +writer.withThreadSafe(1); +writer.uniqueIdentifier(154991181400); +writer.withBlockSize(1); +writer.withBlockletSize(16); +writer.enableLocalDictionary(true); +writer.localDictionaryThreshold(1); +if (argc > 3) { +writer.withHadoopConf("fs.s3a.access.key", argv[1]); +writer.withHadoopConf("fs.s3a.secret.key", argv[2]); +writer.withHadoopConf("fs.s3a.endpoint", argv[3]); +} +writer.build(); + +int rowNum = 7; +int size = 10; +long longValue = 0; +double doubleValue = 0; +float floatValue = 0; +jclass objClass = env->FindClass("java/lang/String"); +for (int i = 0; i < rowNum; ++i) { +jobjectArray arr = env->NewObjectArray(size, objClass, 0); +char ctrInt[10]; +gcvt(i, 10, ctrInt); + +char a[15] = "robot"; +strcat(a, ctrInt); +jobject stringField = env->NewStringUTF(a); +env->SetObjectArrayElement(arr, 0, stringField); + +char ctrShort[10]; +gcvt(i % 1, 10, ctrShort); +jobject shortField = env->NewStringUTF(ctrShort); +env->SetObjectArrayElement(arr, 1, shortField); + +jobject intField = env->NewStringUTF(ctrInt); +env->SetObjectArrayElement(arr, 2, intField); + + +char ctrLong[10]; +gcvt(longValue, 10, ctrLong); +longValue = longValue + 2; +jobject longField = env->NewStringUTF(ctrLong); +env->SetObjectArrayElement(arr, 3, longField); + +char ctrDouble[10]; +gcvt(doubleValue, 10, ctrDouble); +doubleValue = doubleValue + 2; +jobject doubleField = env->NewStringUTF(ctrDouble); +env->SetObjectArrayElement(arr, 4, doubleField); + +jobject boolField = env->NewStringUTF("true"); +env->SetObjectArrayElement(arr, 5, boolField); + +jobject dateField = env->NewStringUTF(" 2019-03-02"); +env->SetObjectArrayElement(arr, 6, dateField); + +jobject timeField = env->NewStringUTF("2019-02-12 03:03:34"); +env->SetObjectArrayElement(arr, 7, timeField); + +char ctrFloat[10]; +gcvt(floatValue, 10, ctrFloat); +floatValue = floatValue + 2; +jobject floatField = env->NewStringUTF(ctrFloat); +env->SetObjectArrayElement(arr, 8, floatField); + +jobject arrayField = env->NewStringUTF("Hello#World#From#Carbon"); +env->SetObjec
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241333995 --- Diff: store/CSDK/test/main.cpp --- @@ -645,6 +653,278 @@ bool testWriteData(JNIEnv *env, char *path, int argc, char *argv[]) { } } +void writeData(JNIEnv *env, CarbonWriter writer, int size, jclass objClass, char *stringField, short shortField) { +jobjectArray arr = env->NewObjectArray(size, objClass, 0); + +jobject jStringField = env->NewStringUTF(stringField); +env->SetObjectArrayElement(arr, 0, jStringField); + +char ctrShort[10]; +gcvt(shortField % 1, 10, ctrShort); +jobject jShortField = env->NewStringUTF(ctrShort); +env->SetObjectArrayElement(arr, 1, jShortField); + +writer.write(arr); + +env->DeleteLocalRef(jStringField); +env->DeleteLocalRef(jShortField); +env->DeleteLocalRef(arr); +} + +/** + * test WithLoadOption interface + * + * @param env jni env + * @param path file path + * @param argc argument counter + * @param argv argument vector + * @return true or throw exception + */ +bool testWithLoadOption(JNIEnv *env, char *path, int argc, char *argv[]) { + +char *jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{longField:long},{doubleField:double},{boolField:boolean},{dateField:date},{timeField:timestamp},{floatField:float},{arrayField:array}]"; +try { +CarbonWriter writer; +writer.builder(env); --- End diff -- Can add these load options to existing writer test case, no need to add new ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241328653 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CSVCarbonWriterTest.java --- @@ -543,4 +543,42 @@ public void testWritingAndReadingArrayOfFloatAndByte() throws IOException { } } + @Test + public void testWithTableProperties() throws IOException { --- End diff -- what's the use of this test case ? how would you validate table property is working or not ? ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241327234 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java --- @@ -199,6 +203,22 @@ public CarbonWriterBuilder withLoadOptions(Map options) { return this; } + /** + * To support the load options for sdk writer + * + * @param key the key of load option + * @param value the value of load option + * @return updated CarbonWriterBuilder object + */ + public CarbonWriterBuilder withLoadOption(String key, String value) { +Objects.requireNonNull(key, "key of table properties should not be null"); --- End diff -- this is load options, not table properties ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241322296 --- Diff: store/CSDK/src/CarbonWriter.cpp --- @@ -58,6 +58,35 @@ void CarbonWriter::outputPath(char *path) { carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); } +void CarbonWriter::sortBy(int argc, char **argv) { +if (argc < 0) { +throw std::runtime_error("argc parameter can't be negative."); +} +if (argv == NULL) { +throw std::runtime_error("argv parameter can't be NULL."); +} +checkBuilder(); +jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonReaderBuilderClass, "sortBy", + "([Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: sortBy"); +} +jclass objectArrayClass = jniEnv->FindClass("Ljava/lang/String;"); +if (objectArrayClass == NULL) { +throw std::runtime_error("Can't find the class in java: java/lang/String"); +} +jobjectArray array = jniEnv->NewObjectArray(argc, objectArrayClass, NULL); +for (int i = 0; i < argc; ++i) { +jstring value = jniEnv->NewStringUTF(argv[i]); +jniEnv->SetObjectArrayElement(array, i, value); +} + +jvalue args[1]; +args[0].l = array; +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); --- End diff -- can this be modified to (void) jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); no need to collect return value from java API as it return type of cpp method is void. check same for all the new void API added ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241321530 --- Diff: store/CSDK/src/CarbonWriter.cpp --- @@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char *value) { carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); } +void CarbonWriter::withTableProperty(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withTableProperty", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withTableProperty"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::withLoadOption(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withLoadOption", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withLoadOption"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::taskNo(long taskNo) { +if (taskNo < 0) { +throw std::runtime_error("taskNo parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "taskNo", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: taskNo"); +} +jvalue args[2]; +args[0].j = taskNo; +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::uniqueIdentifier(long timestamp) { +if (timestamp < 1) { +throw std::runtime_error("timestamp parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "uniqueIdentifier", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: uniqueIdentifier"); +} +jvalue args[2]; +args[0].j = timestamp; +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::withThreadSafe(short numOfThreads) { +if (numOfThreads < 1) { +throw std::runtime_error("numOfThreads parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withThreadSafe", +"(S)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withThreadSafe"); +} +jvalue args[2]; +args[0].s = numOfThreads; +carbonWriterBuilderObject = jniEnv->Cal
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241320959 --- Diff: store/CSDK/src/CarbonWriter.cpp --- @@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char *value) { carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); } +void CarbonWriter::withTableProperty(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withTableProperty", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withTableProperty"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::withLoadOption(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withLoadOption", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withLoadOption"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::taskNo(long taskNo) { +if (taskNo < 0) { +throw std::runtime_error("taskNo parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "taskNo", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: taskNo"); +} +jvalue args[2]; +args[0].j = taskNo; +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::uniqueIdentifier(long timestamp) { +if (timestamp < 1) { +throw std::runtime_error("timestamp parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "uniqueIdentifier", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: uniqueIdentifier"); +} +jvalue args[2]; +args[0].j = timestamp; +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::withThreadSafe(short numOfThreads) { +if (numOfThreads < 1) { +throw std::runtime_error("numOfThreads parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withThreadSafe", +"(S)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withThreadSafe"); +} +jvalue args[2]; --- End diff -- same as above ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241318527 --- Diff: store/CSDK/src/CarbonWriter.cpp --- @@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char *value) { carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); } +void CarbonWriter::withTableProperty(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withTableProperty", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withTableProperty"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::withLoadOption(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withLoadOption", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withLoadOption"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::taskNo(long taskNo) { +if (taskNo < 0) { +throw std::runtime_error("taskNo parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "taskNo", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: taskNo"); +} +jvalue args[2]; +args[0].j = taskNo; +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::uniqueIdentifier(long timestamp) { +if (timestamp < 1) { +throw std::runtime_error("timestamp parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "uniqueIdentifier", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: uniqueIdentifier"); +} +jvalue args[2]; --- End diff -- should be size 1 ? ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241318351 --- Diff: store/CSDK/src/CarbonWriter.cpp --- @@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char *value) { carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); } +void CarbonWriter::withTableProperty(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withTableProperty", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withTableProperty"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::withLoadOption(char *key, char *value) { +if (key == NULL) { +throw std::runtime_error("key parameter can't be NULL."); +} +if (value == NULL) { +throw std::runtime_error("value parameter can't be NULL."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "withLoadOption", + "(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: withLoadOption"); +} +jvalue args[2]; +args[0].l = jniEnv->NewStringUTF(key); +args[1].l = jniEnv->NewStringUTF(value); +carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); +} + +void CarbonWriter::taskNo(long taskNo) { +if (taskNo < 0) { +throw std::runtime_error("taskNo parameter can't be negative."); +} +checkBuilder(); +jclass carbonWriterBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); +jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, "taskNo", +"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;"); +if (methodID == NULL) { +throw std::runtime_error("Can't find the method in java: taskNo"); +} +jvalue args[2]; --- End diff -- should be size 1 ? ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241315209 --- Diff: store/CSDK/src/CarbonWriter.cpp --- @@ -58,6 +58,35 @@ void CarbonWriter::outputPath(char *path) { carbonWriterBuilderObject = jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args); } +void CarbonWriter::sortBy(int argc, char **argv) { +if (argc < 0) { +throw std::runtime_error("argc parameter can't be negative."); +} +if (argv == NULL) { +throw std::runtime_error("argv parameter can't be NULL."); +} +checkBuilder(); +jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonWriterBuilderObject); --- End diff -- it is writerBuilder, not readerBuilder ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241313087 --- Diff: docs/csdk-guide.md --- @@ -214,6 +226,122 @@ release the memory and destroy JVM. void withHadoopConf(char *key, char *value); ``` +<<<<<<< HEAD +=== +``` + /** + * To support the table properties for writer + * + * @param key properties key + * @param value properties value + */ +void withTableProperty(char *key, char *value); +``` + +``` +/** + * To support the load options for C++ sdk writer + * + * @param options key,value pair of load options. + * supported keys values are + * a. bad_records_logger_enable -- true (write into separate logs), false + * b. bad_records_action -- FAIL, FORCE, IGNORE, REDIRECT + * c. bad_record_path -- path + * d. dateformat -- same as JAVA SimpleDateFormat + * e. timestampformat -- same as JAVA SimpleDateFormat + * f. complex_delimiter_level_1 -- value to Split the complexTypeData + * g. complex_delimiter_level_2 -- value to Split the nested complexTypeData + * h. quotechar + * i. escapechar + * + * Default values are as follows. + * + * a. bad_records_logger_enable -- "false" + * b. bad_records_action -- "FAIL" + * c. bad_record_path -- "" + * d. dateformat -- "" , uses from carbon.properties file + * e. timestampformat -- "", uses from carbon.properties file + * f. complex_delimiter_level_1 -- "$" + * g. complex_delimiter_level_2 -- ":" + * h. quotechar -- "\"" + * i. escapechar -- "\\" + * + * @return updated CarbonWriterBuilder + */ +void withLoadOption(char *key, char *value); +``` + +``` +/** + * sets the taskNo for the writer. CSDKs concurrently running + * will set taskNo in order to avoid conflicts in file's name during write. + * + * @param taskNo is the TaskNo user wants to specify. + * by default it is system time in nano seconds. + */ +void taskNo(long taskNo); +``` + +``` +/** + * to set the timestamp in the carbondata and carbonindex index files + * + * @param timestamp is a timestamp to be used in the carbondata and carbonindex index files. + * By default set to zero. + * @return updated CarbonWriterBuilder + */ +void uniqueIdentifier(long timestamp); +``` + +``` +/** + * To make c++ sdk writer thread safe. + * + * @param numOfThreads should number of threads in which writer is called in multi-thread scenario + * default C++ sdk writer is not thread safe. + * can use one writer instance in one thread only. + */ +void withThreadSafe(short numOfThreads) ; +``` + +``` +/** + * To set the carbondata file size in MB between 1MB-2048MB + * + * @param blockSize is size in MB between 1MB to 2048 MB + * default value is 1024 MB + */ +void withBlockSize(int blockSize); +``` + +``` +/** + * To set the blocklet size of CarbonData file + * + * @param blockletSize is blocklet size in MB + *default value is 64 MB + * @return updated CarbonWriterBuilder + */ +void withBlockletSize(int blockletSize); +``` + +``` +/** + * @param localDictionaryThreshold is localDictionaryThreshold, default is 1 + * @return updated CarbonWriterBuilder + */ +void localDictionaryThreshold(int localDictionaryThreshold); +``` + +``` +/** + * @param enableLocalDictionary enable local dictionary, default is false + * @return updated CarbonWriterBuilder + */ +void enableLocalDictionary(bool enableLocalDictionary); +``` + +>>>>>>> aebd066bc... [CARBONDATA-3073] Support configure TableProperties,withLoadOption etc. interface in carbon writer of C++ SDK --- End diff -- same as above ---
[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2899#discussion_r241311890 --- Diff: docs/csdk-guide.md --- @@ -214,6 +226,122 @@ release the memory and destroy JVM. void withHadoopConf(char *key, char *value); ``` +<<<<<<< HEAD --- End diff -- what's this? problem while resolving a conflict? please check and redo this file. ---
[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2982#discussion_r241286067 --- Diff: integration/presto/src/main/java/org/apache/carbondata/presto/impl/CarbonTableReader.java --- @@ -364,23 +355,38 @@ private CarbonTable parseCarbonMetadata(SchemaTableName table) { String tablePath = storePath + "/" + carbonTableIdentifier.getDatabaseName() + "/" + carbonTableIdentifier.getTableName(); - //Step 2: read the metadata (tableInfo) of the table. - ThriftReader.TBaseCreator createTBase = new ThriftReader.TBaseCreator() { -// TBase is used to read and write thrift objects. -// TableInfo is a kind of TBase used to read and write table information. -// TableInfo is generated by thrift, -// see schema.thrift under format/src/main/thrift for details. -public TBase create() { - return new org.apache.carbondata.format.TableInfo(); + String metadataPath = CarbonTablePath.getSchemaFilePath(tablePath); + boolean isTransactionalTable = false; + try { +FileFactory.FileType fileType = FileFactory.getFileType(metadataPath); +if (FileFactory.getCarbonFile(metadataPath, fileType).isFileExist(metadataPath, fileType)) { --- End diff -- currently, this reduces one function call, If I don't pass file type implicitly again they call this method. Let it be there now. when we remove filetype from file factory in the future, this also will be optimized. ---
[GitHub] carbondata issue #2983: [CARBONDATA-3119] Fixed SDK Write for Complex Array ...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2983 retest this please ---
[GitHub] carbondata pull request #2949: [CARBONDATA-3118] support parallel block prun...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2949#discussion_r240900313 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java --- @@ -205,26 +195,53 @@ public BlockletDetailsFetcher getBlockletDetailsFetcher() { final FilterResolverIntf filterExp, final List partitions, List blocklets, final Map> dataMaps, int totalFiles) { +/* + * + * Below is the example of how this part of code works. + * consider a scenario of having 5 segments, 10 datamaps in each segment, --- End diff -- BlockDatamap and blockletDatamap can store multiple files information. Each file is one row in that datamap. But non-default datamaps are not like that, so default datamaps distribution in multithread happens based on number of entries in datamaps, for non-default datamps distribution is based on number of datamaps (one datamap is considered as one record for non-default datamaps) ALso 10 datamap in a segment means, one merge index file has info of 10 index files ---
[GitHub] carbondata issue #2983: [CARBONDATA-3119] Fixed SDK Write for Complex Array ...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2983 retest this please ---
[GitHub] carbondata issue #2983: [CARBONDATA-3119] Fixed SDK Write for Complex Array ...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2983 @ravipesala , @chenliang613 : please add @shivamasn to whitelist ---
[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2983#discussion_r240889253 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -2072,4 +1797,35 @@ public void testReadingNullValues() { } } + @Test public void testSdkWriteWhenArrayOfStringIsEmpty() + throws IOException, InvalidLoadOptionException { + +CarbonProperties.getInstance() +.addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL"); --- End diff -- same as above, take a backup of CARBON_BAD_RECORDS_ACTION and set it at the end of the test case ---
[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2983#discussion_r240889185 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -23,8 +23,11 @@ import java.util.*; import org.apache.avro.generic.GenericData; + --- End diff -- please revert unwanted changes, only your changes should be there in diff ---
[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2983#discussion_r240889074 --- Diff: integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala --- @@ -39,6 +39,27 @@ class TestComplexDataType extends QueryTest with BeforeAndAfterAll { .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, badRecordAction) } + test("test Projection PushDown for Array - String type when Array is Empty") { +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL") --- End diff -- Take a previous CARBON_BAD_RECORDS_ACTION property as backup and set it back after this test case, else it affects other test suites ---
[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2983#discussion_r240889108 --- Diff: integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala --- @@ -39,6 +39,27 @@ class TestComplexDataType extends QueryTest with BeforeAndAfterAll { .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, badRecordAction) } + test("test Projection PushDown for Array - String type when Array is Empty") { +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL") +sql("drop table if exists table1") +sql("create table table1 (detail array) stored by 'carbondata'") +sql("insert into table1 values('')") +checkAnswer(sql("select detail[0] from table1"), Seq(Row(""))) +sql("drop table if exists table1") + } + + test("test Projection PushDown for Struct - Array type when Array is Empty") { +CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL") --- End diff -- same as above ---
[jira] [Created] (CARBONDATA-3163) If table has different time format, for no_sort columns data goes as bad record (null) for second table when loaded after first table.
Ajantha Bhat created CARBONDATA-3163: Summary: If table has different time format, for no_sort columns data goes as bad record (null) for second table when loaded after first table. Key: CARBONDATA-3163 URL: https://issues.apache.org/jira/browse/CARBONDATA-3163 Project: CarbonData Issue Type: Bug Reporter: Ajantha Bhat Assignee: Ajantha Bhat If table has different time format, for no_sort columns data goes as bad record (null) for second table when loaded after first table. FilterProcessorTestCase.test("Between filter") -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Created] (CARBONDATA-3164) During no_sort, excpetion happend at converter step is not reached to user. same problem in SDK and spark file format flow also.
Ajantha Bhat created CARBONDATA-3164: Summary: During no_sort, excpetion happend at converter step is not reached to user. same problem in SDK and spark file format flow also. Key: CARBONDATA-3164 URL: https://issues.apache.org/jira/browse/CARBONDATA-3164 Project: CarbonData Issue Type: Bug Reporter: Ajantha Bhat Assignee: Ajantha Bhat During no_sort, excpetion happend at converter step is not reached to user. same problem in SDK and spark file format flow also. TestLoadDataGeneral. test({color:#008000}"test load / insert / update with data more than 32000 bytes - dictionary_exclude"{color}) -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Created] (CARBONDATA-3162) Range filters doesn't remove null values for no_sort direct dictionary dimension columns.
Ajantha Bhat created CARBONDATA-3162: Summary: Range filters doesn't remove null values for no_sort direct dictionary dimension columns. Key: CARBONDATA-3162 URL: https://issues.apache.org/jira/browse/CARBONDATA-3162 Project: CarbonData Issue Type: Bug Reporter: Ajantha Bhat Assignee: Ajantha Bhat Range filters doesn't remove null values for no_sort direct dictionary dimension columns. TimestampDataTypeDirectDictionaryTest. test({color:#008000}"test timestamp with dictionary include and no_inverted index"{color}) -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[GitHub] carbondata issue #2982: [CARBONDATA-3158] support presto-carbon to read sdk ...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2982 @ravipesala , @jackylk : PR is ready. please review ---
[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2982#discussion_r240477368 --- Diff: integration/presto/src/main/java/org/apache/carbondata/presto/impl/CarbonTableReader.java --- @@ -364,23 +355,38 @@ private CarbonTable parseCarbonMetadata(SchemaTableName table) { String tablePath = storePath + "/" + carbonTableIdentifier.getDatabaseName() + "/" + carbonTableIdentifier.getTableName(); - //Step 2: read the metadata (tableInfo) of the table. - ThriftReader.TBaseCreator createTBase = new ThriftReader.TBaseCreator() { -// TBase is used to read and write thrift objects. -// TableInfo is a kind of TBase used to read and write table information. -// TableInfo is generated by thrift, -// see schema.thrift under format/src/main/thrift for details. -public TBase create() { - return new org.apache.carbondata.format.TableInfo(); + String metadataPath = CarbonTablePath.getSchemaFilePath(tablePath); + boolean isTransactionalTable = false; + try { +if (FileFactory.getCarbonFile(metadataPath) --- End diff -- hmm. ok. ---
[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2982#discussion_r240475287 --- Diff: integration/presto/src/main/java/org/apache/carbondata/presto/impl/CarbonTableReader.java --- @@ -364,23 +355,38 @@ private CarbonTable parseCarbonMetadata(SchemaTableName table) { String tablePath = storePath + "/" + carbonTableIdentifier.getDatabaseName() + "/" + carbonTableIdentifier.getTableName(); - //Step 2: read the metadata (tableInfo) of the table. - ThriftReader.TBaseCreator createTBase = new ThriftReader.TBaseCreator() { -// TBase is used to read and write thrift objects. -// TableInfo is a kind of TBase used to read and write table information. -// TableInfo is generated by thrift, -// see schema.thrift under format/src/main/thrift for details. -public TBase create() { - return new org.apache.carbondata.format.TableInfo(); + String metadataPath = CarbonTablePath.getSchemaFilePath(tablePath); + boolean isTransactionalTable = false; + try { +if (FileFactory.getCarbonFile(metadataPath) +.isFileExist(metadataPath, FileFactory.getFileType(metadataPath))) { + // If metadata folder exists, it is a transactional table + isTransactionalTable = true; } - }; - ThriftReader thriftReader = - new ThriftReader(CarbonTablePath.getSchemaFilePath(tablePath), createTBase); - thriftReader.open(); - org.apache.carbondata.format.TableInfo tableInfo = - (org.apache.carbondata.format.TableInfo) thriftReader.read(); - thriftReader.close(); - + } catch (IOException e) { +throw new RuntimeException(e); + } + org.apache.carbondata.format.TableInfo tableInfo; + if (isTransactionalTable) { +//Step 2: read the metadata (tableInfo) of the table. +ThriftReader.TBaseCreator createTBase = new ThriftReader.TBaseCreator() { + // TBase is used to read and write thrift objects. + // TableInfo is a kind of TBase used to read and write table information. + // TableInfo is generated by thrift, + // see schema.thrift under format/src/main/thrift for details. + public TBase create() { +return new org.apache.carbondata.format.TableInfo(); + } +}; +ThriftReader thriftReader = +new ThriftReader(CarbonTablePath.getSchemaFilePath(tablePath), createTBase); +thriftReader.open(); +tableInfo = (org.apache.carbondata.format.TableInfo) thriftReader.read(); +thriftReader.close(); + } else { +tableInfo = +CarbonUtil.inferSchema(tablePath, table.getTableName(), false, new Configuration()); --- End diff -- I have tested. It works. but better to use FileFactory.getConfiguration(). I will change to it, ---
[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2982 [CARBONDATA-3158] support presto-carbon to read sdk cabron files **problem:** Currently, carbon SDK files output (files without metadata folder and its contents) are read by spark using an external table with carbon session. But presto carbon integration doesn't support that. It can currently read only the transactional table output files. **solution:** Hence we can enhance presto to read SDK output files. This will increase the use cases for presto-carbon integration. The above scenario can be achieved by inferring schema if metadata folder not exists and setting read committed scope to LatestFilesReadCommittedScope, if non-transctional table output files are present. Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? NA - [ ] Any backward compatibility impacted? NA - [ ] Document update required? NA - [ ] Testing done. yes, added UT - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. NA You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata presto Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2982.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2982 commit e54253eff4546945a8c5a2543ac5ed5aa12df85b Author: ajantha-bhat Date: 2018-12-07T13:07:10Z presto with sdk ---
[jira] [Created] (CARBONDATA-3158) support presto-carbon to read sdk cabron files
Ajantha Bhat created CARBONDATA-3158: Summary: support presto-carbon to read sdk cabron files Key: CARBONDATA-3158 URL: https://issues.apache.org/jira/browse/CARBONDATA-3158 Project: CarbonData Issue Type: Improvement Reporter: Ajantha Bhat Assignee: Ajantha Bhat Currently, carbon SDK files output (files without metadata folder and its contents) are read by spark using an external table with carbon session. But presto carbon integration doesn't support that. It can currently read only the transactional table output files. Hence we can enhance presto to read SDK output files. This will increase the use cases for presto-carbon integration. The above scenario can be achieved by inferring schema if metadata folder not exists and setting read committed scope to LatestFilesReadCommittedScope, if non-transctional table output files are present. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[GitHub] carbondata issue #2966: [WIP] test and check no sort by default
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2966 retest this please ---
[GitHub] carbondata pull request #2966: [WIP] test and check no sort by default
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2966#discussion_r237747880 --- Diff: integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/DataMapWriterSuite.scala --- @@ -156,8 +156,7 @@ class DataMapWriterSuite extends QueryTest with BeforeAndAfterAll { CarbonProperties.getInstance() .addProperty("carbon.blockletgroup.size.in.mb", "16") CarbonProperties.getInstance() - .addProperty("carbon.number.of.cores.while.loading", -CarbonCommonConstants.NUM_CORES_DEFAULT_VAL) + .addProperty("carbon.number.of.cores.while.loading", "2") --- End diff -- And @qiuchenjian : If you have a doubt in PR changes, you can ask it in comment sections. Don't add as a review comment. Just add as a comment. ---
[GitHub] carbondata pull request #2966: [WIP] test and check no sort by default
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2966#discussion_r237747693 --- Diff: integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/DataMapWriterSuite.scala --- @@ -156,8 +156,7 @@ class DataMapWriterSuite extends QueryTest with BeforeAndAfterAll { CarbonProperties.getInstance() .addProperty("carbon.blockletgroup.size.in.mb", "16") CarbonProperties.getInstance() - .addProperty("carbon.number.of.cores.while.loading", -CarbonCommonConstants.NUM_CORES_DEFAULT_VAL) + .addProperty("carbon.number.of.cores.while.loading", "2") --- End diff -- CarbonCommonConstants.NUM_CORES_DEFAULT_VAL is removed, so setting CarbonCommonConstants.NUM_CORES_LOADING as 2. It is a test code. ---
[GitHub] carbondata pull request #2964: [HOTFIX] Fix ArrayOutOfBound exception when d...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2964#discussion_r237548880 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -575,7 +575,7 @@ public void testReadColumnTwice() throws IOException, InterruptedException { CarbonReader reader = CarbonReader .builder(path, "_temp") -.projection(new String[]{"name", "name", "age", "name"}) +.projection(new String[]{"name", "age", "age", "name"}) --- End diff -- @qiuchenjian : NO, age is an int column (measure), check schema in writer buildler. No need of new test case, now this test case handle testing duplicate measures and dimensions. ---
[GitHub] carbondata pull request #2966: [WIP] test and check no sort by default
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2966 [WIP] test and check no sort by default [WIP] test and check no sort by default Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? - [ ] Any backward compatibility impacted? - [ ] Document update required? - [ ] Testing done Please provide details on - Whether new unit test cases have been added or why no new tests are required? - How it is tested? Please attach test report. - Is it a performance related change? Please attach the performance test report. - Any additional information to help reviewers in testing this change. - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata optimize Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2966.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2966 commit 09410167efee56d69a3c18433c8e6e97a2fe18eb Author: ajantha-bhat Date: 2018-11-29T15:59:34Z no sort by default ---
[GitHub] carbondata issue #2964: [HOTFIX] Fix ArrayOutOfBound exception when duplicat...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2964 @kunal642 , @kumarvishal09 : please check this ---
[GitHub] carbondata pull request #2964: [HOTFIX] Fix ArrayOutOfBound exception when d...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2964 [HOTFIX] Fix ArrayOutOfBound exception when duplicate measure in projection column problem: ArrayOutOfBound exception when duplicate measure in the projection column cause: In query executor, when the reusable buffer is formed. It was considering only the unique values. Need to consider all the projections. solution: consider all the projections, while forming a reusable buffer. Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? NA - [ ] Any backward compatibility impacted? NA - [ ] Document update required? NA - [ ] Testing done yes, updated UT. - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. NA You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata sdk Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2964.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2964 commit ca6fd01b92844f43f2472ccf3f17d498bbd73216 Author: ajantha-bhat Date: 2018-11-29T11:42:56Z fix ArrayOutOfBound exception when duplicate measure in projection column ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r237174006 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java --- @@ -63,6 +75,8 @@ private SegmentPropertiesFetcher segmentPropertiesFetcher; + private static final Log LOG = LogFactory.getLog(TableDataMap.class); --- End diff -- ok ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r237173860 --- Diff: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java --- @@ -1399,6 +1399,17 @@ private CarbonCommonConstants() { public static final String CARBON_PUSH_ROW_FILTERS_FOR_VECTOR_DEFAULT = "false"; + /** + * max driver threads used for block pruning [1 to 4 threads] + */ + @CarbonProperty public static final String CARBON_MAX_DRIVER_THREADS_FOR_BLOCK_PRUNING = + "carbon.max.driver.threads.for.block.pruning"; --- End diff -- I have another non-default datamap PR. I will check about this. I feel this name also OK ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r237173725 --- Diff: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java --- @@ -1399,6 +1399,17 @@ private CarbonCommonConstants() { public static final String CARBON_PUSH_ROW_FILTERS_FOR_VECTOR_DEFAULT = "false"; + /** + * max driver threads used for block pruning [1 to 4 threads] + */ + @CarbonProperty public static final String CARBON_MAX_DRIVER_THREADS_FOR_BLOCK_PRUNING = + "carbon.max.driver.threads.for.block.pruning"; + + public static final String CARBON_MAX_DRIVER_THREADS_FOR_BLOCK_PRUNING_DEFAULT = "4"; + + // block prune in multi-thread if files size more than 100K files. + public static final int CARBON_DRIVER_PRUNING_MULTI_THREAD_ENABLE_FILES_COUNT = 10; --- End diff -- because driver doing multi-thread default may impact concurrent queries, also by testing observed that 100k datamap takes 1 second. If block pruning taking more than a second then only multi-thead ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r237173126 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java --- @@ -487,6 +487,8 @@ private int getBlockCount(List blocklets) { // First prune using default datamap on driver side. TableDataMap defaultDataMap = DataMapStoreManager.getInstance().getDefaultDataMap(carbonTable); List prunedBlocklets = null; +// This is to log the event, so user will know what is happening by seeing logs. +LOG.info("Started block pruning ..."); --- End diff -- log will anyways have timestap, we can subtract stop and start time. I have another non-default datamap PR. I will check about this. ---
[GitHub] carbondata issue #2962: [CARBONDATA-3138] Fix random count mismatch with mul...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2962 @ravipesala , @kumarvishal09 : please check this ---
[GitHub] carbondata pull request #2962: [CARBONDATA-3138] Fix random count mismatch w...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2962 [CARBONDATA-3138] Fix random count mismatch with multi-thread block pruning problem: Random count mismatch in query in multi-thread block-pruning scenario. cause: Existing prune method not meant for multi-threading as synchronization was missing. only in implicit filter scenario, while preparing the block ID list, synchronization was missing. Hence pruning was giving wrong result. solution: synchronize the implicit filter preparation, as prune now called in multi-thread Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? NA - [ ] Any backward compatibility impacted? NA - [ ] Document update required? NA - [ ] Testing done done with huge data - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata issue_fix Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2962.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2962 commit 09a469e926d43b4992084e5dc1727899ee04366f Author: ajantha-bhat Date: 2018-11-28T13:48:16Z random count mismatch with multi-thread block pruning ---
[jira] [Created] (CARBONDATA-3138) Random count mismatch in query in multi-thread block-pruning scenario
Ajantha Bhat created CARBONDATA-3138: Summary: Random count mismatch in query in multi-thread block-pruning scenario Key: CARBONDATA-3138 URL: https://issues.apache.org/jira/browse/CARBONDATA-3138 Project: CarbonData Issue Type: Bug Reporter: Ajantha Bhat Assignee: Ajantha Bhat problem: Random count mismatch in query in multi-thread block-pruning scenario. cause:Existing prune method not meant for multi-threading as synchronization was missiing. only in implicit filter scenario, while preparing the block ID list, synchronization was missing. Hence pruning was giving wrong result. solution: syncronize the imlicit filter prepartion, as prune now called in multi-thread -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[GitHub] carbondata pull request #2949: [WIP] support parallel block pruning for non-...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2949#discussion_r236746764 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMap.java --- @@ -70,4 +70,6 @@ void init(DataMapModel dataMapModel) */ void finish(); + // can return , number of records information that are stored in datamap. --- End diff -- ok, changed to just "returns" ---
[GitHub] carbondata pull request #2958: [CARBONDATA-3136] JVM crash with preaggregate...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2958 [CARBONDATA-3136] JVM crash with preaggregate datamap when average of decimal column is taken with orderby. problem: JVM crash with preaggregate datamap when average of decimal column is taken with orderby. cause: When preparing plan with preaggregate datamap, decimal is cast to double in average expression. This was leading to JVM crash in spark as we were filling with wrong precision (callstack mentioned in JIRA) solution: division result of average, should be casted to decimal instead of double for decimal datatype. Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? NA - [ ] Any backward compatibility impacted? NA - [ ] Document update required? NA - [ ] Testing done. yes, added UT - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. NA You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata issue_fix Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2958.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2958 commit 8d95838e5d5991d7c355944d40a54972ea1c1424 Author: ajantha-bhat Date: 2018-11-27T14:07:49Z jvm crash when query pre-aggreagte table with avg(decimal_column) and order by ---
[jira] [Updated] (CARBONDATA-3136) JVM crash with preaggregate datamap
[ https://issues.apache.org/jira/browse/CARBONDATA-3136?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Ajantha Bhat updated CARBONDATA-3136: - Description: JVM crash with preaggregate datamap. callstack: Stack: [0x7efebd49a000,0x7efebd59b000], sp=0x7efebd598dc8, free space=1019k Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) V [libjvm.so+0x7b2b50] J 7620 sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V (0 bytes) @ 0x7eff4a3479e1 [0x7eff4a347900+0xe1] j org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34 j org.apache.spark.sql.catalyst.expressions.UnsafeRow.getBinary(I)[B+54 j org.apache.spark.sql.catalyst.expressions.UnsafeRow.getDecimal(III)Lorg/apache/spark/sql/types/Decimal;+30 j org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;+36 j org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 J 3104 C1 scala.collection.Iterator$$anon$11.next()Ljava/lang/Object; (19 bytes) @ 0x7eff49154724 [0x7eff49154560+0x1c4] j org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Lscala/collection/Iterator;)Lscala/collection/Iterator;+78 j org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 J 14007 C1 org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; (17 bytes) @ 0x7eff4a6ed204 [0x7eff4a6ecc40+0x5c4] J 11684 C1 org.apache.spark.rdd.MapPartitionsRDD.compute(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator; (36 bytes) @ 0x7eff4ad11274 [0x7eff4ad10f60+0x314] J 13771 C1 org.apache.spark.rdd.RDD.iterator(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator; (46 bytes) @ 0x7eff4b39dd3c [0x7eff4b39d160+0xbdc] test({color:#008000}"Test Pre_aggregate with decimal column with order by"{color}) { sql({color:#008000}"drop table if exists maintable"{color}) sql({color:#008000}"create table maintable(name string, decimal_col decimal(30,16)) stored by 'carbondata'"{color}) sql({color:#008000}"insert into table maintable select 'abc',452.564"{color}) sql( {color:#008000}"create datamap ag1 on table maintable using 'preaggregate' as select name,avg(decimal_col)" {color}+ {color:#008000}" from maintable group by name"{color}) checkAnswer(sql({color:#008000}"select avg(decimal_col) from maintable group by name order by name"{color}), {color:#660e7a}Seq{color}(Row({color:#ff}452.5640{color}))) } was: JVM crash with preaggregate datamap. callstack: Stack: [0x7efebd49a000,0x7efebd59b000], sp=0x7efebd598dc8, free space=1019k Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) V [libjvm.so+0x7b2b50] J 7620 sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V (0 bytes) @ 0x7eff4a3479e1 [0x7eff4a347900+0xe1] j org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34 j org.apache.spark.sql.catalyst.expressions.UnsafeRow.getBinary(I)[B+54 j org.apache.spark.sql.catalyst.expressions.UnsafeRow.getDecimal(III)Lorg/apache/spark/sql/types/Decimal;+30 j org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;+36 j org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 J 3104 C1 scala.collection.Iterator$$anon$11.next()Ljava/lang/Object; (19 bytes) @ 0x7eff49154724 [0x7eff49154560+0x1c4] j org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Lscala/collection/Iterator;)Lscala/collection/Iterator;+78 j org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 J 14007 C1 org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; (17 bytes) @ 0x7eff4a6ed204 [0x7eff4a6ecc40+0x5c4] J 11684 C1 org.apache.spark.rdd.MapPartitionsRDD.compute(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator; (36 bytes) @ 0x7eff4ad11274 [0x7eff4ad10f60+0x314] J 13771 C1 org.apache.spark.rdd.RDD.iterator(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
[jira] [Created] (CARBONDATA-3136) JVM crash with preaggregate datamap
Ajantha Bhat created CARBONDATA-3136: Summary: JVM crash with preaggregate datamap Key: CARBONDATA-3136 URL: https://issues.apache.org/jira/browse/CARBONDATA-3136 Project: CarbonData Issue Type: Improvement Reporter: Ajantha Bhat Assignee: Ajantha Bhat JVM crash with preaggregate datamap. callstack: Stack: [0x7efebd49a000,0x7efebd59b000], sp=0x7efebd598dc8, free space=1019k Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) V [libjvm.so+0x7b2b50] J 7620 sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V (0 bytes) @ 0x7eff4a3479e1 [0x7eff4a347900+0xe1] j org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34 j org.apache.spark.sql.catalyst.expressions.UnsafeRow.getBinary(I)[B+54 j org.apache.spark.sql.catalyst.expressions.UnsafeRow.getDecimal(III)Lorg/apache/spark/sql/types/Decimal;+30 j org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;+36 j org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 J 3104 C1 scala.collection.Iterator$$anon$11.next()Ljava/lang/Object; (19 bytes) @ 0x7eff49154724 [0x7eff49154560+0x1c4] j org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Lscala/collection/Iterator;)Lscala/collection/Iterator;+78 j org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Ljava/lang/Object;)Ljava/lang/Object;+5 J 14007 C1 org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; (17 bytes) @ 0x7eff4a6ed204 [0x7eff4a6ecc40+0x5c4] J 11684 C1 org.apache.spark.rdd.MapPartitionsRDD.compute(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator; (36 bytes) @ 0x7eff4ad11274 [0x7eff4ad10f60+0x314] J 13771 C1 org.apache.spark.rdd.RDD.iterator(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator; (46 bytes) @ 0x7eff4b39dd3c [0x7eff4b39d160+0xbdc] -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2936 retest this please ---
[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2936 did a conflict resolve with documentation file. Let the build run again ---
[GitHub] carbondata pull request #2949: [WIP] support parallel block pruning for non-...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2949 [WIP] support parallel block pruning for non-default datamaps [WIP] support parallel block pruning for non-default datamaps This PR dependent on #2936 Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? - [ ] Any backward compatibility impacted? - [ ] Document update required? - [ ] Testing done Please provide details on - Whether new unit test cases have been added or why no new tests are required? - How it is tested? Please attach test report. - Is it a performance related change? Please attach the performance test report. - Any additional information to help reviewers in testing this change. - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata working_backup Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2949.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2949 commit 6237d69fcc0ddc1a08c74579762b721108a251fe Author: ajantha-bhat Date: 2018-11-20T16:45:06Z parllelize block pruning commit e8e912daf3ada357352e006ec9ce435d4c4b1625 Author: ajantha-bhat Date: 2018-11-22T11:01:53Z reveiw comment fix commit d0bf82f276618f6fa09cbce65f714394b5fa4e0c Author: ajantha-bhat Date: 2018-11-23T13:22:07Z support parallel pruning for non-default datamaps ---
[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2936 @ravipesala : PR is ready. please check ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r235846200 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java --- @@ -120,37 +132,166 @@ public BlockletDetailsFetcher getBlockletDetailsFetcher() { * @param filterExp * @return */ - public List prune(List segments, FilterResolverIntf filterExp, - List partitions) throws IOException { -List blocklets = new ArrayList<>(); -SegmentProperties segmentProperties; -Map> dataMaps = dataMapFactory.getDataMaps(segments); + public List prune(List segments, final FilterResolverIntf filterExp, + final List partitions) throws IOException { +final List blocklets = new ArrayList<>(); +final Map> dataMaps = dataMapFactory.getDataMaps(segments); +// for non-filter queries +if (filterExp == null) { + // if filter is not passed, then return all the blocklets. + return pruneWithoutFilter(segments, partitions, blocklets); +} +// for filter queries +int totalFiles = 0; +boolean isBlockDataMapType = true; +for (Segment segment : segments) { + for (DataMap dataMap : dataMaps.get(segment)) { +if (!(dataMap instanceof BlockDataMap)) { --- End diff -- This one, I have to figure out, number of entries in all kinds of datamap and need to test those scenario. I will handle in next PR ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r235842114 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java --- @@ -120,37 +132,166 @@ public BlockletDetailsFetcher getBlockletDetailsFetcher() { * @param filterExp * @return */ - public List prune(List segments, FilterResolverIntf filterExp, - List partitions) throws IOException { -List blocklets = new ArrayList<>(); -SegmentProperties segmentProperties; -Map> dataMaps = dataMapFactory.getDataMaps(segments); + public List prune(List segments, final FilterResolverIntf filterExp, + final List partitions) throws IOException { +final List blocklets = new ArrayList<>(); +final Map> dataMaps = dataMapFactory.getDataMaps(segments); +// for non-filter queries +if (filterExp == null) { + // if filter is not passed, then return all the blocklets. + return pruneWithoutFilter(segments, partitions, blocklets); +} +// for filter queries +int totalFiles = 0; +boolean isBlockDataMapType = true; +for (Segment segment : segments) { + for (DataMap dataMap : dataMaps.get(segment)) { +if (!(dataMap instanceof BlockDataMap)) { + isBlockDataMapType = false; + break; +} +totalFiles += ((BlockDataMap) dataMap).getTotalBlocks(); + } + if (!isBlockDataMapType) { +// totalFiles fill be 0 for non-BlockDataMap Type. ex: lucene, bloom datamap. use old flow. +break; + } +} +int numOfThreadsForPruning = getNumOfThreadsForPruning(); +int filesPerEachThread = totalFiles / numOfThreadsForPruning; +if (numOfThreadsForPruning == 1 || filesPerEachThread == 1 +|| segments.size() < numOfThreadsForPruning || totalFiles +< CarbonCommonConstants.CARBON_DRIVER_PRUNING_MULTI_THREAD_ENABLE_FILES_COUNT) { + // use multi-thread, only if the files are more than 0.1 million. + // As 0.1 million files block pruning can take only 1 second. + // Doing multi-thread for smaller values is not recommended as + // driver should have minimum threads opened to support multiple concurrent queries. + return pruneWithFilter(segments, filterExp, partitions, blocklets, dataMaps); +} +// handle by multi-thread +return pruneWithFilterMultiThread(segments, filterExp, partitions, blocklets, dataMaps, +totalFiles); + } + + private List pruneWithoutFilter(List segments, + List partitions, List blocklets) throws IOException { +for (Segment segment : segments) { + List allBlocklets = blockletDetailsFetcher.getAllBlocklets(segment, partitions); + blocklets.addAll( + addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(allBlocklets, segment), + segment.toString())); +} +return blocklets; + } + + private List pruneWithFilter(List segments, + FilterResolverIntf filterExp, List partitions, + List blocklets, Map> dataMaps) throws IOException { for (Segment segment : segments) { List pruneBlocklets = new ArrayList<>(); - // if filter is not passed then return all the blocklets - if (filterExp == null) { -pruneBlocklets = blockletDetailsFetcher.getAllBlocklets(segment, partitions); - } else { -segmentProperties = segmentPropertiesFetcher.getSegmentProperties(segment); -for (DataMap dataMap : dataMaps.get(segment)) { - pruneBlocklets.addAll(dataMap.prune(filterExp, segmentProperties, partitions)); + SegmentProperties segmentProperties = segmentPropertiesFetcher.getSegmentProperties(segment); + for (DataMap dataMap : dataMaps.get(segment)) { +pruneBlocklets.addAll(dataMap.prune(filterExp, segmentProperties, partitions)); + } + blocklets.addAll( + addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(pruneBlocklets, segment), + segment.toString())); +} +return blocklets; + } + + private List pruneWithFilterMultiThread(List segments, + final FilterResolverIntf filterExp, final List partitions, + List blocklets, final Map> dataMaps, + int totalFiles) { +int numOfThreadsForPruning = getNumOfThreadsForPruning(); +int filesPerEachThread = (int) Math.ceil((double)totalFiles / numOfThreadsForPruning); +int prev = 0; +int filesCount = 0; +int processedFileCount = 0; +List> segmentList = new ArrayList<>(); --- End diff -- done ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r235774840 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java --- @@ -120,37 +132,166 @@ public BlockletDetailsFetcher getBlockletDetailsFetcher() { * @param filterExp * @return */ - public List prune(List segments, FilterResolverIntf filterExp, - List partitions) throws IOException { -List blocklets = new ArrayList<>(); -SegmentProperties segmentProperties; -Map> dataMaps = dataMapFactory.getDataMaps(segments); + public List prune(List segments, final FilterResolverIntf filterExp, + final List partitions) throws IOException { +final List blocklets = new ArrayList<>(); +final Map> dataMaps = dataMapFactory.getDataMaps(segments); +// for non-filter queries +if (filterExp == null) { + // if filter is not passed, then return all the blocklets. + return pruneWithoutFilter(segments, partitions, blocklets); +} +// for filter queries +int totalFiles = 0; +boolean isBlockDataMapType = true; +for (Segment segment : segments) { + for (DataMap dataMap : dataMaps.get(segment)) { +if (!(dataMap instanceof BlockDataMap)) { --- End diff -- Two reasons: 1. number of datamaps will be very less if it is not a block or blocklet datamap. Hence multi-threading is not required (as it is overhead for driver in concurrent scenarios) 2. Other datamaps doesn't have number entries count in them. I will check ---
[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2936#discussion_r235615112 --- Diff: core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java --- @@ -120,37 +132,166 @@ public BlockletDetailsFetcher getBlockletDetailsFetcher() { * @param filterExp * @return */ - public List prune(List segments, FilterResolverIntf filterExp, - List partitions) throws IOException { -List blocklets = new ArrayList<>(); -SegmentProperties segmentProperties; -Map> dataMaps = dataMapFactory.getDataMaps(segments); + public List prune(List segments, final FilterResolverIntf filterExp, + final List partitions) throws IOException { +final List blocklets = new ArrayList<>(); +final Map> dataMaps = dataMapFactory.getDataMaps(segments); +// for non-filter queries +if (filterExp == null) { + // if filter is not passed, then return all the blocklets. + return pruneWithoutFilter(segments, partitions, blocklets); --- End diff -- yes, Already tested this. for 100k files with filter takes around 1 seconds. But without filter is 50 ms. Very less. Hence not handled. for filters, pruning was taking time. ---
[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2936 retest this please ---
[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2936 @manishgupta88 , @ravipesala : please review ---
[jira] [Created] (CARBONDATA-3118) Parallelize block pruning of default datamap in driver for filter query processing
Ajantha Bhat created CARBONDATA-3118: Summary: Parallelize block pruning of default datamap in driver for filter query processing Key: CARBONDATA-3118 URL: https://issues.apache.org/jira/browse/CARBONDATA-3118 Project: CarbonData Issue Type: Improvement Reporter: Ajantha Bhat Assignee: Ajantha Bhat *"Parallelize block pruning of default datamap in driver for filter query processing"* *Background:* We do block pruning for the filter queries at the driver side. In real time big data scenario, we can have millions of carbon files for one carbon table. It is currently observed that for 1 million carbon files it takes around 5 seconds for block pruning. As each carbon file takes around 0.005ms for pruning (with only one filter columns set in 'column_meta_cache' tblproperty). If the files are more, we might take more time for block pruning. Also, spark Job will not be launched until block pruning is completed. so, the user will not know what is happening at that time and why spark job is not launching. currently, block pruning is taking time as each segment processing is happening sequentially. we can reduce the time by parallelizing it. *solution:*Keep default number of threads for block pruning as 4. User can reduce this number by a carbon property "carbon.max.driver.threads.for.pruning" to set between -> 1 to 4. In TableDataMap.prune(), group the segments as per the threads by distributing equal carbon files to each thread. Launch the threads for a group of segments to handle block pruning. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[GitHub] carbondata pull request #2936: [WIP] Parallelize block pruning of default da...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2936 [WIP] Parallelize block pruning of default datamap in driver for filter query processing Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? - [ ] Any backward compatibility impacted? - [ ] Document update required? - [ ] Testing done Please provide details on - Whether new unit test cases have been added or why no new tests are required? - How it is tested? Please attach test report. - Is it a performance related change? Please attach the performance test report. - Any additional information to help reviewers in testing this change. - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata issue_fix Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2936.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2936 commit 7e2d16903565effab3c1c085291178865f6ad7ba Author: ajantha-bhat Date: 2018-11-20T16:45:06Z pruning compliling commit 059b69b46d5e543ba4a65af85ce694a1899de395 Author: ajantha-bhat Date: 2018-11-21T02:27:52Z issue fix ---
[GitHub] carbondata issue #2921: Removed unnecessary configuration in BlockletDataMap...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2921 @ravipesala , @chenliang613 : please add @NamanRastogi to whiteList ---
[GitHub] carbondata issue #2921: Removed unnecessary configuration in BlockletDataMap...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2921 retest this please ---
[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2895 retest this please ---
[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2895 retest this please ---
[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2895 retest this please ---
[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2895 @manishgupta88 : please check. same 2.3 build has passed without any code changes before run. It is random failure. ---
[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2895 retest this please ---
[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2895 retest this please ---
[GitHub] carbondata pull request #2895: [HOTFIX] Fix NPE in spark, when same vector r...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2895#discussion_r231001948 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/LocalDictDimensionDataChunkStore.java --- @@ -61,10 +61,7 @@ public void fillVector(int[] invertedIndex, int[] invertedIndexReverse, byte[] d int columnValueSize = dimensionDataChunkStore.getColumnValueSize(); int rowsNum = data.length / columnValueSize; CarbonColumnVector vector = vectorInfo.vector; -if (!dictionary.isDictionaryUsed()) { - vector.setDictionary(dictionary); - dictionary.setDictionaryUsed(); -} +vector.setDictionary(dictionary); --- End diff -- done ---
[GitHub] carbondata pull request #2895: [HOTFIX] Fix NPE in spark, when same vector r...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2895#discussion_r230997947 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/LocalDictDimensionDataChunkStore.java --- @@ -61,10 +61,7 @@ public void fillVector(int[] invertedIndex, int[] invertedIndexReverse, byte[] d int columnValueSize = dimensionDataChunkStore.getColumnValueSize(); int rowsNum = data.length / columnValueSize; CarbonColumnVector vector = vectorInfo.vector; -if (!dictionary.isDictionaryUsed()) { - vector.setDictionary(dictionary); - dictionary.setDictionaryUsed(); -} +vector.setDictionary(dictionary); --- End diff -- I have checked this while coding, fill row is our method. So, no issues. Only this vector is spark vector and dictionary needs to clear for it. ---
[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2816 LGTM ---
[GitHub] carbondata pull request #2895: [HOTFIX] Fix NPE in spark, when same vector r...
GitHub user ajantha-bhat opened a pull request: https://github.com/apache/carbondata/pull/2895 [HOTFIX] Fix NPE in spark, when same vector reads files with local dictionary and without local dictionary problem: NPE in spark, when same vector reads files with local dictionary and without local dictionary cause: when two carbondata files are present, one with local dictionary and one without local dictionary. If same vector is used to read this files [can happen if task is launched to group of files]. If local dictionary files are found first, dictionary is set for that vector. But it was never reset for another file reading. solution: reset dictionary once batch is processed,set only for local dictionary batch processing. Be sure to do all of the following checklist to help us incorporate your contribution quickly and easily: - [ ] Any interfaces changed? NA - [ ] Any backward compatibility impacted? NA - [ ] Document update required? NA - [ ] Testing done yes, cluster testing done. - [ ] For large changes, please consider breaking it into sub-tasks under an umbrella JIRA. NA You can merge this pull request into a Git repository by running: $ git pull https://github.com/ajantha-bhat/carbondata master_new Alternatively you can review and apply these changes as the patch at: https://github.com/apache/carbondata/pull/2895.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #2895 commit 99c7621336e3cf180bfa0c3a326a2f1fafe51631 Author: ajantha-bhat Date: 2018-11-05T10:00:27Z Fix vectcor reading with local dictionary and without local dictionary ---
[GitHub] carbondata issue #2888: [CARBONDATA-3066]add documentation for writtenBy and...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2888 LGTM ---
[GitHub] carbondata pull request #2850: [CARBONDATA-3056] Added concurrent reading th...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2850#discussion_r230272267 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -114,6 +115,57 @@ public static CarbonReaderBuilder builder(String tablePath) { return builder(tablePath, tableName); } + /** + * Breaks the list of CarbonRecordReader in CarbonReader into multiple + * CarbonReader objects, each iterating through some 'carbondata' files + * and return that list of CarbonReader objects + * + * If the no. of files is greater than maxSplits, then break the + * CarbonReader into maxSplits splits, with each split iterating + * through >= 1 file. + * + * If the no. of files is less than maxSplits, then return list of + * CarbonReader with size as the no. of files, with each CarbonReader + * iterating through exactly one file + * + * @param maxSplits: Int + * @return list of {@link CarbonReader} objects + */ + public List split(int maxSplits) throws IOException { --- End diff -- @ravipesala : Adding to builder will break builder pattern, recently we removed arguments from build() and make it as separate API for SDK writer. Reader also followed same. ---
[GitHub] carbondata pull request #2888: [CARBONDATA-3066]add documentation for writte...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2888#discussion_r229733657 --- Diff: docs/sdk-guide.md --- @@ -686,6 +695,16 @@ Find example code at [CarbonReaderExample](https://github.com/apache/carbondata/ public static Schema readSchemaInIndexFile(String indexFilePath); ``` +``` + /** + * This method return the version details in formatted string by reading from carbondata file + * @param dataFilePath complete path including carbondata file name + * @return string with information of who has written this file in which carbondata project version --- End diff -- can mention one example in the header ---
[GitHub] carbondata pull request #2888: [CARBONDATA-3066]add documentation for writte...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2888#discussion_r229733094 --- Diff: docs/sdk-guide.md --- @@ -429,6 +429,15 @@ public CarbonWriterBuilder withAvroInput(org.apache.avro.Schema avroSchema); public CarbonWriterBuilder withJsonInput(Schema carbonSchema); ``` +``` +/** +* To support writing the ApplicationName which is writing the carbondata file +* @param application name which is writing the carbondata files +* @return CarbonWriterBuilder --- End diff -- mention that it is mandatory method to call ---
[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2816#discussion_r229629760 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/RowBatch.java --- @@ -100,4 +100,19 @@ public int getSize() { counter++; return row; } + + /** + * read next batch + * + * @return rows + */ + public List nextBatch() { +if (!hasNext()) { + throw new NoSuchElementException(); +} +List row; +row = rows.subList(counter, rows.size()); --- End diff -- where is the batch reading happening now ? you just have to return rows I think. ---
[GitHub] carbondata issue #2807: [CARBONDATA-2997] Support read schema from index fil...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2807 LGTM. Let one more reviewer review. ---
[GitHub] carbondata pull request #2804: [CARBONDATA-2996] CarbonSchemaReader support ...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2804#discussion_r229583983 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonSchemaReader.java --- @@ -64,11 +66,70 @@ public static Schema readSchemaInSchemaFile(String schemaFilePath) throws IOExce /** * Read carbondata file and return the schema * - * @param dataFilePath complete path including carbondata file name + * @param path carbondata store path * @return Schema object * @throws IOException */ - public static Schema readSchemaInDataFile(String dataFilePath) throws IOException { + public static Schema readSchemaFromFirstDataFile(String path) throws IOException { +String dataFilePath = getFirstCarbonDataFile(path); +return readSchemaInDataFile(dataFilePath); + } + + /** + * get first carbondata file in path and don't check all files schema + * + * @param path carbondata file path + * @return first carbondata file name + */ + public static String getFirstCarbonDataFile(String path) { --- End diff -- I have already suggested to keep getFirstCarbonFile(path, extension) -- this only will give data or index file based on the extension. no need to have duplicate code for both index and data file ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229298501 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -1737,4 +1738,89 @@ public void testReadNextRowWithProjectionAndRowUtil() { } } + @Test + public void testVectorReader() { +String path = "./testWriteFiles"; +try { + FileUtils.deleteDirectory(new File(path)); + + Field[] fields = new Field[12]; + fields[0] = new Field("stringField", DataTypes.STRING); + fields[1] = new Field("shortField", DataTypes.SHORT); + fields[2] = new Field("intField", DataTypes.INT); + fields[3] = new Field("longField", DataTypes.LONG); + fields[4] = new Field("doubleField", DataTypes.DOUBLE); + fields[5] = new Field("boolField", DataTypes.BOOLEAN); + fields[6] = new Field("dateField", DataTypes.DATE); + fields[7] = new Field("timeField", DataTypes.TIMESTAMP); + fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2)); + fields[9] = new Field("varcharField", DataTypes.VARCHAR); + fields[10] = new Field("byteField", DataTypes.BYTE); + fields[11] = new Field("floatField", DataTypes.FLOAT); + Map map = new HashMap<>(); + map.put("complex_delimiter_level_1", "#"); + CarbonWriter writer = CarbonWriter.builder() + .outputPath(path) + .withLoadOptions(map) + .withCsvInput(new Schema(fields)) + .writtenBy("CarbonReaderTest") + .build(); + + for (int i = 0; i < 10; i++) { +String[] row2 = new String[]{ +"robot" + (i % 10), +String.valueOf(i % 1), +String.valueOf(i), +String.valueOf(Long.MAX_VALUE - i), +String.valueOf((double) i / 2), +String.valueOf(true), +"2019-03-02", +"2019-02-12 03:03:34", +"12.345", +"varchar", +String.valueOf(i), +"1.23" +}; +writer.write(row2); + } + writer.close(); + + // Read data + CarbonReader reader = CarbonReader + .builder(path, "_temp") + .withVectorReader(true) + .build(); + + int i = 0; + while (reader.hasNext()) { +Object[] data = (Object[]) reader.readNextRow(); + +assert (RowUtil.getString(data, 0).equals("robot" + i)); +assertEquals(RowUtil.getShort(data, 4), i); +assertEquals(RowUtil.getInt(data, 5), i); +assert (RowUtil.getLong(data, 6) == Long.MAX_VALUE - i); +assertEquals(RowUtil.getDouble(data, 7), ((double) i) / 2); +assert (RowUtil.getByte(data, 8).equals(new Byte("1"))); +assertEquals(RowUtil.getInt(data, 1), 17957); +assertEquals(RowUtil.getLong(data, 2), 154992081400L); +assert (RowUtil.getDecimal(data, 9).equals("12.35")); +assert (RowUtil.getString(data, 3).equals("varchar")); +assertEquals(RowUtil.getByte(data, 10), new Byte(String.valueOf(i))); +assertEquals(RowUtil.getFloat(data, 11), new Float("1.23")); +i++; + } + reader.close(); --- End diff -- Add validation for total number of rows read. ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229284011 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonVectorizedRecordReader.java --- @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.hadoop.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonV3DataFormatConstants; +import org.apache.carbondata.core.datastore.block.TableBlockInfo; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.metadata.datatype.DecimalType; +import org.apache.carbondata.core.metadata.datatype.StructField; +import org.apache.carbondata.core.scan.executor.QueryExecutor; +import org.apache.carbondata.core.scan.executor.QueryExecutorFactory; +import org.apache.carbondata.core.scan.executor.exception.QueryExecutionException; +import org.apache.carbondata.core.scan.model.ProjectionDimension; +import org.apache.carbondata.core.scan.model.ProjectionMeasure; +import org.apache.carbondata.core.scan.model.QueryModel; +import org.apache.carbondata.core.scan.result.iterator.AbstractDetailQueryResultIterator; +import org.apache.carbondata.core.scan.result.vector.CarbonColumnVector; +import org.apache.carbondata.core.scan.result.vector.CarbonColumnarBatch; +import org.apache.carbondata.core.scan.result.vector.impl.CarbonColumnVectorImpl; +import org.apache.carbondata.core.util.ByteUtil; +import org.apache.carbondata.hadoop.AbstractRecordReader; +import org.apache.carbondata.hadoop.CarbonInputSplit; + +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.log4j.Logger; + +/** + * A specialized RecordReader that reads into CarbonColumnarBatches directly using the + * carbondata column APIs and fills the data directly into columns. + */ +public class CarbonVectorizedRecordReader extends AbstractRecordReader { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonVectorizedRecordReader.class.getName()); + + private CarbonColumnarBatch carbonColumnarBatch; + + private QueryExecutor queryExecutor; + + private int batchIdx = 0; + + private int numBatched = 0; + + private AbstractDetailQueryResultIterator iterator; + + private QueryModel queryModel; + + public CarbonVectorizedRecordReader(QueryModel queryModel) { +this.queryModel = queryModel; + } + + @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { +List splitList; +if (inputSplit instanceof CarbonInputSplit) { + splitList = new ArrayList<>(1); + splitList.add((CarbonInputSplit) inputSplit); +} else { + throw new RuntimeException("unsupported input split type: " + inputSplit); +} +List tableBlockInfoList = CarbonInputSplit.createBlocks(splitList); +queryModel.setTableBlockInfos(tableBlockInfoList); +queryModel.setVectorReader(true); +try { + queryExecutor = + QueryExecutorFactory.getQueryExecutor(queryModel, taskAttemptContext.getConfiguration()); + iterator = (AbstractDetailQueryResultIterator) queryExecutor.execute(queryModel); +} catch (QueryExecutionException e) { + LOGGER.error(e); + throw new InterruptedException(e.getMessage()); +} catch (Exception e) { + LOGGER.error(e); + throw e; +} + } + + @Override public boolean nextKeyValue() throws IOException, InterruptedException
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229254647 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonVectorizedRecordReader.java --- @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.hadoop.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonV3DataFormatConstants; +import org.apache.carbondata.core.datastore.block.TableBlockInfo; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.metadata.datatype.DecimalType; +import org.apache.carbondata.core.metadata.datatype.StructField; +import org.apache.carbondata.core.scan.executor.QueryExecutor; +import org.apache.carbondata.core.scan.executor.QueryExecutorFactory; +import org.apache.carbondata.core.scan.executor.exception.QueryExecutionException; +import org.apache.carbondata.core.scan.model.ProjectionDimension; +import org.apache.carbondata.core.scan.model.ProjectionMeasure; +import org.apache.carbondata.core.scan.model.QueryModel; +import org.apache.carbondata.core.scan.result.iterator.AbstractDetailQueryResultIterator; +import org.apache.carbondata.core.scan.result.vector.CarbonColumnVector; +import org.apache.carbondata.core.scan.result.vector.CarbonColumnarBatch; +import org.apache.carbondata.core.scan.result.vector.impl.CarbonColumnVectorImpl; +import org.apache.carbondata.core.util.ByteUtil; +import org.apache.carbondata.hadoop.AbstractRecordReader; +import org.apache.carbondata.hadoop.CarbonInputSplit; + +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.log4j.Logger; + +/** + * A specialized RecordReader that reads into CarbonColumnarBatches directly using the + * carbondata column APIs and fills the data directly into columns. + */ +public class CarbonVectorizedRecordReader extends AbstractRecordReader { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonVectorizedRecordReader.class.getName()); + + private CarbonColumnarBatch carbonColumnarBatch; + + private QueryExecutor queryExecutor; + + private int batchIdx = 0; + + private int numBatched = 0; + + private AbstractDetailQueryResultIterator iterator; + + private QueryModel queryModel; + + public CarbonVectorizedRecordReader(QueryModel queryModel) { +this.queryModel = queryModel; + } + + @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { +List splitList; +if (inputSplit instanceof CarbonInputSplit) { + splitList = new ArrayList<>(1); + splitList.add((CarbonInputSplit) inputSplit); +} else { + throw new RuntimeException("unsupported input split type: " + inputSplit); +} +List tableBlockInfoList = CarbonInputSplit.createBlocks(splitList); +queryModel.setTableBlockInfos(tableBlockInfoList); +queryModel.setVectorReader(true); +try { + queryExecutor = + QueryExecutorFactory.getQueryExecutor(queryModel, taskAttemptContext.getConfiguration()); + iterator = (AbstractDetailQueryResultIterator) queryExecutor.execute(queryModel); +} catch (QueryExecutionException e) { + LOGGER.error(e); + throw new InterruptedException(e.getMessage()); +} catch (Exception e) { + LOGGER.error(e); + throw e; +} + } + + @Override public boolean nextKeyValue() throws IOException, InterruptedException {
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229248603 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonFileInputFormat.java --- @@ -180,6 +235,11 @@ public CarbonTable getOrCreateCarbonTable(Configuration configuration) throws IO validSegments, partitionInfo, oldPartitionIdList); numBlocks = dataBlocksOfSegment.size(); result.addAll(dataBlocksOfSegment); +Collections.sort(result, new Comparator() { --- End diff -- Why sorting the split ? can you add comment here ? ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229246953 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonFileInputFormat.java --- @@ -145,9 +158,33 @@ public CarbonTable getOrCreateCarbonTable(Configuration configuration) throws IO externalTableSegments.add(seg); } } - // do block filtering and get split - List splits = - getSplits(job, filter, externalTableSegments, null, partitionInfo, null); + List splits = new ArrayList<>(); + if (isSDK) { --- End diff -- This is SDK logic, not fileInputFormat logic. you can do this in CarbonReaderBuilder.build() --> here already getSplits() is called, you can check if filters are there, call FileInputFormat.getSplits() else call a method getAllFileSplit() which gives all the files without blocklet loading Also we don't need isSDK flag also ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229243919 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/CarbonInputSplit.java --- @@ -138,6 +138,19 @@ public CarbonInputSplit(String segmentId, Path path, long start, long length, St version = CarbonProperties.getInstance().getFormatVersion(); } + public CarbonInputSplit(String segmentId, Path path, long start, long length, --- End diff -- This is not required. can use above constructor and pass FileFormat.COLUMNAR_V3 for fileFormat ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229247445 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonFileInputFormat.java --- @@ -145,9 +158,33 @@ public CarbonTable getOrCreateCarbonTable(Configuration configuration) throws IO externalTableSegments.add(seg); } } - // do block filtering and get split - List splits = - getSplits(job, filter, externalTableSegments, null, partitionInfo, null); + List splits = new ArrayList<>(); + if (isSDK) { +for (CarbonFile carbonFile : getAllCarbonDataFiles(carbonTable.getTablePath())) { --- End diff -- move it to method getAllFileSplits(), and called it from CarbonReaderBuilder.build() as discussed above ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229218201 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/vector/impl/CarbonColumnVectorImpl.java --- @@ -305,7 +301,7 @@ public void setBlockDataType(DataType blockDataType) { } @Override public CarbonColumnVector getDictionaryVector() { -return dictionaryVector; +return null; --- End diff -- should throw unsupported operation exception instead of null ? or remove the override method itself. ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229242570 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/filesystem/LocalCarbonFile.java --- @@ -178,6 +178,25 @@ public boolean delete() { return carbonFiles; } + @Override public List listFiles(Boolean recursive, CarbonFileFilter fileFilter) --- End diff -- recursive variable is not used in the logic ---
[GitHub] carbondata pull request #2876: [CARBONDATA-3054] Fix Dictionary file cannot ...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2876#discussion_r229237604 --- Diff: integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDictionaryDecoder.scala --- @@ -137,14 +139,15 @@ case class CarbonDictionaryDecoder( val forwardDictionaryCache: Cache[DictionaryColumnUniqueIdentifier, Dictionary] = cacheProvider.createCache(CacheType.FORWARD_DICTIONARY) val dicts: Seq[ForwardDictionaryWrapper] = getDictionaryWrapper(tableNameToCarbonTableMapping, -forwardDictionaryCache) +forwardDictionaryCache, conf) val exprs = child.output.map { exp => ExpressionCanonicalizer.execute(BindReferences.bindReference(exp, child.output)) } ctx.currentVars = input val resultVars = exprs.zipWithIndex.map { case (expr, index) => if (dicts(index) != null) { + ThreadLocalSessionInfo.setConfigurationToCurrentThread(conf.value.value) --- End diff -- done. Setted in codeGen now. ---
[GitHub] carbondata pull request #2876: [CARBONDATA-3054] Fix Dictionary file cannot ...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2876#discussion_r229237654 --- Diff: integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDictionaryDecoder.scala --- @@ -557,16 +561,20 @@ class CarbonDecoderRDD( * It is a wrapper around Dictionary, it is a work around to keep the dictionary serializable in * case of codegen * @param dictIdentifier Dictionary column unique identifier + * @param broadcastConf hadoop broadcast conf for serialization, that contains carbon conf. */ class ForwardDictionaryWrapper( -dictIdentifier: DictionaryColumnUniqueIdentifier) extends Serializable { +dictIdentifier: DictionaryColumnUniqueIdentifier, +broadcastConf: Broadcast[SerializableConfiguration]) extends Serializable { var dictionary: Dictionary = null var dictionaryLoader: DictionaryLoader = _ def getDictionaryValueForKeyInBytes (surrogateKey: Int): Array[Byte] = { if (dictionary == null) { + ThreadLocalSessionInfo.getOrCreateCarbonSessionInfo().getNonSerializableExtraInfo --- End diff -- done. Setted in codeGen now ---
[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2869#discussion_r229193317 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java --- @@ -169,7 +169,7 @@ public void fillRow(int rowId, CarbonColumnVector vector, int vectorRow) { length)) { vector.putNull(vectorRow); --- End diff -- Need to skip this for varchar also ? ---
[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2850#discussion_r229183542 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/ConcurrentSdkReaderTest.java --- @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.sdk.file; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import org.apache.carbondata.core.metadata.datatype.DataTypes; + +import junit.framework.TestCase; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOExceptionWithCause; +import org.junit.*; + +/** + * multi-thread Test suite for {@link CarbonReader} + */ +public class ConcurrentSdkReaderTest { + + private static final String dataDir = "./testReadFiles"; + + @Before + @After + public void cleanTestData() { +try { + FileUtils.deleteDirectory(new File(dataDir)); +} catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); +} + } + + private void writeTestData(long numRows, int tableBlockSize) { +Field[] fields = new Field[2]; +fields[0] = new Field("stringField", DataTypes.STRING); +fields[1] = new Field("intField", DataTypes.INT); + +Map tableProperties = new HashMap<>(); +tableProperties.put("table_blocksize", Integer.toString(tableBlockSize)); + +CarbonWriterBuilder builder = + CarbonWriter.builder().outputPath(dataDir).withTableProperties(tableProperties) +.withCsvInput(new Schema(fields)); + +try { + CarbonWriter writer = builder.build(); + + for (long i = 0; i < numRows; ++i) { +writer.write(new String[] { "robot_" + i, String.valueOf(i) }); + } + writer.close(); +} catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); +} + } + + @Test public void testReadParallely() throws IOException, InterruptedException { +long numRows = 1000; --- End diff -- We must not add huge record test case in UT. PR builder time for all the PR will affect by this. Locally test with huge data but update UT with fewer rows. say 10 rows. ---
[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2850#discussion_r229183179 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String tablePath) { return builder(tablePath, tableName); } + /** + * Return a new list of {@link CarbonReader} objects + * + * @param maxSplits + */ + public List split(int maxSplits) throws IOException { +validateReader(); +if (maxSplits < 1) { + throw new RuntimeException( + this.getClass().getSimpleName() + ".split: maxSplits must be positive"); +} + +List carbonReaders = new ArrayList<>(); + +// If maxSplits < readers.size +// Split the reader into maxSplits splits with each +// element contains >= 1 CarbonRecordReader objects +if (maxSplits < this.readers.size()) { + for (int i = 0; i < maxSplits; ++i) { +carbonReaders.add(new CarbonReader<>(this.readers +.subList((int) Math.ceil((float) (i * this.readers.size()) / maxSplits), --- End diff -- this is constant, do this outside loop and use it each time ---
[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2850#discussion_r229179978 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String tablePath) { return builder(tablePath, tableName); } + /** + * Return a new list of {@link CarbonReader} objects + * --- End diff -- Add a clear description, mention what happens if splits greater than the number of files and what happens if splits are lesser than the number of files ---
[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2850#discussion_r229179713 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String tablePath) { return builder(tablePath, tableName); } + /** + * Return a new list of {@link CarbonReader} objects + * + * @param maxSplits + */ + public List split(int maxSplits) throws IOException { +validateReader(); +if (maxSplits < 1) { + throw new RuntimeException( + this.getClass().getSimpleName() + ".split: maxSplits must be positive"); +} + +List carbonReaders = new ArrayList<>(); + +// If maxSplits < readers.size +// Split the reader into maxSplits splits with each +// element contains >= 1 CarbonRecordReader objects +if (maxSplits < this.readers.size()) { + for (int i = 0; i < maxSplits; ++i) { +carbonReaders.add(new CarbonReader<>(this.readers +.subList((int) Math.ceil((float) (i * this.readers.size()) / maxSplits), +(int) Math.ceil((float) ((i + 1) * this.readers.size()) / maxSplits; + } +} +// If maxSplits >= readers.size +// Split the reader into reader.size splits with each --- End diff -- keep comments inside else block for easy reading of the code ---
[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2850#discussion_r229179428 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String tablePath) { return builder(tablePath, tableName); } + /** + * Return a new list of {@link CarbonReader} objects + * + * @param maxSplits + */ + public List split(int maxSplits) throws IOException { --- End diff -- Need to add new interfaces exposed in sdk-guide.md. you can add this API info there. ---
[GitHub] carbondata issue #2876: [CARBONDATA-3054] Fix Dictionary file cannot be read...
Github user ajantha-bhat commented on the issue: https://github.com/apache/carbondata/pull/2876 retest this please ---
[GitHub] carbondata pull request #2804: [CARBONDATA-2996] CarbonSchemaReader support ...
Github user ajantha-bhat commented on a diff in the pull request: https://github.com/apache/carbondata/pull/2804#discussion_r229173861 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonSchemaReaderTest.java --- @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.sdk.file; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; + +import junit.framework.TestCase; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.commons.io.FileUtils; +import org.junit.Test; + +public class CarbonSchemaReaderTest extends TestCase { + + @Test + public void testReadSchemaFromDataFile() { +String path = "./testWriteFiles"; +try { + FileUtils.deleteDirectory(new File(path)); + + Field[] fields = new Field[11]; + fields[0] = new Field("stringField", DataTypes.STRING); + fields[1] = new Field("shortField", DataTypes.SHORT); + fields[2] = new Field("intField", DataTypes.INT); + fields[3] = new Field("longField", DataTypes.LONG); + fields[4] = new Field("doubleField", DataTypes.DOUBLE); + fields[5] = new Field("boolField", DataTypes.BOOLEAN); + fields[6] = new Field("dateField", DataTypes.DATE); + fields[7] = new Field("timeField", DataTypes.TIMESTAMP); + fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2)); + fields[9] = new Field("varcharField", DataTypes.VARCHAR); + fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING)); + Map map = new HashMap<>(); + map.put("complex_delimiter_level_1", "#"); + CarbonWriter writer = CarbonWriter.builder() + .outputPath(path) + .withLoadOptions(map) + .withCsvInput(new Schema(fields)).build(); + + for (int i = 0; i < 10; i++) { +String[] row2 = new String[]{ +"robot" + (i % 10), +String.valueOf(i % 1), +String.valueOf(i), +String.valueOf(Long.MAX_VALUE - i), +String.valueOf((double) i / 2), +String.valueOf(true), +"2019-03-02", +"2019-02-12 03:03:34", +"12.345", +"varchar", +"Hello#World#From#Carbon" +}; +writer.write(row2); + } + writer.close(); + + Schema schema = CarbonSchemaReader + .readSchemaInDataFile(path) + .asOriginOrder(); + // Transform the schema + assertEquals(schema.getFields().length, 11); + String[] strings = new String[schema.getFields().length]; + for (int i = 0; i < schema.getFields().length; i++) { +strings[i] = (schema.getFields())[i].getFieldName(); + } + assert (strings[0].equalsIgnoreCase("stringField")); + assert (strings[1].equalsIgnoreCase("shortField")); + assert (strings[2].equalsIgnoreCase("intField")); + assert (strings[3].equalsIgnoreCase("longField")); + assert (strings[4].equalsIgnoreCase("doubleField")); --- End diff -- can move it to a method and use for both the test case. ---