[GitHub] carbondata pull request #2966: [CARBONDATA-3162][CARBONDATA-3163][CARBONDATA...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2966#discussion_r241643126
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java 
---
@@ -501,6 +502,17 @@ public CarbonLoadModel buildLoadModel(Schema 
carbonSchema)
 }
 // for the longstring field, change the datatype from string to varchar
 this.schema = updateSchemaFields(carbonSchema, longStringColumns);
+if (sortColumns != null && sortColumns.length != 0) {
+  if (options == null || options.get("sort_scope") == null) {
+// If sort_columns are specified and sort_scope is not specified,
+// change sort scope to local_sort as now by default sort scope is 
no_sort.
+if (options == null) {
+  options = new HashMap<>();
+}
+//TODO: add in carbon property instead of load options
--- End diff --

example, sort_columns are mentioned in table properties, but sort_scope is 
present as "Batch_sort" in carbon properties, so in this time If I set in table 
properties, test case fails. 

that too they set carbon properties after creating the table !


---


[GitHub] carbondata pull request #2966: [CARBONDATA-3162][CARBONDATA-3163][CARBONDATA...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2966#discussion_r241642790
  
--- Diff: 
integration/spark2/src/test/scala/org/apache/spark/carbondata/restructure/AlterTableValidationTestCase.scala
 ---
@@ -523,7 +523,8 @@ class AlterTableValidationTestCase extends 
Spark2QueryTest with BeforeAndAfterAl
 }
   }
 
-  test("describe formatted for default sort_columns pre and post alter") {
+  // after changing default sort_scope to no_sort, all dimensions are not 
selected for sorting.
+  ignore("describe formatted for default sort_columns pre and post alter") 
{
--- End diff --

ok.


---


[GitHub] carbondata pull request #2966: [CARBONDATA-3162][CARBONDATA-3163][CARBONDATA...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2966#discussion_r241642755
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java 
---
@@ -501,6 +502,17 @@ public CarbonLoadModel buildLoadModel(Schema 
carbonSchema)
 }
 // for the longstring field, change the datatype from string to varchar
 this.schema = updateSchemaFields(carbonSchema, longStringColumns);
+if (sortColumns != null && sortColumns.length != 0) {
+  if (options == null || options.get("sort_scope") == null) {
+// If sort_columns are specified and sort_scope is not specified,
+// change sort scope to local_sort as now by default sort scope is 
no_sort.
+if (options == null) {
+  options = new HashMap<>();
+}
+//TODO: add in carbon property instead of load options
--- End diff --

there are test cases, that just set sort scope in carbon property but not 
sort columns, that test case fails as table property that I set has higher 
priority than carbon properties


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241334135
  
--- Diff: store/CSDK/test/main.cpp ---
@@ -645,6 +653,278 @@ bool testWriteData(JNIEnv *env, char *path, int argc, 
char *argv[]) {
 }
 }
 
+void writeData(JNIEnv *env, CarbonWriter writer, int size, jclass 
objClass, char *stringField, short shortField) {
+jobjectArray arr = env->NewObjectArray(size, objClass, 0);
+
+jobject jStringField = env->NewStringUTF(stringField);
+env->SetObjectArrayElement(arr, 0, jStringField);
+
+char ctrShort[10];
+gcvt(shortField % 1, 10, ctrShort);
+jobject jShortField = env->NewStringUTF(ctrShort);
+env->SetObjectArrayElement(arr, 1, jShortField);
+
+writer.write(arr);
+
+env->DeleteLocalRef(jStringField);
+env->DeleteLocalRef(jShortField);
+env->DeleteLocalRef(arr);
+}
+
+/**
+ * test WithLoadOption interface
+ *
+ * @param env  jni env
+ * @param path file path
+ * @param argc argument counter
+ * @param argv argument vector
+ * @return true or throw exception
+ */
+bool testWithLoadOption(JNIEnv *env, char *path, int argc, char *argv[]) {
+
+char *jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{longField:long},{doubleField:double},{boolField:boolean},{dateField:date},{timeField:timestamp},{floatField:float},{arrayField:array}]";
+try {
+CarbonWriter writer;
+writer.builder(env);
+writer.outputPath(path);
+writer.withCsvInput(jsonSchema);
+writer.withLoadOption("complex_delimiter_level_1", "#");
+writer.writtenBy("CSDK");
+writer.taskNo(185);
+writer.withThreadSafe(1);
+writer.uniqueIdentifier(154991181400);
+writer.withBlockSize(1);
+writer.withBlockletSize(16);
+writer.enableLocalDictionary(true);
+writer.localDictionaryThreshold(1);
+if (argc > 3) {
+writer.withHadoopConf("fs.s3a.access.key", argv[1]);
+writer.withHadoopConf("fs.s3a.secret.key", argv[2]);
+writer.withHadoopConf("fs.s3a.endpoint", argv[3]);
+}
+writer.build();
+
+int rowNum = 7;
+int size = 10;
+long longValue = 0;
+double doubleValue = 0;
+float floatValue = 0;
+jclass objClass = env->FindClass("java/lang/String");
+for (int i = 0; i < rowNum; ++i) {
+jobjectArray arr = env->NewObjectArray(size, objClass, 0);
+char ctrInt[10];
+gcvt(i, 10, ctrInt);
+
+char a[15] = "robot";
+strcat(a, ctrInt);
+jobject stringField = env->NewStringUTF(a);
+env->SetObjectArrayElement(arr, 0, stringField);
+
+char ctrShort[10];
+gcvt(i % 1, 10, ctrShort);
+jobject shortField = env->NewStringUTF(ctrShort);
+env->SetObjectArrayElement(arr, 1, shortField);
+
+jobject intField = env->NewStringUTF(ctrInt);
+env->SetObjectArrayElement(arr, 2, intField);
+
+
+char ctrLong[10];
+gcvt(longValue, 10, ctrLong);
+longValue = longValue + 2;
+jobject longField = env->NewStringUTF(ctrLong);
+env->SetObjectArrayElement(arr, 3, longField);
+
+char ctrDouble[10];
+gcvt(doubleValue, 10, ctrDouble);
+doubleValue = doubleValue + 2;
+jobject doubleField = env->NewStringUTF(ctrDouble);
+env->SetObjectArrayElement(arr, 4, doubleField);
+
+jobject boolField = env->NewStringUTF("true");
+env->SetObjectArrayElement(arr, 5, boolField);
+
+jobject dateField = env->NewStringUTF(" 2019-03-02");
+env->SetObjectArrayElement(arr, 6, dateField);
+
+jobject timeField = env->NewStringUTF("2019-02-12 03:03:34");
+env->SetObjectArrayElement(arr, 7, timeField);
+
+char ctrFloat[10];
+gcvt(floatValue, 10, ctrFloat);
+floatValue = floatValue + 2;
+jobject floatField = env->NewStringUTF(ctrFloat);
+env->SetObjectArrayElement(arr, 8, floatField);
+
+jobject arrayField = 
env->NewStringUTF("Hello#World#From#Carbon");
+env->SetObjec

[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241333995
  
--- Diff: store/CSDK/test/main.cpp ---
@@ -645,6 +653,278 @@ bool testWriteData(JNIEnv *env, char *path, int argc, 
char *argv[]) {
 }
 }
 
+void writeData(JNIEnv *env, CarbonWriter writer, int size, jclass 
objClass, char *stringField, short shortField) {
+jobjectArray arr = env->NewObjectArray(size, objClass, 0);
+
+jobject jStringField = env->NewStringUTF(stringField);
+env->SetObjectArrayElement(arr, 0, jStringField);
+
+char ctrShort[10];
+gcvt(shortField % 1, 10, ctrShort);
+jobject jShortField = env->NewStringUTF(ctrShort);
+env->SetObjectArrayElement(arr, 1, jShortField);
+
+writer.write(arr);
+
+env->DeleteLocalRef(jStringField);
+env->DeleteLocalRef(jShortField);
+env->DeleteLocalRef(arr);
+}
+
+/**
+ * test WithLoadOption interface
+ *
+ * @param env  jni env
+ * @param path file path
+ * @param argc argument counter
+ * @param argv argument vector
+ * @return true or throw exception
+ */
+bool testWithLoadOption(JNIEnv *env, char *path, int argc, char *argv[]) {
+
+char *jsonSchema = 
"[{stringField:string},{shortField:short},{intField:int},{longField:long},{doubleField:double},{boolField:boolean},{dateField:date},{timeField:timestamp},{floatField:float},{arrayField:array}]";
+try {
+CarbonWriter writer;
+writer.builder(env);
--- End diff --

Can add these load options to existing writer test case, no need to add new


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241328653
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/CSVCarbonWriterTest.java 
---
@@ -543,4 +543,42 @@ public void testWritingAndReadingArrayOfFloatAndByte() 
throws IOException {
 }
   }
 
+  @Test
+  public void testWithTableProperties() throws IOException {
--- End diff --

what's the use of this test case ? how would you validate table property is 
working or not ?


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241327234
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java 
---
@@ -199,6 +203,22 @@ public CarbonWriterBuilder withLoadOptions(Map options) {
 return this;
   }
 
+  /**
+   * To support the load options for sdk writer
+   *
+   * @param key   the key of load option
+   * @param value the value of load option
+   * @return updated CarbonWriterBuilder object
+   */
+  public CarbonWriterBuilder withLoadOption(String key, String value) {
+Objects.requireNonNull(key, "key of table properties should not be 
null");
--- End diff --

this is load options, not table properties


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241322296
  
--- Diff: store/CSDK/src/CarbonWriter.cpp ---
@@ -58,6 +58,35 @@ void CarbonWriter::outputPath(char *path) {
 carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 }
 
+void CarbonWriter::sortBy(int argc, char **argv) {
+if (argc < 0) {
+throw std::runtime_error("argc parameter can't be negative.");
+}
+if (argv == NULL) {
+throw std::runtime_error("argv parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonReaderBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonReaderBuilderClass, 
"sortBy",
+
"([Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: sortBy");
+}
+jclass objectArrayClass = jniEnv->FindClass("Ljava/lang/String;");
+if (objectArrayClass == NULL) {
+throw std::runtime_error("Can't find the class in java: 
java/lang/String");
+}
+jobjectArray array = jniEnv->NewObjectArray(argc, objectArrayClass, 
NULL);
+for (int i = 0; i < argc; ++i) {
+jstring value = jniEnv->NewStringUTF(argv[i]);
+jniEnv->SetObjectArrayElement(array, i, value);
+}
+
+jvalue args[1];
+args[0].l = array;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
--- End diff --

can this be modified to 

(void) jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);

no need to collect return value from java API as it return type of cpp 
method is void.

check same for all the new void API added


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241321530
  
--- Diff: store/CSDK/src/CarbonWriter.cpp ---
@@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char 
*value) {
 carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 }
 
+void CarbonWriter::withTableProperty(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withTableProperty",
+
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withTableProperty");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::withLoadOption(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withLoadOption",
+ 
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withLoadOption");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::taskNo(long taskNo) {
+if (taskNo < 0) {
+throw std::runtime_error("taskNo parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"taskNo",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: taskNo");
+}
+jvalue args[2];
+args[0].j = taskNo;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::uniqueIdentifier(long timestamp) {
+if (timestamp < 1) {
+throw std::runtime_error("timestamp parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"uniqueIdentifier",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
uniqueIdentifier");
+}
+jvalue args[2];
+args[0].j = timestamp;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::withThreadSafe(short numOfThreads) {
+if (numOfThreads < 1) {
+throw std::runtime_error("numOfThreads parameter can't be 
negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withThreadSafe",
+"(S)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withThreadSafe");
+}
+jvalue args[2];
+args[0].s = numOfThreads;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 

[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241320959
  
--- Diff: store/CSDK/src/CarbonWriter.cpp ---
@@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char 
*value) {
 carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 }
 
+void CarbonWriter::withTableProperty(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withTableProperty",
+
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withTableProperty");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::withLoadOption(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withLoadOption",
+ 
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withLoadOption");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::taskNo(long taskNo) {
+if (taskNo < 0) {
+throw std::runtime_error("taskNo parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"taskNo",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: taskNo");
+}
+jvalue args[2];
+args[0].j = taskNo;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::uniqueIdentifier(long timestamp) {
+if (timestamp < 1) {
+throw std::runtime_error("timestamp parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"uniqueIdentifier",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
uniqueIdentifier");
+}
+jvalue args[2];
+args[0].j = timestamp;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::withThreadSafe(short numOfThreads) {
+if (numOfThreads < 1) {
+throw std::runtime_error("numOfThreads parameter can't be 
negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withThreadSafe",
+"(S)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withThreadSafe");
+}
+jvalue args[2];
--- End diff --

same as above


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241318527
  
--- Diff: store/CSDK/src/CarbonWriter.cpp ---
@@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char 
*value) {
 carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 }
 
+void CarbonWriter::withTableProperty(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withTableProperty",
+
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withTableProperty");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::withLoadOption(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withLoadOption",
+ 
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withLoadOption");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::taskNo(long taskNo) {
+if (taskNo < 0) {
+throw std::runtime_error("taskNo parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"taskNo",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: taskNo");
+}
+jvalue args[2];
+args[0].j = taskNo;
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::uniqueIdentifier(long timestamp) {
+if (timestamp < 1) {
+throw std::runtime_error("timestamp parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"uniqueIdentifier",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
uniqueIdentifier");
+}
+jvalue args[2];
--- End diff --

should be size 1 ?


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241318351
  
--- Diff: store/CSDK/src/CarbonWriter.cpp ---
@@ -98,6 +127,158 @@ void CarbonWriter::withHadoopConf(char *key, char 
*value) {
 carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 }
 
+void CarbonWriter::withTableProperty(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withTableProperty",
+
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withTableProperty");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::withLoadOption(char *key, char *value) {
+if (key == NULL) {
+throw std::runtime_error("key parameter can't be NULL.");
+}
+if (value == NULL) {
+throw std::runtime_error("value parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"withLoadOption",
+ 
"(Ljava/lang/String;Ljava/lang/String;)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: 
withLoadOption");
+}
+jvalue args[2];
+args[0].l = jniEnv->NewStringUTF(key);
+args[1].l = jniEnv->NewStringUTF(value);
+carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
+}
+
+void CarbonWriter::taskNo(long taskNo) {
+if (taskNo < 0) {
+throw std::runtime_error("taskNo parameter can't be negative.");
+}
+checkBuilder();
+jclass carbonWriterBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
+jmethodID methodID = jniEnv->GetMethodID(carbonWriterBuilderClass, 
"taskNo",
+"(J)Lorg/apache/carbondata/sdk/file/CarbonWriterBuilder;");
+if (methodID == NULL) {
+throw std::runtime_error("Can't find the method in java: taskNo");
+}
+jvalue args[2];
--- End diff --

should be size 1 ?


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241315209
  
--- Diff: store/CSDK/src/CarbonWriter.cpp ---
@@ -58,6 +58,35 @@ void CarbonWriter::outputPath(char *path) {
 carbonWriterBuilderObject = 
jniEnv->CallObjectMethodA(carbonWriterBuilderObject, methodID, args);
 }
 
+void CarbonWriter::sortBy(int argc, char **argv) {
+if (argc < 0) {
+throw std::runtime_error("argc parameter can't be negative.");
+}
+if (argv == NULL) {
+throw std::runtime_error("argv parameter can't be NULL.");
+}
+checkBuilder();
+jclass carbonReaderBuilderClass = 
jniEnv->GetObjectClass(carbonWriterBuilderObject);
--- End diff --

it is writerBuilder, not readerBuilder


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241313087
  
--- Diff: docs/csdk-guide.md ---
@@ -214,6 +226,122 @@ release the memory and destroy JVM.
 void withHadoopConf(char *key, char *value);
 ```
 
+<<<<<<< HEAD
+===
+```
+ /**
+ *  To support the table properties for writer
+ *
+ * @param key properties key
+ * @param value properties value
+ */
+void withTableProperty(char *key, char *value);
+```
+
+```
+/**
+ * To support the load options for C++ sdk writer
+ *
+ * @param options key,value pair of load options.
+ * supported keys values are
+ * a. bad_records_logger_enable -- true (write into separate logs), 
false
+ * b. bad_records_action -- FAIL, FORCE, IGNORE, REDIRECT
+ * c. bad_record_path -- path
+ * d. dateformat -- same as JAVA SimpleDateFormat
+ * e. timestampformat -- same as JAVA SimpleDateFormat
+ * f. complex_delimiter_level_1 -- value to Split the complexTypeData
+ * g. complex_delimiter_level_2 -- value to Split the nested 
complexTypeData
+ * h. quotechar
+ * i. escapechar
+ *
+ * Default values are as follows.
+ *
+ * a. bad_records_logger_enable -- "false"
+ * b. bad_records_action -- "FAIL"
+ * c. bad_record_path -- ""
+ * d. dateformat -- "" , uses from carbon.properties file
+ * e. timestampformat -- "", uses from carbon.properties file
+ * f. complex_delimiter_level_1 -- "$"
+ * g. complex_delimiter_level_2 -- ":"
+ * h. quotechar -- "\""
+ * i. escapechar -- "\\"
+ *
+ * @return updated CarbonWriterBuilder
+ */
+void withLoadOption(char *key, char *value);
+```
+
+```
+/**
+ * sets the taskNo for the writer. CSDKs concurrently running
+ * will set taskNo in order to avoid conflicts in file's name during 
write.
+ *
+ * @param taskNo is the TaskNo user wants to specify.
+ *   by default it is system time in nano seconds.
+ */
+void taskNo(long taskNo);
+```
+
+```
+/**
+ * to set the timestamp in the carbondata and carbonindex index files
+ *
+ * @param timestamp is a timestamp to be used in the carbondata and 
carbonindex index files.
+ * By default set to zero.
+ * @return updated CarbonWriterBuilder
+ */
+void uniqueIdentifier(long timestamp);
+```
+
+```
+/**
+ * To make c++ sdk writer thread safe.
+ *
+ * @param numOfThreads should number of threads in which writer is 
called in multi-thread scenario
+ *  default C++ sdk writer is not thread safe.
+ *  can use one writer instance in one thread only.
+ */
+void withThreadSafe(short numOfThreads) ;
+```
+
+```
+/**
+ * To set the carbondata file size in MB between 1MB-2048MB
+ *
+ * @param blockSize is size in MB between 1MB to 2048 MB
+ * default value is 1024 MB
+ */
+void withBlockSize(int blockSize);
+```
+
+```
+/**
+ * To set the blocklet size of CarbonData file
+ *
+ * @param blockletSize is blocklet size in MB
+ *default value is 64 MB
+ * @return updated CarbonWriterBuilder
+ */
+void withBlockletSize(int blockletSize);
+```
+
+```
+/**
+ * @param localDictionaryThreshold is localDictionaryThreshold, 
default is 1
+ * @return updated CarbonWriterBuilder
+ */
+void localDictionaryThreshold(int localDictionaryThreshold);
+```
+
+```
+/**
+ * @param enableLocalDictionary enable local dictionary, default is 
false
+ * @return updated CarbonWriterBuilder
+ */
+void enableLocalDictionary(bool enableLocalDictionary);
+```
+
+>>>>>>> aebd066bc... [CARBONDATA-3073] Support configure 
TableProperties,withLoadOption etc. interface in carbon writer of C++ SDK
--- End diff --

same as above


---


[GitHub] carbondata pull request #2899: [CARBONDATA-3073][CARBONDATA-3044] Support co...

2018-12-13 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2899#discussion_r241311890
  
--- Diff: docs/csdk-guide.md ---
@@ -214,6 +226,122 @@ release the memory and destroy JVM.
 void withHadoopConf(char *key, char *value);
 ```
 
+<<<<<<< HEAD
--- End diff --

what's this?

problem while resolving a conflict? please check and redo this file.


---


[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...

2018-12-12 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2982#discussion_r241286067
  
--- Diff: 
integration/presto/src/main/java/org/apache/carbondata/presto/impl/CarbonTableReader.java
 ---
@@ -364,23 +355,38 @@ private CarbonTable 
parseCarbonMetadata(SchemaTableName table) {
   String tablePath = storePath + "/" + 
carbonTableIdentifier.getDatabaseName() + "/"
   + carbonTableIdentifier.getTableName();
 
-  //Step 2: read the metadata (tableInfo) of the table.
-  ThriftReader.TBaseCreator createTBase = new 
ThriftReader.TBaseCreator() {
-// TBase is used to read and write thrift objects.
-// TableInfo is a kind of TBase used to read and write table 
information.
-// TableInfo is generated by thrift,
-// see schema.thrift under format/src/main/thrift for details.
-public TBase create() {
-  return new org.apache.carbondata.format.TableInfo();
+  String metadataPath = CarbonTablePath.getSchemaFilePath(tablePath);
+  boolean isTransactionalTable = false;
+  try {
+FileFactory.FileType fileType = 
FileFactory.getFileType(metadataPath);
+if (FileFactory.getCarbonFile(metadataPath, 
fileType).isFileExist(metadataPath, fileType)) {
--- End diff --

currently, this reduces one function call, If I don't pass file type 
implicitly again they call this method. Let it be there now. 

when we remove filetype from file factory in the future, this also will be 
optimized.


---


[GitHub] carbondata issue #2983: [CARBONDATA-3119] Fixed SDK Write for Complex Array ...

2018-12-12 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2983
  
retest this please


---


[GitHub] carbondata pull request #2949: [CARBONDATA-3118] support parallel block prun...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2949#discussion_r240900313
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java ---
@@ -205,26 +195,53 @@ public BlockletDetailsFetcher 
getBlockletDetailsFetcher() {
   final FilterResolverIntf filterExp, final List 
partitions,
   List blocklets, final Map> 
dataMaps,
   int totalFiles) {
+/*
+ 
*
+ * Below is the example of how this part of code works.
+ * consider a scenario of having 5 segments, 10 datamaps in each 
segment,
--- End diff --

BlockDatamap and blockletDatamap can store multiple files information. Each 
file is one row in that datamap. But non-default datamaps are not like that, so 
default datamaps distribution in multithread happens based on number of entries 
in datamaps, for non-default datamps distribution is based on number of 
datamaps (one datamap is considered as one record for non-default datamaps)

ALso 10 datamap in a segment means, one merge index file has info of 10 
index files


---


[GitHub] carbondata issue #2983: [CARBONDATA-3119] Fixed SDK Write for Complex Array ...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2983
  
retest this please


---


[GitHub] carbondata issue #2983: [CARBONDATA-3119] Fixed SDK Write for Complex Array ...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2983
  
@ravipesala , @chenliang613 : please add @shivamasn to whitelist


---


[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2983#discussion_r240889253
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -2072,4 +1797,35 @@ public void testReadingNullValues() {
 }
   }
 
+  @Test public void testSdkWriteWhenArrayOfStringIsEmpty()
+  throws IOException, InvalidLoadOptionException {
+
+CarbonProperties.getInstance()
+.addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, 
"FAIL");
--- End diff --

same as above, take a backup of CARBON_BAD_RECORDS_ACTION and set it at the 
end of the test case


---


[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2983#discussion_r240889185
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -23,8 +23,11 @@
 import java.util.*;
 
 import org.apache.avro.generic.GenericData;
+
--- End diff --

please revert unwanted changes, only your changes should be there in diff


---


[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2983#discussion_r240889074
  
--- Diff: 
integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
 ---
@@ -39,6 +39,27 @@ class TestComplexDataType extends QueryTest with 
BeforeAndAfterAll {
   .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, 
badRecordAction)
   }
 
+  test("test Projection PushDown for Array - String type when Array is 
Empty") {
+CarbonProperties.getInstance()
+  .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL")
--- End diff --

Take a previous CARBON_BAD_RECORDS_ACTION property as backup and set it 
back after this test case, else it affects other test suites


---


[GitHub] carbondata pull request #2983: [CARBONDATA-3119] Fixed SDK Write for Complex...

2018-12-11 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2983#discussion_r240889108
  
--- Diff: 
integration/spark-common-test/src/test/scala/org/apache/carbondata/integration/spark/testsuite/complexType/TestComplexDataType.scala
 ---
@@ -39,6 +39,27 @@ class TestComplexDataType extends QueryTest with 
BeforeAndAfterAll {
   .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, 
badRecordAction)
   }
 
+  test("test Projection PushDown for Array - String type when Array is 
Empty") {
+CarbonProperties.getInstance()
+  .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL")
+sql("drop table if exists table1")
+sql("create table table1 (detail array) stored by 
'carbondata'")
+sql("insert into table1 values('')")
+checkAnswer(sql("select detail[0] from table1"), Seq(Row("")))
+sql("drop table if exists table1")
+  }
+
+  test("test Projection PushDown for Struct - Array type when Array is 
Empty") {
+CarbonProperties.getInstance()
+  .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL")
--- End diff --

same as above


---


[jira] [Created] (CARBONDATA-3163) If table has different time format, for no_sort columns data goes as bad record (null) for second table when loaded after first table.

2018-12-11 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3163:


 Summary: If table has different time format, for no_sort columns 
data goes as bad record (null) for second table when loaded after first table.
 Key: CARBONDATA-3163
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3163
 Project: CarbonData
  Issue Type: Bug
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


If table has different time format, for no_sort columns data goes as bad record 
(null) for second table when loaded after first table.
FilterProcessorTestCase.test("Between  filter")



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Created] (CARBONDATA-3164) During no_sort, excpetion happend at converter step is not reached to user. same problem in SDK and spark file format flow also.

2018-12-11 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3164:


 Summary: During no_sort, excpetion happend at converter step is 
not reached to user. same problem in SDK and spark file format flow also.
 Key: CARBONDATA-3164
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3164
 Project: CarbonData
  Issue Type: Bug
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


During no_sort, excpetion happend at converter step is not reached to user. 
same problem in SDK and spark file format flow also.

 

TestLoadDataGeneral.

test({color:#008000}"test load / insert / update with data more than 32000 
bytes - dictionary_exclude"{color})



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[jira] [Created] (CARBONDATA-3162) Range filters doesn't remove null values for no_sort direct dictionary dimension columns.

2018-12-11 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3162:


 Summary: Range filters doesn't remove null values for no_sort 
direct dictionary dimension columns.
 Key: CARBONDATA-3162
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3162
 Project: CarbonData
  Issue Type: Bug
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


Range filters doesn't remove null values for no_sort direct dictionary 
dimension columns.

 

TimestampDataTypeDirectDictionaryTest.

test({color:#008000}"test timestamp with dictionary include and no_inverted 
index"{color})



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[GitHub] carbondata issue #2982: [CARBONDATA-3158] support presto-carbon to read sdk ...

2018-12-10 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2982
  
@ravipesala , @jackylk : PR is ready. please review


---


[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...

2018-12-10 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2982#discussion_r240477368
  
--- Diff: 
integration/presto/src/main/java/org/apache/carbondata/presto/impl/CarbonTableReader.java
 ---
@@ -364,23 +355,38 @@ private CarbonTable 
parseCarbonMetadata(SchemaTableName table) {
   String tablePath = storePath + "/" + 
carbonTableIdentifier.getDatabaseName() + "/"
   + carbonTableIdentifier.getTableName();
 
-  //Step 2: read the metadata (tableInfo) of the table.
-  ThriftReader.TBaseCreator createTBase = new 
ThriftReader.TBaseCreator() {
-// TBase is used to read and write thrift objects.
-// TableInfo is a kind of TBase used to read and write table 
information.
-// TableInfo is generated by thrift,
-// see schema.thrift under format/src/main/thrift for details.
-public TBase create() {
-  return new org.apache.carbondata.format.TableInfo();
+  String metadataPath = CarbonTablePath.getSchemaFilePath(tablePath);
+  boolean isTransactionalTable = false;
+  try {
+if (FileFactory.getCarbonFile(metadataPath)
--- End diff --

hmm. ok.


---


[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...

2018-12-10 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2982#discussion_r240475287
  
--- Diff: 
integration/presto/src/main/java/org/apache/carbondata/presto/impl/CarbonTableReader.java
 ---
@@ -364,23 +355,38 @@ private CarbonTable 
parseCarbonMetadata(SchemaTableName table) {
   String tablePath = storePath + "/" + 
carbonTableIdentifier.getDatabaseName() + "/"
   + carbonTableIdentifier.getTableName();
 
-  //Step 2: read the metadata (tableInfo) of the table.
-  ThriftReader.TBaseCreator createTBase = new 
ThriftReader.TBaseCreator() {
-// TBase is used to read and write thrift objects.
-// TableInfo is a kind of TBase used to read and write table 
information.
-// TableInfo is generated by thrift,
-// see schema.thrift under format/src/main/thrift for details.
-public TBase create() {
-  return new org.apache.carbondata.format.TableInfo();
+  String metadataPath = CarbonTablePath.getSchemaFilePath(tablePath);
+  boolean isTransactionalTable = false;
+  try {
+if (FileFactory.getCarbonFile(metadataPath)
+.isFileExist(metadataPath, 
FileFactory.getFileType(metadataPath))) {
+  // If metadata folder exists, it is a transactional table
+  isTransactionalTable = true;
 }
-  };
-  ThriftReader thriftReader =
-  new ThriftReader(CarbonTablePath.getSchemaFilePath(tablePath), 
createTBase);
-  thriftReader.open();
-  org.apache.carbondata.format.TableInfo tableInfo =
-  (org.apache.carbondata.format.TableInfo) thriftReader.read();
-  thriftReader.close();
-
+  } catch (IOException e) {
+throw new RuntimeException(e);
+  }
+  org.apache.carbondata.format.TableInfo tableInfo;
+  if (isTransactionalTable) {
+//Step 2: read the metadata (tableInfo) of the table.
+ThriftReader.TBaseCreator createTBase = new 
ThriftReader.TBaseCreator() {
+  // TBase is used to read and write thrift objects.
+  // TableInfo is a kind of TBase used to read and write table 
information.
+  // TableInfo is generated by thrift,
+  // see schema.thrift under format/src/main/thrift for details.
+  public TBase create() {
+return new org.apache.carbondata.format.TableInfo();
+  }
+};
+ThriftReader thriftReader =
+new ThriftReader(CarbonTablePath.getSchemaFilePath(tablePath), 
createTBase);
+thriftReader.open();
+tableInfo = (org.apache.carbondata.format.TableInfo) 
thriftReader.read();
+thriftReader.close();
+  } else {
+tableInfo =
+CarbonUtil.inferSchema(tablePath, table.getTableName(), false, 
new Configuration());
--- End diff --

I have tested. It works.

but better to use
FileFactory.getConfiguration(). I will change to it,


---


[GitHub] carbondata pull request #2982: [CARBONDATA-3158] support presto-carbon to re...

2018-12-10 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2982

[CARBONDATA-3158] support presto-carbon to read sdk cabron files

**problem:**
Currently, carbon SDK files output (files without metadata folder and its
contents) are read by spark using an external table with carbon session.
But presto carbon integration doesn't support that. It can currently read
only the transactional table output files.

**solution:** 
Hence we can enhance presto to read SDK output files. This will increase
the use cases for presto-carbon integration.

The above scenario can be achieved by inferring schema if metadata folder
not exists and
setting read committed scope to LatestFilesReadCommittedScope, if
non-transctional table output files are present.

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed? NA
 
 - [ ] Any backward compatibility impacted? NA
 
 - [ ] Document update required? NA

 - [ ] Testing done.  
yes, added UT   
 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA. NA



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata presto

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2982.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2982


commit e54253eff4546945a8c5a2543ac5ed5aa12df85b
Author: ajantha-bhat 
Date:   2018-12-07T13:07:10Z

presto with sdk




---


[jira] [Created] (CARBONDATA-3158) support presto-carbon to read sdk cabron files

2018-12-10 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3158:


 Summary: support presto-carbon to read sdk cabron files
 Key: CARBONDATA-3158
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3158
 Project: CarbonData
  Issue Type: Improvement
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


Currently, carbon SDK files output (files without metadata folder and its 
contents) are read by spark using an external table with carbon session. 
But presto carbon integration doesn't support that. It can currently read 
only the transactional table output files. 

Hence we can enhance presto to read SDK output files. This will increase 
the use cases for presto-carbon integration. 

The above scenario can be achieved by inferring schema if metadata folder 
not exists and 
setting read committed scope to LatestFilesReadCommittedScope, if 
non-transctional table output files are present.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[GitHub] carbondata issue #2966: [WIP] test and check no sort by default

2018-11-30 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2966
  
retest this please


---


[GitHub] carbondata pull request #2966: [WIP] test and check no sort by default

2018-11-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2966#discussion_r237747880
  
--- Diff: 
integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/DataMapWriterSuite.scala
 ---
@@ -156,8 +156,7 @@ class DataMapWriterSuite extends QueryTest with 
BeforeAndAfterAll {
 CarbonProperties.getInstance()
   .addProperty("carbon.blockletgroup.size.in.mb", "16")
 CarbonProperties.getInstance()
-  .addProperty("carbon.number.of.cores.while.loading",
-CarbonCommonConstants.NUM_CORES_DEFAULT_VAL)
+  .addProperty("carbon.number.of.cores.while.loading", "2")
--- End diff --

And @qiuchenjian : If you have a doubt in PR changes, you can ask it in 
comment sections. Don't add as a review comment. Just add as a comment.


---


[GitHub] carbondata pull request #2966: [WIP] test and check no sort by default

2018-11-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2966#discussion_r237747693
  
--- Diff: 
integration/spark-common-test/src/test/scala/org/apache/carbondata/spark/testsuite/datamap/DataMapWriterSuite.scala
 ---
@@ -156,8 +156,7 @@ class DataMapWriterSuite extends QueryTest with 
BeforeAndAfterAll {
 CarbonProperties.getInstance()
   .addProperty("carbon.blockletgroup.size.in.mb", "16")
 CarbonProperties.getInstance()
-  .addProperty("carbon.number.of.cores.while.loading",
-CarbonCommonConstants.NUM_CORES_DEFAULT_VAL)
+  .addProperty("carbon.number.of.cores.while.loading", "2")
--- End diff --

CarbonCommonConstants.NUM_CORES_DEFAULT_VAL is removed, so setting 
CarbonCommonConstants.NUM_CORES_LOADING
 as 2. It is a test code.


---


[GitHub] carbondata pull request #2964: [HOTFIX] Fix ArrayOutOfBound exception when d...

2018-11-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2964#discussion_r237548880
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -575,7 +575,7 @@ public void testReadColumnTwice() throws IOException, 
InterruptedException {
 
 CarbonReader reader = CarbonReader
 .builder(path, "_temp")
-.projection(new String[]{"name", "name", "age", "name"})
+.projection(new String[]{"name", "age", "age", "name"})
--- End diff --

@qiuchenjian : NO, age is an int column (measure), check schema in writer 
buildler. 

No need of new test case, now this test case handle testing duplicate 
measures and dimensions.


---


[GitHub] carbondata pull request #2966: [WIP] test and check no sort by default

2018-11-29 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2966

[WIP] test and check no sort by default

[WIP] test and check no sort by default

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed?
 
 - [ ] Any backward compatibility impacted?
 
 - [ ] Document update required?

 - [ ] Testing done
Please provide details on 
- Whether new unit test cases have been added or why no new tests 
are required?
- How it is tested? Please attach test report.
- Is it a performance related change? Please attach the performance 
test report.
- Any additional information to help reviewers in testing this 
change.
   
 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA. 



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata optimize

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2966.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2966


commit 09410167efee56d69a3c18433c8e6e97a2fe18eb
Author: ajantha-bhat 
Date:   2018-11-29T15:59:34Z

no sort by default




---


[GitHub] carbondata issue #2964: [HOTFIX] Fix ArrayOutOfBound exception when duplicat...

2018-11-29 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2964
  
@kunal642 , @kumarvishal09 : please check this


---


[GitHub] carbondata pull request #2964: [HOTFIX] Fix ArrayOutOfBound exception when d...

2018-11-29 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2964

[HOTFIX] Fix ArrayOutOfBound exception when duplicate measure in projection 
column

problem: ArrayOutOfBound exception when duplicate measure in the projection 
column

cause: In query executor, when the reusable buffer is formed. It was 
considering only the unique values. Need to consider all the projections.

solution: consider all the projections, while forming a reusable buffer.

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed? NA
 
 - [ ] Any backward compatibility impacted? NA
 
 - [ ] Document update required? NA

 - [ ] Testing done
yes, updated UT.
   
 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA.  NA



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata sdk

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2964.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2964


commit ca6fd01b92844f43f2472ccf3f17d498bbd73216
Author: ajantha-bhat 
Date:   2018-11-29T11:42:56Z

fix ArrayOutOfBound exception when duplicate measure in projection column




---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-28 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r237174006
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java ---
@@ -63,6 +75,8 @@
 
   private SegmentPropertiesFetcher segmentPropertiesFetcher;
 
+  private static final Log LOG = LogFactory.getLog(TableDataMap.class);
--- End diff --

ok


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-28 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r237173860
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java
 ---
@@ -1399,6 +1399,17 @@ private CarbonCommonConstants() {
 
   public static final String CARBON_PUSH_ROW_FILTERS_FOR_VECTOR_DEFAULT = 
"false";
 
+  /**
+   * max driver threads used for block pruning [1 to 4 threads]
+   */
+  @CarbonProperty public static final String 
CARBON_MAX_DRIVER_THREADS_FOR_BLOCK_PRUNING =
+  "carbon.max.driver.threads.for.block.pruning";
--- End diff --

I have another non-default datamap PR. I will check about this. I feel this 
name also OK


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-28 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r237173725
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java
 ---
@@ -1399,6 +1399,17 @@ private CarbonCommonConstants() {
 
   public static final String CARBON_PUSH_ROW_FILTERS_FOR_VECTOR_DEFAULT = 
"false";
 
+  /**
+   * max driver threads used for block pruning [1 to 4 threads]
+   */
+  @CarbonProperty public static final String 
CARBON_MAX_DRIVER_THREADS_FOR_BLOCK_PRUNING =
+  "carbon.max.driver.threads.for.block.pruning";
+
+  public static final String 
CARBON_MAX_DRIVER_THREADS_FOR_BLOCK_PRUNING_DEFAULT = "4";
+
+  // block prune in multi-thread if files size more than 100K files.
+  public static final int 
CARBON_DRIVER_PRUNING_MULTI_THREAD_ENABLE_FILES_COUNT = 10;
--- End diff --

because driver doing multi-thread  default may impact concurrent queries, 
also by testing observed that 100k datamap takes 1 second.  If block pruning 
taking more than a second then only multi-thead 


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-28 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r237173126
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java ---
@@ -487,6 +487,8 @@ private int getBlockCount(List 
blocklets) {
 // First prune using default datamap on driver side.
 TableDataMap defaultDataMap = 
DataMapStoreManager.getInstance().getDefaultDataMap(carbonTable);
 List prunedBlocklets = null;
+// This is to log the event, so user will know what is happening by 
seeing logs.
+LOG.info("Started block pruning ...");
--- End diff --

log will anyways have timestap, we can subtract stop and start time. I have 
another non-default datamap PR. I will check about this. 


---


[GitHub] carbondata issue #2962: [CARBONDATA-3138] Fix random count mismatch with mul...

2018-11-28 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2962
  
@ravipesala , @kumarvishal09 : please check this


---


[GitHub] carbondata pull request #2962: [CARBONDATA-3138] Fix random count mismatch w...

2018-11-28 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2962

[CARBONDATA-3138] Fix random count mismatch with multi-thread block pruning

problem: Random count mismatch in query in multi-thread block-pruning 
scenario.

cause: Existing prune method not meant for multi-threading as 
synchronization was missing.
only in implicit filter scenario, while preparing the block ID list, 
synchronization was missing. Hence pruning was giving wrong result.

solution: synchronize the implicit filter preparation, as prune now called 
in multi-thread

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed? NA
 
 - [ ] Any backward compatibility impacted? NA
 
 - [ ] Document update required? NA

 - [ ] Testing done
   done with huge data   
 
- [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA. 



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata issue_fix

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2962.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2962


commit 09a469e926d43b4992084e5dc1727899ee04366f
Author: ajantha-bhat 
Date:   2018-11-28T13:48:16Z

random count mismatch with multi-thread block pruning




---


[jira] [Created] (CARBONDATA-3138) Random count mismatch in query in multi-thread block-pruning scenario

2018-11-28 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3138:


 Summary: Random count mismatch in query in multi-thread 
block-pruning scenario
 Key: CARBONDATA-3138
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3138
 Project: CarbonData
  Issue Type: Bug
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


problem: Random count mismatch in query in multi-thread block-pruning scenario.

cause:Existing prune method not meant for multi-threading as synchronization 
was missiing. 

only in implicit filter scenario, while preparing the block ID list, 
synchronization was missing. Hence pruning was giving wrong result.

solution: syncronize the imlicit filter prepartion, as prune now called in 
multi-thread

 

 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[GitHub] carbondata pull request #2949: [WIP] support parallel block pruning for non-...

2018-11-27 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2949#discussion_r236746764
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/dev/DataMap.java ---
@@ -70,4 +70,6 @@ void init(DataMapModel dataMapModel)
*/
   void finish();
 
+  // can return , number of records information that are stored in datamap.
--- End diff --

ok, changed to just "returns"


---


[GitHub] carbondata pull request #2958: [CARBONDATA-3136] JVM crash with preaggregate...

2018-11-27 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2958

[CARBONDATA-3136] JVM crash with preaggregate datamap when average of 
decimal column is taken with orderby.  

problem:  JVM crash with preaggregate datamap when average of decimal 
column is taken with orderby.  

cause: When preparing plan with preaggregate datamap, decimal is cast to 
double in average expression. This was leading to JVM crash in spark as we were 
filling with wrong precision (callstack mentioned in JIRA) 

solution: division result of average, should be casted to decimal instead 
of double for decimal datatype.

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed? NA
 
 - [ ] Any backward compatibility impacted? NA
 
 - [ ] Document update required? NA

 - [ ] Testing done.   
   yes, added UT   

 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA.  NA



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata issue_fix

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2958.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2958


commit 8d95838e5d5991d7c355944d40a54972ea1c1424
Author: ajantha-bhat 
Date:   2018-11-27T14:07:49Z

jvm crash when query pre-aggreagte table with avg(decimal_column) and order 
by




---


[jira] [Updated] (CARBONDATA-3136) JVM crash with preaggregate datamap

2018-11-27 Thread Ajantha Bhat (JIRA)


 [ 
https://issues.apache.org/jira/browse/CARBONDATA-3136?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ajantha Bhat updated CARBONDATA-3136:
-
Description: 
JVM crash with preaggregate datamap.

callstack:

Stack: [0x7efebd49a000,0x7efebd59b000],  sp=0x7efebd598dc8,  free 
space=1019k
 Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
 V  [libjvm.so+0x7b2b50]
 J 7620  sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V 
(0 bytes) @ 0x7eff4a3479e1 [0x7eff4a347900+0xe1]
 j  
org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34
 j  org.apache.spark.sql.catalyst.expressions.UnsafeRow.getBinary(I)[B+54
 j  
org.apache.spark.sql.catalyst.expressions.UnsafeRow.getDecimal(III)Lorg/apache/spark/sql/types/Decimal;+30
 j  
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;+36
 j  
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
 J 3104 C1 scala.collection.Iterator$$anon$11.next()Ljava/lang/Object; (19 
bytes) @ 0x7eff49154724 [0x7eff49154560+0x1c4]
 j  
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Lscala/collection/Iterator;)Lscala/collection/Iterator;+78
 j  
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
 J 14007 C1 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;
 (17 bytes) @ 0x7eff4a6ed204 [0x7eff4a6ecc40+0x5c4]
 J 11684 C1 
org.apache.spark.rdd.MapPartitionsRDD.compute(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
 (36 bytes) @ 0x7eff4ad11274 [0x7eff4ad10f60+0x314]
 J 13771 C1 
org.apache.spark.rdd.RDD.iterator(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
 (46 bytes) @ 0x7eff4b39dd3c  [0x7eff4b39d160+0xbdc]

 

 

test({color:#008000}"Test Pre_aggregate with decimal column with order 
by"{color}) {
 sql({color:#008000}"drop table if exists maintable"{color})
 sql({color:#008000}"create table maintable(name string, decimal_col 
decimal(30,16)) stored by 'carbondata'"{color})
 sql({color:#008000}"insert into table maintable select 'abc',452.564"{color})
 sql(
 {color:#008000}"create datamap ag1 on table maintable using 'preaggregate' as 
select name,avg(decimal_col)" {color}+
 {color:#008000}" from maintable group by name"{color})
 checkAnswer(sql({color:#008000}"select avg(decimal_col) from maintable group 
by name order by name"{color}),
 {color:#660e7a}Seq{color}(Row({color:#ff}452.5640{color})))
}

  was:
JVM crash with preaggregate datamap.

callstack:

Stack: [0x7efebd49a000,0x7efebd59b000],  sp=0x7efebd598dc8,  free 
space=1019k
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
V  [libjvm.so+0x7b2b50]
J 7620  sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V (0 
bytes) @ 0x7eff4a3479e1 [0x7eff4a347900+0xe1]
j  
org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34
j  org.apache.spark.sql.catalyst.expressions.UnsafeRow.getBinary(I)[B+54
j  
org.apache.spark.sql.catalyst.expressions.UnsafeRow.getDecimal(III)Lorg/apache/spark/sql/types/Decimal;+30
j  
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;+36
j  
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
J 3104 C1 scala.collection.Iterator$$anon$11.next()Ljava/lang/Object; (19 
bytes) @ 0x7eff49154724 [0x7eff49154560+0x1c4]
j  
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Lscala/collection/Iterator;)Lscala/collection/Iterator;+78
j  
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
J 14007 C1 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;
 (17 bytes) @ 0x7eff4a6ed204 [0x7eff4a6ecc40+0x5c4]
J 11684 C1 
org.apache.spark.rdd.MapPartitionsRDD.compute(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
 (36 bytes) @ 0x7eff4ad11274 [0x7eff4ad10f60+0x314]
J 13771 C1 
org.apache.spark.rdd.RDD.iterator(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
 (46 bytes) @ 0x7eff4b39dd

[jira] [Created] (CARBONDATA-3136) JVM crash with preaggregate datamap

2018-11-27 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3136:


 Summary: JVM crash with preaggregate datamap
 Key: CARBONDATA-3136
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3136
 Project: CarbonData
  Issue Type: Improvement
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


JVM crash with preaggregate datamap.

callstack:

Stack: [0x7efebd49a000,0x7efebd59b000],  sp=0x7efebd598dc8,  free 
space=1019k
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
V  [libjvm.so+0x7b2b50]
J 7620  sun.misc.Unsafe.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V (0 
bytes) @ 0x7eff4a3479e1 [0x7eff4a347900+0xe1]
j  
org.apache.spark.unsafe.Platform.copyMemory(Ljava/lang/Object;JLjava/lang/Object;JJ)V+34
j  org.apache.spark.sql.catalyst.expressions.UnsafeRow.getBinary(I)[B+54
j  
org.apache.spark.sql.catalyst.expressions.UnsafeRow.getDecimal(III)Lorg/apache/spark/sql/types/Decimal;+30
j  
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Lorg/apache/spark/sql/catalyst/InternalRow;)Lorg/apache/spark/sql/catalyst/expressions/UnsafeRow;+36
j  
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
J 3104 C1 scala.collection.Iterator$$anon$11.next()Ljava/lang/Object; (19 
bytes) @ 0x7eff49154724 [0x7eff49154560+0x1c4]
j  
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Lscala/collection/Iterator;)Lscala/collection/Iterator;+78
j  
org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(Ljava/lang/Object;)Ljava/lang/Object;+5
J 14007 C1 
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;
 (17 bytes) @ 0x7eff4a6ed204 [0x7eff4a6ecc40+0x5c4]
J 11684 C1 
org.apache.spark.rdd.MapPartitionsRDD.compute(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
 (36 bytes) @ 0x7eff4ad11274 [0x7eff4ad10f60+0x314]
J 13771 C1 
org.apache.spark.rdd.RDD.iterator(Lorg/apache/spark/Partition;Lorg/apache/spark/TaskContext;)Lscala/collection/Iterator;
 (46 bytes) @ 0x7eff4b39dd3c  [0x7eff4b39d160+0xbdc]



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...

2018-11-23 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2936
  
retest this please


---


[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...

2018-11-23 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2936
  
did a conflict resolve with documentation file. Let the build run again 


---


[GitHub] carbondata pull request #2949: [WIP] support parallel block pruning for non-...

2018-11-23 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2949

[WIP] support parallel block pruning for non-default datamaps

[WIP] support parallel block pruning for non-default datamaps

This PR dependent on #2936 

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed?
 
 - [ ] Any backward compatibility impacted?
 
 - [ ] Document update required?

 - [ ] Testing done
Please provide details on 
- Whether new unit test cases have been added or why no new tests 
are required?
- How it is tested? Please attach test report.
- Is it a performance related change? Please attach the performance 
test report.
- Any additional information to help reviewers in testing this 
change.
   
 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA. 



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata working_backup

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2949.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2949


commit 6237d69fcc0ddc1a08c74579762b721108a251fe
Author: ajantha-bhat 
Date:   2018-11-20T16:45:06Z

parllelize block pruning

commit e8e912daf3ada357352e006ec9ce435d4c4b1625
Author: ajantha-bhat 
Date:   2018-11-22T11:01:53Z

reveiw comment fix

commit d0bf82f276618f6fa09cbce65f714394b5fa4e0c
Author: ajantha-bhat 
Date:   2018-11-23T13:22:07Z

support parallel pruning for non-default datamaps




---


[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...

2018-11-23 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2936
  
@ravipesala : PR is ready. please check


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-22 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r235846200
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java ---
@@ -120,37 +132,166 @@ public BlockletDetailsFetcher 
getBlockletDetailsFetcher() {
* @param filterExp
* @return
*/
-  public List prune(List segments, 
FilterResolverIntf filterExp,
-  List partitions) throws IOException {
-List blocklets = new ArrayList<>();
-SegmentProperties segmentProperties;
-Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+  public List prune(List segments, final 
FilterResolverIntf filterExp,
+  final List partitions) throws IOException {
+final List blocklets = new ArrayList<>();
+final Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+// for non-filter queries
+if (filterExp == null) {
+  // if filter is not passed, then return all the blocklets.
+  return pruneWithoutFilter(segments, partitions, blocklets);
+}
+// for filter queries
+int totalFiles = 0;
+boolean isBlockDataMapType = true;
+for (Segment segment : segments) {
+  for (DataMap dataMap : dataMaps.get(segment)) {
+if (!(dataMap instanceof BlockDataMap)) {
--- End diff --

This one, I have to figure out, number of entries in all kinds of datamap 
and need to test those scenario. I will handle in next PR 


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-22 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r235842114
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java ---
@@ -120,37 +132,166 @@ public BlockletDetailsFetcher 
getBlockletDetailsFetcher() {
* @param filterExp
* @return
*/
-  public List prune(List segments, 
FilterResolverIntf filterExp,
-  List partitions) throws IOException {
-List blocklets = new ArrayList<>();
-SegmentProperties segmentProperties;
-Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+  public List prune(List segments, final 
FilterResolverIntf filterExp,
+  final List partitions) throws IOException {
+final List blocklets = new ArrayList<>();
+final Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+// for non-filter queries
+if (filterExp == null) {
+  // if filter is not passed, then return all the blocklets.
+  return pruneWithoutFilter(segments, partitions, blocklets);
+}
+// for filter queries
+int totalFiles = 0;
+boolean isBlockDataMapType = true;
+for (Segment segment : segments) {
+  for (DataMap dataMap : dataMaps.get(segment)) {
+if (!(dataMap instanceof BlockDataMap)) {
+  isBlockDataMapType = false;
+  break;
+}
+totalFiles += ((BlockDataMap) dataMap).getTotalBlocks();
+  }
+  if (!isBlockDataMapType) {
+// totalFiles fill be 0 for non-BlockDataMap Type. ex: lucene, 
bloom datamap. use old flow.
+break;
+  }
+}
+int numOfThreadsForPruning = getNumOfThreadsForPruning();
+int filesPerEachThread = totalFiles / numOfThreadsForPruning;
+if (numOfThreadsForPruning == 1 || filesPerEachThread == 1
+|| segments.size() < numOfThreadsForPruning || totalFiles
+< 
CarbonCommonConstants.CARBON_DRIVER_PRUNING_MULTI_THREAD_ENABLE_FILES_COUNT) {
+  // use multi-thread, only if the files are more than 0.1 million.
+  // As 0.1 million files block pruning can take only 1 second.
+  // Doing multi-thread for smaller values is not recommended as
+  // driver should have minimum threads opened to support multiple 
concurrent queries.
+  return pruneWithFilter(segments, filterExp, partitions, blocklets, 
dataMaps);
+}
+// handle by multi-thread
+return pruneWithFilterMultiThread(segments, filterExp, partitions, 
blocklets, dataMaps,
+totalFiles);
+  }
+
+  private List pruneWithoutFilter(List segments,
+  List partitions, List blocklets) 
throws IOException {
+for (Segment segment : segments) {
+  List allBlocklets = 
blockletDetailsFetcher.getAllBlocklets(segment, partitions);
+  blocklets.addAll(
+  
addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(allBlocklets, segment),
+  segment.toString()));
+}
+return blocklets;
+  }
+
+  private List pruneWithFilter(List segments,
+  FilterResolverIntf filterExp, List partitions,
+  List blocklets, Map> 
dataMaps) throws IOException {
 for (Segment segment : segments) {
   List pruneBlocklets = new ArrayList<>();
-  // if filter is not passed then return all the blocklets
-  if (filterExp == null) {
-pruneBlocklets = blockletDetailsFetcher.getAllBlocklets(segment, 
partitions);
-  } else {
-segmentProperties = 
segmentPropertiesFetcher.getSegmentProperties(segment);
-for (DataMap dataMap : dataMaps.get(segment)) {
-  pruneBlocklets.addAll(dataMap.prune(filterExp, 
segmentProperties, partitions));
+  SegmentProperties segmentProperties = 
segmentPropertiesFetcher.getSegmentProperties(segment);
+  for (DataMap dataMap : dataMaps.get(segment)) {
+pruneBlocklets.addAll(dataMap.prune(filterExp, segmentProperties, 
partitions));
+  }
+  blocklets.addAll(
+  
addSegmentId(blockletDetailsFetcher.getExtendedBlocklets(pruneBlocklets, 
segment),
+  segment.toString()));
+}
+return blocklets;
+  }
+
+  private List pruneWithFilterMultiThread(List 
segments,
+  final FilterResolverIntf filterExp, final List 
partitions,
+  List blocklets, final Map> 
dataMaps,
+  int totalFiles) {
+int numOfThreadsForPruning = getNumOfThreadsForPruning();
+int filesPerEachThread = (int) Math.ceil((double)totalFiles / 
numOfThreadsForPruning);
+int prev = 0;
+int filesCount = 0;
+int processedFileCount = 0;
+List> segmentList = new ArrayList<>();
--- End diff --

done


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-22 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r235774840
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java ---
@@ -120,37 +132,166 @@ public BlockletDetailsFetcher 
getBlockletDetailsFetcher() {
* @param filterExp
* @return
*/
-  public List prune(List segments, 
FilterResolverIntf filterExp,
-  List partitions) throws IOException {
-List blocklets = new ArrayList<>();
-SegmentProperties segmentProperties;
-Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+  public List prune(List segments, final 
FilterResolverIntf filterExp,
+  final List partitions) throws IOException {
+final List blocklets = new ArrayList<>();
+final Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+// for non-filter queries
+if (filterExp == null) {
+  // if filter is not passed, then return all the blocklets.
+  return pruneWithoutFilter(segments, partitions, blocklets);
+}
+// for filter queries
+int totalFiles = 0;
+boolean isBlockDataMapType = true;
+for (Segment segment : segments) {
+  for (DataMap dataMap : dataMaps.get(segment)) {
+if (!(dataMap instanceof BlockDataMap)) {
--- End diff --

Two reasons:

1.  number of datamaps will be very less if it is not a block or blocklet 
datamap. Hence multi-threading is not required (as it is overhead for driver in 
concurrent scenarios)

2. Other datamaps doesn't have number entries count in them.

I will check


---


[GitHub] carbondata pull request #2936: [CARBONDATA-3118] Parallelize block pruning o...

2018-11-21 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2936#discussion_r235615112
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datamap/TableDataMap.java ---
@@ -120,37 +132,166 @@ public BlockletDetailsFetcher 
getBlockletDetailsFetcher() {
* @param filterExp
* @return
*/
-  public List prune(List segments, 
FilterResolverIntf filterExp,
-  List partitions) throws IOException {
-List blocklets = new ArrayList<>();
-SegmentProperties segmentProperties;
-Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+  public List prune(List segments, final 
FilterResolverIntf filterExp,
+  final List partitions) throws IOException {
+final List blocklets = new ArrayList<>();
+final Map> dataMaps = 
dataMapFactory.getDataMaps(segments);
+// for non-filter queries
+if (filterExp == null) {
+  // if filter is not passed, then return all the blocklets.
+  return pruneWithoutFilter(segments, partitions, blocklets);
--- End diff --

yes, Already tested this. for 100k files with filter takes around 1 
seconds. But without filter is 50 ms.  Very less. Hence not handled.

for filters, pruning was taking time.


---


[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...

2018-11-21 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2936
  
retest this please


---


[GitHub] carbondata issue #2936: [CARBONDATA-3118] Parallelize block pruning of defau...

2018-11-21 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2936
  
@manishgupta88 , @ravipesala : please review


---


[jira] [Created] (CARBONDATA-3118) Parallelize block pruning of default datamap in driver for filter query processing

2018-11-21 Thread Ajantha Bhat (JIRA)
Ajantha Bhat created CARBONDATA-3118:


 Summary: Parallelize block pruning of default datamap in driver  
for filter query processing
 Key: CARBONDATA-3118
 URL: https://issues.apache.org/jira/browse/CARBONDATA-3118
 Project: CarbonData
  Issue Type: Improvement
Reporter: Ajantha Bhat
Assignee: Ajantha Bhat


*"Parallelize block pruning of default datamap in driver 
for filter query processing"* 

*Background:* 
We do block pruning for the filter queries at the driver side. 
In real time big data scenario, we can have millions of carbon files for 
one carbon table. 
It is currently observed that for 1 million carbon files it takes around 5 
seconds for block pruning. As each carbon file takes around 0.005ms for 
pruning 
(with only one filter columns set in 'column_meta_cache' tblproperty). 
If the files are more, we might take more time for block pruning. 
Also, spark Job will not be launched until block pruning is completed. so, 
the user will not know what is happening at that time and why spark job is 
not launching. 
currently, block pruning is taking time as each segment processing is 
happening sequentially. we can reduce the time by parallelizing it. 


*solution:*Keep default number of threads for block pruning as 4. 
User can reduce this number by a carbon property 
"carbon.max.driver.threads.for.pruning" to set between -> 1 to 4. 

In TableDataMap.prune(), 

group the segments as per the threads by distributing equal carbon files to 
each thread. 
Launch the threads for a group of segments to handle block pruning. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


[GitHub] carbondata pull request #2936: [WIP] Parallelize block pruning of default da...

2018-11-20 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2936

[WIP] Parallelize block pruning of default datamap in driver for filter 
query processing

Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed?
 
 - [ ] Any backward compatibility impacted?
 
 - [ ] Document update required?

 - [ ] Testing done
Please provide details on 
- Whether new unit test cases have been added or why no new tests 
are required?
- How it is tested? Please attach test report.
- Is it a performance related change? Please attach the performance 
test report.
- Any additional information to help reviewers in testing this 
change.
   
 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA. 



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata issue_fix

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2936.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2936


commit 7e2d16903565effab3c1c085291178865f6ad7ba
Author: ajantha-bhat 
Date:   2018-11-20T16:45:06Z

pruning compliling

commit 059b69b46d5e543ba4a65af85ce694a1899de395
Author: ajantha-bhat 
Date:   2018-11-21T02:27:52Z

issue fix




---


[GitHub] carbondata issue #2921: Removed unnecessary configuration in BlockletDataMap...

2018-11-15 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2921
  
@ravipesala , @chenliang613 :  please add @NamanRastogi  to whiteList


---


[GitHub] carbondata issue #2921: Removed unnecessary configuration in BlockletDataMap...

2018-11-15 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2921
  
retest this please


---


[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...

2018-11-13 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2895
  
retest this please


---


[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...

2018-11-12 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2895
  
retest this please


---


[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...

2018-11-11 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2895
  
retest this please


---


[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...

2018-11-11 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2895
  
@manishgupta88 : please check. same 2.3 build has passed without any code 
changes before run. It is random failure.


---


[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...

2018-11-06 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2895
  
retest this please


---


[GitHub] carbondata issue #2895: [HOTFIX] Fix NPE in spark, when same vector reads fi...

2018-11-06 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2895
  
retest this please


---


[GitHub] carbondata pull request #2895: [HOTFIX] Fix NPE in spark, when same vector r...

2018-11-05 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2895#discussion_r231001948
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/LocalDictDimensionDataChunkStore.java
 ---
@@ -61,10 +61,7 @@ public void fillVector(int[] invertedIndex, int[] 
invertedIndexReverse, byte[] d
 int columnValueSize = dimensionDataChunkStore.getColumnValueSize();
 int rowsNum = data.length / columnValueSize;
 CarbonColumnVector vector = vectorInfo.vector;
-if (!dictionary.isDictionaryUsed()) {
-  vector.setDictionary(dictionary);
-  dictionary.setDictionaryUsed();
-}
+vector.setDictionary(dictionary);
--- End diff --

done


---


[GitHub] carbondata pull request #2895: [HOTFIX] Fix NPE in spark, when same vector r...

2018-11-05 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2895#discussion_r230997947
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/LocalDictDimensionDataChunkStore.java
 ---
@@ -61,10 +61,7 @@ public void fillVector(int[] invertedIndex, int[] 
invertedIndexReverse, byte[] d
 int columnValueSize = dimensionDataChunkStore.getColumnValueSize();
 int rowsNum = data.length / columnValueSize;
 CarbonColumnVector vector = vectorInfo.vector;
-if (!dictionary.isDictionaryUsed()) {
-  vector.setDictionary(dictionary);
-  dictionary.setDictionaryUsed();
-}
+vector.setDictionary(dictionary);
--- End diff --

I have checked this while coding, fill row is our method. So, no issues. 
Only this vector is spark vector and dictionary needs to clear for it.


---


[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

2018-11-05 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2816
  
LGTM


---


[GitHub] carbondata pull request #2895: [HOTFIX] Fix NPE in spark, when same vector r...

2018-11-05 Thread ajantha-bhat
GitHub user ajantha-bhat opened a pull request:

https://github.com/apache/carbondata/pull/2895

[HOTFIX] Fix NPE in spark, when same vector reads files with local 
dictionary and without local dictionary

problem: NPE in spark, when same vector reads files with local dictionary 
and without local dictionary

cause: when two carbondata files are present, one with local dictionary and 
one without local dictionary. If same vector is used to read this files [can 
happen if task is launched to group of files]. If  local dictionary files are 
found first, dictionary is set for that vector. But it was never reset for 
another file reading.

solution: reset dictionary once batch is processed,set only for local 
dictionary batch processing. 


Be sure to do all of the following checklist to help us incorporate 
your contribution quickly and easily:

 - [ ] Any interfaces changed? NA
 
 - [ ] Any backward compatibility impacted? NA
 
 - [ ] Document update required? NA

 - [ ] Testing done
yes, cluster testing done.   
 - [ ] For large changes, please consider breaking it into sub-tasks under 
an umbrella JIRA.  NA



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/ajantha-bhat/carbondata master_new

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/carbondata/pull/2895.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #2895


commit 99c7621336e3cf180bfa0c3a326a2f1fafe51631
Author: ajantha-bhat 
Date:   2018-11-05T10:00:27Z

Fix vectcor reading with local dictionary and without local dictionary




---


[GitHub] carbondata issue #2888: [CARBONDATA-3066]add documentation for writtenBy and...

2018-11-02 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2888
  
LGTM


---


[GitHub] carbondata pull request #2850: [CARBONDATA-3056] Added concurrent reading th...

2018-11-01 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2850#discussion_r230272267
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -114,6 +115,57 @@ public static CarbonReaderBuilder builder(String 
tablePath) {
 return builder(tablePath, tableName);
   }
 
+  /**
+   * Breaks the list of CarbonRecordReader in CarbonReader into multiple
+   * CarbonReader objects, each iterating through some 'carbondata' files
+   * and return that list of CarbonReader objects
+   *
+   * If the no. of files is greater than maxSplits, then break the
+   * CarbonReader into maxSplits splits, with each split iterating
+   * through >= 1 file.
+   *
+   * If the no. of files is less than maxSplits, then return list of
+   * CarbonReader with size as the no. of files, with each CarbonReader
+   * iterating through exactly one file
+   *
+   * @param maxSplits: Int
+   * @return list of {@link CarbonReader} objects
+   */
+  public List split(int maxSplits) throws IOException {
--- End diff --

@ravipesala : Adding to builder will break builder pattern, recently we 
removed arguments from build() and make it as separate API for SDK writer. 
Reader also followed same.


---


[GitHub] carbondata pull request #2888: [CARBONDATA-3066]add documentation for writte...

2018-10-31 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2888#discussion_r229733657
  
--- Diff: docs/sdk-guide.md ---
@@ -686,6 +695,16 @@ Find example code at 
[CarbonReaderExample](https://github.com/apache/carbondata/
   public static Schema readSchemaInIndexFile(String indexFilePath);
 ```
 
+```
+  /**
+   * This method return the version details in formatted string by reading 
from carbondata file
+   * @param dataFilePath complete path including carbondata file name
+   * @return string with information of who has written this file in which 
carbondata project version
--- End diff --

can mention one example in the header


---


[GitHub] carbondata pull request #2888: [CARBONDATA-3066]add documentation for writte...

2018-10-31 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2888#discussion_r229733094
  
--- Diff: docs/sdk-guide.md ---
@@ -429,6 +429,15 @@ public CarbonWriterBuilder 
withAvroInput(org.apache.avro.Schema avroSchema);
 public CarbonWriterBuilder withJsonInput(Schema carbonSchema);
 ```
 
+```
+/**
+* To support writing the ApplicationName which is writing the carbondata 
file
+* @param application name which is writing the carbondata files
+* @return CarbonWriterBuilder
--- End diff --

mention that it is mandatory method to call


---


[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

2018-10-31 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r229629760
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/scan/result/RowBatch.java ---
@@ -100,4 +100,19 @@ public int getSize() {
 counter++;
 return row;
   }
+
+  /**
+   * read next batch
+   *
+   * @return rows
+   */
+  public List nextBatch() {
+if (!hasNext()) {
+  throw new NoSuchElementException();
+}
+List row;
+row = rows.subList(counter, rows.size());
--- End diff --

where is the batch reading happening now ?

you just have to return rows I think. 



---


[GitHub] carbondata issue #2807: [CARBONDATA-2997] Support read schema from index fil...

2018-10-31 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2807
  
LGTM.

Let one more reviewer review.


---


[GitHub] carbondata pull request #2804: [CARBONDATA-2996] CarbonSchemaReader support ...

2018-10-31 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2804#discussion_r229583983
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonSchemaReader.java 
---
@@ -64,11 +66,70 @@ public static Schema readSchemaInSchemaFile(String 
schemaFilePath) throws IOExce
   /**
* Read carbondata file and return the schema
*
-   * @param dataFilePath complete path including carbondata file name
+   * @param path carbondata store path
* @return Schema object
* @throws IOException
*/
-  public static Schema readSchemaInDataFile(String dataFilePath) throws 
IOException {
+  public static Schema readSchemaFromFirstDataFile(String path) throws 
IOException {
+String dataFilePath = getFirstCarbonDataFile(path);
+return readSchemaInDataFile(dataFilePath);
+  }
+
+  /**
+   * get first carbondata file in path and don't check all files schema
+   *
+   * @param path carbondata file path
+   * @return first carbondata file name
+   */
+  public static String getFirstCarbonDataFile(String path) {
--- End diff --

I have already suggested to keep getFirstCarbonFile(path, extension) -- 
this only will give data or index file based on the extension. 

no need to have duplicate code for both index and data file 


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229298501
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -1737,4 +1738,89 @@ public void 
testReadNextRowWithProjectionAndRowUtil() {
 }
   }
 
+  @Test
+  public void testVectorReader() {
+String path = "./testWriteFiles";
+try {
+  FileUtils.deleteDirectory(new File(path));
+
+  Field[] fields = new Field[12];
+  fields[0] = new Field("stringField", DataTypes.STRING);
+  fields[1] = new Field("shortField", DataTypes.SHORT);
+  fields[2] = new Field("intField", DataTypes.INT);
+  fields[3] = new Field("longField", DataTypes.LONG);
+  fields[4] = new Field("doubleField", DataTypes.DOUBLE);
+  fields[5] = new Field("boolField", DataTypes.BOOLEAN);
+  fields[6] = new Field("dateField", DataTypes.DATE);
+  fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
+  fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 
2));
+  fields[9] = new Field("varcharField", DataTypes.VARCHAR);
+  fields[10] = new Field("byteField", DataTypes.BYTE);
+  fields[11] = new Field("floatField", DataTypes.FLOAT);
+  Map map = new HashMap<>();
+  map.put("complex_delimiter_level_1", "#");
+  CarbonWriter writer = CarbonWriter.builder()
+  .outputPath(path)
+  .withLoadOptions(map)
+  .withCsvInput(new Schema(fields))
+  .writtenBy("CarbonReaderTest")
+  .build();
+
+  for (int i = 0; i < 10; i++) {
+String[] row2 = new String[]{
+"robot" + (i % 10),
+String.valueOf(i % 1),
+String.valueOf(i),
+String.valueOf(Long.MAX_VALUE - i),
+String.valueOf((double) i / 2),
+String.valueOf(true),
+"2019-03-02",
+"2019-02-12 03:03:34",
+"12.345",
+"varchar",
+String.valueOf(i),
+"1.23"
+};
+writer.write(row2);
+  }
+  writer.close();
+
+  // Read data
+  CarbonReader reader = CarbonReader
+  .builder(path, "_temp")
+  .withVectorReader(true)
+  .build();
+
+  int i = 0;
+  while (reader.hasNext()) {
+Object[] data = (Object[]) reader.readNextRow();
+
+assert (RowUtil.getString(data, 0).equals("robot" + i));
+assertEquals(RowUtil.getShort(data, 4), i);
+assertEquals(RowUtil.getInt(data, 5), i);
+assert (RowUtil.getLong(data, 6) == Long.MAX_VALUE - i);
+assertEquals(RowUtil.getDouble(data, 7), ((double) i) / 2);
+assert (RowUtil.getByte(data, 8).equals(new Byte("1")));
+assertEquals(RowUtil.getInt(data, 1), 17957);
+assertEquals(RowUtil.getLong(data, 2), 154992081400L);
+assert (RowUtil.getDecimal(data, 9).equals("12.35"));
+assert (RowUtil.getString(data, 3).equals("varchar"));
+assertEquals(RowUtil.getByte(data, 10), new 
Byte(String.valueOf(i)));
+assertEquals(RowUtil.getFloat(data, 11), new Float("1.23"));
+i++;
+  }
+  reader.close();
--- End diff --

Add validation for total number of rows read.  


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229284011
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonVectorizedRecordReader.java
 ---
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.hadoop.util;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.carbondata.common.logging.LogServiceFactory;
+import org.apache.carbondata.core.constants.CarbonV3DataFormatConstants;
+import org.apache.carbondata.core.datastore.block.TableBlockInfo;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.metadata.datatype.DecimalType;
+import org.apache.carbondata.core.metadata.datatype.StructField;
+import org.apache.carbondata.core.scan.executor.QueryExecutor;
+import org.apache.carbondata.core.scan.executor.QueryExecutorFactory;
+import 
org.apache.carbondata.core.scan.executor.exception.QueryExecutionException;
+import org.apache.carbondata.core.scan.model.ProjectionDimension;
+import org.apache.carbondata.core.scan.model.ProjectionMeasure;
+import org.apache.carbondata.core.scan.model.QueryModel;
+import 
org.apache.carbondata.core.scan.result.iterator.AbstractDetailQueryResultIterator;
+import org.apache.carbondata.core.scan.result.vector.CarbonColumnVector;
+import org.apache.carbondata.core.scan.result.vector.CarbonColumnarBatch;
+import 
org.apache.carbondata.core.scan.result.vector.impl.CarbonColumnVectorImpl;
+import org.apache.carbondata.core.util.ByteUtil;
+import org.apache.carbondata.hadoop.AbstractRecordReader;
+import org.apache.carbondata.hadoop.CarbonInputSplit;
+
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.log4j.Logger;
+
+/**
+ * A specialized RecordReader that reads into CarbonColumnarBatches 
directly using the
+ * carbondata column APIs and fills the data directly into columns.
+ */
+public class CarbonVectorizedRecordReader extends 
AbstractRecordReader {
+
+  private static final Logger LOGGER =
+  
LogServiceFactory.getLogService(CarbonVectorizedRecordReader.class.getName());
+
+  private CarbonColumnarBatch carbonColumnarBatch;
+
+  private QueryExecutor queryExecutor;
+
+  private int batchIdx = 0;
+
+  private int numBatched = 0;
+
+  private AbstractDetailQueryResultIterator iterator;
+
+  private QueryModel queryModel;
+
+  public CarbonVectorizedRecordReader(QueryModel queryModel) {
+this.queryModel = queryModel;
+  }
+
+  @Override public void initialize(InputSplit inputSplit, 
TaskAttemptContext taskAttemptContext)
+  throws IOException, InterruptedException {
+List splitList;
+if (inputSplit instanceof CarbonInputSplit) {
+  splitList = new ArrayList<>(1);
+  splitList.add((CarbonInputSplit) inputSplit);
+} else {
+  throw new RuntimeException("unsupported input split type: " + 
inputSplit);
+}
+List tableBlockInfoList = 
CarbonInputSplit.createBlocks(splitList);
+queryModel.setTableBlockInfos(tableBlockInfoList);
+queryModel.setVectorReader(true);
+try {
+  queryExecutor =
+  QueryExecutorFactory.getQueryExecutor(queryModel, 
taskAttemptContext.getConfiguration());
+  iterator = (AbstractDetailQueryResultIterator) 
queryExecutor.execute(queryModel);
+} catch (QueryExecutionException e) {
+  LOGGER.error(e);
+  throw new InterruptedException(e.getMessage());
+} catch (Exception e) {
+  LOGGER.error(e);
+  throw e;
+}
+  }
+
+  @Override public boolean nextKeyValue() throws IOException, 
Inter

[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229254647
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/util/CarbonVectorizedRecordReader.java
 ---
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.hadoop.util;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.carbondata.common.logging.LogServiceFactory;
+import org.apache.carbondata.core.constants.CarbonV3DataFormatConstants;
+import org.apache.carbondata.core.datastore.block.TableBlockInfo;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.metadata.datatype.DecimalType;
+import org.apache.carbondata.core.metadata.datatype.StructField;
+import org.apache.carbondata.core.scan.executor.QueryExecutor;
+import org.apache.carbondata.core.scan.executor.QueryExecutorFactory;
+import 
org.apache.carbondata.core.scan.executor.exception.QueryExecutionException;
+import org.apache.carbondata.core.scan.model.ProjectionDimension;
+import org.apache.carbondata.core.scan.model.ProjectionMeasure;
+import org.apache.carbondata.core.scan.model.QueryModel;
+import 
org.apache.carbondata.core.scan.result.iterator.AbstractDetailQueryResultIterator;
+import org.apache.carbondata.core.scan.result.vector.CarbonColumnVector;
+import org.apache.carbondata.core.scan.result.vector.CarbonColumnarBatch;
+import 
org.apache.carbondata.core.scan.result.vector.impl.CarbonColumnVectorImpl;
+import org.apache.carbondata.core.util.ByteUtil;
+import org.apache.carbondata.hadoop.AbstractRecordReader;
+import org.apache.carbondata.hadoop.CarbonInputSplit;
+
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.log4j.Logger;
+
+/**
+ * A specialized RecordReader that reads into CarbonColumnarBatches 
directly using the
+ * carbondata column APIs and fills the data directly into columns.
+ */
+public class CarbonVectorizedRecordReader extends 
AbstractRecordReader {
+
+  private static final Logger LOGGER =
+  
LogServiceFactory.getLogService(CarbonVectorizedRecordReader.class.getName());
+
+  private CarbonColumnarBatch carbonColumnarBatch;
+
+  private QueryExecutor queryExecutor;
+
+  private int batchIdx = 0;
+
+  private int numBatched = 0;
+
+  private AbstractDetailQueryResultIterator iterator;
+
+  private QueryModel queryModel;
+
+  public CarbonVectorizedRecordReader(QueryModel queryModel) {
+this.queryModel = queryModel;
+  }
+
+  @Override public void initialize(InputSplit inputSplit, 
TaskAttemptContext taskAttemptContext)
+  throws IOException, InterruptedException {
+List splitList;
+if (inputSplit instanceof CarbonInputSplit) {
+  splitList = new ArrayList<>(1);
+  splitList.add((CarbonInputSplit) inputSplit);
+} else {
+  throw new RuntimeException("unsupported input split type: " + 
inputSplit);
+}
+List tableBlockInfoList = 
CarbonInputSplit.createBlocks(splitList);
+queryModel.setTableBlockInfos(tableBlockInfoList);
+queryModel.setVectorReader(true);
+try {
+  queryExecutor =
+  QueryExecutorFactory.getQueryExecutor(queryModel, 
taskAttemptContext.getConfiguration());
+  iterator = (AbstractDetailQueryResultIterator) 
queryExecutor.execute(queryModel);
+} catch (QueryExecutionException e) {
+  LOGGER.error(e);
+  throw new InterruptedException(e.getMessage());
+} catch (Exception e) {
+  LOGGER.error(e);
+  throw e;
+}
+  }
+
+  @Override public boolean nextKeyValue() throws IOException, 
InterruptedException {
  

[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229248603
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonFileInputFormat.java
 ---
@@ -180,6 +235,11 @@ public CarbonTable 
getOrCreateCarbonTable(Configuration configuration) throws IO
 validSegments, partitionInfo, oldPartitionIdList);
 numBlocks = dataBlocksOfSegment.size();
 result.addAll(dataBlocksOfSegment);
+Collections.sort(result, new Comparator() {
--- End diff --

Why sorting the split ? can you add comment here ?


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229246953
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonFileInputFormat.java
 ---
@@ -145,9 +158,33 @@ public CarbonTable 
getOrCreateCarbonTable(Configuration configuration) throws IO
   externalTableSegments.add(seg);
 }
   }
-  // do block filtering and get split
-  List splits =
-  getSplits(job, filter, externalTableSegments, null, 
partitionInfo, null);
+  List splits = new ArrayList<>();
+  if (isSDK) {
--- End diff --

This is SDK logic, not fileInputFormat logic. 

you can do this in CarbonReaderBuilder.build()  --> here already 
getSplits() is called, you can check if filters are there, call 
FileInputFormat.getSplits() else call a method getAllFileSplit() which gives 
all the files without blocklet loading

Also we don't need isSDK flag also


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229243919
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/CarbonInputSplit.java ---
@@ -138,6 +138,19 @@ public CarbonInputSplit(String segmentId, Path path, 
long start, long length, St
 version = CarbonProperties.getInstance().getFormatVersion();
   }
 
+  public CarbonInputSplit(String segmentId, Path path, long start, long 
length,
--- End diff --

This is not required. can use above constructor and pass 
FileFormat.COLUMNAR_V3 for fileFormat


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229247445
  
--- Diff: 
hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonFileInputFormat.java
 ---
@@ -145,9 +158,33 @@ public CarbonTable 
getOrCreateCarbonTable(Configuration configuration) throws IO
   externalTableSegments.add(seg);
 }
   }
-  // do block filtering and get split
-  List splits =
-  getSplits(job, filter, externalTableSegments, null, 
partitionInfo, null);
+  List splits = new ArrayList<>();
+  if (isSDK) {
+for (CarbonFile carbonFile : 
getAllCarbonDataFiles(carbonTable.getTablePath())) {
--- End diff --

move it to method getAllFileSplits(), and called it from 
CarbonReaderBuilder.build() as discussed above


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229218201
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/scan/result/vector/impl/CarbonColumnVectorImpl.java
 ---
@@ -305,7 +301,7 @@ public void setBlockDataType(DataType blockDataType) {
   }
 
   @Override public CarbonColumnVector getDictionaryVector() {
-return dictionaryVector;
+return null;
--- End diff --

should throw unsupported operation exception instead of null ?

or remove the override method itself.


---


[GitHub] carbondata pull request #2876: [CARBONDATA-3054] Fix Dictionary file cannot ...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2876#discussion_r229237604
  
--- Diff: 
integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDictionaryDecoder.scala
 ---
@@ -137,14 +139,15 @@ case class CarbonDictionaryDecoder(
   val forwardDictionaryCache: Cache[DictionaryColumnUniqueIdentifier, 
Dictionary] =
 cacheProvider.createCache(CacheType.FORWARD_DICTIONARY)
   val dicts: Seq[ForwardDictionaryWrapper] = 
getDictionaryWrapper(tableNameToCarbonTableMapping,
-forwardDictionaryCache)
+forwardDictionaryCache, conf)
 
   val exprs = child.output.map { exp =>
 ExpressionCanonicalizer.execute(BindReferences.bindReference(exp, 
child.output))
   }
   ctx.currentVars = input
   val resultVars = exprs.zipWithIndex.map { case (expr, index) =>
 if (dicts(index) != null) {
+  
ThreadLocalSessionInfo.setConfigurationToCurrentThread(conf.value.value)
--- End diff --

done. Setted in codeGen now.


---


[GitHub] carbondata pull request #2876: [CARBONDATA-3054] Fix Dictionary file cannot ...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2876#discussion_r229237654
  
--- Diff: 
integration/spark2/src/main/scala/org/apache/spark/sql/CarbonDictionaryDecoder.scala
 ---
@@ -557,16 +561,20 @@ class CarbonDecoderRDD(
  * It is a wrapper around Dictionary, it is a work around to keep the 
dictionary serializable in
  * case of codegen
  * @param dictIdentifier Dictionary column unique identifier
+ * @param broadcastConf hadoop broadcast conf for serialization, that 
contains carbon conf.
  */
 class ForwardDictionaryWrapper(
-dictIdentifier: DictionaryColumnUniqueIdentifier) extends Serializable 
{
+dictIdentifier: DictionaryColumnUniqueIdentifier,
+broadcastConf: Broadcast[SerializableConfiguration]) extends 
Serializable {
 
   var dictionary: Dictionary = null
 
   var dictionaryLoader: DictionaryLoader = _
 
   def getDictionaryValueForKeyInBytes (surrogateKey: Int): Array[Byte] = {
 if (dictionary == null) {
+  
ThreadLocalSessionInfo.getOrCreateCarbonSessionInfo().getNonSerializableExtraInfo
--- End diff --

done. Setted in codeGen now


---


[GitHub] carbondata pull request #2869: [CARBONDATA-3057] Changes for improving carbo...

2018-10-30 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2869#discussion_r229193317
  
--- Diff: 
core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/SafeVariableLengthDimensionDataChunkStore.java
 ---
@@ -169,7 +169,7 @@ public void fillRow(int rowId, CarbonColumnVector 
vector, int vectorRow) {
 length)) {
   vector.putNull(vectorRow);
--- End diff --

Need to skip this for varchar also ?


---


[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2850#discussion_r229183542
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/ConcurrentSdkReaderTest.java
 ---
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.sdk.file;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+
+import junit.framework.TestCase;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.junit.*;
+
+/**
+ * multi-thread Test suite for {@link CarbonReader}
+ */
+public class ConcurrentSdkReaderTest {
+
+  private static final String dataDir = "./testReadFiles";
+
+  @Before
+  @After
+  public void cleanTestData() {
+try {
+  FileUtils.deleteDirectory(new File(dataDir));
+} catch (Exception e) {
+  e.printStackTrace();
+  Assert.fail(e.getMessage());
+}
+  }
+
+  private void writeTestData(long numRows, int tableBlockSize) {
+Field[] fields = new Field[2];
+fields[0] = new Field("stringField", DataTypes.STRING);
+fields[1] = new Field("intField", DataTypes.INT);
+
+Map tableProperties = new HashMap<>();
+tableProperties.put("table_blocksize", 
Integer.toString(tableBlockSize));
+
+CarbonWriterBuilder builder =
+
CarbonWriter.builder().outputPath(dataDir).withTableProperties(tableProperties)
+.withCsvInput(new Schema(fields));
+
+try {
+  CarbonWriter writer = builder.build();
+
+  for (long i = 0; i < numRows; ++i) {
+writer.write(new String[] { "robot_" + i, String.valueOf(i) });
+  }
+  writer.close();
+} catch (Exception e) {
+  e.printStackTrace();
+  Assert.fail(e.getMessage());
+}
+  }
+
+  @Test public void testReadParallely() throws IOException, 
InterruptedException {
+long numRows = 1000;
--- End diff --

We must not add huge record test case in UT. PR builder time for all the PR 
will affect by this. Locally test with huge data but update UT with fewer rows. 
say 10 rows.


---


[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2850#discussion_r229183179
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String 
tablePath) {
 return builder(tablePath, tableName);
   }
 
+  /**
+   * Return a new list of {@link CarbonReader} objects
+   *
+   * @param maxSplits
+   */
+  public List split(int maxSplits) throws IOException {
+validateReader();
+if (maxSplits < 1) {
+  throw new RuntimeException(
+  this.getClass().getSimpleName() + ".split: maxSplits must be 
positive");
+}
+
+List carbonReaders = new ArrayList<>();
+
+// If maxSplits < readers.size
+// Split the reader into maxSplits splits with each
+// element contains >= 1 CarbonRecordReader objects
+if (maxSplits < this.readers.size()) {
+  for (int i = 0; i < maxSplits; ++i) {
+carbonReaders.add(new CarbonReader<>(this.readers
+.subList((int) Math.ceil((float) (i * this.readers.size()) / 
maxSplits),
--- End diff --

this is constant, do this outside loop and use it each time


---


[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2850#discussion_r229179978
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String 
tablePath) {
 return builder(tablePath, tableName);
   }
 
+  /**
+   * Return a new list of {@link CarbonReader} objects
+   *
--- End diff --

Add a clear description, mention what happens if splits greater than the 
number of files and what happens if splits are lesser than the number of files


---


[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2850#discussion_r229179713
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String 
tablePath) {
 return builder(tablePath, tableName);
   }
 
+  /**
+   * Return a new list of {@link CarbonReader} objects
+   *
+   * @param maxSplits
+   */
+  public List split(int maxSplits) throws IOException {
+validateReader();
+if (maxSplits < 1) {
+  throw new RuntimeException(
+  this.getClass().getSimpleName() + ".split: maxSplits must be 
positive");
+}
+
+List carbonReaders = new ArrayList<>();
+
+// If maxSplits < readers.size
+// Split the reader into maxSplits splits with each
+// element contains >= 1 CarbonRecordReader objects
+if (maxSplits < this.readers.size()) {
+  for (int i = 0; i < maxSplits; ++i) {
+carbonReaders.add(new CarbonReader<>(this.readers
+.subList((int) Math.ceil((float) (i * this.readers.size()) / 
maxSplits),
+(int) Math.ceil((float) ((i + 1) * this.readers.size()) / 
maxSplits;
+  }
+}
+// If maxSplits >= readers.size
+// Split the reader into reader.size splits with each
--- End diff --

keep comments inside else block for easy reading of the code


---


[GitHub] carbondata pull request #2850: [WIP] Added concurrent reading through SDK

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2850#discussion_r229179428
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -114,6 +117,43 @@ public static CarbonReaderBuilder builder(String 
tablePath) {
 return builder(tablePath, tableName);
   }
 
+  /**
+   * Return a new list of {@link CarbonReader} objects
+   *
+   * @param maxSplits
+   */
+  public List split(int maxSplits) throws IOException {
--- End diff --

Need to add new interfaces exposed in sdk-guide.md. you can add this API 
info there.


---


[GitHub] carbondata issue #2876: [CARBONDATA-3054] Fix Dictionary file cannot be read...

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on the issue:

https://github.com/apache/carbondata/pull/2876
  
retest this please


---


[GitHub] carbondata pull request #2804: [CARBONDATA-2996] CarbonSchemaReader support ...

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2804#discussion_r229173861
  
--- Diff: 
store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonSchemaReaderTest.java
 ---
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.sdk.file;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.commons.io.FileUtils;
+import org.junit.Test;
+
+public class CarbonSchemaReaderTest extends TestCase {
+
+  @Test
+  public void testReadSchemaFromDataFile() {
+String path = "./testWriteFiles";
+try {
+  FileUtils.deleteDirectory(new File(path));
+
+  Field[] fields = new Field[11];
+  fields[0] = new Field("stringField", DataTypes.STRING);
+  fields[1] = new Field("shortField", DataTypes.SHORT);
+  fields[2] = new Field("intField", DataTypes.INT);
+  fields[3] = new Field("longField", DataTypes.LONG);
+  fields[4] = new Field("doubleField", DataTypes.DOUBLE);
+  fields[5] = new Field("boolField", DataTypes.BOOLEAN);
+  fields[6] = new Field("dateField", DataTypes.DATE);
+  fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
+  fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 
2));
+  fields[9] = new Field("varcharField", DataTypes.VARCHAR);
+  fields[10] = new Field("arrayField", 
DataTypes.createArrayType(DataTypes.STRING));
+  Map map = new HashMap<>();
+  map.put("complex_delimiter_level_1", "#");
+  CarbonWriter writer = CarbonWriter.builder()
+  .outputPath(path)
+  .withLoadOptions(map)
+  .withCsvInput(new Schema(fields)).build();
+
+  for (int i = 0; i < 10; i++) {
+String[] row2 = new String[]{
+"robot" + (i % 10),
+String.valueOf(i % 1),
+String.valueOf(i),
+String.valueOf(Long.MAX_VALUE - i),
+String.valueOf((double) i / 2),
+String.valueOf(true),
+"2019-03-02",
+"2019-02-12 03:03:34",
+"12.345",
+"varchar",
+"Hello#World#From#Carbon"
+};
+writer.write(row2);
+  }
+  writer.close();
+
+  Schema schema = CarbonSchemaReader
+  .readSchemaInDataFile(path)
+  .asOriginOrder();
+  // Transform the schema
+  assertEquals(schema.getFields().length, 11);
+  String[] strings = new String[schema.getFields().length];
+  for (int i = 0; i < schema.getFields().length; i++) {
+strings[i] = (schema.getFields())[i].getFieldName();
+  }
+  assert (strings[0].equalsIgnoreCase("stringField"));
+  assert (strings[1].equalsIgnoreCase("shortField"));
+  assert (strings[2].equalsIgnoreCase("intField"));
+  assert (strings[3].equalsIgnoreCase("longField"));
+  assert (strings[4].equalsIgnoreCase("doubleField"));
--- End diff --

can move it to a method and use for both the test case.



---


[GitHub] carbondata pull request #2804: [CARBONDATA-2996] CarbonSchemaReader support ...

2018-10-29 Thread ajantha-bhat
Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2804#discussion_r229173916
  
--- Diff: 
store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonSchemaReader.java 
---
@@ -59,11 +60,30 @@ public static Schema readSchemaInSchemaFile(String 
schemaFilePath) throws IOExce
   /**
* Read carbondata file and return the schema
*
-   * @param dataFilePath complete path including carbondata file name
+   * @param path complete path including carbondata file name
* @return Schema object
* @throws IOException
*/
-  public static Schema readSchemaInDataFile(String dataFilePath) throws 
IOException {
+  public static Schema readSchemaInDataFile(String path) throws 
IOException {
+String dataFilePath = path;
+if (!(dataFilePath.contains(".carbondata"))) {
+  CarbonFile[] carbonFiles = FileFactory
+  .getCarbonFile(path)
+  .listFiles(new CarbonFileFilter() {
+@Override
+public boolean accept(CarbonFile file) {
+  if (file == null) {
+return false;
+  }
+  return file.getName().endsWith(".carbondata");
+}
+  });
+  if (carbonFiles == null || carbonFiles.length < 1) {
+throw new RuntimeException("Carbon data file not exists.");
+  }
+  dataFilePath = carbonFiles[0].getAbsolutePath();
--- End diff --

In that case you can implement,

String getFirstCarbonFile(path, ExtenstionType)

and pass it to existing method. ReadSchemaFromFile() must only read it. It 
should not do any extra work.


---


<    1   2   3   4   5   6   7   8   9   10   >