Re: can't apply mappartitions to dataframe generated from carboncontext

2017-06-12 Thread Mic Sun
org.apache.spark.SparkException: Task not serializable
at
org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at
org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2054)
at
org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:926)
at
org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:925)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:925)
at
org.apache.spark.sql.DataFrame$$anonfun$foreachPartition$1.apply$mcV$sp(DataFrame.scala:1445)
at
org.apache.spark.sql.DataFrame$$anonfun$foreachPartition$1.apply(DataFrame.scala:1445)
at
org.apache.spark.sql.DataFrame$$anonfun$foreachPartition$1.apply(DataFrame.scala:1445)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
at
org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2087)
at
org.apache.spark.sql.DataFrame.foreachPartition(DataFrame.scala:1444)
at
$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:66)
at
$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:72)
at
$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:74)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:76)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:78)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:80)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.(:82)
at $iwC$$iwC$$iwC$$iwC$$iwC.(:84)
at $iwC$$iwC$$iwC$$iwC.(:86)
at $iwC$$iwC$$iwC.(:88)
at $iwC$$iwC.(:90)
at $iwC.(:92)
at (:94)
at .(:98)
at .()
at .(:7)
at .()
at $print()
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
at
org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
at
org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
at
org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$pasteCommand(SparkILoop.scala:825)
at
org.apache.spark.repl.SparkILoop$$anonfun$standardCommands$8.apply(SparkILoop.scala:345)
at
org.apache.spark.repl.SparkILoop$$anonfun$standardCommands$8.apply(SparkILoop.scala:345)
at
scala.tools.nsc.interpreter.LoopCommands$LoopCommand$$anonfun$nullary$1.apply(LoopCommands.scala:65)
at
scala.tools.nsc.interpreter.LoopCommands$LoopCommand$$anonfun$nullary$1.apply(LoopCommands.scala:65)
at
scala.tools.nsc.interpreter.LoopCommands$NullaryCmd.apply(LoopCommands.scala:76)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:809)
at
org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at
org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at
org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at
org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at
scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at
org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.spark.deploy.SparkSubmit$.org$apach

[jira] [Created] (CARBONDATA-1154) Driver Side IUD Performance Optimization

2017-06-12 Thread sounak chakraborty (JIRA)
sounak chakraborty created CARBONDATA-1154:
--

 Summary: Driver Side IUD Performance Optimization
 Key: CARBONDATA-1154
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1154
 Project: CarbonData
  Issue Type: Bug
Reporter: sounak chakraborty


Driver Side IUD Performance Optimization
1. Get invalid blocks ony when there is a Update Performed in the Table.

2. As UpdateVO is per segment basis no need to call it for each blocks.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1155) DataLoad failure for noDictionarySortColumns with 3Lakh data

2017-06-12 Thread Rahul Kumar (JIRA)
Rahul Kumar created CARBONDATA-1155:
---

 Summary: DataLoad failure for noDictionarySortColumns with 3Lakh 
data
 Key: CARBONDATA-1155
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1155
 Project: CarbonData
  Issue Type: Bug
Reporter: Rahul Kumar
Assignee: Rahul Kumar


CREATE TABLE IF NOT EXISTS flow_carbon_test4
(
col1String,
col2  String,
col3  String,
col4  String,
col5  String,
col6  String,
col7  String,
col8 String,
col9 String,
col10  String,
col11  String,
col12  String,
col13  String,
col14   String,
col15  String,
col16 String,
col17 int,
col18 int,
col19  String,
col20  String,
col21  String,
col22 String,
col23 String,
col24 String,
col25   String,
col26  String,
col27  String,
col28String,
col29 String,
col30 String,
col31  String,
col32  String,
col33  String,
col34 String,
col35 String,
col36  String,
col37  String,
col38 String,
col39 String,
col40 String,
col41 String,
col42  String,
col43  String,
col44 String,
col45 String,
col46 String,
col47 String,
col48 String,
col49  String,
col50 decimal(15,2),
col51 decimal(15,2),
col52 String,
col53 String,
col54 String,
col55  String,
col56  String,
col57  String,
col58 String,
col59 String,
col60 String,
col61 String,
col62 String,
val_dte String,
opp_ac_flg  String,
cmb_flg String,
ass_vch_flg String,
col63 String,
col64 String,
vch_bus_rmk String,
tec_rmk_cde String,
vch_tec_rmk String,
rsv_ara String,
col65 String,
col66 String,
col67   String,
col68String
)
STORED BY 'org.apache.carbondata.format'
TBLPROPERTIES('DICTIONARY_INCLUDE'='col2,col18,col3,col4,col31,col32,col34,
col37,col8,col41,col43,col46,col47,col48,col49,col52,col53,col55,
col56,col57,col59,col60,col61,col62,opp_ac_flg,cmb_flg,ass_vch_flg,
col63,col64,vch_bus_rmk,tec_rmk_cde,vch_tec_rmk,rsv_ara,col6,col5',
'DICTIONARY_EXCLUDE'='col15,col16,col19,col20,col21,col22,col23,
col24,col10,col25,col26,col11,col27,col14,col1,col28,col29,col30,
col33,col35,col36,col38,col39,col40,col9,
col42,col44,col45,col54,col58,col13,col12,col7,val_dte,
col65,col66,col67,col68','table_blocksize'='1',
'sort_columns'='col1')

 LOAD DATA  inpath 'D:/CSVs/20140101_3_3_1.csv' into table flow_carbon_test4 
options('DELIMITER'=',', 
'QUOTECHAR'='"','FILEHEADER'='col15,col16,col17,col18,col19,col20,col21,col22,col23,col24,col10,col25,col26,col11,col27,col14,col1,col28,col29,col3,col4,col30,col31,col32,col33,col34,col35,col36,col37,col8,col38,col39,col40,col9,col41,col42,col43,col44,col45,col46,col47,col48,col49,col50,col51,col52,col53,col54,col55,col56,col57,col58,col13,col12,col7,col59,col60,col61,col62,val_dte,opp_ac_flg,cmb_flg,ass_vch_flg,col63,col64,vch_bus_rmk,tec_rmk_cde,vch_tec_rmk,rsv_ara,col6,col5,col65,col66,col67,col68,col2','sort_scope'='BATCH_SORT','batch_sort_size_inmb'='64')
*Error: java.lang.Exception: DataLoad failure: There is an unexpected error: 
There is an unexpected error while closing data handler (state=,code=0)*





--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1156) IUD Performance Improvement And Synchonizaion issue

2017-06-12 Thread kumar vishal (JIRA)
kumar vishal created CARBONDATA-1156:


 Summary: IUD Performance Improvement And Synchonizaion issue 
 Key: CARBONDATA-1156
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1156
 Project: CarbonData
  Issue Type: Bug
Reporter: kumar vishal
Assignee: kumar vishal


Delete delta file loading is taking more time as it is read for blocklet level. 
Now added code to read block level.
In current IUD design delete delta files are getting listed for each block in 
executor level in case of parallel query and iud operation it may give wrong 
result. Now passing delete delta information from driver to executor



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1157) Dependency conflict when executing load query on multinode cluster with Spark 1.6

2017-06-12 Thread SWATI RAO (JIRA)
SWATI RAO created CARBONDATA-1157:
-

 Summary: Dependency conflict when executing load query on 
multinode cluster with Spark 1.6
 Key: CARBONDATA-1157
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1157
 Project: CarbonData
  Issue Type: Bug
  Components: data-load
Affects Versions: 1.1.0
 Environment: Spark 1.6
Reporter: SWATI RAO




java.io.InvalidClassException: org.apache.carbondata.spark.DataLoadResultImpl; 
local class incompatible: stream classdesc serialVersionUID = 
459643937457370671, local class serialVersionUID = 870423879879520920
 [exec] at 
java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:616)
 [exec] at 
java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1630)
 [exec] at 
java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1521)
 [exec] at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1781)
 [exec] at 
java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
 [exec] at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
 [exec] at 
java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
 [exec] at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
 [exec] at 
java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
 [exec] at 
java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2018)
 [exec] at 
java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1942)
 [exec] at 
java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1808)
 [exec] at 
java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1353)
 [exec] at 
java.io.ObjectInputStream.readObject(ObjectInputStream.java:373)
 [exec] at 
org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:76)
 [exec] at 
org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:115)
 [exec] at 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
 [exec] at org.apache.spark.scheduler.Task.run(Task.scala:89)
 [exec] at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
 [exec] at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
 [exec] at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
 [exec] at java.lang.Thread.run(Thread.java:745)
 [exec] 
 [exec] Driver stacktrace:



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1158) Hive integration code optimization

2017-06-12 Thread Liang Chen (JIRA)
Liang Chen created CARBONDATA-1158:
--

 Summary: Hive integration code optimization
 Key: CARBONDATA-1158
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1158
 Project: CarbonData
  Issue Type: Sub-task
  Components: hive-integration
Reporter: Liang Chen


Hive integration code optimization:
1. Remove redundant and unused code.
2. Optimize some code
a) Convert some internal functions from public to private.
b) Fix some code which may generate error.
c) Change code as per java code style.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1159) Batch sort loading is not proper without synchronization

2017-06-12 Thread dhatchayani (JIRA)
dhatchayani created CARBONDATA-1159:
---

 Summary: Batch sort loading is not proper without synchronization
 Key: CARBONDATA-1159
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1159
 Project: CarbonData
  Issue Type: Bug
Reporter: dhatchayani
Assignee: dhatchayani
Priority: Minor






--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


RE: About ColumnGroup feature

2017-06-12 Thread Jihong Ma
not a feature beneficial to real-world application, vote for A!

Jihong

-Original Message-
From: Jacky Li [mailto:jacky.li...@qq.com] 
Sent: Friday, June 09, 2017 6:05 PM
To: dev@carbondata.apache.org
Subject: About ColumnGroup feature

Hi Community,

In JIRA 1014, we are adding Unsafe ColumnPage in data load process to reduce 
GC, and adding EncodingStrategy for Encoding Overriding features to make open 
up encoding interface for both usability and extensibility.

When implementing these new features, I found ColumnGroup feature creates 
unnecessary burden on data loading code and makes developer interfaces more 
complex to use. For example, since ColumnGroup can appear in any field in the 
table, even in SORT_COLUMNS, it forces developer to understand ColumnGroup 
concept and handle it in its encoding implementation.

As far as I know, there is not much discussion on ColumnGroup in the mail list 
and I think it is high possibility that no one is using it. So I am proposing 
to remove this feature and it can enable carbon to make Encoding interface 
cleaner for developers. 

Please vote:
A. Remove it
B. Not remove it, and provide reason


Thanks,
Jacky Li



Re: About ColumnGroup feature

2017-06-12 Thread David CaiQiang
+1 for A

As I known, so far ColumnGroup feature can't improve performance very well,
it became a useless feature nearly. If necessary, we need redesign this
feature to keep code clean and tune it well to improve performance.



-
Best Regards
David Cai
--
View this message in context: 
http://apache-carbondata-dev-mailing-list-archive.1130556.n5.nabble.com/About-ColumnGroup-feature-tp14436p14729.html
Sent from the Apache CarbonData Dev Mailing List archive mailing list archive 
at Nabble.com.


[jira] [Created] (CARBONDATA-1160) Use spark multi-threads model to load data

2017-06-12 Thread Yadong Qi (JIRA)
Yadong Qi created CARBONDATA-1160:
-

 Summary: Use spark multi-threads model to load data
 Key: CARBONDATA-1160
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1160
 Project: CarbonData
  Issue Type: New Feature
  Components: data-load
Affects Versions: 1.1.0
Reporter: Yadong Qi






--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1161) Generate one index file per segment to improve the performance of first query

2017-06-12 Thread Yadong Qi (JIRA)
Yadong Qi created CARBONDATA-1161:
-

 Summary: Generate one index file per segment to improve the 
performance of first query
 Key: CARBONDATA-1161
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1161
 Project: CarbonData
  Issue Type: Sub-task
Reporter: Yadong Qi






--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1162) After compact segments, data sorted in global level.

2017-06-12 Thread Yadong Qi (JIRA)
Yadong Qi created CARBONDATA-1162:
-

 Summary: After compact segments, data sorted in global level.
 Key: CARBONDATA-1162
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1162
 Project: CarbonData
  Issue Type: Sub-task
Reporter: Yadong Qi






--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1163) Use sortBy operator to load data

2017-06-12 Thread Yadong Qi (JIRA)
Yadong Qi created CARBONDATA-1163:
-

 Summary: Use sortBy operator to load data
 Key: CARBONDATA-1163
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1163
 Project: CarbonData
  Issue Type: Sub-task
Reporter: Yadong Qi






--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1164) Make Column Group feature deprecated

2017-06-12 Thread Jacky Li (JIRA)
Jacky Li created CARBONDATA-1164:


 Summary: Make Column Group feature deprecated
 Key: CARBONDATA-1164
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1164
 Project: CarbonData
  Issue Type: Improvement
Reporter: Jacky Li


After discussion in community 
(http://apache-carbondata-dev-mailing-list-archive.1130556.n5.nabble.com/About-ColumnGroup-feature-td14436.html),
 we conclude that column group feature will be deprecated. 




--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1165) Class Cast excpeyion in intermediate file merger when loading data

2017-06-12 Thread anubhav tarar (JIRA)
anubhav tarar created CARBONDATA-1165:
-

 Summary: Class Cast excpeyion in intermediate file merger when 
loading data
 Key: CARBONDATA-1165
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1165
 Project: CarbonData
  Issue Type: Bug
  Components: data-load
Affects Versions: 1.2.0
 Environment: spark2.1
Reporter: anubhav tarar
Assignee: anubhav tarar
Priority: Trivial


query:

 spark.sql("CREATE TABLE ORDERS ( O_ORDERKEY INT ,\n O_CUSTKEY INT ,\n 
O_ORDERSTATUS STRING ," +
  "\n O_TOTALPRICE DECIMAL(15,2) ,\n O_ORDERDATE TIMESTAMP ,\n 
O_ORDERPRIORITY STRING" +
  " , \n O_CLERK STRING , \n O_SHIPPRIORITY INT ,\n O_COMMENT 
STRING ) STORED BY " +
  "'carbondata'")


  spark.sql("LOAD DATA INPATH \"hdfs://localhost:54310/user1/orders.csv\" INTO 
TABLE orders " +
  "OPTIONS('DELIMITER'='|' , 
'QUOTECHAR'='\"','FILEHEADER'='O_ORDERKEY,O_CUSTKEY," +
  
"O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,"
 +
  "O_COMMENT')")

logs:

java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.Long
at 
org.apache.carbondata.processing.sortandgroupby.sortdata.IntermediateFileMerger.writeDataTofile(IntermediateFileMerger.java:347)
at 
org.apache.carbondata.processing.sortandgroupby.sortdata.IntermediateFileMerger.call(IntermediateFileMerger.java:112)
at 
org.apache.carbondata.processing.sortandgroupby.sortdata.IntermediateFileMerger.call(IntermediateFileMerger.java:37)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Created] (CARBONDATA-1166) creating partition on decimal column is failing

2017-06-12 Thread QiangCai (JIRA)
QiangCai created CARBONDATA-1166:


 Summary: creating partition on decimal column is failing
 Key: CARBONDATA-1166
 URL: https://issues.apache.org/jira/browse/CARBONDATA-1166
 Project: CarbonData
  Issue Type: Bug
Reporter: QiangCai
Assignee: QiangCai






--
This message was sent by Atlassian JIRA
(v6.4.14#64029)