sql - group by on UDF not working

2014-11-07 Thread Tridib Samanta
I am trying to group by on a calculated field. Is it supported on spark sql? I 
am running it on a nested json structure.
 
Query: SELECT YEAR(c.Patient.DOB), sum(c.ClaimPay.TotalPayAmnt) FROM claim c 
group by YEAR(c.Patient.DOB)
 
Spark Version: spark-1.2.0-SNAPSHOT wit Hive and hadoop 2.4.
Error: 
 
org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Expression not 
in GROUP BY: HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFYear(Patient#8.DOB 
AS DOB#191) AS c_0#185, tree:
Aggregate [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFYear(Patient#8.DOB)], 
[HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFYear(Patient#8.DOB AS DOB#191) 
AS c_0#185,SUM(CAST(ClaimPay#5.TotalPayAmnt AS TotalPayAmnt#192, LongType)) AS 
c_1#186L]
 Subquery c
  Subquery claim
   LogicalRDD 
[AttendPhysician#0,BillProv#1,Claim#2,ClaimClinic#3,ClaimInfo#4,ClaimPay#5,ClaimTL#6,OpPhysician#7,Patient#8,PayToPhysician#9,Payer#10,Physician#11,RefProv#12,Services#13,Subscriber#14],
 MappedRDD[5] at map at JsonRDD.scala:43
at 
org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:127)
at 
org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:125)
at 
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at 
org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
at 
org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:115)
at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
at 
org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
at 
org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:115)
at 
org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:113)
at 
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
at 
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
at 
scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
at 
scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
at 
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
at 
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
at scala.collection.immutable.List.foreach(List.scala:318)
at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
at 
org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
at 
org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
at 
org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
at 
org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
at 
org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
at 
org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
at 
org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
at 
org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
at 
org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
at 
org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
at org.apache.spark.sql.SchemaRDD.collect(SchemaRDD.scala:423)
at $iwC$$iwC$$iwC$$iwC.init(console:17)
at $iwC$$iwC$$iwC.init(console:22)
at $iwC$$iwC.init(console:24)
at $iwC.init(console:26)
at init(console:28)
at .init(console:32)
at .clinit(console)
at .init(console:7)
at .clinit(console)
at $print(console)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at 
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)
at 
org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)
at 
org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)
at 

Re: sql - group by on UDF not working

2014-11-07 Thread Shixiong Zhu
Now it doesn't support such query. I can easily reproduce it. Created a
JIRA here: https://issues.apache.org/jira/browse/SPARK-4296

Best Regards,
Shixiong Zhu

2014-11-07 16:44 GMT+08:00 Tridib Samanta tridib.sama...@live.com:

 I am trying to group by on a calculated field. Is it supported on spark
 sql? I am running it on a nested json structure.

 Query: SELECT YEAR(c.Patient.DOB), sum(c.ClaimPay.TotalPayAmnt) FROM claim
 c group by YEAR(c.Patient.DOB)

 Spark Version: spark-1.2.0-SNAPSHOT wit Hive and hadoop 2.4.
 Error:

 org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Expression
 not in GROUP BY:
 HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFYear(Patient#8.DOB AS
 DOB#191) AS c_0#185, tree:
 Aggregate
 [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFYear(Patient#8.DOB)],
 [HiveSimpleUdf#org.apache.hadoop.hive.ql.udf.UDFYear(Patient#8.DOB AS
 DOB#191) AS c_0#185,SUM(CAST(ClaimPay#5.TotalPayAmnt AS TotalPayAmnt#192,
 LongType)) AS c_1#186L]
  Subquery c
   Subquery claim
LogicalRDD
 [AttendPhysician#0,BillProv#1,Claim#2,ClaimClinic#3,ClaimInfo#4,ClaimPay#5,ClaimTL#6,OpPhysician#7,Patient#8,PayToPhysician#9,Payer#10,Physician#11,RefProv#12,Services#13,Subscriber#14],
 MappedRDD[5] at map at JsonRDD.scala:43
 at
 org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:127)
 at
 org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3$$anonfun$applyOrElse$6.apply(Analyzer.scala:125)
 at
 scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
 at
 scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
 at
 org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:125)
 at
 org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$$anonfun$apply$3.applyOrElse(Analyzer.scala:115)
 at
 org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
 at
 org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:135)
 at
 org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:115)
 at
 org.apache.spark.sql.catalyst.analysis.Analyzer$CheckAggregation$.apply(Analyzer.scala:113)
 at
 org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:61)
 at
 org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1$$anonfun$apply$2.apply(RuleExecutor.scala:59)
 at
 scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
 at
 scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:60)
 at
 scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:34)
 at
 org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:59)
 at
 org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$apply$1.apply(RuleExecutor.scala:51)
 at scala.collection.immutable.List.foreach(List.scala:318)
 at
 org.apache.spark.sql.catalyst.rules.RuleExecutor.apply(RuleExecutor.scala:51)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.analyzed$lzycompute(SQLContext.scala:411)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.analyzed(SQLContext.scala:411)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.withCachedData$lzycompute(SQLContext.scala:412)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.withCachedData(SQLContext.scala:412)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan$lzycompute(SQLContext.scala:413)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.optimizedPlan(SQLContext.scala:413)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
 at
 org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
 at org.apache.spark.sql.SchemaRDD.collect(SchemaRDD.scala:423)
 at $iwC$$iwC$$iwC$$iwC.init(console:17)
 at $iwC$$iwC$$iwC.init(console:22)
 at $iwC$$iwC.init(console:24)
 at $iwC.init(console:26)
 at init(console:28)
 at .init(console:32)
 at .clinit(console)
 at .init(console:7)
 at .clinit(console)
 at $print(console)
 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 at
 sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
 at
 sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:606)
 at