[ https://issues.apache.org/jira/browse/SPARK-9740?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15116977#comment-15116977 ]
Emlyn Corrin edited comment on SPARK-9740 at 1/26/16 9:33 AM: -------------------------------------------------------------- I've put together a minimal example to demonstrate the problem: {code} package spark_test; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.functions; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.expressions.Window; import org.apache.spark.sql.hive.HiveContext; public class SparkTestMain { public static void main(String[] args) { assert args.length == 1; SparkConf sparkConf = new SparkConf().setAppName("SparkTest"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlCtx = new HiveContext(ctx); // I tried both SQLContext and HiveContext here DataFrame df = sqlCtx.read().json(args[0]); System.out.println(df.schema().simpleString()); DataFrame df2 = df.select(functions.expr("FIRST(value,true)") .over(Window.partitionBy(df.col("id")) .orderBy(df.col("time")) .rowsBetween(Long.MIN_VALUE, 0))); System.out.println(df2.take(5)); } } {code} I ran that with: {code} spark-submit --master local[*] spark-test.jar test.json {code} And it fails with: {code} struct<id:bigint,time:bigint,value:string> Exception in thread "main" java.lang.UnsupportedOperationException: 'FIRST('value,true) is not supported in a window operation. at org.apache.spark.sql.expressions.WindowSpec.withAggregate(WindowSpec.scala:191) at org.apache.spark.sql.Column.over(Column.scala:1049) at spark_test.SparkTestMain.main(SparkTestMain.java:23) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:483) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) {code} was (Author: emlyn): I've put together a minimal example to demonstrate the problem: {code} package spark_test; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.functions; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.expressions.Window; import org.apache.spark.sql.hive.HiveContext; public class SparkTestMain { public static void main(String[] args) { assert args.length == 1; SparkConf sparkConf = new SparkConf().setAppName("SparkTest"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlCtx = new HiveContext(ctx); // I tried both SQLContext and HiveContext here DataFrame df = sqlCtx.read().json(args[0]); System.out.println(df.schema().simpleString()); DataFrame df2 = df.select(functions.expr("FIRST(value,true)") .over(Window.partitionBy(df.col("id")) .orderBy(df.col("time")) .rowsBetween(Long.MIN_VALUE, 0))); System.out.println(df2.take(5)); } } {code} I ran that with: {code} spark-submit --master local[*] spark-test.jar test.json {code} And it fails with: {code} struct<id:bigint,time:bigint,value:string> Exception in thread "main" java.lang.UnsupportedOperationException: 'FIRST('value,true) is not supported in a window operation. at org.apache.spark.sql.expressions.WindowSpec.withAggregate(WindowSpec.scala:191) at org.apache.spark.sql.Column.over(Column.scala:1049) at spark_test.SparkTestMain.main(SparkTestMain.java:23) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:483) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) {code} > first/last aggregate NULL behavior > ---------------------------------- > > Key: SPARK-9740 > URL: https://issues.apache.org/jira/browse/SPARK-9740 > Project: Spark > Issue Type: Sub-task > Components: SQL > Reporter: Herman van Hovell > Assignee: Yin Huai > Labels: releasenotes > Fix For: 1.6.0 > > > The FIRST/LAST aggregates implemented as part of the new UDAF interface, > return the first or last non-null value (if any) found. This is a departure > from the behavior of the old FIRST/LAST aggregates and from the > FIRST_VALUE/LAST_VALUE aggregates in Hive. These would return a null value, > if that happened to be the first/last value seen. SPARK-9592 tries to 'fix' > this behavior for the old UDAF interface. > Hive makes this behavior configurable, by adding a skipNulls flag. I would > suggest to do the same, and make the default behavior compatible with Hive. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org