Repository: spark Updated Branches: refs/heads/master 368513048 -> 78e133141
[SPARK-25708][SQL] HAVING without GROUP BY means global aggregate ## What changes were proposed in this pull request? According to the SQL standard, when a query contains `HAVING`, it indicates an aggregate operator. For more details please refer to https://blog.jooq.org/2014/12/04/do-you-really-understand-sqls-group-by-and-having-clauses/ However, in Spark SQL parser, we treat HAVING as a normal filter when there is no GROUP BY, which breaks SQL semantic and lead to wrong result. This PR fixes the parser. ## How was this patch tested? new test Closes #22696 from cloud-fan/having. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: gatorsmile <gatorsm...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/78e13314 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/78e13314 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/78e13314 Branch: refs/heads/master Commit: 78e133141ce8131c60181f947346802864b0951a Parents: 3685130 Author: Wenchen Fan <wenc...@databricks.com> Authored: Fri Oct 12 00:24:06 2018 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Fri Oct 12 00:24:06 2018 -0700 ---------------------------------------------------------------------- docs/sql-programming-guide.md | 1 + .../spark/sql/catalyst/parser/AstBuilder.scala | 41 +++++++++++++------- .../org/apache/spark/sql/internal/SQLConf.scala | 8 ++++ .../sql/catalyst/parser/PlanParserSuite.scala | 2 +- .../resources/sql-tests/inputs/group-by.sql | 7 ++++ .../sql-tests/results/group-by.sql.out | 27 ++++++++++++- .../sql/hive/execution/HiveQuerySuite.scala | 4 -- 7 files changed, 71 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/docs/sql-programming-guide.md ---------------------------------------------------------------------- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 0d29357..fb03ed2 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1977,6 +1977,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see - Since Spark 2.4, Metadata files (e.g. Parquet summary files) and temporary files are not counted as data files when calculating table size during Statistics computation. - Since Spark 2.4, empty strings are saved as quoted empty strings `""`. In version 2.3 and earlier, empty strings are equal to `null` values and do not reflect to any characters in saved CSV files. For example, the row of `"a", null, "", 1` was writted as `a,,,1`. Since Spark 2.4, the same row is saved as `a,,"",1`. To restore the previous behavior, set the CSV option `emptyValue` to empty (not quoted) string. - Since Spark 2.4, The LOAD DATA command supports wildcard `?` and `*`, which match any one character, and zero or more characters, respectively. Example: `LOAD DATA INPATH '/tmp/folder*/'` or `LOAD DATA INPATH '/tmp/part-?'`. Special Characters like `space` also now work in paths. Example: `LOAD DATA INPATH '/tmp/folder name/'`. + - In Spark version 2.3 and earlier, HAVING without GROUP BY is treated as WHERE. This means, `SELECT 1 FROM range(10) HAVING true` is executed as `SELECT 1 FROM range(10) WHERE true` and returns 10 rows. This violates SQL standard, and has been fixed in Spark 2.4. Since Spark 2.4, HAVING without GROUP BY is treated as a global aggregate, which means `SELECT 1 FROM range(10) HAVING true` will return only one row. To restore the previous behavior, set `spark.sql.legacy.parser.havingWithoutGroupByAsWhere` to `true`. ## Upgrading From Spark SQL 2.3.0 to 2.3.1 and above http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index ba0b72e..672bffc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -394,6 +394,17 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging Filter(expression(ctx), plan) } + def withHaving(ctx: BooleanExpressionContext, plan: LogicalPlan): LogicalPlan = { + // Note that we add a cast to non-predicate expressions. If the expression itself is + // already boolean, the optimizer will get rid of the unnecessary cast. + val predicate = expression(ctx) match { + case p: Predicate => p + case e => Cast(e, BooleanType) + } + Filter(predicate, plan) + } + + // Expressions. val expressions = Option(namedExpressionSeq).toSeq .flatMap(_.namedExpression.asScala) @@ -446,30 +457,34 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging case e: NamedExpression => e case e: Expression => UnresolvedAlias(e) } - val withProject = if (aggregation != null) { - withAggregation(aggregation, namedExpressions, withFilter) - } else if (namedExpressions.nonEmpty) { + + def createProject() = if (namedExpressions.nonEmpty) { Project(namedExpressions, withFilter) } else { withFilter } - // Having - val withHaving = withProject.optional(having) { - // Note that we add a cast to non-predicate expressions. If the expression itself is - // already boolean, the optimizer will get rid of the unnecessary cast. - val predicate = expression(having) match { - case p: Predicate => p - case e => Cast(e, BooleanType) + val withProject = if (aggregation == null && having != null) { + if (conf.getConf(SQLConf.LEGACY_HAVING_WITHOUT_GROUP_BY_AS_WHERE)) { + // If the legacy conf is set, treat HAVING without GROUP BY as WHERE. + withHaving(having, createProject()) + } else { + // According to SQL standard, HAVING without GROUP BY means global aggregate. + withHaving(having, Aggregate(Nil, namedExpressions, withFilter)) } - Filter(predicate, withProject) + } else if (aggregation != null) { + val aggregate = withAggregation(aggregation, namedExpressions, withFilter) + aggregate.optionalMap(having)(withHaving) + } else { + // When hitting this branch, `having` must be null. + createProject() } // Distinct val withDistinct = if (setQuantifier() != null && setQuantifier().DISTINCT() != null) { - Distinct(withHaving) + Distinct(withProject) } else { - withHaving + withProject } // Window http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b699707..da70d7d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1567,6 +1567,14 @@ object SQLConf { .internal() .booleanConf .createWithDefault(false) + + val LEGACY_HAVING_WITHOUT_GROUP_BY_AS_WHERE = + buildConf("spark.sql.legacy.parser.havingWithoutGroupByAsWhere") + .internal() + .doc("If it is set to true, the parser will treat HAVING without GROUP BY as a normal " + + "WHERE, which does not follow SQL standard.") + .booleanConf + .createWithDefault(false) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 422bf97..f5da90f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -108,7 +108,7 @@ class PlanParserSuite extends AnalysisTest { assertEqual("select a, b from db.c where x < 1", table("db", "c").where('x < 1).select('a, 'b)) assertEqual( "select a, b from db.c having x < 1", - table("db", "c").select('a, 'b).where('x < 1)) + table("db", "c").groupBy()('a, 'b).where('x < 1)) assertEqual("select distinct a, b from db.c", Distinct(table("db", "c").select('a, 'b))) assertEqual("select all a, b from db.c", table("db", "c").select('a, 'b)) assertEqual("select from tbl", OneRowRelation().select('from.as("tbl"))) http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/sql/core/src/test/resources/sql-tests/inputs/group-by.sql ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 2c18d6a..433db71 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -73,3 +73,10 @@ where b.z != b.z; -- SPARK-24369 multiple distinct aggregations having the same argument set SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*) FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y); + +-- SPARK-25708 HAVING without GROUP BY means global aggregate +SELECT 1 FROM range(10) HAVING true; + +SELECT 1 FROM range(10) HAVING MAX(id) > 0; + +SELECT id FROM range(10) HAVING id > 0; http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/sql/core/src/test/resources/sql-tests/results/group-by.sql.out ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 581aa17..f9d1ee8 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 27 +-- Number of queries: 30 -- !query 0 @@ -250,3 +250,28 @@ SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*) struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,corr(DISTINCT CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,count(1):bigint> -- !query 26 output 1.0 1.0 3 + + +-- !query 27 +SELECT 1 FROM range(10) HAVING true +-- !query 27 schema +struct<1:int> +-- !query 27 output +1 + + +-- !query 28 +SELECT 1 FROM range(10) HAVING MAX(id) > 0 +-- !query 28 schema +struct<1:int> +-- !query 28 output +1 + + +-- !query 29 +SELECT id FROM range(10) HAVING id > 0 +-- !query 29 schema +struct<> +-- !query 29 output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and '`id`' is not an aggregate function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or first_value) if you don't care which value you get.; http://git-wip-us.apache.org/repos/asf/spark/blob/78e13314/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index b9c32e7..a5cff35 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -740,10 +740,6 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd sql("select key, count(*) c from src group by key having c").collect() } - test("SPARK-2225: turn HAVING without GROUP BY into a simple filter") { - assert(sql("select key from src having key > 490").collect().size < 100) - } - test("union/except/intersect") { assertResult(Array(Row(1), Row(1))) { sql("select 1 as a union all select 1 as a").collect() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org