[spark] branch branch-3.4 updated: [SPARK-42296][SQL] Apply spark.sql.inferTimestampNTZInDataSources.enabled on JDBC data source
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new b8c0fb9c760 [SPARK-42296][SQL] Apply spark.sql.inferTimestampNTZInDataSources.enabled on JDBC data source b8c0fb9c760 is described below commit b8c0fb9c7605f7ab51fbb0e900f9334f4e748218 Author: Gengliang Wang AuthorDate: Fri Feb 3 10:06:26 2023 -0800 [SPARK-42296][SQL] Apply spark.sql.inferTimestampNTZInDataSources.enabled on JDBC data source ### What changes were proposed in this pull request? Simliar to https://github.com/apache/spark/pull/39777 and https://github.com/apache/spark/pull/39812, this PR proposes to use `spark.sql.inferTimestampNTZInDataSources.enabled` to control the behavior of timestamp type inference on JDBC data sources. ### Why are the changes needed? Unify the TimestampNTZ type inference behavior over data sources. In JDBC/JSON/CSV data sources, a column can be Timestamp type or TimestampNTZ type. We need a lightweight configuration to control the behavior. ### Does this PR introduce _any_ user-facing change? No, TimestampNTZ is not released yet. ### How was this patch tested? UTs Closes #39868 from gengliangwang/jdbcNTZ. Authored-by: Gengliang Wang Signed-off-by: Dongjoon Hyun (cherry picked from commit 4760a8bd845292f7d6d6a35320bd80082a76c7c5) Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/sql/internal/SQLConf.scala| 9 ++--- .../execution/datasources/jdbc/JDBCOptions.scala | 7 +++- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 43 -- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1cc3b61b834..363e763be4f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3520,11 +3520,10 @@ object SQLConf { val INFER_TIMESTAMP_NTZ_IN_DATA_SOURCES = buildConf("spark.sql.inferTimestampNTZInDataSources.enabled") - .doc("When true, the TimestampNTZ type is the prior choice of the schema inference " + -"over built-in data sources. Otherwise, the inference result will be TimestampLTZ for " + -"backward compatibility. As a result, for JSON/CSV files and partition directories " + -"written with TimestampNTZ columns, the inference results will still be of TimestampLTZ " + -"types.") + .doc("For the schema inference of JSON/CSV/JDBC data sources and partition directories, " + +"this config determines whether to choose the TimestampNTZ type if a column can be " + +"either TimestampNTZ or TimestampLTZ type. If set to true, the inference result of " + +"the column will be TimestampNTZ type. Otherwise, the result will be TimestampLTZ type.") .version("3.4.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala index e725de95335..888951cf9a8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala @@ -26,6 +26,7 @@ import org.apache.spark.SparkFiles import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf /** * Options for the JDBC data source. @@ -232,7 +233,11 @@ class JDBCOptions( val prepareQuery = parameters.get(JDBC_PREPARE_QUERY).map(_ + " ").getOrElse("") // Infers timestamp values as TimestampNTZ type when reading data. - val inferTimestampNTZType = parameters.getOrElse(JDBC_INFER_TIMESTAMP_NTZ, "false").toBoolean + val inferTimestampNTZType = +parameters + .get(JDBC_INFER_TIMESTAMP_NTZ) + .map(_.toBoolean) + .getOrElse(SQLConf.get.inferTimestampNTZInDataSources) } class JdbcOptionsInWrite( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 3e317dc9547..3b3b1bfdb60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -23,6 +23,7 @@ import java.time.{Instant, LocalDate, LocalDateTime} import java.util.{Calendar, GregorianCalendar, Properties, TimeZone} import scala.collection.JavaConverters._
[spark] branch master updated: [SPARK-42296][SQL] Apply spark.sql.inferTimestampNTZInDataSources.enabled on JDBC data source
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4760a8bd845 [SPARK-42296][SQL] Apply spark.sql.inferTimestampNTZInDataSources.enabled on JDBC data source 4760a8bd845 is described below commit 4760a8bd845292f7d6d6a35320bd80082a76c7c5 Author: Gengliang Wang AuthorDate: Fri Feb 3 10:06:26 2023 -0800 [SPARK-42296][SQL] Apply spark.sql.inferTimestampNTZInDataSources.enabled on JDBC data source ### What changes were proposed in this pull request? Simliar to https://github.com/apache/spark/pull/39777 and https://github.com/apache/spark/pull/39812, this PR proposes to use `spark.sql.inferTimestampNTZInDataSources.enabled` to control the behavior of timestamp type inference on JDBC data sources. ### Why are the changes needed? Unify the TimestampNTZ type inference behavior over data sources. In JDBC/JSON/CSV data sources, a column can be Timestamp type or TimestampNTZ type. We need a lightweight configuration to control the behavior. ### Does this PR introduce _any_ user-facing change? No, TimestampNTZ is not released yet. ### How was this patch tested? UTs Closes #39868 from gengliangwang/jdbcNTZ. Authored-by: Gengliang Wang Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/sql/internal/SQLConf.scala| 9 ++--- .../execution/datasources/jdbc/JDBCOptions.scala | 7 +++- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 43 -- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 1cc3b61b834..363e763be4f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3520,11 +3520,10 @@ object SQLConf { val INFER_TIMESTAMP_NTZ_IN_DATA_SOURCES = buildConf("spark.sql.inferTimestampNTZInDataSources.enabled") - .doc("When true, the TimestampNTZ type is the prior choice of the schema inference " + -"over built-in data sources. Otherwise, the inference result will be TimestampLTZ for " + -"backward compatibility. As a result, for JSON/CSV files and partition directories " + -"written with TimestampNTZ columns, the inference results will still be of TimestampLTZ " + -"types.") + .doc("For the schema inference of JSON/CSV/JDBC data sources and partition directories, " + +"this config determines whether to choose the TimestampNTZ type if a column can be " + +"either TimestampNTZ or TimestampLTZ type. If set to true, the inference result of " + +"the column will be TimestampNTZ type. Otherwise, the result will be TimestampLTZ type.") .version("3.4.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala index e725de95335..888951cf9a8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala @@ -26,6 +26,7 @@ import org.apache.spark.SparkFiles import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf /** * Options for the JDBC data source. @@ -232,7 +233,11 @@ class JDBCOptions( val prepareQuery = parameters.get(JDBC_PREPARE_QUERY).map(_ + " ").getOrElse("") // Infers timestamp values as TimestampNTZ type when reading data. - val inferTimestampNTZType = parameters.getOrElse(JDBC_INFER_TIMESTAMP_NTZ, "false").toBoolean + val inferTimestampNTZType = +parameters + .get(JDBC_INFER_TIMESTAMP_NTZ) + .map(_.toBoolean) + .getOrElse(SQLConf.get.inferTimestampNTZInDataSources) } class JdbcOptionsInWrite( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 3e317dc9547..3b3b1bfdb60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -23,6 +23,7 @@ import java.time.{Instant, LocalDate, LocalDateTime} import java.util.{Calendar, GregorianCalendar, Properties, TimeZone} import scala.collection.JavaConverters._ +import scala.util.Random import org.mockito.ArgumentMatchers._ import org.mockito.Mockito._ @@ -1935,13 +1936,26
[spark] branch branch-3.4 updated: [SPARK-42333][SQL] Change log level to debug when fetching result set from SparkExecuteStatementOperation
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new cf8d35a1c75 [SPARK-42333][SQL] Change log level to debug when fetching result set from SparkExecuteStatementOperation cf8d35a1c75 is described below commit cf8d35a1c75dec91998288cc3d364b1d769423e9 Author: Yuming Wang AuthorDate: Fri Feb 3 10:01:07 2023 -0800 [SPARK-42333][SQL] Change log level to debug when fetching result set from SparkExecuteStatementOperation ### What changes were proposed in this pull request? Change log level from info to debug when fetching result set from `SparkExecuteStatementOperation`. ### Why are the changes needed? Avoid generating too many logs: https://user-images.githubusercontent.com/5399861/216561187-6ad00458-d196-4f3a-a314-b2f309aec482.png;> ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #39876 from wangyum/SPARK-42333. Authored-by: Yuming Wang Signed-off-by: Dongjoon Hyun (cherry picked from commit 4ebfc0e2e9405282e3fa044aaa59b69ddb8a1162) Signed-off-by: Dongjoon Hyun --- .../spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala index 3292cbef417..c41e92e618b 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala @@ -96,7 +96,7 @@ private[hive] class SparkExecuteStatementOperation( private def getNextRowSetInternal( order: FetchOrientation, maxRowsL: Long): TRowSet = withLocalProperties { -log.info(s"Received getNextRowSet request order=${order} and maxRowsL=${maxRowsL} " + +log.debug(s"Received getNextRowSet request order=${order} and maxRowsL=${maxRowsL} " + s"with ${statementId}") validateDefaultFetchOrientation(order) assertState(OperationState.FINISHED) @@ -112,7 +112,7 @@ private[hive] class SparkExecuteStatementOperation( val maxRows = maxRowsL.toInt val offset = iter.getPosition val rows = iter.take(maxRows).toList -log.info(s"Returning result set with ${rows.length} rows from offsets " + +log.debug(s"Returning result set with ${rows.length} rows from offsets " + s"[${iter.getFetchStart}, ${offset}) with $statementId") RowSetUtils.toTRowSet(offset, rows, dataTypes, getProtocolVersion, getTimeFormatters) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated (02b39f0b880 -> 4ebfc0e2e94)
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/spark.git from 02b39f0b880 [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved add 4ebfc0e2e94 [SPARK-42333][SQL] Change log level to debug when fetching result set from SparkExecuteStatementOperation No new revisions were added by this update. Summary of changes: .../spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-41554] fix changing of Decimal scale when scale decreased by m…
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new d304a652823 [SPARK-41554] fix changing of Decimal scale when scale decreased by m… d304a652823 is described below commit d304a6528233798dc93d31da58be220cf6d0485e Author: oleksii.diagiliev AuthorDate: Fri Feb 3 10:49:42 2023 -0600 [SPARK-41554] fix changing of Decimal scale when scale decreased by m… …ore than 18 This is a backport PR for https://github.com/apache/spark/pull/39099 Closes #39381 from fe2s/branch-3.2-fix-decimal-scaling. Authored-by: oleksii.diagiliev Signed-off-by: Sean Owen --- .../scala/org/apache/spark/sql/types/Decimal.scala | 60 +- .../org/apache/spark/sql/types/DecimalSuite.scala | 53 ++- 2 files changed, 88 insertions(+), 25 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index bc5fba8d0d8..503a887d690 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -388,30 +388,42 @@ final class Decimal extends Ordered[Decimal] with Serializable { if (scale < _scale) { // Easier case: we just need to divide our scale down val diff = _scale - scale -val pow10diff = POW_10(diff) -// % and / always round to 0 -val droppedDigits = longVal % pow10diff -longVal /= pow10diff -roundMode match { - case ROUND_FLOOR => -if (droppedDigits < 0) { - longVal += -1L -} - case ROUND_CEILING => -if (droppedDigits > 0) { - longVal += 1L -} - case ROUND_HALF_UP => -if (math.abs(droppedDigits) * 2 >= pow10diff) { - longVal += (if (droppedDigits < 0) -1L else 1L) -} - case ROUND_HALF_EVEN => -val doubled = math.abs(droppedDigits) * 2 -if (doubled > pow10diff || doubled == pow10diff && longVal % 2 != 0) { - longVal += (if (droppedDigits < 0) -1L else 1L) -} - case _ => -throw QueryExecutionErrors.unsupportedRoundingMode(roundMode) +// If diff is greater than max number of digits we store in Long, then +// value becomes 0. Otherwise we calculate new value dividing by power of 10. +// In both cases we apply rounding after that. +if (diff > MAX_LONG_DIGITS) { + longVal = roundMode match { +case ROUND_FLOOR => if (longVal < 0) -1L else 0L +case ROUND_CEILING => if (longVal > 0) 1L else 0L +case ROUND_HALF_UP | ROUND_HALF_EVEN => 0L +case _ => sys.error(s"Not supported rounding mode: $roundMode") + } +} else { + val pow10diff = POW_10(diff) + // % and / always round to 0 + val droppedDigits = longVal % pow10diff + longVal /= pow10diff + roundMode match { +case ROUND_FLOOR => + if (droppedDigits < 0) { +longVal += -1L + } +case ROUND_CEILING => + if (droppedDigits > 0) { +longVal += 1L + } +case ROUND_HALF_UP => + if (math.abs(droppedDigits) * 2 >= pow10diff) { +longVal += (if (droppedDigits < 0) -1L else 1L) + } +case ROUND_HALF_EVEN => + val doubled = math.abs(droppedDigits) * 2 + if (doubled > pow10diff || doubled == pow10diff && longVal % 2 != 0) { +longVal += (if (droppedDigits < 0) -1L else 1L) + } +case _ => + throw QueryExecutionErrors.unsupportedRoundingMode(roundMode) + } } } else if (scale > _scale) { // We might be able to multiply longVal by a power of 10 and not overflow, but if not, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala index 5433c561a03..1f4862fcbdc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala @@ -27,6 +27,9 @@ import org.apache.spark.sql.types.Decimal._ import org.apache.spark.unsafe.types.UTF8String class DecimalSuite extends SparkFunSuite with PrivateMethodTester with SQLHelper { + + val allSupportedRoundModes = Seq(ROUND_HALF_UP, ROUND_HALF_EVEN, ROUND_CEILING, ROUND_FLOOR) + /** Check that a Decimal has the given string representation, precision and scale */ private
[spark] branch branch-3.3 updated: [SPARK-41554] fix changing of Decimal scale when scale decreased by m…
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 2d539c5c702 [SPARK-41554] fix changing of Decimal scale when scale decreased by m… 2d539c5c702 is described below commit 2d539c5c7022d44d8a2d53e752287c42c2601444 Author: oleksii.diagiliev AuthorDate: Fri Feb 3 10:48:56 2023 -0600 [SPARK-41554] fix changing of Decimal scale when scale decreased by m… …ore than 18 This is a backport PR for https://github.com/apache/spark/pull/39099 Closes #39813 from fe2s/branch-3.3-fix-decimal-scaling. Authored-by: oleksii.diagiliev Signed-off-by: Sean Owen --- .../scala/org/apache/spark/sql/types/Decimal.scala | 60 +- .../org/apache/spark/sql/types/DecimalSuite.scala | 53 ++- 2 files changed, 88 insertions(+), 25 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index 7a43d01eb2f..07a2c47cff0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -397,30 +397,42 @@ final class Decimal extends Ordered[Decimal] with Serializable { if (scale < _scale) { // Easier case: we just need to divide our scale down val diff = _scale - scale -val pow10diff = POW_10(diff) -// % and / always round to 0 -val droppedDigits = longVal % pow10diff -longVal /= pow10diff -roundMode match { - case ROUND_FLOOR => -if (droppedDigits < 0) { - longVal += -1L -} - case ROUND_CEILING => -if (droppedDigits > 0) { - longVal += 1L -} - case ROUND_HALF_UP => -if (math.abs(droppedDigits) * 2 >= pow10diff) { - longVal += (if (droppedDigits < 0) -1L else 1L) -} - case ROUND_HALF_EVEN => -val doubled = math.abs(droppedDigits) * 2 -if (doubled > pow10diff || doubled == pow10diff && longVal % 2 != 0) { - longVal += (if (droppedDigits < 0) -1L else 1L) -} - case _ => -throw QueryExecutionErrors.unsupportedRoundingMode(roundMode) +// If diff is greater than max number of digits we store in Long, then +// value becomes 0. Otherwise we calculate new value dividing by power of 10. +// In both cases we apply rounding after that. +if (diff > MAX_LONG_DIGITS) { + longVal = roundMode match { +case ROUND_FLOOR => if (longVal < 0) -1L else 0L +case ROUND_CEILING => if (longVal > 0) 1L else 0L +case ROUND_HALF_UP | ROUND_HALF_EVEN => 0L +case _ => sys.error(s"Not supported rounding mode: $roundMode") + } +} else { + val pow10diff = POW_10(diff) + // % and / always round to 0 + val droppedDigits = longVal % pow10diff + longVal /= pow10diff + roundMode match { +case ROUND_FLOOR => + if (droppedDigits < 0) { +longVal += -1L + } +case ROUND_CEILING => + if (droppedDigits > 0) { +longVal += 1L + } +case ROUND_HALF_UP => + if (math.abs(droppedDigits) * 2 >= pow10diff) { +longVal += (if (droppedDigits < 0) -1L else 1L) + } +case ROUND_HALF_EVEN => + val doubled = math.abs(droppedDigits) * 2 + if (doubled > pow10diff || doubled == pow10diff && longVal % 2 != 0) { +longVal += (if (droppedDigits < 0) -1L else 1L) + } +case _ => + throw QueryExecutionErrors.unsupportedRoundingMode(roundMode) + } } } else if (scale > _scale) { // We might be able to multiply longVal by a power of 10 and not overflow, but if not, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala index 6f70dc51b95..6ccd2b9bd32 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala @@ -27,6 +27,9 @@ import org.apache.spark.sql.types.Decimal._ import org.apache.spark.unsafe.types.UTF8String class DecimalSuite extends SparkFunSuite with PrivateMethodTester with SQLHelper { + + val allSupportedRoundModes = Seq(ROUND_HALF_UP, ROUND_HALF_EVEN, ROUND_CEILING, ROUND_FLOOR) + /** Check that a Decimal has the given string representation, precision and scale */ private
[spark] branch branch-3.4 updated: [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new 0106e17fca6 [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved 0106e17fca6 is described below commit 0106e17fca6f1c51f737af67097fd28d89329b20 Author: Wenchen Fan AuthorDate: Fri Feb 3 15:40:33 2023 +0300 [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/39508 to fix a regression. We should not remove aliases from grouping expressions if they are not resolved, as the alias may be necessary for resolution, such as `CreateNamedStruct`. ### Why are the changes needed? fix a regression ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test Closes #39867 from cloud-fan/column. Lead-authored-by: Wenchen Fan Co-authored-by: Wenchen Fan Signed-off-by: Max Gekk (cherry picked from commit 02b39f0b880a2ecf63167355d9644e91c98588a8) Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/ResolveReferencesInAggregate.scala | 8 +++- sql/core/src/test/resources/sql-tests/inputs/group-by.sql | 3 +++ .../src/test/resources/sql-tests/results/group-by.sql.out | 11 +++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 4af2ecc91ab..1a9ed4ce16e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -96,7 +96,13 @@ object ResolveReferencesInAggregate extends SQLConfHelper // can't find the grouping expressions via `semanticEquals` and the analysis will fail. // Example rules: ResolveGroupingAnalytics (See SPARK-31670 for more details) and // ResolveLateralColumnAliasReference. - groupingExpressions = resolvedGroupExprs.map(trimAliases), + groupingExpressions = resolvedGroupExprs.map { e => +// Only trim the alias if the expression is resolved, as the alias may be needed to resolve +// the expression, such as `NamePlaceHolder` in `CreateNamedStruct`. +// Note: this rule will be invoked even if the Aggregate is fully resolved. So alias in +// GROUP BY will be removed eventually, by following iterations. +if (e.resolved) trimAliases(e) else e + }, aggregateExpressions = resolvedAggExprsWithOuter) } diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 1615c43cc7e..c812403ba2c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -34,6 +34,9 @@ SELECT a + b, COUNT(b) FROM testData GROUP BY a + b; SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1; SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1; +-- struct() in group by +SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa); + -- Aggregate with nulls. SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM testData; diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 0402039fafa..6e7592d6978 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -145,6 +145,17 @@ struct<((a + 1) + 1):int,count(b):bigint> NULL 1 +-- !query +SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa) +-- !query schema +struct +-- !query output +2 +2 +2 +3 + + -- !query SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM testData - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 02b39f0b880 [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved 02b39f0b880 is described below commit 02b39f0b880a2ecf63167355d9644e91c98588a8 Author: Wenchen Fan AuthorDate: Fri Feb 3 15:40:33 2023 +0300 [SPARK-41985][SQL][FOLLOWUP] Remove alias in GROUP BY only when the expr is resolved ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/39508 to fix a regression. We should not remove aliases from grouping expressions if they are not resolved, as the alias may be necessary for resolution, such as `CreateNamedStruct`. ### Why are the changes needed? fix a regression ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test Closes #39867 from cloud-fan/column. Lead-authored-by: Wenchen Fan Co-authored-by: Wenchen Fan Signed-off-by: Max Gekk --- .../sql/catalyst/analysis/ResolveReferencesInAggregate.scala | 8 +++- sql/core/src/test/resources/sql-tests/inputs/group-by.sql | 3 +++ .../src/test/resources/sql-tests/results/group-by.sql.out | 11 +++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 4af2ecc91ab..1a9ed4ce16e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -96,7 +96,13 @@ object ResolveReferencesInAggregate extends SQLConfHelper // can't find the grouping expressions via `semanticEquals` and the analysis will fail. // Example rules: ResolveGroupingAnalytics (See SPARK-31670 for more details) and // ResolveLateralColumnAliasReference. - groupingExpressions = resolvedGroupExprs.map(trimAliases), + groupingExpressions = resolvedGroupExprs.map { e => +// Only trim the alias if the expression is resolved, as the alias may be needed to resolve +// the expression, such as `NamePlaceHolder` in `CreateNamedStruct`. +// Note: this rule will be invoked even if the Aggregate is fully resolved. So alias in +// GROUP BY will be removed eventually, by following iterations. +if (e.resolved) trimAliases(e) else e + }, aggregateExpressions = resolvedAggExprsWithOuter) } diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 1615c43cc7e..c812403ba2c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -34,6 +34,9 @@ SELECT a + b, COUNT(b) FROM testData GROUP BY a + b; SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1; SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1; +-- struct() in group by +SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa); + -- Aggregate with nulls. SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM testData; diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 0402039fafa..6e7592d6978 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -145,6 +145,17 @@ struct<((a + 1) + 1):int,count(b):bigint> NULL 1 +-- !query +SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa) +-- !query schema +struct +-- !query output +2 +2 +2 +3 + + -- !query SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM testData - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.4 updated: [SPARK-42234][SQL] Rename error class: `UNSUPPORTED_FEATURE.REPEATED_PIVOT`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.4 by this push: new 5ca47b63328 [SPARK-42234][SQL] Rename error class: `UNSUPPORTED_FEATURE.REPEATED_PIVOT` 5ca47b63328 is described below commit 5ca47b63328faf97c0b09af67f51814274c5f9bc Author: itholic AuthorDate: Fri Feb 3 15:17:00 2023 +0300 [SPARK-42234][SQL] Rename error class: `UNSUPPORTED_FEATURE.REPEATED_PIVOT` ### What changes were proposed in this pull request? This PR proposes to rename `UNSUPPORTED_FEATURE.REPEATED_PIVOT` to `REPEATED_CLAUSE`. ### Why are the changes needed? `REPEATED_PIVOT` is actually not an `UNSUPPORTED_FEATURE`, and there must be other cases we should cover in more generic way ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated UTs. Closes #39795 from itholic/REPEATED_PIVOT. Lead-authored-by: itholic Co-authored-by: Haejoon Lee <44108233+itho...@users.noreply.github.com> Signed-off-by: Max Gekk (cherry picked from commit a916a059100a53583fb987b47ffde5745627fdb8) Signed-off-by: Max Gekk --- core/src/main/resources/error/error-classes.json | 11 ++- .../org/apache/spark/sql/errors/QueryExecutionErrors.scala| 6 +++--- .../scala/org/apache/spark/sql/RelationalGroupedDataset.scala | 4 +++- .../apache/spark/sql/errors/QueryExecutionErrorsSuite.scala | 6 +++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 0a929d5f48e..7cd70bda8bb 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -1208,6 +1208,12 @@ ], "sqlState" : "42K03" }, + "REPEATED_CLAUSE" : { +"message" : [ + "The clause may be used at most once per operation." +], +"sqlState" : "42614" + }, "ROUTINE_ALREADY_EXISTS" : { "message" : [ "Cannot create the function because it already exists.", @@ -1597,11 +1603,6 @@ "Python UDF in the ON clause of a JOIN. In case of an INNNER JOIN consider rewriting to a CROSS JOIN with a WHERE clause." ] }, - "REPEATED_PIVOT" : { -"message" : [ - "Repeated PIVOT operation." -] - }, "SET_NAMESPACE_PROPERTY" : { "message" : [ " is a reserved namespace property, ." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index c64c26e510b..b3bd7b727bf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2602,10 +2602,10 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase { cause = null) } - def repeatedPivotsUnsupportedError(): Throwable = { + def repeatedPivotsUnsupportedError(clause: String, operation: String): Throwable = { new SparkUnsupportedOperationException( - errorClass = "UNSUPPORTED_FEATURE.REPEATED_PIVOT", - messageParameters = Map.empty[String, String]) + errorClass = "REPEATED_CLAUSE", + messageParameters = Map("clause" -> clause, "operation" -> operation)) } def pivotNotAfterGroupByUnsupportedError(): Throwable = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 61517de0dfa..b168bbc4b42 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -483,7 +483,9 @@ class RelationalGroupedDataset protected[sql]( groupingExprs, RelationalGroupedDataset.PivotType(pivotColumn.expr, valueExprs)) case _: RelationalGroupedDataset.PivotType => -throw QueryExecutionErrors.repeatedPivotsUnsupportedError() +throw QueryExecutionErrors.repeatedPivotsUnsupportedError( + clause = "PIVOT", operation = "SUBQUERY" +) case _ => throw QueryExecutionErrors.pivotNotAfterGroupByUnsupportedError() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala index c679e4f707f..5d4b8e0b0c4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala @@
[spark] branch master updated: [SPARK-42234][SQL] Rename error class: `UNSUPPORTED_FEATURE.REPEATED_PIVOT`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a916a059100 [SPARK-42234][SQL] Rename error class: `UNSUPPORTED_FEATURE.REPEATED_PIVOT` a916a059100 is described below commit a916a059100a53583fb987b47ffde5745627fdb8 Author: itholic AuthorDate: Fri Feb 3 15:17:00 2023 +0300 [SPARK-42234][SQL] Rename error class: `UNSUPPORTED_FEATURE.REPEATED_PIVOT` ### What changes were proposed in this pull request? This PR proposes to rename `UNSUPPORTED_FEATURE.REPEATED_PIVOT` to `REPEATED_CLAUSE`. ### Why are the changes needed? `REPEATED_PIVOT` is actually not an `UNSUPPORTED_FEATURE`, and there must be other cases we should cover in more generic way ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated UTs. Closes #39795 from itholic/REPEATED_PIVOT. Lead-authored-by: itholic Co-authored-by: Haejoon Lee <44108233+itho...@users.noreply.github.com> Signed-off-by: Max Gekk --- core/src/main/resources/error/error-classes.json | 11 ++- .../org/apache/spark/sql/errors/QueryExecutionErrors.scala| 6 +++--- .../scala/org/apache/spark/sql/RelationalGroupedDataset.scala | 4 +++- .../apache/spark/sql/errors/QueryExecutionErrorsSuite.scala | 6 +++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 178eda8ce11..030c65e2056 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -1208,6 +1208,12 @@ ], "sqlState" : "42K03" }, + "REPEATED_CLAUSE" : { +"message" : [ + "The clause may be used at most once per operation." +], +"sqlState" : "42614" + }, "ROUTINE_ALREADY_EXISTS" : { "message" : [ "Cannot create the function because it already exists.", @@ -1597,11 +1603,6 @@ "Python UDF in the ON clause of a JOIN. In case of an INNNER JOIN consider rewriting to a CROSS JOIN with a WHERE clause." ] }, - "REPEATED_PIVOT" : { -"message" : [ - "Repeated PIVOT operation." -] - }, "SET_NAMESPACE_PROPERTY" : { "message" : [ " is a reserved namespace property, ." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index c64c26e510b..b3bd7b727bf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2602,10 +2602,10 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase { cause = null) } - def repeatedPivotsUnsupportedError(): Throwable = { + def repeatedPivotsUnsupportedError(clause: String, operation: String): Throwable = { new SparkUnsupportedOperationException( - errorClass = "UNSUPPORTED_FEATURE.REPEATED_PIVOT", - messageParameters = Map.empty[String, String]) + errorClass = "REPEATED_CLAUSE", + messageParameters = Map("clause" -> clause, "operation" -> operation)) } def pivotNotAfterGroupByUnsupportedError(): Throwable = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 61517de0dfa..b168bbc4b42 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -483,7 +483,9 @@ class RelationalGroupedDataset protected[sql]( groupingExprs, RelationalGroupedDataset.PivotType(pivotColumn.expr, valueExprs)) case _: RelationalGroupedDataset.PivotType => -throw QueryExecutionErrors.repeatedPivotsUnsupportedError() +throw QueryExecutionErrors.repeatedPivotsUnsupportedError( + clause = "PIVOT", operation = "SUBQUERY" +) case _ => throw QueryExecutionErrors.pivotNotAfterGroupByUnsupportedError() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala index c679e4f707f..5d4b8e0b0c4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala @@ -205,9 +205,9 @@ class QueryExecutionErrorsSuite } checkError( exception = e1, - errorClass =