GuoPhilipse commented on a change in pull request #28593: URL: https://github.com/apache/spark/pull/28593#discussion_r429990585
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala ########## @@ -59,7 +59,7 @@ object Cast { case (StringType, TimestampType) => true case (BooleanType, TimestampType) => true case (DateType, TimestampType) => true - case (_: NumericType, TimestampType) => true + case (_: FractionalType, TimestampType) => true Review comment: do you need forbiding casting timestamp to numeric type at the same time,maybe someone will complain about it later ########## File path: docs/sql-migration-guide.md ########## @@ -27,6 +27,10 @@ license: | - In Spark 3.1, grouping_id() returns long values. In Spark version 3.0 and earlier, this function returns int values. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.integerGroupingId` to `true`. - In Spark 3.1, SQL UI data adopts the `formatted` mode for the query plan explain results. To restore the behavior before Spark 3.0, you can set `spark.sql.ui.explainMode` to `extended`. + + - In Spark 3.1, casting numeric to timestamp and will be forbidden by default, user can enable it by setting spark.sql.legacy.allowCastNumericToTimestamp to true, and functions(TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS) are strongly recommended to avoid possible inaccurate scenes, [SPARK-31710](https://issues.apache.org/jira/browse/SPARK-31710) for more details. + + - In Spark 3.1, to_date function with date format as the second parameter will be forbidden by default, user can enable it by setting spark.sql.legacy.allowCastNumericToTimestamp to true, [SPARK-31710](https://issues.apache.org/jira/browse/SPARK-31710) for more details. Review comment: Thanks @cloud-fan ,as we discussed before, the functions behaviors well in individual sqls,but the forbid behavior by default is suitbale for all the spark tasks, or you mean spark decided not to forbid it by default? ########## File path: python/pyspark/sql/dataframe.py ########## @@ -521,7 +521,7 @@ def withWatermark(self, eventTime, delayThreshold): .. note:: Evolving - >>> sdf.select('name', sdf.time.cast('timestamp')).withWatermark('time', '10 minutes') + # >>> sdf.select('name', sdf.time.cast('timestamp')).withWatermark('time', '10 minutes') Review comment: Good suggestion! let me fix it. ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala ########## @@ -411,19 +411,48 @@ abstract class NumberToTimestampBase extends UnaryExpression protected def upScaleFactor: Long - override def inputTypes: Seq[AbstractDataType] = Seq(IntegralType) + override def inputTypes: Seq[AbstractDataType] = Seq(NumericType) override def dataType: DataType = TimestampType override def nullSafeEval(input: Any): Any = { - Math.multiplyExact(input.asInstanceOf[Number].longValue(), upScaleFactor) + child.dataType match { + case ByteType | ShortType | IntegerType | LongType => + Math.multiplyExact(input.asInstanceOf[Number].longValue(), upScaleFactor).longValue() + case DecimalType() => Review comment: @cloud-fan i plan to use a legacy config for those test cases who used casting fraction to timestamp,so we can keep compatibility with previous tests. ########## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala ########## @@ -50,7 +50,17 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { } protected def checkNullCast(from: DataType, to: DataType): Unit = { - checkEvaluation(cast(Literal.create(null, from), to, UTC_OPT), null) + withSQLConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP.key -> "true") { + checkEvaluation(cast(Literal.create(null, from), to, UTC_OPT), null) + } + + withSQLConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP.key -> "false") { Review comment: Good catch,I have removed it. ########## File path: sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala ########## @@ -32,9 +34,17 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.DoubleType import org.apache.spark.unsafe.types.CalendarInterval -class DateFunctionsSuite extends QueryTest with SharedSparkSession { +class DateFunctionsSuite extends QueryTest with SharedSparkSession with BeforeAndAfter{ import testImplicits._ + before { + sqlContext.conf.setConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP, true) Review comment: Now we forbid the numeric type casting to timestamp ,but the above three functions is designed for Integral type, we need to expand it to support numeric type, i suggest we can support it in a single PR,this PR have too many changes, how do you think ?@cloud-fan ########## File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala ########## @@ -3358,6 +3358,17 @@ object functions { window(timeColumn, windowDuration, windowDuration, "0 second") } + /** + * Creates timestamp from the number of seconds since UTC epoch.", + * @group = "datetime_funcs", + * @since = "3.1.0") + */ + def timestamp_seconds(e: Column): Column = withExpr { SecondsToTimestamp(e.expr) } + + def array_contains1(column: Column, value: Any): Column = withExpr { Review comment: sorry, i forgot to delete the test one.have removed it ########## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala ########## @@ -376,29 +409,46 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(cast(ts, LongType), 15.toLong) checkEvaluation(cast(ts, FloatType), 15.003f) checkEvaluation(cast(ts, DoubleType), 15.003) - checkEvaluation(cast(cast(tss, ShortType), TimestampType), - DateTimeUtils.fromJavaTimestamp(ts) * MILLIS_PER_SECOND) - checkEvaluation(cast(cast(tss, IntegerType), TimestampType), - DateTimeUtils.fromJavaTimestamp(ts) * MILLIS_PER_SECOND) - checkEvaluation(cast(cast(tss, LongType), TimestampType), - DateTimeUtils.fromJavaTimestamp(ts) * MILLIS_PER_SECOND) - checkEvaluation( - cast(cast(millis.toFloat / MILLIS_PER_SECOND, TimestampType), FloatType), - millis.toFloat / MILLIS_PER_SECOND) - checkEvaluation( - cast(cast(millis.toDouble / MILLIS_PER_SECOND, TimestampType), DoubleType), - millis.toDouble / MILLIS_PER_SECOND) - checkEvaluation( - cast(cast(Decimal(1), TimestampType), DecimalType.SYSTEM_DEFAULT), - Decimal(1)) - // A test for higher precision than millis - checkEvaluation(cast(cast(0.000001, TimestampType), DoubleType), 0.000001) + withSQLConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP.key -> "true") { + checkEvaluation(cast(cast(tss, ShortType), TimestampType), + DateTimeUtils.fromJavaTimestamp(ts) * MILLIS_PER_SECOND) + checkEvaluation(cast(cast(tss, IntegerType), TimestampType), + DateTimeUtils.fromJavaTimestamp(ts) * MILLIS_PER_SECOND) + checkEvaluation(cast(cast(tss, LongType), TimestampType), + DateTimeUtils.fromJavaTimestamp(ts) * MILLIS_PER_SECOND) + checkEvaluation( + cast(cast(millis.toFloat / MILLIS_PER_SECOND, TimestampType), FloatType), + millis.toFloat / MILLIS_PER_SECOND) + checkEvaluation( + cast(cast(millis.toDouble / MILLIS_PER_SECOND, TimestampType), DoubleType), + millis.toDouble / MILLIS_PER_SECOND) + checkEvaluation( + cast(cast(Decimal(1), TimestampType), DecimalType.SYSTEM_DEFAULT), + Decimal(1)) + + // A test for higher precision than millis + checkEvaluation(cast(cast(0.000001, TimestampType), DoubleType), 0.000001) + + checkEvaluation(cast(Double.NaN, TimestampType), null) + checkEvaluation(cast(1.0 / 0.0, TimestampType), null) + checkEvaluation(cast(Float.NaN, TimestampType), null) + checkEvaluation(cast(1.0f / 0.0f, TimestampType), null) + } - checkEvaluation(cast(Double.NaN, TimestampType), null) - checkEvaluation(cast(1.0 / 0.0, TimestampType), null) - checkEvaluation(cast(Float.NaN, TimestampType), null) - checkEvaluation(cast(1.0f / 0.0f, TimestampType), null) + withSQLConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP.key -> "false") { + assert(!cast(cast(tss, ShortType), TimestampType).resolved) Review comment: NIce, have removed the duplicated ones ########## File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala ########## @@ -3358,6 +3358,15 @@ object functions { window(timeColumn, windowDuration, windowDuration, "0 second") } + /** + * Creates timestamp from the number of seconds since UTC epoch.", + * @group = "datetime_funcs", + * @since = "3.1.0") + */ + def timestamp_seconds(e: Column): Column = withExpr { Review comment: oops, let me fix it. ########## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala ########## @@ -1311,6 +1311,27 @@ class CastSuite extends CastSuiteBase { checkEvaluation(cast(negativeTs, LongType), expectedSecs) } } + + test("SPARK-31710:fail casting from integral to timestamp by default") { + withSQLConf(SQLConf.LEGACY_AllOW_CAST_NUMERIC_TO_TIMESTAMP.key -> "false") { + assert(!cast(2.toByte, TimestampType).resolved) + assert(!cast(10.toShort, TimestampType).resolved) + assert(!cast(3, TimestampType).resolved) + assert(!cast(10L, TimestampType).resolved) + assert(!cast(Decimal(1.2), TimestampType).resolved) + assert(!cast(1.7f, TimestampType).resolved) + assert(!cast(2.3d, TimestampType).resolved) + } + withSQLConf(SQLConf.LEGACY_AllOW_CAST_NUMERIC_TO_TIMESTAMP.key -> "true") { + assert(cast(2.toByte, TimestampType).resolved) + assert(cast(10.toShort, TimestampType).resolved) + assert(cast(3, TimestampType).resolved) + assert(cast(10L, TimestampType).resolved) + assert(cast(Decimal(1.2), TimestampType).resolved) + assert(cast(1.7f, TimestampType).resolved) + assert(cast(2.3d, TimestampType).resolved) + } + } Review comment: Nice ! @kiszk ########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala ########## @@ -59,7 +59,7 @@ object Cast { case (StringType, TimestampType) => true case (BooleanType, TimestampType) => true case (DateType, TimestampType) => true - case (_: NumericType, TimestampType) => true + case (_: FractionalType, TimestampType) => true Review comment: ok , will correct it ########## File path: sql/core/src/main/scala/org/apache/spark/sql/functions.scala ########## @@ -3358,6 +3358,17 @@ object functions { window(timeColumn, windowDuration, windowDuration, "0 second") } + /** Review comment: yse,have fixed it. ########## File path: python/pyspark/sql/functions.py ########## @@ -1211,8 +1211,8 @@ def to_date(col, format=None): >>> df.select(to_date(df.t).alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] - >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) - >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect() + # >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) + # >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect() Review comment: let me fix it ########## File path: python/pyspark/sql/functions.py ########## @@ -1427,6 +1427,19 @@ def to_utc_timestamp(timestamp, tz): return Column(sc._jvm.functions.to_utc_timestamp(_to_java_column(timestamp), tz)) +@since(3.1) +def timestamp_seconds(col): Review comment: @HyukjinKwon I just count the usage,there are 47 places where the function was called. ########## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala ########## @@ -58,13 +60,16 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd TestHive.setCacheTables(true) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) + TestHive.setConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP, true) Review comment: 11 test cases failed, 6 of them can be fixed by using new function ,while the rest (createQueryTest) are casting fraction to timestamp,including one sql in decimal_1.q, maybe a overall legacy config is good choice for them ,meanwhile i am trying to find a more suitable way to fit them.@cloud-fan Do you have any good ideas? ########## File path: sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala ########## @@ -58,13 +60,16 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd TestHive.setCacheTables(true) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) + TestHive.setConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP, true) Review comment: i will add the legacy config before the effected test cases instead of the whole test suite. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org