vranes commented on code in PR #55535:
URL: https://github.com/apache/spark/pull/55535#discussion_r3161865452
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala:
##########
@@ -3897,3 +3899,178 @@ case class TimestampDiff(
copy(startTimestamp = newLeft, endTimestamp = newRight)
}
}
+
+/**
+ * Aligns a timestamp to the start of a fixed-size interval bucket.
+ *
+ * Returns the start of the half-open bucket [start, start + bucketSize)
containing ts.
+ * All computation is performed on UTC values.
+ */
+case class TimeBucket(
+ bucketSize: Expression,
+ ts: Expression,
+ originTs: Expression)
+ extends TernaryExpression with ExpectsInputTypes {
+
+ override def nullIntolerant: Boolean = true
+
+ override def first: Expression = bucketSize
+ override def second: Expression = ts
+ override def third: Expression = originTs
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(
+ TypeCollection(DayTimeIntervalType, YearMonthIntervalType),
+ AnyTimestampType,
+ AnyTimestampType)
+
+ override def dataType: DataType = ts.dataType
+
+ override def checkInputDataTypes(): TypeCheckResult = {
+ val defaultCheck = super.checkInputDataTypes()
+ if (defaultCheck.isFailure) return defaultCheck
+
+ if (!bucketSize.foldable) {
+ return DataTypeMismatch(
+ errorSubClass = "NON_FOLDABLE_INPUT",
+ messageParameters = Map(
+ "inputName" -> toSQLId("bucketSize"),
+ "inputType" -> toSQLType(bucketSize.dataType),
+ "inputExpr" -> toSQLExpr(bucketSize)))
+ }
+
+ val bucketSizeValue = bucketSize.eval()
+ if (bucketSizeValue != null) {
+ val isNonPositive = bucketSize.dataType match {
+ case _: DayTimeIntervalType => bucketSizeValue.asInstanceOf[Long] <= 0
+ case _: YearMonthIntervalType => bucketSizeValue.asInstanceOf[Int] <= 0
+ case other => throw SparkException.internalError(
+ s"Unexpected bucketSize type: $other")
+ }
+ if (isNonPositive) {
+ return DataTypeMismatch(
+ errorSubClass = "VALUE_OUT_OF_RANGE",
+ messageParameters = Map(
+ "exprName" -> "time_bucket",
+ "valueRange" -> "(0, inf)",
Review Comment:
`exprName` fixed, thanks!
Regarding valueRange, I'm unsure because DT uses Long and YM uses Int as
internal representation, but the SQL function doesn't actually accept those raw
values as input for intervals. Rendering (0, 9223372036854775807] for DT and
(0, 2147483647] for YM is technically accurate but doesn't represent valid user
input.
I've found 4 cases of (0, positive) range in the repo, maybe I could use
this or stick with your proposal, what do you think?
<img width="540" height="154" alt="image"
src="https://github.com/user-attachments/assets/0981a053-ac6c-4e5a-b583-1837c4aeb26f"
/>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]