martin-g commented on code in PR #19078:
URL: https://github.com/apache/datafusion/pull/19078#discussion_r2588707683
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -15,33 +15,52 @@
// specific language governing permissions and limitations
// under the License.
+use std::marker::PhantomData;
use std::sync::Arc;
+use arrow::array::timezone::Tz;
use arrow::array::{
Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
StringArrayType, StringViewArray,
};
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
+use arrow::compute::kernels::cast_utils::{
+ string_to_datetime, string_to_timestamp_nanos,
+};
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
use chrono::format::{parse, Parsed, StrftimeItems};
use chrono::LocalResult::Single;
use chrono::{DateTime, TimeZone, Utc};
-
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::{
- exec_datafusion_err, exec_err, unwrap_or_internal_err, DataFusionError,
Result,
- ScalarType, ScalarValue,
+ exec_datafusion_err, exec_err, internal_datafusion_err,
unwrap_or_internal_err,
+ DataFusionError, Result, ScalarValue,
};
use datafusion_expr::ColumnarValue;
+use num_traits::{PrimInt, ToPrimitive};
/// Error message if nanosecond conversion request beyond supported interval
const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented
as nanoseconds have to be between 1677-09-21T00:12:44.0 and
2262-04-11T23:47:16.854775804";
+#[expect(unused)]
/// Calls string_to_timestamp_nanos and converts the error type
pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> {
string_to_timestamp_nanos(s).map_err(|e| e.into())
}
+pub(crate) fn string_to_timestamp_nanos_with_timezone(
+ timezone: &Option<Tz>,
+ s: &str,
+) -> Result<i64> {
+ let tz = timezone.unwrap_or("UTC".parse()?);
Review Comment:
```suggestion
let tz = timezone.unwrap_or(chrono_tz::UTC);
```
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +107,52 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
)
};
+ let mut datetime_str = s;
+ let mut format = format;
+
+ // we manually handle the most common case of a named timezone at the end
of the timestamp
+ // not that %+ handles 'Z' at the end of the string without a space. This
code doesn't
Review Comment:
```suggestion
// note that %+ handles 'Z' at the end of the string without a space.
This code doesn't
```
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -15,33 +15,52 @@
// specific language governing permissions and limitations
// under the License.
+use std::marker::PhantomData;
use std::sync::Arc;
+use arrow::array::timezone::Tz;
use arrow::array::{
Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
StringArrayType, StringViewArray,
};
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
+use arrow::compute::kernels::cast_utils::{
+ string_to_datetime, string_to_timestamp_nanos,
+};
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
use chrono::format::{parse, Parsed, StrftimeItems};
use chrono::LocalResult::Single;
use chrono::{DateTime, TimeZone, Utc};
-
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::{
- exec_datafusion_err, exec_err, unwrap_or_internal_err, DataFusionError,
Result,
- ScalarType, ScalarValue,
+ exec_datafusion_err, exec_err, internal_datafusion_err,
unwrap_or_internal_err,
+ DataFusionError, Result, ScalarValue,
};
use datafusion_expr::ColumnarValue;
+use num_traits::{PrimInt, ToPrimitive};
/// Error message if nanosecond conversion request beyond supported interval
const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented
as nanoseconds have to be between 1677-09-21T00:12:44.0 and
2262-04-11T23:47:16.854775804";
+#[expect(unused)]
Review Comment:
Since it is crate-private why not remove it ?
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -152,6 +212,20 @@ pub(crate) fn string_to_timestamp_nanos_formatted(
.ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
}
+pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone(
+ timezone: &Option<Tz>,
+ s: &str,
+ format: &str,
+) -> Result<i64, DataFusionError> {
+ let dt =
+ string_to_datetime_formatted(&timezone.unwrap_or("UTC".parse()?), s,
format)?;
Review Comment:
```suggestion
string_to_datetime_formatted(&timezone.unwrap_or(chrono_tz::UTC)),
s, format)?;
```
##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -923,7 +1536,11 @@ mod tests {
}
fn parse_timestamp_formatted(s: &str, format: &str) -> Result<i64,
DataFusionError> {
- let result = string_to_timestamp_nanos_formatted(s, format);
+ let result = string_to_timestamp_nanos_formatted_with_timezone(
+ &Some("UTC".parse()?),
Review Comment:
```suggestion
&Some(chrono_tz::UTC)),
```
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +107,52 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
)
};
+ let mut datetime_str = s;
+ let mut format = format;
+
+ // we manually handle the most common case of a named timezone at the end
of the timestamp
+ // not that %+ handles 'Z' at the end of the string without a space. This
code doesn't
+ // handle named timezones with no preceding space since that would require
writing a
+ // custom parser (or switching to Jiff)
+ let tz: Option<chrono_tz::Tz> = if format.ends_with(" %Z") {
+ // grab the string after the last space as the named timezone
+ let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect();
+ let timezone_name = parts[0];
+ datetime_str = parts[1];
+
+ // attempt to parse the timezone name
+ let result: Result<chrono_tz::Tz, chrono_tz::ParseError> =
timezone_name.parse();
+ let Ok(tz) = result else {
+ return Err(err(&result.unwrap_err().to_string()));
+ };
+
+ // successfully parsed the timezone name, remove the ' %Z' from the
format
+ format = format.trim_end_matches(" %Z");
Review Comment:
since it is guarded by `if format.ends_with(" %Z")`:
```suggestion
format = &format[..format.len() - 3];
```
##########
docs/source/user-guide/sql/scalar_functions.md:
##########
@@ -2873,7 +2909,13 @@ Additional examples can be found
[here](https://github.com/apache/datafusion/blo
### `to_timestamp_nanos`
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports
strings, integer, and unsigned integer types as input. Strings are parsed as
RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are
provided. Integers and unsigned integers are interpreted as nanoseconds since
the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z<TZ>`) in the
session time zone. Supports strings,
Review Comment:
```suggestion
Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the
session time zone. Supports strings,
```
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -141,6 +200,7 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
///
/// [`chrono::format::strftime`]:
https://docs.rs/chrono/latest/chrono/format/strftime/index.html
#[inline]
+#[expect(unused)]
Review Comment:
Since it is crate-private why not remove it ?
##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -165,17 +222,31 @@ Additional examples can be found
[here](https://github.com/apache/datafusion/blo
),
argument(
name = "format_n",
- description = "Optional [Chrono
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
strings to use to parse the expression. Formats will be tried in the order they
appear with the first successful one being returned. If none of the formats
successfully parse the expression an error will be returned."
+ description = r#"
+Optional [Chrono
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one
being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named
timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
)
)]
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct ToTimestampMicrosFunc {
signature: Signature,
+ timezone: Option<Arc<str>>,
}
#[user_doc(
doc_section(label = "Time and Date Functions"),
- description = "Converts a value to a timestamp
(`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned
integer types as input. Strings are parsed as RFC3339 (e.g.
'2023-07-20T05:44:00') if no [Chrono
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are
provided. Integers and unsigned integers are interpreted as nanoseconds since
the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+ description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z<TZ>`) in the
session time zone. Supports strings,
Review Comment:
```suggestion
Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the
session time zone. Supports strings,
```
##########
datafusion/sqllogictest/test_files/to_timestamp_timezone.slt:
##########
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## to_timestamp timezone tests
+##########
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 1: Default timezone (None) with naive timestamp
+## Naive timestamps (without explicit timezone) should be interpreted as UTC
by default
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+## Test 2: Explicit UTC timezone ('Z' suffix)
+## Explicit timezone should be respected regardless of session timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+## Test 3: Explicit timezone offset (+05:00)
+## Explicit offset should be respected - this is 13:42:29+05:00 which is
08:42:29 UTC
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+## Test 4: Explicit timezone offset without colon (+0500)
+## Should handle offset formats without colons
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+0500');
+----
+2020-09-08T08:42:29
+
+## Test 5: Negative timezone offset
+query P
+SELECT to_timestamp('2020-09-08T13:42:29-03:30');
+----
+2020-09-08T17:12:29
+
+## Test 6: Configure session timezone to America/New_York
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 7: Naive timestamp with configured timezone
+## '2020-09-08T13:42:29' in America/New_York is EDT (UTC-4)
+## So this should become '2020-09-08T13:42:29Z-04:00'
Review Comment:
```suggestion
## So this should become '2020-09-08T13:42:29-04:00'
```
##########
datafusion/functions/Cargo.toml:
##########
@@ -71,6 +71,7 @@ base64 = { version = "0.22", optional = true }
blake2 = { version = "^0.10.2", optional = true }
blake3 = { version = "1.8", optional = true }
chrono = { workspace = true }
+chrono-tz = "0.10.4"
Review Comment:
Make it optional and gate it behind the `datetime_expressions` feature ?
##########
datafusion/functions/src/datetime/to_unixtime.rs:
##########
@@ -114,9 +114,11 @@ impl ScalarUDFImpl for ToUnixtimeFunc {
DataType::Timestamp(_, tz) => arg_args[0]
.cast_to(&DataType::Timestamp(TimeUnit::Second, tz), None)?
.cast_to(&DataType::Int64, None),
- DataType::Utf8 => ToTimestampSecondsFunc::new()
- .invoke_with_args(args)?
- .cast_to(&DataType::Int64, None),
+ DataType::Utf8 => {
Review Comment:
Should this also support LargeUtf8 and Utf8View ?
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +107,52 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
)
};
+ let mut datetime_str = s;
+ let mut format = format;
+
+ // we manually handle the most common case of a named timezone at the end
of the timestamp
+ // not that %+ handles 'Z' at the end of the string without a space. This
code doesn't
+ // handle named timezones with no preceding space since that would require
writing a
+ // custom parser (or switching to Jiff)
+ let tz: Option<chrono_tz::Tz> = if format.ends_with(" %Z") {
+ // grab the string after the last space as the named timezone
+ let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect();
Review Comment:
You can use pattern matching to avoid the Vec allocation:
```suggestion
if let Some((datetime_str, timezone_name)) =
datetime_str.rsplit_once(' ') {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]