martin-g commented on code in PR #19078:
URL: https://github.com/apache/datafusion/pull/19078#discussion_r2588707683


##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -15,33 +15,52 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::marker::PhantomData;
 use std::sync::Arc;
 
+use arrow::array::timezone::Tz;
 use arrow::array::{
     Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
     StringArrayType, StringViewArray,
 };
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
+use arrow::compute::kernels::cast_utils::{
+    string_to_datetime, string_to_timestamp_nanos,
+};
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
 use chrono::format::{parse, Parsed, StrftimeItems};
 use chrono::LocalResult::Single;
 use chrono::{DateTime, TimeZone, Utc};
-
 use datafusion_common::cast::as_generic_string_array;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, unwrap_or_internal_err, DataFusionError, 
Result,
-    ScalarType, ScalarValue,
+    exec_datafusion_err, exec_err, internal_datafusion_err, 
unwrap_or_internal_err,
+    DataFusionError, Result, ScalarValue,
 };
 use datafusion_expr::ColumnarValue;
+use num_traits::{PrimInt, ToPrimitive};
 
 /// Error message if nanosecond conversion request beyond supported interval
 const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented 
as nanoseconds have to be between 1677-09-21T00:12:44.0 and 
2262-04-11T23:47:16.854775804";
 
+#[expect(unused)]
 /// Calls string_to_timestamp_nanos and converts the error type
 pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> {
     string_to_timestamp_nanos(s).map_err(|e| e.into())
 }
 
+pub(crate) fn string_to_timestamp_nanos_with_timezone(
+    timezone: &Option<Tz>,
+    s: &str,
+) -> Result<i64> {
+    let tz = timezone.unwrap_or("UTC".parse()?);

Review Comment:
   ```suggestion
       let tz = timezone.unwrap_or(chrono_tz::UTC);
   ```



##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +107,52 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
         )
     };
 
+    let mut datetime_str = s;
+    let mut format = format;
+
+    // we manually handle the most common case of a named timezone at the end 
of the timestamp
+    // not that %+ handles 'Z' at the end of the string without a space. This 
code doesn't

Review Comment:
   ```suggestion
       // note that %+ handles 'Z' at the end of the string without a space. 
This code doesn't
   ```



##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -15,33 +15,52 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::marker::PhantomData;
 use std::sync::Arc;
 
+use arrow::array::timezone::Tz;
 use arrow::array::{
     Array, ArrowPrimitiveType, AsArray, GenericStringArray, PrimitiveArray,
     StringArrayType, StringViewArray,
 };
-use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
-use arrow::datatypes::DataType;
+use arrow::compute::kernels::cast_utils::{
+    string_to_datetime, string_to_timestamp_nanos,
+};
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow_buffer::ArrowNativeType;
 use chrono::format::{parse, Parsed, StrftimeItems};
 use chrono::LocalResult::Single;
 use chrono::{DateTime, TimeZone, Utc};
-
 use datafusion_common::cast::as_generic_string_array;
 use datafusion_common::{
-    exec_datafusion_err, exec_err, unwrap_or_internal_err, DataFusionError, 
Result,
-    ScalarType, ScalarValue,
+    exec_datafusion_err, exec_err, internal_datafusion_err, 
unwrap_or_internal_err,
+    DataFusionError, Result, ScalarValue,
 };
 use datafusion_expr::ColumnarValue;
+use num_traits::{PrimInt, ToPrimitive};
 
 /// Error message if nanosecond conversion request beyond supported interval
 const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented 
as nanoseconds have to be between 1677-09-21T00:12:44.0 and 
2262-04-11T23:47:16.854775804";
 
+#[expect(unused)]

Review Comment:
   Since it is crate-private why not remove it ?



##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -152,6 +212,20 @@ pub(crate) fn string_to_timestamp_nanos_formatted(
         .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
 }
 
+pub(crate) fn string_to_timestamp_nanos_formatted_with_timezone(
+    timezone: &Option<Tz>,
+    s: &str,
+    format: &str,
+) -> Result<i64, DataFusionError> {
+    let dt =
+        string_to_datetime_formatted(&timezone.unwrap_or("UTC".parse()?), s, 
format)?;

Review Comment:
   ```suggestion
           string_to_datetime_formatted(&timezone.unwrap_or(chrono_tz::UTC)), 
s, format)?;
   ```



##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -923,7 +1536,11 @@ mod tests {
     }
 
     fn parse_timestamp_formatted(s: &str, format: &str) -> Result<i64, 
DataFusionError> {
-        let result = string_to_timestamp_nanos_formatted(s, format);
+        let result = string_to_timestamp_nanos_formatted_with_timezone(
+            &Some("UTC".parse()?),

Review Comment:
   ```suggestion
               &Some(chrono_tz::UTC)),
   ```



##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +107,52 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
         )
     };
 
+    let mut datetime_str = s;
+    let mut format = format;
+
+    // we manually handle the most common case of a named timezone at the end 
of the timestamp
+    // not that %+ handles 'Z' at the end of the string without a space. This 
code doesn't
+    // handle named timezones with no preceding space since that would require 
writing a
+    // custom parser (or switching to Jiff)
+    let tz: Option<chrono_tz::Tz> = if format.ends_with(" %Z") {
+        // grab the string after the last space as the named timezone
+        let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect();
+        let timezone_name = parts[0];
+        datetime_str = parts[1];
+
+        // attempt to parse the timezone name
+        let result: Result<chrono_tz::Tz, chrono_tz::ParseError> = 
timezone_name.parse();
+        let Ok(tz) = result else {
+            return Err(err(&result.unwrap_err().to_string()));
+        };
+
+        // successfully parsed the timezone name, remove the ' %Z' from the 
format
+        format = format.trim_end_matches(" %Z");

Review Comment:
   since it is guarded by `if format.ends_with(" %Z")`:
   
   ```suggestion
           format = &format[..format.len() - 3];
   ```



##########
docs/source/user-guide/sql/scalar_functions.md:
##########
@@ -2873,7 +2909,13 @@ Additional examples can be found 
[here](https://github.com/apache/datafusion/blo
 
 ### `to_timestamp_nanos`
 
-Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z`). Supports 
strings, integer, and unsigned integer types as input. Strings are parsed as 
RFC3339 (e.g. '2023-07-20T05:44:00') if no [Chrono 
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are 
provided. Integers and unsigned integers are interpreted as nanoseconds since 
the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z<TZ>`) in the 
session time zone. Supports strings,

Review Comment:
   ```suggestion
   Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the 
session time zone. Supports strings,
   ```



##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -141,6 +200,7 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
 ///
 /// [`chrono::format::strftime`]: 
https://docs.rs/chrono/latest/chrono/format/strftime/index.html
 #[inline]
+#[expect(unused)]

Review Comment:
   Since it is crate-private why not remove it ?



##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -165,17 +222,31 @@ Additional examples can be found 
[here](https://github.com/apache/datafusion/blo
     ),
     argument(
         name = "format_n",
-        description = "Optional [Chrono 
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) 
strings to use to parse the expression. Formats will be tried in the order they 
appear with the first successful one being returned. If none of the formats 
successfully parse the expression an error will be returned."
+        description = r#"
+Optional [Chrono 
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) 
strings to use to parse the expression.
+Formats will be tried in the order they appear with the first successful one 
being returned. If none of the formats successfully
+parse the expression an error will be returned. Note: parsing of named 
timezones (e.g. 'America/New_York') using %Z is
+only supported at the end of the string preceded by a space.
+"#
     )
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct ToTimestampMicrosFunc {
     signature: Signature,
+    timezone: Option<Arc<str>>,
 }
 
 #[user_doc(
     doc_section(label = "Time and Date Functions"),
-    description = "Converts a value to a timestamp 
(`YYYY-MM-DDT00:00:00.000000000Z`). Supports strings, integer, and unsigned 
integer types as input. Strings are parsed as RFC3339 (e.g. 
'2023-07-20T05:44:00') if no [Chrono 
format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are 
provided. Integers and unsigned integers are interpreted as nanoseconds since 
the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding timestamp.",
+    description = r#"
+Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000Z<TZ>`) in the 
session time zone. Supports strings,

Review Comment:
   ```suggestion
   Converts a value to a timestamp (`YYYY-MM-DDT00:00:00.000000000<TZ>`) in the 
session time zone. Supports strings,
   ```



##########
datafusion/sqllogictest/test_files/to_timestamp_timezone.slt:
##########
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+##########
+## to_timestamp timezone tests
+##########
+
+## Reset timezone for other tests
+statement ok
+RESET datafusion.execution.time_zone
+
+## Test 1: Default timezone (None) with naive timestamp
+## Naive timestamps (without explicit timezone) should be interpreted as UTC 
by default
+query P
+SELECT to_timestamp('2020-09-08T13:42:29');
+----
+2020-09-08T13:42:29
+
+## Test 2: Explicit UTC timezone ('Z' suffix)
+## Explicit timezone should be respected regardless of session timezone
+query P
+SELECT to_timestamp('2020-09-08T13:42:29Z');
+----
+2020-09-08T13:42:29
+
+## Test 3: Explicit timezone offset (+05:00)
+## Explicit offset should be respected - this is 13:42:29+05:00 which is 
08:42:29 UTC
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+05:00');
+----
+2020-09-08T08:42:29
+
+## Test 4: Explicit timezone offset without colon (+0500)
+## Should handle offset formats without colons
+query P
+SELECT to_timestamp('2020-09-08T13:42:29+0500');
+----
+2020-09-08T08:42:29
+
+## Test 5: Negative timezone offset
+query P
+SELECT to_timestamp('2020-09-08T13:42:29-03:30');
+----
+2020-09-08T17:12:29
+
+## Test 6: Configure session timezone to America/New_York
+statement ok
+SET datafusion.execution.time_zone = 'America/New_York';
+
+## Test 7: Naive timestamp with configured timezone
+## '2020-09-08T13:42:29' in America/New_York is EDT (UTC-4)
+## So this should become '2020-09-08T13:42:29Z-04:00'

Review Comment:
   ```suggestion
   ## So this should become '2020-09-08T13:42:29-04:00'
   ```



##########
datafusion/functions/Cargo.toml:
##########
@@ -71,6 +71,7 @@ base64 = { version = "0.22", optional = true }
 blake2 = { version = "^0.10.2", optional = true }
 blake3 = { version = "1.8", optional = true }
 chrono = { workspace = true }
+chrono-tz = "0.10.4"

Review Comment:
   Make it optional and gate it behind the `datetime_expressions` feature ?



##########
datafusion/functions/src/datetime/to_unixtime.rs:
##########
@@ -114,9 +114,11 @@ impl ScalarUDFImpl for ToUnixtimeFunc {
             DataType::Timestamp(_, tz) => arg_args[0]
                 .cast_to(&DataType::Timestamp(TimeUnit::Second, tz), None)?
                 .cast_to(&DataType::Int64, None),
-            DataType::Utf8 => ToTimestampSecondsFunc::new()
-                .invoke_with_args(args)?
-                .cast_to(&DataType::Int64, None),
+            DataType::Utf8 => {

Review Comment:
   Should this also support LargeUtf8 and Utf8View ?



##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -89,11 +107,52 @@ pub(crate) fn string_to_datetime_formatted<T: TimeZone>(
         )
     };
 
+    let mut datetime_str = s;
+    let mut format = format;
+
+    // we manually handle the most common case of a named timezone at the end 
of the timestamp
+    // not that %+ handles 'Z' at the end of the string without a space. This 
code doesn't
+    // handle named timezones with no preceding space since that would require 
writing a
+    // custom parser (or switching to Jiff)
+    let tz: Option<chrono_tz::Tz> = if format.ends_with(" %Z") {
+        // grab the string after the last space as the named timezone
+        let parts: Vec<&str> = datetime_str.rsplitn(2, ' ').collect();

Review Comment:
   You can use pattern matching to avoid the Vec allocation:
   ```suggestion
           if let Some((datetime_str, timezone_name)) = 
datetime_str.rsplit_once(' ') {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to