This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new cdb042ed0 Faster timestamp parsing (~70-90% faster) (#3801)
cdb042ed0 is described below

commit cdb042ed02a6c9afd0cf3f6f86072b09c599f59a
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Mar 9 16:42:54 2023 +0100

    Faster timestamp parsing (~70-90% faster) (#3801)
    
    * Faster timestamp parsing
    
    * Faster timezone parsing
    
    * More tests
    
    * Review feedback
    
    * Review feedback
    
    * Fix test
    
    * Format
---
 arrow-array/src/timezone.rs           |  59 +++----
 arrow-cast/Cargo.toml                 |   5 +
 arrow-cast/benches/parse_timestamp.rs |  44 +++++
 arrow-cast/src/cast.rs                |  10 +-
 arrow-cast/src/parse.rs               | 299 ++++++++++++++++++++++------------
 arrow/Cargo.toml                      |   4 +
 arrow/tests/timezone.rs               |  81 +++++++++
 7 files changed, 371 insertions(+), 131 deletions(-)

diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs
index 3af76c3da..f56189c46 100644
--- a/arrow-array/src/timezone.rs
+++ b/arrow-array/src/timezone.rs
@@ -18,29 +18,34 @@
 //! Timezone for timestamp arrays
 
 use arrow_schema::ArrowError;
-use chrono::format::{parse, Parsed, StrftimeItems};
 use chrono::FixedOffset;
 pub use private::{Tz, TzOffset};
 
-/// Parses a fixed offset of the form "+09:00"
-fn parse_fixed_offset(tz: &str) -> Result<FixedOffset, ArrowError> {
-    let mut parsed = Parsed::new();
-
-    if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%:z"))
-        .and_then(|_| parsed.to_fixed_offset())
-    {
-        return Ok(fixed_offset);
+/// Parses a fixed offset of the form "+09:00", "-09" or "+0930"
+fn parse_fixed_offset(tz: &str) -> Option<FixedOffset> {
+    let bytes = tz.as_bytes();
+
+    let mut values = match bytes.len() {
+        // [+-]XX:XX
+        6 if bytes[3] == b':' => [bytes[1], bytes[2], bytes[4], bytes[5]],
+        // [+-]XXXX
+        5 => [bytes[1], bytes[2], bytes[3], bytes[4]],
+        // [+-]XX
+        3 => [bytes[1], bytes[2], b'0', b'0'],
+        _ => return None,
+    };
+    values.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0'));
+    if values.iter().any(|x| *x > 9) {
+        return None;
     }
+    let secs = (values[0] * 10 + values[1]) as i32 * 60 * 60
+        + (values[2] * 10 + values[3]) as i32 * 60;
 
-    if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%#z"))
-        .and_then(|_| parsed.to_fixed_offset())
-    {
-        return Ok(fixed_offset);
+    match bytes[0] {
+        b'+' => FixedOffset::east_opt(secs),
+        b'-' => FixedOffset::west_opt(secs),
+        _ => None,
     }
-
-    Err(ArrowError::ParseError(format!(
-        "Invalid timezone \"{tz}\": Expected format [+-]XX:XX, [+-]XX, or 
[+-]XXXX"
-    )))
 }
 
 #[cfg(feature = "chrono-tz")]
@@ -83,12 +88,11 @@ mod private {
         type Err = ArrowError;
 
         fn from_str(tz: &str) -> Result<Self, Self::Err> {
-            if tz.starts_with('+') || tz.starts_with('-') {
-                Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?)))
-            } else {
-                Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| {
+            match parse_fixed_offset(tz) {
+                Some(offset) => Ok(Self(TzInner::Offset(offset))),
+                None => Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| {
                     ArrowError::ParseError(format!("Invalid timezone \"{tz}\": 
{e}"))
-                })?)))
+                })?))),
             }
         }
     }
@@ -261,13 +265,12 @@ mod private {
         type Err = ArrowError;
 
         fn from_str(tz: &str) -> Result<Self, Self::Err> {
-            if tz.starts_with('+') || tz.starts_with('-') {
-                Ok(Self(parse_fixed_offset(tz)?))
-            } else {
-                Err(ArrowError::ParseError(format!(
+            let offset = parse_fixed_offset(tz).ok_or_else(|| {
+                ArrowError::ParseError(format!(
                     "Invalid timezone \"{tz}\": only offset based timezones 
supported without chrono-tz feature"
-                )))
-            }
+                ))
+            })?;
+            Ok(Self(offset))
         }
     }
 
diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml
index 688e0001f..c383369c4 100644
--- a/arrow-cast/Cargo.toml
+++ b/arrow-cast/Cargo.toml
@@ -48,5 +48,10 @@ num = { version = "0.4", default-features = false, features 
= ["std"] }
 lexical-core = { version = "^0.8", default-features = false, features = 
["write-integers", "write-floats", "parse-integers", "parse-floats"] }
 
 [dev-dependencies]
+criterion = { version = "0.4", default-features = false }
 
 [build-dependencies]
+
+[[bench]]
+name = "parse_timestamp"
+harness = false
diff --git a/arrow-cast/benches/parse_timestamp.rs 
b/arrow-cast/benches/parse_timestamp.rs
new file mode 100644
index 000000000..d3ab41863
--- /dev/null
+++ b/arrow-cast/benches/parse_timestamp.rs
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_cast::parse::string_to_timestamp_nanos;
+use criterion::*;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let timestamps = [
+        "2020-09-08",
+        "2020-09-08T13:42:29",
+        "2020-09-08T13:42:29.190",
+        "2020-09-08T13:42:29.190855",
+        "2020-09-08T13:42:29.190855999",
+        "2020-09-08T13:42:29+00:00",
+        "2020-09-08T13:42:29.190+00:00",
+        "2020-09-08T13:42:29.190855+00:00",
+        "2020-09-08T13:42:29.190855999-05:00",
+        "2020-09-08T13:42:29.190855Z",
+    ];
+
+    for timestamp in timestamps {
+        let t = black_box(timestamp);
+        c.bench_function(t, |b| {
+            b.iter(|| string_to_timestamp_nanos(t).unwrap());
+        });
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index ae9016654..35bf62969 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -4914,7 +4914,7 @@ mod tests {
                 let err = cast_with_options(array, &to_type, 
&options).unwrap_err();
                 assert_eq!(
                     err.to_string(),
-                    "Cast error: Error parsing 'Not a valid date' as timestamp"
+                    "Parser error: Error parsing timestamp from 'Not a valid 
date': error parsing date"
                 );
             }
         }
@@ -7899,8 +7899,12 @@ mod tests {
             ]);
 
             let array = Arc::new(valid) as ArrayRef;
-            let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, 
Some(tz)))
-                .unwrap();
+            let b = cast_with_options(
+                &array,
+                &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)),
+                &CastOptions { safe: false },
+            )
+            .unwrap();
 
             let c = b
                 .as_any()
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index 7f6ca742d..38fb4fc29 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow_array::timezone::Tz;
 use arrow_array::types::*;
 use arrow_array::{ArrowNativeTypeOp, ArrowPrimitiveType};
 use arrow_buffer::ArrowNativeType;
@@ -22,6 +23,116 @@ use arrow_schema::ArrowError;
 use chrono::prelude::*;
 use std::str::FromStr;
 
+/// Helper for parsing timestamps
+struct TimestampParser {
+    /// The timestamp bytes to parse minus `b'0'`
+    ///
+    /// This makes interpretation as an integer inexpensive
+    digits: [u8; 32],
+    /// A mask containing a `1` bit where the corresponding byte is a valid 
ASCII digit
+    mask: u32,
+}
+
+impl TimestampParser {
+    fn new(bytes: &[u8]) -> Self {
+        let mut digits = [0; 32];
+        let mut mask = 0;
+
+        // Treating all bytes the same way, helps LLVM vectorise this correctly
+        for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() {
+            *o = i.wrapping_sub(b'0');
+            mask |= ((*o < 10) as u32) << idx
+        }
+
+        Self { digits, mask }
+    }
+
+    /// Returns true if the byte at `idx` in the original string equals `b`
+    fn test(&self, idx: usize, b: u8) -> bool {
+        self.digits[idx] == b.wrapping_sub(b'0')
+    }
+
+    /// Parses a date of the form `1997-01-31`
+    fn date(&self) -> Option<NaiveDate> {
+        if self.mask & 0b1111111111 != 0b1101101111
+            || !self.test(4, b'-')
+            || !self.test(7, b'-')
+        {
+            return None;
+        }
+
+        let year = self.digits[0] as u16 * 1000
+            + self.digits[1] as u16 * 100
+            + self.digits[2] as u16 * 10
+            + self.digits[3] as u16;
+
+        let month = self.digits[5] * 10 + self.digits[6];
+        let day = self.digits[8] * 10 + self.digits[9];
+
+        NaiveDate::from_ymd_opt(year as _, month as _, day as _)
+    }
+
+    /// Parses a time of any of forms
+    /// - `09:26:56`
+    /// - `09:26:56.123`
+    /// - `09:26:56.123456`
+    /// - `09:26:56.123456789`
+    /// - `092656`
+    ///
+    /// Returning the end byte offset
+    fn time(&self) -> Option<(NaiveTime, usize)> {
+        match (self.mask >> 11) & 0b11111111 {
+            // 09:26:56
+            0b11011011 if self.test(13, b':') && self.test(16, b':') => {
+                let hour = self.digits[11] * 10 + self.digits[12];
+                let minute = self.digits[14] * 10 + self.digits[15];
+                let second = self.digits[17] * 10 + self.digits[18];
+                let time = NaiveTime::from_hms_opt(hour as _, minute as _, 
second as _)?;
+
+                let millis = || {
+                    self.digits[20] as u32 * 100_000_000
+                        + self.digits[21] as u32 * 10_000_000
+                        + self.digits[22] as u32 * 1_000_000
+                };
+
+                let micros = || {
+                    self.digits[23] as u32 * 100_000
+                        + self.digits[24] as u32 * 10_000
+                        + self.digits[25] as u32 * 1_000
+                };
+
+                let nanos = || {
+                    self.digits[26] as u32 * 100
+                        + self.digits[27] as u32 * 10
+                        + self.digits[28] as u32
+                };
+
+                match self.test(19, b'.') {
+                    true => match (self.mask >> 20).trailing_ones() {
+                        3 => Some((time.with_nanosecond(millis())?, 23)),
+                        6 => Some((time.with_nanosecond(millis() + micros())?, 
26)),
+                        9 => Some((
+                            time.with_nanosecond(millis() + micros() + 
nanos())?,
+                            29,
+                        )),
+                        _ => None,
+                    },
+                    false => Some((time, 19)),
+                }
+            }
+            // 092656
+            0b111111 => {
+                let hour = self.digits[11] * 10 + self.digits[12];
+                let minute = self.digits[13] * 10 + self.digits[14];
+                let second = self.digits[15] * 10 + self.digits[16];
+                let time = NaiveTime::from_hms_opt(hour as _, minute as _, 
second as _)?;
+                Some((time, 17))
+            }
+            _ => None,
+        }
+    }
+}
+
 /// Accepts a string and parses it relative to the provided `timezone`
 ///
 /// In addition to RFC3339 / ISO8601 standard timestamps, it also
@@ -32,105 +143,83 @@ use std::str::FromStr;
 /// * `1997-01-31T09:26:56.123Z`        # RCF3339
 /// * `1997-01-31T09:26:56.123-05:00`   # RCF3339
 /// * `1997-01-31 09:26:56.123-05:00`   # close to RCF3339 but with a space 
rather than T
+/// * `2023-01-01 04:05:06.789 -08`     # close to RCF3339, no fractional 
seconds or time separator
 /// * `1997-01-31T09:26:56.123`         # close to RCF3339 but no timezone 
offset specified
 /// * `1997-01-31 09:26:56.123`         # close to RCF3339 but uses a space 
and no timezone offset
 /// * `1997-01-31 09:26:56`             # close to RCF3339, no fractional 
seconds
+/// * `1997-01-31 092656`               # close to RCF3339, no fractional 
seconds
+/// * `1997-01-31 092656+04:00`         # close to RCF3339, no fractional 
seconds or time separator
 /// * `1997-01-31`                      # close to RCF3339, only date no time
 ///
-/// Some formats that supported by PostgresSql 
<https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-TIME-TABLE>
-/// still not supported by chrono, like
-///     "2023-01-01 040506 America/Los_Angeles",
-///     "2023-01-01 04:05:06.789 +07:30:00",
-///     "2023-01-01 040506 +07:30:00",
-///     "2023-01-01 04:05:06.789 PST",
-///     "2023-01-01 04:05:06.789 -08",
+/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature 
is enabled
+///
+/// * `2023-01-01 040506 America/Los_Angeles`
+///
+/// If a timestamp is ambiguous, for example as a result of daylight-savings 
time, an error
+/// will be returned
+///
+/// Some formats supported by PostgresSql 
<https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-TIME-TABLE>
+/// are not supported, like
+///
+/// * "2023-01-01 04:05:06.789 +07:30:00",
+/// * "2023-01-01 040506 +07:30:00",
+/// * "2023-01-01 04:05:06.789 PST",
+///
+/// [IANA timezones]: https://www.iana.org/time-zones
 pub fn string_to_datetime<T: TimeZone>(
     timezone: &T,
     s: &str,
 ) -> Result<DateTime<T>, ArrowError> {
-    // Fast path:  RFC3339 timestamp (with a T)
-    // Example: 2020-09-08T13:42:29.190855Z
-    if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
-        return Ok(ts.with_timezone(timezone));
-    }
-
-    // Implement quasi-RFC3339 support by trying to parse the
-    // timestamp with various other format specifiers to to support
-    // separating the date and time with a space ' ' rather than 'T' to be
-    // (more) compatible with Apache Spark SQL
-
-    let supported_formats = vec![
-        "%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00
-        "%Y-%m-%d %H%M%S%.3f%:z",  // Example: "2023-01-01 040506 +07:30"
-    ];
-
-    for f in supported_formats.iter() {
-        if let Ok(ts) = DateTime::parse_from_str(s, f) {
-            return Ok(ts.with_timezone(timezone));
-        }
-    }
+    let err = |ctx: &str| {
+        ArrowError::ParseError(format!("Error parsing timestamp from '{s}': 
{ctx}"))
+    };
 
-    // with an explicit Z, using ' ' as a separator
-    // Example: 2020-09-08 13:42:29Z
-    if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
-        return Ok(ts.with_timezone(timezone));
+    let bytes = s.as_bytes();
+    if bytes.len() < 10 {
+        return Err(err("timestamp must contain at least 10 characters"));
     }
 
-    // Support timestamps without an explicit timezone offset, again
-    // to be compatible with what Apache Spark SQL does.
+    let parser = TimestampParser::new(bytes);
+    let date = parser.date().ok_or_else(|| err("error parsing date"))?;
+    if bytes.len() == 10 {
+        let offset = timezone.offset_from_local_date(&date);
+        let offset = offset
+            .single()
+            .ok_or_else(|| err("error computing timezone offset"))?;
 
-    // without a timezone specifier as a local time, using T as a separator
-    // Example: 2020-09-08T13:42:29.190855
-    if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") {
-        if let Some(offset) = 
timezone.offset_from_local_datetime(&ts).single() {
-            return Ok(DateTime::from_local(ts, offset));
-        }
+        let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
+        return Ok(DateTime::from_local(date.and_time(time), offset));
     }
 
-    // without a timezone specifier as a local time, using T as a
-    // separator, no fractional seconds
-    // Example: 2020-09-08T13:42:29
-    if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
-        if let Some(offset) = 
timezone.offset_from_local_datetime(&ts).single() {
-            return Ok(DateTime::from_local(ts, offset));
-        }
+    if !parser.test(10, b'T') && !parser.test(10, b' ') {
+        return Err(err("invalid timestamp separator"));
     }
 
-    // without a timezone specifier as a local time, using ' ' as a separator
-    // Example: 2020-09-08 13:42:29.190855
-    if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") {
-        if let Some(offset) = 
timezone.offset_from_local_datetime(&ts).single() {
-            return Ok(DateTime::from_local(ts, offset));
-        }
+    let (time, tz_offset) = parser.time().ok_or_else(|| err("error parsing 
time"))?;
+    let datetime = date.and_time(time);
+    if bytes.len() <= tz_offset {
+        let offset = timezone.offset_from_local_datetime(&datetime);
+        let offset = offset
+            .single()
+            .ok_or_else(|| err("error computing timezone offset"))?;
+        return Ok(DateTime::from_local(datetime, offset));
     }
 
-    // without a timezone specifier as a local time, using ' ' as a
-    // separator, no fractional seconds
-    // Example: 2020-09-08 13:42:29
-    if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
-        if let Some(offset) = 
timezone.offset_from_local_datetime(&ts).single() {
-            return Ok(DateTime::from_local(ts, offset));
-        }
+    if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' {
+        let offset = timezone.offset_from_local_datetime(&datetime);
+        let offset = offset
+            .single()
+            .ok_or_else(|| err("error computing timezone offset"))?;
+        return Ok(DateTime::from_utc(datetime, offset));
     }
 
-    // without a timezone specifier as a local time, only date
-    // Example: 2020-09-08
-    if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
-        if let Some(ts) = dt.and_hms_opt(0, 0, 0) {
-            if let Some(offset) = 
timezone.offset_from_local_datetime(&ts).single() {
-                return Ok(DateTime::from_local(ts, offset));
-            }
-        }
-    }
-
-    // Note we don't pass along the error message from the underlying
-    // chrono parsing because we tried several different format
-    // strings and we don't know which the user was trying to
-    // match. Ths any of the specific error messages is likely to be
-    // be more confusing than helpful
-    Err(ArrowError::CastError(format!(
-        "Error parsing '{s}' as timestamp"
-    )))
+    // Parse remainder of string as timezone
+    let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?;
+    let offset = parsed_tz.offset_from_local_datetime(&datetime);
+    let offset = offset
+        .single()
+        .ok_or_else(|| err("error computing timezone offset"))?;
+    Ok(DateTime::<Tz>::from_local(datetime, offset).with_timezone(timezone))
 }
 
 /// Accepts a string in RFC3339 / ISO8601 standard format and some
@@ -418,19 +507,20 @@ const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates 
that can be represented a
 
 impl Parser for Date32Type {
     fn parse(string: &str) -> Option<i32> {
-        let date = string.parse::<chrono::NaiveDate>().ok()?;
+        let parser = TimestampParser::new(string.as_bytes());
+        let date = parser.date()?;
         Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
     }
 
     fn parse_formatted(string: &str, format: &str) -> Option<i32> {
-        let date = chrono::NaiveDate::parse_from_str(string, format).ok()?;
+        let date = NaiveDate::parse_from_str(string, format).ok()?;
         Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
     }
 }
 
 impl Parser for Date64Type {
     fn parse(string: &str) -> Option<i64> {
-        let date_time = string.parse::<NaiveDateTime>().ok()?;
+        let date_time = string_to_datetime(&Utc, string).ok()?;
         Some(date_time.timestamp_millis())
     }
 
@@ -896,14 +986,35 @@ mod tests {
     #[test]
     fn string_to_timestamp_invalid() {
         // Test parsing invalid formats
+        let cases = [
+            ("", "timestamp must contain at least 10 characters"),
+            ("SS", "timestamp must contain at least 10 characters"),
+            ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"),
+            ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"),
+            ("1997-01-31  09:26:56.123Z", "error parsing time"),
+            ("1997:01:31T09:26:56.123Z", "error parsing date"),
+            ("1997:1:31T09:26:56.123Z", "error parsing date"),
+            ("1997-01-32T09:26:56.123Z", "error parsing date"),
+            ("1997-13-32T09:26:56.123Z", "error parsing date"),
+            ("1997-02-29T09:26:56.123Z", "error parsing date"),
+            ("2015-02-30T17:35:20-08:00", "error parsing date"),
+            ("1997-01-10T9:26:56.123Z", "error parsing time"),
+            ("2015-01-20T25:35:20-08:00", "error parsing time"),
+            ("1997-01-10T09:61:56.123Z", "error parsing time"),
+            ("1997-01-10T09:61:90.123Z", "error parsing time"),
+            ("1997-01-10T12:00:56.12Z", "error parsing time"),
+            ("1997-01-10T12:00:56.1234Z", "error parsing time"),
+            ("1997-01-10T12:00:56.12345Z", "error parsing time"),
+            ("1997-01-10T12:00:6.123Z", "error parsing time"),
+            ("1997-01-31T092656.123Z", "error parsing time"),
+        ];
 
-        // It would be nice to make these messages better
-        expect_timestamp_parse_error("", "Error parsing '' as timestamp");
-        expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp");
-        expect_timestamp_parse_error(
-            "Wed, 18 Feb 2015 23:16:09 GMT",
-            "Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp",
-        );
+        for (s, ctx) in cases {
+            let expected =
+                format!("Parser error: Error parsing timestamp from '{s}': 
{ctx}");
+            let actual = string_to_datetime(&Utc, s).unwrap_err().to_string();
+            assert_eq!(actual, expected)
+        }
     }
 
     // Parse a timestamp to timestamp int with a useful human readable error 
message
@@ -915,18 +1026,6 @@ mod tests {
         result
     }
 
-    fn expect_timestamp_parse_error(s: &str, expected_err: &str) {
-        match string_to_timestamp_nanos(s) {
-            Ok(v) => panic!(
-                "Expected error '{expected_err}' while parsing '{s}', but 
parsed {v} instead"
-            ),
-            Err(e) => {
-                assert!(e.to_string().contains(expected_err),
-                        "Can not find expected error '{expected_err}' while 
parsing '{s}'. Actual error '{e}'");
-            }
-        }
-    }
-
     #[test]
     fn string_without_timezone_to_timestamp() {
         // string without timezone should always output the same regardless 
the local or session timezone
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 08fc5513d..587f42d7e 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -296,3 +296,7 @@ required-features = ["pyarrow"]
 [[test]]
 name = "array_cast"
 required-features = ["chrono-tz"]
+
+[[test]]
+name = "timezone"
+required-features = ["chrono-tz"]
diff --git a/arrow/tests/timezone.rs b/arrow/tests/timezone.rs
new file mode 100644
index 000000000..b71d04d64
--- /dev/null
+++ b/arrow/tests/timezone.rs
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_cast::parse::string_to_datetime;
+use chrono::Utc;
+
+#[test]
+fn test_parse_timezone() {
+    let cases = [
+        (
+            "2023-01-01 040506 America/Los_Angeles",
+            "2023-01-01T12:05:06+00:00",
+        ),
+        (
+            "2023-01-01 04:05:06.345 America/Los_Angeles",
+            "2023-01-01T12:05:06.345+00:00",
+        ),
+        (
+            "2023-01-01 04:05:06.345 America/Los_Angeles",
+            "2023-01-01T12:05:06.345+00:00",
+        ),
+        (
+            "2023-01-01 04:05:06.789 -08",
+            "2023-01-01T12:05:06.789+00:00",
+        ),
+        (
+            "2023-03-12 040506 America/Los_Angeles",
+            "2023-03-12T11:05:06+00:00",
+        ), // Daylight savings
+    ];
+
+    for (s, expected) in cases {
+        let actual = string_to_datetime(&Utc, s).unwrap().to_rfc3339();
+        assert_eq!(actual, expected, "{s}")
+    }
+}
+
+#[test]
+fn test_parse_timezone_invalid() {
+    let cases = [
+        (
+            "2015-01-20T17:35:20-24:00",
+            "Parser error: Invalid timezone \"-24:00\": '-24:00' is not a 
valid timezone",
+        ),
+        (
+            "2023-01-01 04:05:06.789 +07:30:00",
+            "Parser error: Invalid timezone \"+07:30:00\": '+07:30:00' is not 
a valid timezone"
+        ),
+        (
+            // Sunday, 12 March 2023, 02:00:00 clocks are turned forward 1 
hour to
+            // Sunday, 12 March 2023, 03:00:00 local daylight time instead.
+            "2023-03-12 02:05:06 America/Los_Angeles",
+            "Parser error: Error parsing timestamp from '2023-03-12 02:05:06 
America/Los_Angeles': error computing timezone offset",
+        ),
+        (
+            // Sunday, 5 November 2023, 02:00:00 clocks are turned backward 1 
hour to
+            // Sunday, 5 November 2023, 01:00:00 local standard time instead.
+            "2023-11-05 01:30:06 America/Los_Angeles",
+            "Parser error: Error parsing timestamp from '2023-11-05 01:30:06 
America/Los_Angeles': error computing timezone offset",
+        ),
+    ];
+
+    for (s, expected) in cases {
+        let actual = string_to_datetime(&Utc, s).unwrap_err().to_string();
+        assert_eq!(actual, expected)
+    }
+}

Reply via email to