kosiew commented on code in PR #18025:
URL: https://github.com/apache/datafusion/pull/18025#discussion_r2497250919


##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -42,6 +47,506 @@ pub(crate) fn string_to_timestamp_nanos_shim(s: &str) -> 
Result<i64> {
     string_to_timestamp_nanos(s).map_err(|e| e.into())
 }
 
+#[derive(Clone, Copy, Debug)]
+enum ConfiguredZone {
+    Named(Tz),
+    Offset(FixedOffset),
+}
+
+#[derive(Clone)]
+pub(crate) struct ConfiguredTimeZone {
+    repr: Arc<str>,
+    zone: ConfiguredZone,
+}
+
+impl ConfiguredTimeZone {
+    pub(crate) fn utc() -> Self {
+        Self {
+            repr: Arc::from("+00:00"),
+            zone: ConfiguredZone::Offset(FixedOffset::east_opt(0).unwrap()),
+        }
+    }
+
+    pub(crate) fn parse(tz: &str) -> Result<Option<Self>> {
+        let tz = tz.trim();
+        if tz.is_empty() {
+            return Ok(None);
+        }
+
+        if let Ok(named) = Tz::from_str(tz) {
+            return Ok(Some(Self {
+                repr: Arc::from(tz),
+                zone: ConfiguredZone::Named(named),
+            }));
+        }
+
+        if let Some(offset) = parse_fixed_offset(tz) {
+            return Ok(Some(Self {
+                repr: Arc::from(tz),
+                zone: ConfiguredZone::Offset(offset),
+            }));
+        }
+
+        Err(exec_datafusion_err!(
+            "Invalid execution timezone '{tz}'. Please provide an IANA 
timezone name (e.g. 'America/New_York') or an offset in the form '+HH:MM'."
+        ))
+    }
+
+    pub(crate) fn from_config(config: &ConfigOptions) -> Self {
+        match Self::parse(&config.execution.time_zone) {
+            Ok(Some(tz)) => tz,
+            _ => Self::utc(),
+        }
+    }
+
+    fn timestamp_from_naive(&self, naive: &NaiveDateTime) -> Result<i64> {
+        match self.zone {
+            ConfiguredZone::Named(tz) => {
+                local_datetime_to_timestamp(tz.from_local_datetime(naive), 
&self.repr)
+            }
+            ConfiguredZone::Offset(offset) => {
+                local_datetime_to_timestamp(offset.from_local_datetime(naive), 
&self.repr)
+            }
+        }
+    }
+
+    fn datetime_from_formatted(&self, s: &str, format: &str) -> 
Result<DateTime<Utc>> {
+        let datetime = match self.zone {
+            ConfiguredZone::Named(tz) => {
+                string_to_datetime_formatted(&tz, s, 
format)?.with_timezone(&Utc)
+            }
+            ConfiguredZone::Offset(offset) => {
+                string_to_datetime_formatted(&offset, s, 
format)?.with_timezone(&Utc)
+            }
+        };
+        Ok(datetime)
+    }
+}
+
+impl fmt::Debug for ConfiguredTimeZone {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ConfiguredTimeZone")
+            .field("repr", &self.repr)
+            .finish()
+    }
+}
+
+impl PartialEq for ConfiguredTimeZone {
+    fn eq(&self, other: &Self) -> bool {
+        self.repr == other.repr
+    }
+}
+
+impl Eq for ConfiguredTimeZone {}
+
+impl Hash for ConfiguredTimeZone {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.repr.hash(state);
+    }
+}
+
+fn parse_fixed_offset(tz: &str) -> Option<FixedOffset> {
+    let tz = tz.trim();
+    if tz.eq_ignore_ascii_case("utc") || tz.eq_ignore_ascii_case("z") {
+        return FixedOffset::east_opt(0);
+    }
+
+    let (sign, rest) = if let Some(rest) = tz.strip_prefix('+') {
+        (1, rest)
+    } else if let Some(rest) = tz.strip_prefix('-') {
+        (-1, rest)
+    } else {
+        return None;
+    };
+
+    let (hours, minutes) = if let Some((hours, minutes)) = 
rest.split_once(':') {
+        (hours, minutes)
+    } else if rest.len() == 4 {
+        rest.split_at(2)
+    } else {
+        return None;
+    };
+
+    let hours: i32 = hours.parse().ok()?;
+    let minutes: i32 = minutes.parse().ok()?;
+    if hours > 23 || minutes > 59 {
+        return None;
+    }
+
+    let total_minutes = hours * 60 + minutes;
+    let total_seconds = sign * total_minutes * 60;
+    FixedOffset::east_opt(total_seconds)
+}
+
+/// Converts a local datetime result to a UTC timestamp in nanoseconds.
+///
+/// # DST Transition Behavior
+///
+/// This function handles daylight saving time (DST) transitions by returning 
an error
+/// when the local time is ambiguous or invalid:
+///
+/// ## Ambiguous Times (Fall Back)
+/// When clocks "fall back" (e.g., 2:00 AM becomes 1:00 AM), times in the 
repeated hour
+/// exist twice. For example, in America/New_York on 2024-11-03:
+/// - `2024-11-03 01:30:00` occurs both at UTC 05:30 (EDT) and UTC 06:30 (EST)
+///
+/// DataFusion returns an error rather than silently choosing one 
interpretation,
+/// ensuring users are aware of the ambiguity.
+///
+/// ## Invalid Times (Spring Forward)
+/// When clocks "spring forward" (e.g., 2:00 AM becomes 3:00 AM), times in the 
skipped hour
+/// don't exist. For example, in America/New_York on 2024-03-10:
+/// - `2024-03-10 02:30:00` never occurred (clocks jumped from 02:00 to 03:00)
+///
+/// DataFusion returns an error for these non-existent times.
+///
+/// ## Workarounds
+/// To avoid ambiguity errors:
+/// 1. Use timestamps with explicit timezone offsets (e.g., `2024-11-03 
01:30:00-05:00`)
+/// 2. Convert to UTC before processing
+/// 3. Use a timezone without DST (e.g., UTC, `America/Phoenix`)
+fn local_datetime_to_timestamp<T: TimeZone>(
+    result: LocalResult<DateTime<T>>,
+    tz_repr: &str,
+) -> Result<i64> {
+    match result {
+        Single(dt) => datetime_to_timestamp(dt.with_timezone(&Utc)),
+        LocalResult::Ambiguous(dt1, dt2) => Err(exec_datafusion_err!(
+            "The local time '{:?}' is ambiguous in timezone '{tz_repr}' (also 
corresponds to '{:?}').",
+            dt1.naive_local(),
+            dt2.naive_local()
+        )),
+        LocalResult::None => Err(exec_datafusion_err!(
+            "The local time is invalid in timezone '{tz_repr}'."
+        )),
+    }
+}
+
+fn datetime_to_timestamp(datetime: DateTime<Utc>) -> Result<i64> {
+    datetime
+        .timestamp_nanos_opt()
+        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
+}
+
+fn timestamp_to_naive(value: i64) -> Result<NaiveDateTime> {
+    let secs = value.div_euclid(1_000_000_000);
+    let nanos = value.rem_euclid(1_000_000_000) as u32;
+    DateTime::<Utc>::from_timestamp(secs, nanos)
+        .ok_or_else(|| exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
+        .map(|dt| dt.naive_utc())
+}
+
+/// Detects whether a timestamp string contains explicit timezone information.
+///
+/// This function performs a single-pass scan to check for:
+/// 1. RFC3339-compatible format (via Arrow's parser)
+/// 2. Timezone offset markers (e.g., `+05:00`, `-0800`, `+05`)
+/// 3. Trailing 'Z' or 'z' suffix (UTC indicator)
+/// 4. Named timezone identifiers (e.g., `UTC`, `America/New_York`)
+///
+/// # Performance Considerations
+/// This function is called for every string value during timestamp parsing.
+/// The implementation uses a single-pass byte-level scan for efficiency.
+///
+/// # Examples
+/// ```ignore
+/// assert!(has_explicit_timezone("2020-09-08T13:42:29Z"));
+/// assert!(has_explicit_timezone("2020-09-08T13:42:29+05:00"));
+/// assert!(has_explicit_timezone("2020-09-08T13:42:29 UTC"));
+/// assert!(!has_explicit_timezone("2020-09-08T13:42:29"));
+/// ```
+fn has_explicit_timezone(value: &str) -> bool {
+    // Fast path: try RFC3339 parsing first
+    if has_rfc3339_timezone(value) {
+        return true;
+    }
+
+    // Single-pass scan for offset markers and named timezones
+    has_offset_marker(value) || has_named_timezone(value)
+}
+
+/// Checks if the string is a valid RFC3339 timestamp with timezone.
+#[inline]
+fn has_rfc3339_timezone(value: &str) -> bool {
+    DateTime::parse_from_rfc3339(value).is_ok()
+}
+
+/// Detects UTC indicator ('Z' or 'z') or numeric timezone offsets.
+///
+/// Recognizes patterns like:
+/// - `2020-09-08T13:42:29Z` (trailing Z)
+/// - `2020-09-08T13:42:29+05:00` (offset with colons)
+/// - `2020-09-08T13:42:29+0500` (offset without colons)
+/// - `2020-09-08T13:42:29+05` (two-digit offset)
+///
+/// Avoids false positives from:
+/// - Scientific notation (e.g., `1.5e+10`)
+/// - Date separators (e.g., `05-17-2023`)
+fn has_offset_marker(value: &str) -> bool {
+    let bytes = value.as_bytes();
+    let len = bytes.len();
+
+    let mut i = 0;
+    while i < len {
+        match bytes[i] as char {
+            // Check for trailing 'Z' (UTC indicator)
+            'Z' | 'z' => {
+                if i > 0 && bytes[i - 1].is_ascii_digit() {
+                    let next = i + 1;
+                    if next == len || !bytes[next].is_ascii_alphabetic() {
+                        return true;
+                    }
+                }
+                i += 1;
+            }
+            // Check for timezone offset (+/-HHMM or +/-HH:MM)
+            '+' | '-' => {
+                // Skip scientific notation (e.g., 1.5e+10)
+                if i > 0 {
+                    let prev = bytes[i - 1] as char;
+                    if prev == 'e' || prev == 'E' {
+                        i += 1;
+                        continue;
+                    }
+                }
+
+                if is_valid_offset_at(bytes, i, len) {
+                    return true;
+                }
+
+                // Skip past digits to continue scanning
+                i += 1;
+                while i < len && bytes[i].is_ascii_digit() {
+                    i += 1;
+                }
+            }
+            _ => i += 1,
+        }
+    }
+
+    false
+}
+
+/// Checks if position `i` starts a valid timezone offset.
+///
+/// Returns true for patterns like:
+/// - `+05:00` or `-03:30` (with colons)
+/// - `+0500` or `-0800` (4-digit without colons)
+/// - `+053045` (6-digit with seconds)
+/// - `+05` or `-08` (2-digit)
+fn is_valid_offset_at(bytes: &[u8], i: usize, len: usize) -> bool {
+    let mut j = i + 1;
+    let mut digit_count = 0;
+
+    // Count consecutive digits after +/-
+    while j < len && bytes[j].is_ascii_digit() {
+        digit_count += 1;
+        j += 1;
+    }
+
+    // Check for offset with colons (e.g., +05:00 or +05:00:45)
+    if j < len && bytes[j] == b':' {
+        return is_colon_separated_offset(bytes, j, len);
+    }
+
+    // Check for offset without colons
+    match digit_count {
+        2 | 4 | 6 => is_context_valid_for_offset(bytes, i, j, len),
+        _ => false,
+    }
+}
+
+/// Validates colon-separated offset format (e.g., +05:00 or +05:00:45).
+fn is_colon_separated_offset(bytes: &[u8], mut pos: usize, len: usize) -> bool 
{
+    let mut sections = 0;
+
+    while pos < len && bytes[pos] == b':' {
+        pos += 1;
+        let mut digits = 0;
+        while pos < len && bytes[pos].is_ascii_digit() {
+            digits += 1;
+            pos += 1;
+        }
+        if digits != 2 {
+            return false;
+        }
+        sections += 1;
+    }
+
+    sections > 0
+        && (pos == len
+            || bytes[pos].is_ascii_whitespace()
+            || matches!(bytes[pos], b',' | b'.' | b':' | b';'))
+}
+
+/// Checks if the context around an offset marker is valid.
+///
+/// Ensures the offset follows a time component (not a date separator).
+/// For example:
+/// - Valid: `13:42:29+0500` (follows time with colon)
+/// - Invalid: `05-17+2023` (part of date, no preceding colon)
+fn is_context_valid_for_offset(bytes: &[u8], i: usize, j: usize, len: usize) 
-> bool {
+    if i == 0 {
+        return false;
+    }
+
+    let prev = bytes[i - 1];
+
+    // Valid after T, t, space, or tab separators
+    if matches!(prev, b'T' | b't' | b' ' | b'\t') {
+        return is_followed_by_delimiter(bytes, j, len);
+    }
+
+    // When following a digit, must be part of a time (not date)
+    if prev.is_ascii_digit() {
+        let has_colon_before = bytes[..i].contains(&b':');
+        let no_date_separator = !has_recent_dash_or_slash(bytes, i);
+        let no_dash_after = j >= len || bytes[j] != b'-';
+
+        if has_colon_before
+            && i >= 2
+            && bytes[i - 2].is_ascii_digit()
+            && no_date_separator
+            && no_dash_after
+        {
+            return is_followed_by_delimiter(bytes, j, len);
+        }
+    }
+
+    false
+}
+
+/// Checks if there's a dash or slash in the 4 characters before position `i`.
+///
+/// This helps distinguish time offsets from date separators.
+/// For example, in `05-17-2023`, the `-` is a date separator, not an offset.
+#[inline]
+fn has_recent_dash_or_slash(bytes: &[u8], i: usize) -> bool {
+    let lookback_start = i.saturating_sub(4);
+    bytes[lookback_start..i]
+        .iter()
+        .any(|&b| b == b'-' || b == b'/')
+}
+
+/// Checks if position `j` is followed by a valid delimiter or end of string.
+#[inline]
+fn is_followed_by_delimiter(bytes: &[u8], j: usize, len: usize) -> bool {
+    j == len
+        || bytes[j].is_ascii_whitespace()
+        || matches!(bytes[j], b',' | b'.' | b':' | b';')
+}
+
+/// Scans for named timezone identifiers (e.g., `UTC`, `GMT`, 
`America/New_York`).
+///
+/// This performs a token-based scan looking for:
+/// - Common abbreviations: `UTC`, `GMT`
+/// - IANA timezone names: `America/New_York`, `Europe/London`
+///
+/// The scan looks for timezone tokens anywhere in the string, as some formats
+/// place timezone names at the beginning or end (e.g., `UTC 2024-01-01` or
+/// `2024-01-01 America/New_York`).
+fn has_named_timezone(value: &str) -> bool {
+    let bytes = value.as_bytes();
+    let len = bytes.len();
+    let mut start = 0;
+
+    while start < len {
+        // Skip non-token characters
+        while start < len && !is_token_char(bytes[start]) {
+            start += 1;
+        }
+
+        if start == len {
+            break;
+        }
+
+        // Extract token
+        let mut end = start;
+        let mut has_alpha = false;
+        while end < len && is_token_char(bytes[end]) {
+            if bytes[end].is_ascii_alphabetic() {
+                has_alpha = true;
+            }
+            end += 1;
+        }
+
+        // Check if token (or suffix) is a timezone
+        if has_alpha {
+            let token = &value[start..end];
+            if is_timezone_name(token) {
+                return true;
+            }
+
+            // Check suffixes (e.g., "PST" in "12:00PST")
+            for (offset, ch) in token.char_indices().skip(1) {
+                if ch.is_ascii_alphabetic() {
+                    let candidate = &token[offset..];
+                    if is_timezone_name(candidate) {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        start = end;
+    }
+
+    false
+}
+
+/// Returns true if the byte can be part of a timezone token.
+#[inline]
+fn is_token_char(b: u8) -> bool {
+    matches!(
+        b,
+        b'A'..=b'Z' | b'a'..=b'z' | b'/' | b'_' | b'-' | b'+' | b'0'..=b'9'
+    )
+}
+
+/// Checks if a token is a recognized timezone name.
+///
+/// Recognizes:
+/// - Common abbreviations: `UTC`, `GMT`
+/// - IANA timezone database names (via `Tz::from_str`)
+/// - Timezone names with trailing offset info (e.g., `PST+8`)
+fn is_timezone_name(token: &str) -> bool {
+    if token.is_empty() {
+        return false;
+    }
+
+    // Check common abbreviations
+    if token.eq_ignore_ascii_case("utc") || token.eq_ignore_ascii_case("gmt") {
+        return true;
+    }
+
+    // Check IANA timezone database
+    if Tz::from_str(token).is_ok() {
+        return true;
+    }
+
+    // Handle timezone names with trailing offset (e.g., "PST+8")

Review Comment:
   Added detects_named_timezones_with_trailing_offsets



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to