andygrove commented on code in PR #3730:
URL: https://github.com/apache/datafusion-comet/pull/3730#discussion_r2965853011


##########
native/spark-expr/src/conversion_funcs/string.rs:
##########
@@ -1092,40 +1109,119 @@ fn timestamp_parser<T: TimeZone>(
     if value.is_empty() {
         return Ok(None);
     }
-    // Define regex patterns and corresponding parsing functions
-    let patterns = &[
+
+    // Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit 
fixed offset.
+    if let Some((stripped, offset_secs)) = extract_offset_suffix(value) {
+        let fixed_tz = chrono::FixedOffset::east_opt(offset_secs)
+            .ok_or_else(|| SparkError::Internal("Invalid timezone 
offset".to_string()))?;
+        return timestamp_parser_with_tz(stripped, eval_mode, &fixed_tz);
+    }
+
+    timestamp_parser_with_tz(value, eval_mode, tz)
+}
+
+/// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), 
returns the
+/// stripped string and the offset in seconds. Returns `None` if no offset 
suffix is present.
+fn extract_offset_suffix(value: &str) -> Option<(&str, i32)> {
+    if let Some(stripped) = value.strip_suffix('Z') {
+        return Some((stripped, 0));
+    }
+    // Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 
digits)
+    if value.len() >= 6 {
+        let suffix_start = value.len() - 6;
+        let suffix = &value[suffix_start..];
+        let sign_byte = suffix.as_bytes()[0];
+        if (sign_byte == b'+' || sign_byte == b'-') && suffix.as_bytes()[3] == 
b':' {
+            if let (Ok(h), Ok(m)) = (suffix[1..3].parse::<i32>(), 
suffix[4..6].parse::<i32>()) {
+                let sign = if sign_byte == b'+' { 1i32 } else { -1i32 };
+                return Some((&value[..suffix_start], sign * (h * 3600 + m * 
60)));
+            }
+        }
+    }
+    None
+}
+
+type TimestampParsePattern<T> = (Regex, fn(&str, &T) -> 
SparkResult<Option<i64>>);
+
+fn timestamp_parser_with_tz<T: TimeZone>(
+    value: &str,
+    eval_mode: EvalMode,
+    tz: &T,
+) -> SparkResult<Option<i64>> {
+    // Define regex patterns and corresponding parsing functions.
+    // Both T-separator and space-separator date-time forms are supported.
+    // Negative years are handled by get_timestamp_values detecting a leading 
'-'.
+    let patterns: &[TimestampParsePattern<T>] = &[
+        // Year only: 4-7 digits, optionally negative
         (
-            Regex::new(r"^\d{4,5}$").unwrap(),
+            Regex::new(r"^-?\d{4,7}$").unwrap(),

Review Comment:
   Do these regex expressions get compiled for every invocation of 
`timestamp_parser_with_tz`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to