alamb commented on code in PR #6211:
URL: https://github.com/apache/arrow-rs/pull/6211#discussion_r1711807051


##########
arrow-cast/src/parse.rs:
##########
@@ -1369,51 +1396,30 @@ fn parse_interval_components(
     value: &str,
     config: &IntervalParseConfig,
 ) -> Result<Vec<(IntervalAmount, IntervalUnit)>, ArrowError> {
-    let parts = value.split_whitespace();
-
-    let raw_amounts = parts.clone().step_by(2);
-    let raw_units = parts.skip(1).step_by(2);
+    let raw_pairs = split_interval_components(value);
 
-    // parse amounts
-    let (amounts, invalid_amounts) = raw_amounts
-        .map(IntervalAmount::from_str)
-        .partition::<Vec<_>, _>(Result::is_ok);
-
-    // invalid amounts?
-    if !invalid_amounts.is_empty() {
-        return Err(ArrowError::ParseError(format!(
-            "Invalid input syntax for type interval: {value:?}"
-        )));
-    }
-
-    // parse units
-    let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units
-        .clone()
-        .map(IntervalUnit::from_str)
-        .partition(Result::is_ok);
-
-    // invalid units?
-    if !invalid_units.is_empty() {
+    // parse amounts and units
+    let Ok(pairs): Result<Vec<(IntervalAmount, IntervalUnit)>, ArrowError> = 
raw_pairs
+        .iter()
+        .map(|(a, u)| Ok((a.parse()?, IntervalUnit::from_str_or_config(*u, 
config)?)))
+        .collect()
+    else {

Review Comment:
   this is certainly a nicer formulation



##########
arrow-cast/src/parse.rs:
##########
@@ -994,17 +994,24 @@ pub fn parse_interval_day_time(
     Ok(IntervalDayTimeType::make_value(days, millis))
 }
 
-pub fn parse_interval_month_day_nano(
+pub fn parse_interval_month_day_nano_config(

Review Comment:
   It appears that this function is part of the public API: 
https://docs.rs/arrow/latest/arrow/compute/kernels/cast_utils/fn.parse_interval_month_day_nano.html
   And therefore this change is an API change
   
   It seems to me that in an ideal world, DataFusion would simply use 
`Interval::parse` / `IntervalParseConfig` directly as it already supports the 
API required, however that would be a large new API surface area
   
   If we want to change `parse_interval_month_day_nano` I suggest:
   1. Make `IntervalParseConfig` pub
   2. Change the API signature to use `IntervalParseConfig` so we have a place 
to add future configuration options without breaking the API again (e.g. if 
someone wants the old behavior of `1` --> `1 second` rather than 
   
   ```rationale
   parse_interval_month_day_nano_config(
       value: &str,
       config: IntervalParseConfig,
   ```
   
   3. keep `pub fn parse_interval_month_day_nano` (possibly marking it 
deprecated) with the same behavor to help ease migration
   
   
   
   



##########
arrow-cast/src/parse.rs:
##########
@@ -1425,6 +1431,33 @@ fn parse_interval_components(
     Ok(result.collect::<Vec<_>>())
 }
 
+/// Split an interval into a vec of amounts and units.
+///
+/// Pairs are separated by spaces, but within a pair the amount and unit may 
or may not be separated by a space.
+///
+/// This should match the behavior of PostgreSQL's interval parser.
+fn split_interval_components(value: &str) -> Vec<(&str, Option<&str>)> {

Review Comment:
   I think you can make this faster by just returning the iterator directly 
rather than forming a vec
   
   However it seems needed to make the nice error message at the moment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to