[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #5166: feat: add `arrow_cast` function to support supports arbitrary arrow types

via GitHub Fri, 03 Mar 2023 10:10:41 -0800


alamb commented on code in PR #5166:
URL: https://github.com/apache/arrow-datafusion/pull/5166#discussion_r1124838520



##########
datafusion/core/tests/sqllogictests/test_files/arrow_typeof.slt:
##########
@@ -52,31 +52,203 @@ SELECT arrow_typeof(1.0::float)
 Float32
 
 # arrow_typeof_decimal
-# query T
-# SELECT arrow_typeof(1::Decimal)
-# ----
-# Decimal128(38, 10)
-
-# # arrow_typeof_timestamp
-# query T
-# SELECT arrow_typeof(now()::timestamp)
-# ----
-# Timestamp(Nanosecond, None)
-
-# # arrow_typeof_timestamp_utc
-# query T
-# SELECT arrow_typeof(now())
-# ----
-# Timestamp(Nanosecond, Some(\"+00:00\"))
-
-# # arrow_typeof_timestamp_date32(
-# query T
-# SELECT arrow_typeof(now()::date)
-# ----
-# Date32
-
-# # arrow_typeof_utf8
-# query T
-# SELECT arrow_typeof('1')
-# ----
-# Utf8
+query T
+SELECT arrow_typeof(1::Decimal)
+----
+Decimal128(38, 10)
+
+# arrow_typeof_timestamp
+query T
+SELECT arrow_typeof(now()::timestamp)
+----
+Timestamp(Nanosecond, None)
+
+# arrow_typeof_timestamp_utc
+query T
+SELECT arrow_typeof(now())
+----
+Timestamp(Nanosecond, Some("+00:00"))
+
+# arrow_typeof_timestamp_date32(
+query T
+SELECT arrow_typeof(now()::date)
+----
+Date32
+
+# arrow_typeof_utf8
+query T
+SELECT arrow_typeof('1')
+----
+Utf8
+
+
+#### arrow_cast (in some ways opposite of arrow_typeof)
+
+
+query I
+SELECT arrow_cast('1', 'Int16')
+----
+1
+
+query error Error during planning: arrow_cast needs 2 arguments, 1 provided
+SELECT arrow_cast('1')
+
+query error Error during planning: arrow_cast requires its second argument to 
be a constant string, got Int64\(43\)
+SELECT arrow_cast('1', 43)
+
+query error Error unrecognized word: unknown
+SELECT arrow_cast('1', 'unknown')
+
+
+## Basic types
+
+statement ok
+create table foo as select
+  arrow_cast(1, 'Int8') as col_i8,
+  arrow_cast(1, 'Int16') as col_i16,
+  arrow_cast(1, 'Int32') as col_i32,
+  arrow_cast(1, 'Int64') as col_i64,
+  arrow_cast(1, 'UInt8') as col_u8,
+  arrow_cast(1, 'UInt16') as col_u16,
+  arrow_cast(1, 'UInt32') as col_u32,
+  arrow_cast(1, 'UInt64') as col_u64,
+  -- can't seem to cast to Float16 for some reason
+  arrow_cast(1.0, 'Float32') as col_f32,
+  arrow_cast(1.0, 'Float64') as col_f64
+;
+
+
+query TTTTTTTTTT
+SELECT
+  arrow_typeof(col_i8),
+  arrow_typeof(col_i16),
+  arrow_typeof(col_i32),
+  arrow_typeof(col_i64),
+  arrow_typeof(col_u8),
+  arrow_typeof(col_u16),
+  arrow_typeof(col_u32),
+  arrow_typeof(col_u64),
+  arrow_typeof(col_f32),
+  arrow_typeof(col_f64)
+  FROM foo;
+----
+Int8 Int16 Int32 Int64 UInt8 UInt16 UInt32 UInt64 Float32 Float64
+
+
+
+statement ok
+drop table foo
+
+## Decimals
+
+statement ok
+create table foo as select
+  arrow_cast(100, 'Decimal128(3,2)') as col_d128
+  -- Can't make a decimal 156:
+  -- This feature is not implemented: Can't create a scalar from array of type 
"Decimal256(3, 2)"
+  --arrow_cast(100, 'Decimal256(3,2)') as col_d256
+;
+
+
+query T
+SELECT
+  arrow_typeof(col_d128)
+  -- arrow_typeof(col_d256),
+  FROM foo;
+----
+Decimal128(3, 2)
+
+
+statement ok
+drop table foo
+
+## strings, large strings
+
+statement ok
+create table foo as select
+  arrow_cast('foo', 'Utf8') as col_utf8,
+  arrow_cast('foo', 'LargeUtf8') as col_large_utf8,
+  arrow_cast('foo', 'Binary') as col_binary,
+  arrow_cast('foo', 'LargeBinary') as col_large_binary
+;
+
+
+query TTTT
+SELECT
+  arrow_typeof(col_utf8),
+  arrow_typeof(col_large_utf8),
+  arrow_typeof(col_binary),
+  arrow_typeof(col_large_binary)
+  FROM foo;
+----
+Utf8 LargeUtf8 Binary LargeBinary
+
+
+statement ok
+drop table foo
+
+
+## timestamps
+
+statement ok
+create table foo as select
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 
'Timestamp(Second, None)') as col_ts_s,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 
'Timestamp(Millisecond, None)') as col_ts_ms,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 
'Timestamp(Microsecond, None)') as col_ts_us,
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 
'Timestamp(Nanosecond, None)') as col_ts_ns
+;
+
+
+query TTTT
+SELECT
+  arrow_typeof(col_ts_s),
+  arrow_typeof(col_ts_ms),
+  arrow_typeof(col_ts_us),
+  arrow_typeof(col_ts_ns)
+  FROM foo;
+----
+Timestamp(Second, None) Timestamp(Millisecond, None) Timestamp(Microsecond, 
None) Timestamp(Nanosecond, None)
+
+
+statement ok
+drop table foo
+
+
+## durations
+
+statement ok
+create table foo as select
+  arrow_cast(to_timestamp('2020-01-02 01:01:11.1234567890Z'), 
'Timestamp(Second, None)') as col_ts_s,

Review Comment:
   This is kind of cool -- it shows how to convert to arbitrary Arrow timestamp 
types.
   
   I wonder if we should deprecate / remove `to_timestamp_millis`, 
`to_timstamp_nanos`, ... 🤔  Any thoughts @waitingkuo ?



##########
docs/source/user-guide/sql/data_types.md:
##########
@@ -100,3 +112,43 @@ the `arrow_typeof` function. For example:
 | `ENUM`        | _Not yet supported_ |
 | `SET`         | _Not yet supported_ |
 | `DATETIME`    | _Not yet supported_ |
+
+## Supported Arrow Types
+
+The following types are supported by the `arrow_typeof` function:

Review Comment:
   I am pretty stoked about the ability to cast to types that have no direct 
SQL mapping (for IOx it is dictionary types) but the different timestamp and 
durations are also quite relevant I think



##########
datafusion/sql/src/expr/arrow_cast.rs:
##########
@@ -0,0 +1,673 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Implementation of the `arrow_cast` function that allows
+//! casting to arbitrary arrow types (rather than SQL types)
+
+use std::{fmt::Display, iter::Peekable, str::Chars};
+
+use arrow_schema::{DataType, IntervalUnit, TimeUnit};
+use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue};
+
+use datafusion_expr::{Expr, ExprSchemable};
+
+pub const ARROW_CAST_NAME: &str = "arrow_cast";
+
+/// Create an [`Expr`] that evaluates the `arrow_cast` function
+///
+/// This function is not a [`BuiltInScalarFunction`] because the
+/// return type of [`BuiltInScalarFunction`] depends only on the
+/// *types* of the arguments. However, the type of `arrow_type` depends on
+/// the *value* of its second argument.
+///
+/// Use the `cast` function to cast to SQL type (which is then mapped
+/// to the corresponding arrow type). For example to cast to `int`
+/// (which is then mapped to the arrow type `Int32`)
+///
+/// ```sql
+/// select cast(column_x as int) ...
+/// ```
+///
+/// Use the `arrow_cast` functiont to cast to a specfic arrow type
+///
+/// For example
+/// ```sql
+/// select arrow_cast(column_x, 'Float64')
+/// ```
+pub fn create_arrow_cast(mut args: Vec<Expr>, schema: &DFSchema) -> 
Result<Expr> {
+    if args.len() != 2 {
+        return Err(DataFusionError::Plan(format!(
+            "arrow_cast needs 2 arguments, {} provided",
+            args.len()
+        )));
+    }
+    let arg1 = args.pop().unwrap();
+    let arg0 = args.pop().unwrap();
+
+    // arg1 must be a stirng
+    let data_type_string = if let Expr::Literal(ScalarValue::Utf8(Some(v))) = 
arg1 {
+        v
+    } else {
+        return Err(DataFusionError::Plan(format!(
+            "arrow_cast requires its second argument to be a constant string, 
got {arg1}"
+        )));
+    };
+
+    // do the actual lookup to the appropriate data type
+    let data_type = parse_data_type(&data_type_string)?;
+
+    arg0.cast_to(&data_type, schema)
+}
+
+/// Parses `str` into a `DataType`.

Review Comment:
   Here is the main entrypoint. This uses a Parser very much like the one in 
https://github.com/sqlparser-rs/sqlparser-rs because I was familiar with that 
structure and I figured that other contributors to DataFusion might be too
   
    I am not sure what reviewers think of proposing putting this in the 
arrow-rs crate -- it feels like a better fit there to me



##########
datafusion/sql/src/expr/arrow_cast.rs:
##########
@@ -0,0 +1,673 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Implementation of the `arrow_cast` function that allows
+//! casting to arbitrary arrow types (rather than SQL types)
+
+use std::{fmt::Display, iter::Peekable, str::Chars};
+
+use arrow_schema::{DataType, IntervalUnit, TimeUnit};
+use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue};
+
+use datafusion_expr::{Expr, ExprSchemable};
+
+pub const ARROW_CAST_NAME: &str = "arrow_cast";
+
+/// Create an [`Expr`] that evaluates the `arrow_cast` function
+///
+/// This function is not a [`BuiltInScalarFunction`] because the
+/// return type of [`BuiltInScalarFunction`] depends only on the
+/// *types* of the arguments. However, the type of `arrow_type` depends on
+/// the *value* of its second argument.
+///
+/// Use the `cast` function to cast to SQL type (which is then mapped
+/// to the corresponding arrow type). For example to cast to `int`
+/// (which is then mapped to the arrow type `Int32`)
+///
+/// ```sql
+/// select cast(column_x as int) ...
+/// ```
+///
+/// Use the `arrow_cast` functiont to cast to a specfic arrow type
+///
+/// For example
+/// ```sql
+/// select arrow_cast(column_x, 'Float64')
+/// ```
+pub fn create_arrow_cast(mut args: Vec<Expr>, schema: &DFSchema) -> 
Result<Expr> {
+    if args.len() != 2 {
+        return Err(DataFusionError::Plan(format!(
+            "arrow_cast needs 2 arguments, {} provided",
+            args.len()
+        )));
+    }
+    let arg1 = args.pop().unwrap();
+    let arg0 = args.pop().unwrap();
+
+    // arg1 must be a stirng
+    let data_type_string = if let Expr::Literal(ScalarValue::Utf8(Some(v))) = 
arg1 {
+        v
+    } else {
+        return Err(DataFusionError::Plan(format!(
+            "arrow_cast requires its second argument to be a constant string, 
got {arg1}"
+        )));
+    };
+
+    // do the actual lookup to the appropriate data type
+    let data_type = parse_data_type(&data_type_string)?;
+
+    arg0.cast_to(&data_type, schema)
+}
+
+/// Parses `str` into a `DataType`.
+///
+/// `parse_data_type` is the the reverse of [`DataType`]'s `Display`
+/// impl, and maintains the invariant that
+/// `parse_data_type(data_type.to_string()) == data_type`
+///
+/// Example:
+/// ```
+/// # use datafusion_sql::parse_data_type;
+/// # use arrow_schema::DataType;
+/// let display_value = "Int32";
+///
+/// // "Int32" is the Display value of `DataType`
+/// assert_eq!(display_value, &format!("{}", DataType::Int32));
+///
+/// // parse_data_type coverts "Int32" back to `DataType`:
+/// let data_type = parse_data_type(display_value).unwrap();
+/// assert_eq!(data_type, DataType::Int32);
+/// ```
+///
+/// TODO file a ticket about bringing this into arrow possibly
+pub fn parse_data_type(val: &str) -> Result<DataType> {
+    Parser::new(val).parse()
+}
+
+fn make_error(val: &str, msg: &str) -> DataFusionError {
+    DataFusionError::Plan(
+        format!("Unsupported type '{val}'. Must be a supported arrow type name 
such as 'Int32' or 'Timestamp(Nanoseconds, None)'. Error {msg}" )
+    )
+}
+
+fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> 
DataFusionError {
+    make_error(val, &format!("Expected '{expected}', got '{actual}'"))
+}
+
+#[derive(Debug)]
+/// Implementation of `parse_data_type`, modeled after 
<https://github.com/sqlparser-rs/sqlparser-rs>
+struct Parser<'a> {
+    val: &'a str,
+    tokenizer: Tokenizer<'a>,
+}
+
+impl<'a> Parser<'a> {
+    fn new(val: &'a str) -> Self {
+        Self {
+            val,
+            tokenizer: Tokenizer::new(val),
+        }
+    }
+
+    fn parse(mut self) -> Result<DataType> {
+        let data_type = self.parse_next_type()?;
+        // ensure that there is no trailing content
+        if self.tokenizer.peek_next_char().is_some() {
+            return Err(make_error(
+                self.val,
+                &format!("checking trailing content after parsing 
'{data_type}'"),
+            ));
+        } else {
+            Ok(data_type)
+        }
+    }
+
+    /// parses the next full DataType
+    fn parse_next_type(&mut self) -> Result<DataType> {
+        match self.next_token()? {
+            Token::SimpleType(data_type) => Ok(data_type),
+            Token::Timestamp => self.parse_timestamp(),
+            Token::Time32 => self.parse_time32(),
+            Token::Time64 => self.parse_time64(),
+            Token::Duration => self.parse_duration(),
+            Token::Interval => self.parse_interval(),
+            Token::FixedSizeBinary => self.parse_fixed_size_binary(),
+            Token::Decimal128 => self.parse_decimal_128(),
+            Token::Decimal256 => self.parse_decimal_256(),
+            Token::Dictionary => self.parse_dictionary(),
+            tok => Err(make_error(
+                self.val,
+                &format!("finding next type, got unexpected '{tok}'"),
+            )),
+        }
+    }
+
+    /// Parses the next timeunit
+    fn parse_time_unit(&mut self, context: &str) -> Result<TimeUnit> {
+        match self.next_token()? {
+            Token::TimeUnit(time_unit) => Ok(time_unit),
+            tok => Err(make_error(
+                self.val,
+                &format!("finding TimeUnit for {context}, got {tok}"),
+            )),
+        }
+    }
+
+    /// Parses the next integer value
+    fn parse_i64(&mut self, context: &str) -> Result<i64> {
+        match self.next_token()? {
+            Token::Integer(v) => Ok(v),
+            tok => Err(make_error(
+                self.val,
+                &format!("finding i64 for {context}, got '{tok}'"),
+            )),
+        }
+    }
+
+    /// Parses the next i32 integer value
+    fn parse_i32(&mut self, context: &str) -> Result<i32> {
+        let length = self.parse_i64(context)?;
+        length.try_into().map_err(|e| {
+            make_error(
+                self.val,
+                &format!("converting {length} into i32 for {context}: {e}"),
+            )
+        })
+    }
+
+    /// Parses the next i8 integer value
+    fn parse_i8(&mut self, context: &str) -> Result<i8> {
+        let length = self.parse_i64(context)?;
+        length.try_into().map_err(|e| {
+            make_error(
+                self.val,
+                &format!("converting {length} into i8 for {context}: {e}"),
+            )
+        })
+    }
+
+    /// Parses the next u8 integer value
+    fn parse_u8(&mut self, context: &str) -> Result<u8> {
+        let length = self.parse_i64(context)?;
+        length.try_into().map_err(|e| {
+            make_error(
+                self.val,
+                &format!("converting {length} into u8 for {context}: {e}"),
+            )
+        })
+    }
+
+    /// Parses the next timestamp (called after `Timestamp` has been consumed)
+    fn parse_timestamp(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let time_unit = self.parse_time_unit("Timestamp")?;
+        self.expect_token(Token::Comma)?;
+        // TODO Support timezones other than Non
+        self.expect_token(Token::None)?;
+        let timezone = None;
+
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Timestamp(time_unit, timezone))
+    }
+
+    /// Parses the next Time32 (called after `Time32` has been consumed)
+    fn parse_time32(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let time_unit = self.parse_time_unit("Time32")?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Time32(time_unit))
+    }
+
+    /// Parses the next Time64 (called after `Time64` has been consumed)
+    fn parse_time64(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let time_unit = self.parse_time_unit("Time64")?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Time64(time_unit))
+    }
+
+    /// Parses the next Duration (called after `Duration` has been consumed)
+    fn parse_duration(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let time_unit = self.parse_time_unit("Duration")?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Duration(time_unit))
+    }
+
+    /// Parses the next Interval (called after `Interval` has been consumed)
+    fn parse_interval(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let interval_unit = match self.next_token()? {
+            Token::IntervalUnit(interval_unit) => interval_unit,
+            tok => {
+                return Err(make_error(
+                    self.val,
+                    &format!("finding IntervalUnit for Interval, got {tok}"),
+                ))
+            }
+        };
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Interval(interval_unit))
+    }
+
+    /// Parses the next FixedSizeBinary (called after `FixedSizeBinary` has 
been consumed)
+    fn parse_fixed_size_binary(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let length = self.parse_i32("FixedSizeBinary")?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::FixedSizeBinary(length))
+    }
+
+    /// Parses the next Decimal128 (called after `Decimal128` has been 
consumed)
+    fn parse_decimal_128(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let precision = self.parse_u8("Decimal128")?;
+        self.expect_token(Token::Comma)?;
+        let scale = self.parse_i8("Decimal128")?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Decimal128(precision, scale))
+    }
+
+    /// Parses the next Decimal256 (called after `Decimal256` has been 
consumed)
+    fn parse_decimal_256(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let precision = self.parse_u8("Decimal256")?;
+        self.expect_token(Token::Comma)?;
+        let scale = self.parse_i8("Decimal256")?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Decimal256(precision, scale))
+    }
+
+    /// Parses the next Dictionary (called after `Dictionary` has been 
consumed)
+    fn parse_dictionary(&mut self) -> Result<DataType> {
+        self.expect_token(Token::LParen)?;
+        let key_type = self.parse_next_type()?;
+        self.expect_token(Token::Comma)?;
+        let value_type = self.parse_next_type()?;
+        self.expect_token(Token::RParen)?;
+        Ok(DataType::Dictionary(
+            Box::new(key_type),
+            Box::new(value_type),
+        ))
+    }
+
+    /// return the next token, or an error if there are none left
+    fn next_token(&mut self) -> Result<Token> {
+        match self.tokenizer.next() {
+            None => Err(make_error(self.val, "finding next token")),
+            Some(token) => token,
+        }
+    }
+
+    /// consume the next token, returning OK(()) if it matches tok, and Err if 
not
+    fn expect_token(&mut self, tok: Token) -> Result<()> {
+        let next_token = self.next_token()?;
+        if next_token == tok {
+            Ok(())
+        } else {
+            Err(make_error_expected(self.val, &tok, &next_token))
+        }
+    }
+}
+
+/// returns true if this character is a separator
+fn is_separator(c: char) -> bool {
+    c == '(' || c == ')' || c == ',' || c == ' '
+}
+
+#[derive(Debug)]
+/// Splits a strings like Dictionary(Int32, Int64) into tokens sutable for 
parsing
+///
+/// For example the string "Timestamp(Nanosecond, None)" would be parsed into:
+///
+/// * Token::Timestamp
+/// * Token::Lparen
+/// * Token::IntervalUnit(IntervalUnit::Nanoseconds)
+/// * Token::Comma,
+/// * Token::None,
+/// * Token::Rparen,
+struct Tokenizer<'a> {
+    val: &'a str,
+    chars: Peekable<Chars<'a>>,
+}
+
+impl<'a> Tokenizer<'a> {
+    fn new(val: &'a str) -> Self {
+        Self {
+            val,
+            chars: val.chars().peekable(),
+        }
+    }
+
+    /// returns the next char, without consuming it
+    fn peek_next_char(&mut self) -> Option<char> {
+        self.chars.peek().copied()
+    }
+
+    /// returns the next char, and consuming it
+    fn next_char(&mut self) -> Option<char> {
+        self.chars.next()
+    }
+
+    /// parse the characters in val starting at pos, until the next
+    /// `,`, `(`, or `)` or end of line
+    fn parse_word(&mut self) -> Result<Token> {
+        let mut word = String::new();
+        loop {
+            match self.peek_next_char() {
+                None => break,
+                Some(c) if is_separator(c) => break,
+                Some(c) => {
+                    self.next_char();
+                    word.push(c);
+                }
+            }
+        }
+
+        // if it started with a number, try parsing it as an integer
+        if let Some(c) = word.chars().next() {
+            if c == '-' || c.is_numeric() {
+                let val: i64 = word.parse().map_err(|e| {
+                    make_error(self.val, &format!("parsing {word} as integer: 
{e}"))
+                })?;
+                return Ok(Token::Integer(val));
+            }
+        }
+
+        // figure out what the word was
+        let token = match word.as_str() {
+            "Null" => Token::SimpleType(DataType::Null),
+            "Boolean" => Token::SimpleType(DataType::Boolean),
+
+            "Int8" => Token::SimpleType(DataType::Int8),
+            "Int16" => Token::SimpleType(DataType::Int16),
+            "Int32" => Token::SimpleType(DataType::Int32),
+            "Int64" => Token::SimpleType(DataType::Int64),
+
+            "UInt8" => Token::SimpleType(DataType::UInt8),
+            "UInt16" => Token::SimpleType(DataType::UInt16),
+            "UInt32" => Token::SimpleType(DataType::UInt32),
+            "UInt64" => Token::SimpleType(DataType::UInt64),
+
+            "Utf8" => Token::SimpleType(DataType::Utf8),
+            "LargeUtf8" => Token::SimpleType(DataType::LargeUtf8),
+            "Binary" => Token::SimpleType(DataType::Binary),
+            "LargeBinary" => Token::SimpleType(DataType::LargeBinary),
+
+            "Float16" => Token::SimpleType(DataType::Float16),
+            "Float32" => Token::SimpleType(DataType::Float32),
+            "Float64" => Token::SimpleType(DataType::Float64),
+
+            "Date32" => Token::SimpleType(DataType::Date32),
+            "Date64" => Token::SimpleType(DataType::Date64),
+
+            "Second" => Token::TimeUnit(TimeUnit::Second),
+            "Millisecond" => Token::TimeUnit(TimeUnit::Millisecond),
+            "Microsecond" => Token::TimeUnit(TimeUnit::Microsecond),
+            "Nanosecond" => Token::TimeUnit(TimeUnit::Nanosecond),
+
+            "Timestamp" => Token::Timestamp,
+            "Time32" => Token::Time32,
+            "Time64" => Token::Time64,
+            "Duration" => Token::Duration,
+            "Interval" => Token::Interval,
+            "Dictionary" => Token::Dictionary,
+
+            "FixedSizeBinary" => Token::FixedSizeBinary,
+            "Decimal128" => Token::Decimal128,
+            "Decimal256" => Token::Decimal256,
+
+            "YearMonth" => Token::IntervalUnit(IntervalUnit::YearMonth),
+            "DayTime" => Token::IntervalUnit(IntervalUnit::DayTime),
+            "MonthDayNano" => Token::IntervalUnit(IntervalUnit::MonthDayNano),
+
+            "None" => Token::None,
+
+            _ => return Err(make_error(self.val, &format!("unrecognized word: 
{word}"))),
+        };
+        Ok(token)
+    }
+}
+
+impl<'a> Iterator for Tokenizer<'a> {
+    type Item = Result<Token>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            match self.peek_next_char()? {
+                ' ' => {
+                    // skip whitespace
+                    self.next_char();
+                    continue;
+                }
+                '(' => {
+                    self.next_char();
+                    return Some(Ok(Token::LParen));
+                }
+                ')' => {
+                    self.next_char();
+                    return Some(Ok(Token::RParen));
+                }
+                ',' => {
+                    self.next_char();
+                    return Some(Ok(Token::Comma));
+                }
+                _ => return Some(self.parse_word()),
+            }
+        }
+    }
+}
+
+/// Grammar is
+///
+#[derive(Debug, PartialEq)]
+enum Token {
+    // Null, or Int32
+    SimpleType(DataType),
+    Timestamp,
+    Time32,
+    Time64,
+    Duration,
+    Interval,
+    FixedSizeBinary,
+    Decimal128,
+    Decimal256,
+    Dictionary,
+    TimeUnit(TimeUnit),
+    IntervalUnit(IntervalUnit),
+    LParen,
+    RParen,
+    Comma,
+    None,
+    Integer(i64),
+}
+
+impl Display for Token {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Token::SimpleType(t) => write!(f, "{t}"),
+            Token::Timestamp => write!(f, "Timestamp"),
+            Token::Time32 => write!(f, "Time32"),
+            Token::Time64 => write!(f, "Time64"),
+            Token::Duration => write!(f, "Duration"),
+            Token::Interval => write!(f, "Interval"),
+            Token::TimeUnit(u) => write!(f, "TimeUnit({u:?})"),
+            Token::IntervalUnit(u) => write!(f, "IntervalUnit({u:?})"),
+            Token::LParen => write!(f, "("),
+            Token::RParen => write!(f, ")"),
+            Token::Comma => write!(f, ","),
+            Token::None => write!(f, "None"),
+            Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
+            Token::Decimal128 => write!(f, "Decimal128"),
+            Token::Decimal256 => write!(f, "Decimal256"),
+            Token::Dictionary => write!(f, "Dictionary"),
+            Token::Integer(v) => write!(f, "Integer({v})"),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use arrow_schema::{IntervalUnit, TimeUnit};
+
+    use super::*;
+
+    #[test]
+    fn test_parse_data_type() {
+        // this ensures types can be parsed correctly from their string 
representations
+        for dt in list_datatypes() {
+            round_trip(dt)
+        }
+    }
+
+    /// convert data_type to a string, and then parse it as a type
+    /// verifying it is the same
+    fn round_trip(data_type: DataType) {
+        let data_type_string = data_type.to_string();
+        println!("Input '{data_type_string}' ({data_type:?})");
+        let parsed_type = parse_data_type(&data_type_string).unwrap();
+        assert_eq!(
+            data_type, parsed_type,
+            "Mismatch parsing {data_type_string}"
+        );
+    }
+
+    fn list_datatypes() -> Vec<DataType> {
+        vec![
+            // ---------
+            // Non Nested types
+            // ---------
+            DataType::Null,
+            DataType::Boolean,
+            DataType::Int8,
+            DataType::Int16,
+            DataType::Int32,
+            DataType::Int64,
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+            DataType::Float16,
+            DataType::Float32,
+            DataType::Float64,
+            DataType::Timestamp(TimeUnit::Second, None),
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            DataType::Timestamp(TimeUnit::Microsecond, None),
+            DataType::Timestamp(TimeUnit::Nanosecond, None),
+            // TODO support timezones
+            //DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
+            DataType::Date32,
+            DataType::Date64,
+            DataType::Time32(TimeUnit::Second),
+            DataType::Time32(TimeUnit::Millisecond),
+            DataType::Time32(TimeUnit::Microsecond),
+            DataType::Time32(TimeUnit::Nanosecond),
+            DataType::Time64(TimeUnit::Second),
+            DataType::Time64(TimeUnit::Millisecond),
+            DataType::Time64(TimeUnit::Microsecond),
+            DataType::Time64(TimeUnit::Nanosecond),
+            DataType::Duration(TimeUnit::Second),
+            DataType::Duration(TimeUnit::Millisecond),
+            DataType::Duration(TimeUnit::Microsecond),
+            DataType::Duration(TimeUnit::Nanosecond),
+            DataType::Interval(IntervalUnit::YearMonth),
+            DataType::Interval(IntervalUnit::DayTime),
+            DataType::Interval(IntervalUnit::MonthDayNano),
+            DataType::Binary,
+            DataType::FixedSizeBinary(0),
+            DataType::FixedSizeBinary(1234),
+            DataType::FixedSizeBinary(-432),
+            DataType::LargeBinary,
+            DataType::Utf8,
+            DataType::LargeUtf8,
+            DataType::Decimal128(7, 12),
+            DataType::Decimal256(6, 13),
+            // ---------
+            // Nested types
+            // ---------
+            DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+            DataType::Dictionary(Box::new(DataType::Int8), 
Box::new(DataType::Utf8)),
+            DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(DataType::FixedSizeBinary(23)),
+            ),
+            DataType::Dictionary(
+                Box::new(DataType::Int8),
+                Box::new(
+                    // nested dictionaries are probably a bad idea but they 
are possible
+                    DataType::Dictionary(
+                        Box::new(DataType::Int8),
+                        Box::new(DataType::Utf8),
+                    ),
+                ),
+            ),
+            // TODO support more structured types (List, LargeList, Struct, 
Union, Map, RunEndEncoded, etc)

Review Comment:
   List, Struct, etc is not important for my use cases (yet) so I didn't add 
support for the yet, but I think all the necessary patterns are present. 
Someone who cares just needs to implement them



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #5166: feat: add `arrow_cast` function to support supports arbitrary arrow types

Reply via email to