This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 9d03e410a3 Support old syntax for DataType parsing (#8541)
9d03e410a3 is described below
commit 9d03e410a3c52cc594d2541ec600dde6d26dfd9d
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Oct 3 14:03:45 2025 -0700
Support old syntax for DataType parsing (#8541)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/8539
# Rationale for this change
Systems like DataFusion use the string representation of DataType in
their public APIs but the type names have changed after
- https://github.com/apache/arrow-rs/pull/8425
We should retain backwards compatibility with the old type names too
# What changes are included in this PR?
1. Support old style `Timestamp(Nanosecond, None)` and
`Timestamp(Nanosecond, None)` style timestamp specifiers
# Are these changes tested?
Yes, with new tests
# Are there any user-facing changes?
See above
---
arrow-schema/src/datatype_parse.rs | 216 ++++++++++++++++++++++++++++++++++++-
1 file changed, 211 insertions(+), 5 deletions(-)
diff --git a/arrow-schema/src/datatype_parse.rs
b/arrow-schema/src/datatype_parse.rs
index 60f92c2e2b..48b7089e8e 100644
--- a/arrow-schema/src/datatype_parse.rs
+++ b/arrow-schema/src/datatype_parse.rs
@@ -19,6 +19,9 @@ use std::{fmt::Display, iter::Peekable, str::Chars,
sync::Arc};
use crate::{ArrowError, DataType, Field, Fields, IntervalUnit, TimeUnit};
+/// Parses a DataType from a string representation
+///
+/// For example, the string "Int32" would be parsed into [`DataType::Int32`]
pub(crate) fn parse_data_type(val: &str) -> ArrowResult<DataType> {
Parser::new(val).parse()
}
@@ -36,8 +39,8 @@ fn make_error_expected(val: &str, expected: &Token, actual:
&Token) -> ArrowErro
make_error(val, &format!("Expected '{expected}', got '{actual}'"))
}
-#[derive(Debug)]
/// Implementation of `parse_data_type`, modeled after
<https://github.com/sqlparser-rs/sqlparser-rs>
+#[derive(Debug)]
struct Parser<'a> {
val: &'a str,
tokenizer: Peekable<Tokenizer<'a>>,
@@ -199,9 +202,31 @@ impl<'a> Parser<'a> {
let timezone;
match self.next_token()? {
Token::Comma => {
- timezone = Some(self.parse_double_quoted_string("Timezone")?);
+ match self.next_token()? {
+ // Support old style `Timestamp(Nanosecond, None)`
+ Token::None => {
+ timezone = None;
+ }
+ // Support old style `Timestamp(Nanosecond,
Some("Timezone"))`
+ Token::Some => {
+ self.expect_token(Token::LParen)?;
+ timezone =
Some(self.parse_double_quoted_string("Timezone")?);
+ self.expect_token(Token::RParen)?;
+ }
+ Token::DoubleQuotedString(tz) => {
+ // Support new style `Timestamp(Nanosecond,
"Timezone")`
+ timezone = Some(tz);
+ }
+ tok => {
+ return Err(make_error(
+ self.val,
+ &format!("Expected None, Some, or a timezone
string, got {tok:?}"),
+ ));
+ }
+ };
self.expect_token(Token::RParen)?;
}
+ // No timezone (e.g `Timestamp(ns)`)
Token::RParen => {
timezone = None;
}
@@ -680,7 +705,7 @@ mod test {
}
}
- /// convert data_type to a string, and then parse it as a type
+ /// Ensure we converting data_type to a string, and then parse it as a type
/// verifying it is the same
fn round_trip(data_type: DataType) {
let data_type_string = data_type.to_string();
@@ -831,9 +856,190 @@ mod test {
];
for (data_type_string, expected_data_type) in cases {
- println!("Parsing '{data_type_string}', expecting
'{expected_data_type}'");
let parsed_data_type = parse_data_type(data_type_string).unwrap();
- assert_eq!(parsed_data_type, expected_data_type);
+ assert_eq!(
+ parsed_data_type, expected_data_type,
+ "Parsing '{data_type_string}', expecting
'{expected_data_type}'"
+ );
+ }
+ }
+
+ /// Ensure that old style types can still be parsed
+ #[test]
+ fn test_parse_data_type_backwards_compatibility() {
+ use DataType::*;
+ use IntervalUnit::*;
+ use TimeUnit::*;
+ // List below created with:
+ // for t in list_datatypes() {
+ // println!(r#"("{t}", {t:?}),"#)
+ // }
+ // (string to parse, expected DataType)
+ let cases = [
+ ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
+ ("Timestamp(Microsecond, None)", Timestamp(Microsecond, None)),
+ ("Timestamp(Millisecond, None)", Timestamp(Millisecond, None)),
+ ("Timestamp(Second, None)", Timestamp(Second, None)),
+ ("Timestamp(Nanosecond, None)", Timestamp(Nanosecond, None)),
+ // Timezones
+ (
+ r#"Timestamp(Nanosecond, Some("+00:00"))"#,
+ Timestamp(Nanosecond, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(Microsecond, Some("+00:00"))"#,
+ Timestamp(Microsecond, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(Millisecond, Some("+00:00"))"#,
+ Timestamp(Millisecond, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(Second, Some("+00:00"))"#,
+ Timestamp(Second, Some("+00:00".into())),
+ ),
+ ("Null", Null),
+ ("Boolean", Boolean),
+ ("Int8", Int8),
+ ("Int16", Int16),
+ ("Int32", Int32),
+ ("Int64", Int64),
+ ("UInt8", UInt8),
+ ("UInt16", UInt16),
+ ("UInt32", UInt32),
+ ("UInt64", UInt64),
+ ("Float16", Float16),
+ ("Float32", Float32),
+ ("Float64", Float64),
+ ("Timestamp(s)", Timestamp(Second, None)),
+ ("Timestamp(ms)", Timestamp(Millisecond, None)),
+ ("Timestamp(µs)", Timestamp(Microsecond, None)),
+ ("Timestamp(ns)", Timestamp(Nanosecond, None)),
+ (
+ r#"Timestamp(ns, "+00:00")"#,
+ Timestamp(Nanosecond, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(µs, "+00:00")"#,
+ Timestamp(Microsecond, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(ms, "+00:00")"#,
+ Timestamp(Millisecond, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(s, "+00:00")"#,
+ Timestamp(Second, Some("+00:00".into())),
+ ),
+ (
+ r#"Timestamp(ns, "+08:00")"#,
+ Timestamp(Nanosecond, Some("+08:00".into())),
+ ),
+ (
+ r#"Timestamp(µs, "+08:00")"#,
+ Timestamp(Microsecond, Some("+08:00".into())),
+ ),
+ (
+ r#"Timestamp(ms, "+08:00")"#,
+ Timestamp(Millisecond, Some("+08:00".into())),
+ ),
+ (
+ r#"Timestamp(s, "+08:00")"#,
+ Timestamp(Second, Some("+08:00".into())),
+ ),
+ ("Date32", Date32),
+ ("Date64", Date64),
+ ("Time32(s)", Time32(Second)),
+ ("Time32(ms)", Time32(Millisecond)),
+ ("Time32(µs)", Time32(Microsecond)),
+ ("Time32(ns)", Time32(Nanosecond)),
+ ("Time64(s)", Time64(Second)),
+ ("Time64(ms)", Time64(Millisecond)),
+ ("Time64(µs)", Time64(Microsecond)),
+ ("Time64(ns)", Time64(Nanosecond)),
+ ("Duration(s)", Duration(Second)),
+ ("Duration(ms)", Duration(Millisecond)),
+ ("Duration(µs)", Duration(Microsecond)),
+ ("Duration(ns)", Duration(Nanosecond)),
+ ("Interval(YearMonth)", Interval(YearMonth)),
+ ("Interval(DayTime)", Interval(DayTime)),
+ ("Interval(MonthDayNano)", Interval(MonthDayNano)),
+ ("Binary", Binary),
+ ("BinaryView", BinaryView),
+ ("FixedSizeBinary(0)", FixedSizeBinary(0)),
+ ("FixedSizeBinary(1234)", FixedSizeBinary(1234)),
+ ("FixedSizeBinary(-432)", FixedSizeBinary(-432)),
+ ("LargeBinary", LargeBinary),
+ ("Utf8", Utf8),
+ ("Utf8View", Utf8View),
+ ("LargeUtf8", LargeUtf8),
+ ("Decimal32(7, 8)", Decimal32(7, 8)),
+ ("Decimal64(6, 9)", Decimal64(6, 9)),
+ ("Decimal128(7, 12)", Decimal128(7, 12)),
+ ("Decimal256(6, 13)", Decimal256(6, 13)),
+ (
+ "Dictionary(Int32, Utf8)",
+ Dictionary(Box::new(Int32), Box::new(Utf8)),
+ ),
+ (
+ "Dictionary(Int8, Utf8)",
+ Dictionary(Box::new(Int8), Box::new(Utf8)),
+ ),
+ (
+ "Dictionary(Int8, Timestamp(ns))",
+ Dictionary(Box::new(Int8), Box::new(Timestamp(Nanosecond,
None))),
+ ),
+ (
+ "Dictionary(Int8, FixedSizeBinary(23))",
+ Dictionary(Box::new(Int8), Box::new(FixedSizeBinary(23))),
+ ),
+ (
+ "Dictionary(Int8, Dictionary(Int8, Utf8))",
+ Dictionary(
+ Box::new(Int8),
+ Box::new(Dictionary(Box::new(Int8), Box::new(Utf8))),
+ ),
+ ),
+ (
+ r#"Struct("f1": nullable Int64, "f2": nullable Float64, "f3":
nullable Timestamp(s, "+08:00"), "f4": nullable Dictionary(Int8,
FixedSizeBinary(23)))"#,
+ Struct(Fields::from(vec![
+ Field::new("f1", Int64, true),
+ Field::new("f2", Float64, true),
+ Field::new("f3", Timestamp(Second, Some("+08:00".into())),
true),
+ Field::new(
+ "f4",
+ Dictionary(Box::new(Int8),
Box::new(FixedSizeBinary(23))),
+ true,
+ ),
+ ])),
+ ),
+ (
+ r#"Struct("Int64": nullable Int64, "Float64": nullable
Float64)"#,
+ Struct(Fields::from(vec![
+ Field::new("Int64", Int64, true),
+ Field::new("Float64", Float64, true),
+ ])),
+ ),
+ (
+ r#"Struct("f1": nullable Int64, "nested_struct": nullable
Struct("n1": nullable Int64))"#,
+ Struct(Fields::from(vec![
+ Field::new("f1", Int64, true),
+ Field::new(
+ "nested_struct",
+ Struct(Fields::from(vec![Field::new("n1", Int64,
true)])),
+ true,
+ ),
+ ])),
+ ),
+ (r#"Struct()"#, Struct(Fields::empty())),
+ ];
+
+ for (data_type_string, expected_data_type) in cases {
+ let parsed_data_type = parse_data_type(data_type_string).unwrap();
+ assert_eq!(
+ parsed_data_type, expected_data_type,
+ "Parsing '{data_type_string}', expecting
'{expected_data_type}'"
+ );
}
}