nevi-me commented on a change in pull request #9705:
URL: https://github.com/apache/arrow/pull/9705#discussion_r600400608



##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -503,6 +675,129 @@ mod tests {
         assert!(result.is_ok());
     }
 
+    #[test]
+    fn test_parse_message_type_integer() {
+        // Invalid integer syntax
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER());
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer syntax, needs both bit-width and UTC sign
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER(32,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer because of non-numeric bit width
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(eight,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(8,false));
+      optional int32 f2 (INTEGER(8,true));
+      optional int32 f3 (INTEGER(16,false));
+      optional int32 f4 (INTEGER(16,true));
+      optional int32 f5 (INTEGER(32,false));
+      optional int32 f6 (INTEGER(32,true));
+      optional int64 f7 (INTEGER(64,false));
+      optional int64 f7 (INTEGER(64,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_parse_message_type_temporal() {
+        // Invalid timestamp syntax
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP();
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid timestamp syntax, needs both unit and UTC adjustment
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP(MILLIS,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid timestamp because of unknown unit
+        let schema = "
+    message root {
+      optional int64 f1 (TIMESTAMP(YOCTOS,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (DATE);
+      optional int32 f2 (TIME(MILLIS,true));
+      optional int64 f3 (TIME(MICROS,false));
+      optional int64 f4 (TIME(NANOS,true));
+      optional int64 f5 (TIMESTAMP(MILLIS,true));
+      optional int64 f6 (TIMESTAMP(MICROS,true));
+      optional int64 f7 (TIMESTAMP(NANOS,false));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());

Review comment:
       Also covered this on other tests

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -281,19 +329,142 @@ impl<'a> Parser<'a> {
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
         // Parse converted type
-        let (converted_type, precision, scale) = if let Some("(") = 
self.tokenizer.next()
+        let (logical_type, converted_type, precision, scale) = if let 
Some("(") =
+            self.tokenizer.next()
         {
-            let tpe = self
+            let (mut logical, mut converted) = self
                 .tokenizer
                 .next()
-                .ok_or_else(|| general_err!("Expected converted type, found 
None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .ok_or_else(|| {
+                    general_err!("Expected logical or converted type, found 
None")
+                })
+                .and_then(|v| {
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),
+                        )),
+                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+                    }
+                })?;
 
             // Parse precision and scale for decimals
             let mut precision: i32 = -1;
             let mut scale: i32 = -1;
 
-            if tpe == ConvertedType::DECIMAL {
+            // Parse the concrete logical type
+            if let Some(tpe) = &logical {
+                match tpe {
+                    LogicalType::DECIMAL(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            precision = parse_i32(
+                                self.tokenizer.next(),
+                                "Expected precision, found None",
+                                "Failed to parse precision for DECIMAL type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                scale = parse_i32(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for TIME 
type",
+                                )?; // TODO: this might not cater for the case 
of no scale correctly

Review comment:
       The spec allows `DECIMAL(precision)` 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
   
   The TODO was because I got confused by the token parsing at the time. I've 
looked at it again, and it looks fine.

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -222,18 +258,29 @@ impl<'a> Parser<'a> {
             .next()
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
-        // Parse converted type if exists
-        let converted_type = if let Some("(") = self.tokenizer.next() {
+        // Parse logical or converted type if exists
+        let (logical_type, converted_type) = if let Some("(") = 
self.tokenizer.next() {
             let tpe = self
                 .tokenizer
                 .next()
                 .ok_or_else(|| general_err!("Expected converted type, found 
None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .and_then(|v| {
+                    // Try logical type first
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),

Review comment:
       This has worked well for me so far, because ConvertedType has a `NONE` 
enum, so when using `ConvertedType::from(None)` I get `ConvertedType::NONE`. 
Also, not all LogicalTypes map to ConvertedType, so I can also return NONE in 
their instance.
   
   If I were to take `ConvertedType::from(LogicalType)`, I'd have to repeat `if 
let Some(logicla_type) = logicalType` in a few places.

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -281,19 +329,142 @@ impl<'a> Parser<'a> {
             .ok_or_else(|| general_err!("Expected name, found None"))?;
 
         // Parse converted type
-        let (converted_type, precision, scale) = if let Some("(") = 
self.tokenizer.next()
+        let (logical_type, converted_type, precision, scale) = if let 
Some("(") =
+            self.tokenizer.next()
         {
-            let tpe = self
+            let (mut logical, mut converted) = self
                 .tokenizer
                 .next()
-                .ok_or_else(|| general_err!("Expected converted type, found 
None"))
-                .and_then(|v| v.to_uppercase().parse::<ConvertedType>())?;
+                .ok_or_else(|| {
+                    general_err!("Expected logical or converted type, found 
None")
+                })
+                .and_then(|v| {
+                    let upper = v.to_uppercase();
+                    let logical = upper.parse::<LogicalType>();
+                    match logical {
+                        Ok(logical) => Ok((
+                            Some(logical.clone()),
+                            ConvertedType::from(Some(logical)),
+                        )),
+                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
+                    }
+                })?;
 
             // Parse precision and scale for decimals
             let mut precision: i32 = -1;
             let mut scale: i32 = -1;
 
-            if tpe == ConvertedType::DECIMAL {
+            // Parse the concrete logical type
+            if let Some(tpe) = &logical {
+                match tpe {
+                    LogicalType::DECIMAL(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            precision = parse_i32(
+                                self.tokenizer.next(),
+                                "Expected precision, found None",
+                                "Failed to parse precision for DECIMAL type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                scale = parse_i32(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for TIME 
type",
+                                )?; // TODO: this might not cater for the case 
of no scale correctly
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = 
Some(LogicalType::DECIMAL(DecimalType {
+                                    precision,
+                                    scale,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            } else {
+                                scale = 0;
+                                logical = 
Some(LogicalType::DECIMAL(DecimalType {
+                                    precision,
+                                    scale,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            }
+                        }
+                    }
+                    LogicalType::TIME(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let unit = parse_timeunit(
+                                self.tokenizer.next(),
+                                "Invalid timeunit found",
+                                "Failed to parse timeunit for TIME type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_adjusted_to_u_t_c = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for TIME 
type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = Some(LogicalType::TIME(TimeType {
+                                    unit,
+                                    is_adjusted_to_u_t_c,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    LogicalType::TIMESTAMP(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let unit = parse_timeunit(
+                                self.tokenizer.next(),
+                                "Invalid timeunit found",
+                                "Failed to parse timeunit for TIMESTAMP type",
+                            )?;
+                            if let Some(",") = self.tokenizer.next() {
+                                let is_adjusted_to_u_t_c = parse_bool(
+                                    self.tokenizer.next(),
+                                    "Invalid boolean found",
+                                    "Failure to parse timezone info for 
TIMESTAMP type",
+                                )?;
+                                assert_token(self.tokenizer.next(), ")")?;
+                                logical = 
Some(LogicalType::TIMESTAMP(TimestampType {
+                                    unit,
+                                    is_adjusted_to_u_t_c,
+                                }));
+                                converted = 
ConvertedType::from(logical.clone());
+                            } else {
+                                // Invalid token for unit
+                                self.tokenizer.backtrack();
+                            }
+                        }
+                    }
+                    LogicalType::INTEGER(_) => {
+                        if let Some("(") = self.tokenizer.next() {
+                            let bit_width = parse_i32(

Review comment:
       I've checked them against the physical type. Thanks

##########
File path: rust/parquet/src/schema/parser.rs
##########
@@ -503,6 +675,129 @@ mod tests {
         assert!(result.is_ok());
     }
 
+    #[test]
+    fn test_parse_message_type_integer() {
+        // Invalid integer syntax
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER());
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer syntax, needs both bit-width and UTC sign
+        let schema = "
+    message root {
+      optional int64 f1 (INTEGER(32,));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Invalid integer because of non-numeric bit width
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(eight,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_err());
+
+        // Valid types
+        let schema = "
+    message root {
+      optional int32 f1 (INTEGER(8,false));
+      optional int32 f2 (INTEGER(8,true));
+      optional int32 f3 (INTEGER(16,false));
+      optional int32 f4 (INTEGER(16,true));
+      optional int32 f5 (INTEGER(32,false));
+      optional int32 f6 (INTEGER(32,true));
+      optional int64 f7 (INTEGER(64,false));
+      optional int64 f7 (INTEGER(64,true));
+    }
+    ";
+        let mut iter = Tokenizer::from_str(schema);
+        let result = Parser {
+            tokenizer: &mut iter,
+        }
+        .parse_message_type();
+        assert!(result.is_ok());

Review comment:
       I've checked that the error messages are what's expected, but for this 
`is_ok()` case, I think it'd be duplicative as we test the schema in 
`test_parse_message_type_compare_*` further below in the code




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to