sunchao commented on a change in pull request #9592:
URL: https://github.com/apache/arrow/pull/9592#discussion_r584251475



##########
File path: rust/parquet/src/basic.rs
##########
@@ -337,8 +396,11 @@ impl ColumnOrder {
             // If the max is -0, the row group may contain +0 values as well.
             // When looking for NaN values, min and max should be ignored.
             Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED,
-            // unsigned byte-wise comparison
-            Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => 
SortOrder::UNSIGNED,
+            // Unsigned byte-wise comparison
+            Type::BYTE_ARRAY => SortOrder::UNSIGNED,
+            // Only unsigned if there was a logical type that supports 
unsigned sort.
+            // Interval has no defined sort order, and should not use UNSIGNED.

Review comment:
       Not quite sure about this: since `INTERVAL` is already handled above 
with `UNDEFINED`, why should we use `UNDEFINED` for this? 
   
   Seems 
[parquet-mr](https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java#L507)
 only uses `UNDEFINED` for `INT96` and `INTERVAL`.

##########
File path: rust/parquet/src/schema/types.rs
##########
@@ -972,18 +1011,22 @@ fn from_thrift_helper(
 }
 
 /// Method to convert to Thrift.
-pub fn to_thrift(schema: &Type) -> Result<Vec<SchemaElement>> {
+pub fn to_thrift(schema: &Type, writer_version: i32) -> 
Result<Vec<SchemaElement>> {

Review comment:
       Can you elaborate a bit why `writer_version` is needed? I thought it 
would just be a straightforward conversion from the `SchemaElement` to the 
thrift counterpart.

##########
File path: rust/parquet/src/basic.rs
##########
@@ -647,34 +806,128 @@ impl str::FromStr for Type {
     }
 }
 
+impl str::FromStr for ConvertedType {
+    type Err = ParquetError;
+
+    fn from_str(s: &str) -> result::Result<Self, Self::Err> {
+        match s {
+            "NONE" => Ok(ConvertedType::NONE),
+            "UTF8" => Ok(ConvertedType::UTF8),
+            "MAP" => Ok(ConvertedType::MAP),
+            "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE),
+            "LIST" => Ok(ConvertedType::LIST),
+            "ENUM" => Ok(ConvertedType::ENUM),
+            "DECIMAL" => Ok(ConvertedType::DECIMAL),
+            "DATE" => Ok(ConvertedType::DATE),
+            "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS),
+            "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS),
+            "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS),
+            "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS),
+            "UINT_8" => Ok(ConvertedType::UINT_8),
+            "UINT_16" => Ok(ConvertedType::UINT_16),
+            "UINT_32" => Ok(ConvertedType::UINT_32),
+            "UINT_64" => Ok(ConvertedType::UINT_64),
+            "INT_8" => Ok(ConvertedType::INT_8),
+            "INT_16" => Ok(ConvertedType::INT_16),
+            "INT_32" => Ok(ConvertedType::INT_32),
+            "INT_64" => Ok(ConvertedType::INT_64),
+            "JSON" => Ok(ConvertedType::JSON),
+            "BSON" => Ok(ConvertedType::BSON),
+            "INTERVAL" => Ok(ConvertedType::INTERVAL),
+            other => Err(general_err!("Invalid logical type {}", other)),
+        }
+    }
+}
+
 impl str::FromStr for LogicalType {
     type Err = ParquetError;
 
     fn from_str(s: &str) -> result::Result<Self, Self::Err> {
         match s {
-            "NONE" => Ok(LogicalType::NONE),
-            "UTF8" => Ok(LogicalType::UTF8),
-            "MAP" => Ok(LogicalType::MAP),
-            "MAP_KEY_VALUE" => Ok(LogicalType::MAP_KEY_VALUE),
-            "LIST" => Ok(LogicalType::LIST),
-            "ENUM" => Ok(LogicalType::ENUM),
-            "DECIMAL" => Ok(LogicalType::DECIMAL),
-            "DATE" => Ok(LogicalType::DATE),
-            "TIME_MILLIS" => Ok(LogicalType::TIME_MILLIS),
-            "TIME_MICROS" => Ok(LogicalType::TIME_MICROS),
-            "TIMESTAMP_MILLIS" => Ok(LogicalType::TIMESTAMP_MILLIS),
-            "TIMESTAMP_MICROS" => Ok(LogicalType::TIMESTAMP_MICROS),
-            "UINT_8" => Ok(LogicalType::UINT_8),
-            "UINT_16" => Ok(LogicalType::UINT_16),
-            "UINT_32" => Ok(LogicalType::UINT_32),
-            "UINT_64" => Ok(LogicalType::UINT_64),
-            "INT_8" => Ok(LogicalType::INT_8),
-            "INT_16" => Ok(LogicalType::INT_16),
-            "INT_32" => Ok(LogicalType::INT_32),
-            "INT_64" => Ok(LogicalType::INT_64),
-            "JSON" => Ok(LogicalType::JSON),
-            "BSON" => Ok(LogicalType::BSON),
-            "INTERVAL" => Ok(LogicalType::INTERVAL),
+            "INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 8,
+                is_signed: true,
+            })),
+            "INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 16,
+                is_signed: true,
+            })),
+            "INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 32,
+                is_signed: true,
+            })),
+            "INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 64,
+                is_signed: true,
+            })),
+            "INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 8,
+                is_signed: false,
+            })),
+            "INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 16,
+                is_signed: false,
+            })),
+            "INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 32,
+                is_signed: false,
+            })),
+            "INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType {
+                bit_width: 64,
+                is_signed: false,
+            })),
+            "MAP" => Ok(LogicalType::MAP(MapType {})),
+            // "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE),

Review comment:
       nit: should we remove this?

##########
File path: rust/parquet/src/basic.rs
##########
@@ -647,34 +806,128 @@ impl str::FromStr for Type {
     }
 }
 
+impl str::FromStr for ConvertedType {
+    type Err = ParquetError;
+
+    fn from_str(s: &str) -> result::Result<Self, Self::Err> {
+        match s {
+            "NONE" => Ok(ConvertedType::NONE),
+            "UTF8" => Ok(ConvertedType::UTF8),
+            "MAP" => Ok(ConvertedType::MAP),
+            "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE),
+            "LIST" => Ok(ConvertedType::LIST),
+            "ENUM" => Ok(ConvertedType::ENUM),
+            "DECIMAL" => Ok(ConvertedType::DECIMAL),
+            "DATE" => Ok(ConvertedType::DATE),
+            "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS),
+            "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS),
+            "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS),
+            "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS),
+            "UINT_8" => Ok(ConvertedType::UINT_8),
+            "UINT_16" => Ok(ConvertedType::UINT_16),
+            "UINT_32" => Ok(ConvertedType::UINT_32),
+            "UINT_64" => Ok(ConvertedType::UINT_64),
+            "INT_8" => Ok(ConvertedType::INT_8),
+            "INT_16" => Ok(ConvertedType::INT_16),
+            "INT_32" => Ok(ConvertedType::INT_32),
+            "INT_64" => Ok(ConvertedType::INT_64),
+            "JSON" => Ok(ConvertedType::JSON),
+            "BSON" => Ok(ConvertedType::BSON),
+            "INTERVAL" => Ok(ConvertedType::INTERVAL),
+            other => Err(general_err!("Invalid logical type {}", other)),

Review comment:
       nit: logical -> converted

##########
File path: rust/parquet/src/schema/types.rs
##########
@@ -1035,22 +1082,26 @@ fn to_thrift_helper(schema: &Type, elements: &mut 
Vec<SchemaElement>) {
                 repetition_type: repetition,
                 name: basic_info.name().to_owned(),
                 num_children: Some(fields.len() as i32),
-                converted_type: basic_info.logical_type().into(),
+                converted_type: basic_info.converted_type().into(),
                 scale: None,
                 precision: None,
                 field_id: if basic_info.has_id() {
                     Some(basic_info.id())
                 } else {
                     None
                 },
-                logical_type: None,
+                logical_type: if writer_version > 1 {

Review comment:
       Hmm populating what?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to