nevi-me commented on a change in pull request #9592: URL: https://github.com/apache/arrow/pull/9592#discussion_r584257095
########## File path: rust/parquet/src/schema/types.rs ########## @@ -1035,22 +1082,26 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec<SchemaElement>) { repetition_type: repetition, name: basic_info.name().to_owned(), num_children: Some(fields.len() as i32), - converted_type: basic_info.logical_type().into(), + converted_type: basic_info.converted_type().into(), scale: None, precision: None, field_id: if basic_info.has_id() { Some(basic_info.id()) } else { None }, - logical_type: None, + logical_type: if writer_version > 1 { Review comment: Sorry, wasn't detailed. We weren't populating the `logical_type` when writing files, so even if one was writing a v2 file; they were only populating the `converted_type`. ########## File path: rust/parquet/src/basic.rs ########## @@ -647,34 +806,128 @@ impl str::FromStr for Type { } } +impl str::FromStr for ConvertedType { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result<Self, Self::Err> { + match s { + "NONE" => Ok(ConvertedType::NONE), + "UTF8" => Ok(ConvertedType::UTF8), + "MAP" => Ok(ConvertedType::MAP), + "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE), + "LIST" => Ok(ConvertedType::LIST), + "ENUM" => Ok(ConvertedType::ENUM), + "DECIMAL" => Ok(ConvertedType::DECIMAL), + "DATE" => Ok(ConvertedType::DATE), + "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS), + "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS), + "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS), + "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS), + "UINT_8" => Ok(ConvertedType::UINT_8), + "UINT_16" => Ok(ConvertedType::UINT_16), + "UINT_32" => Ok(ConvertedType::UINT_32), + "UINT_64" => Ok(ConvertedType::UINT_64), + "INT_8" => Ok(ConvertedType::INT_8), + "INT_16" => Ok(ConvertedType::INT_16), + "INT_32" => Ok(ConvertedType::INT_32), + "INT_64" => Ok(ConvertedType::INT_64), + "JSON" => Ok(ConvertedType::JSON), + "BSON" => Ok(ConvertedType::BSON), + "INTERVAL" => Ok(ConvertedType::INTERVAL), + other => Err(general_err!("Invalid logical type {}", other)), + } + } +} + impl str::FromStr for LogicalType { type Err = ParquetError; fn from_str(s: &str) -> result::Result<Self, Self::Err> { match s { - "NONE" => Ok(LogicalType::NONE), - "UTF8" => Ok(LogicalType::UTF8), - "MAP" => Ok(LogicalType::MAP), - "MAP_KEY_VALUE" => Ok(LogicalType::MAP_KEY_VALUE), - "LIST" => Ok(LogicalType::LIST), - "ENUM" => Ok(LogicalType::ENUM), - "DECIMAL" => Ok(LogicalType::DECIMAL), - "DATE" => Ok(LogicalType::DATE), - "TIME_MILLIS" => Ok(LogicalType::TIME_MILLIS), - "TIME_MICROS" => Ok(LogicalType::TIME_MICROS), - "TIMESTAMP_MILLIS" => Ok(LogicalType::TIMESTAMP_MILLIS), - "TIMESTAMP_MICROS" => Ok(LogicalType::TIMESTAMP_MICROS), - "UINT_8" => Ok(LogicalType::UINT_8), - "UINT_16" => Ok(LogicalType::UINT_16), - "UINT_32" => Ok(LogicalType::UINT_32), - "UINT_64" => Ok(LogicalType::UINT_64), - "INT_8" => Ok(LogicalType::INT_8), - "INT_16" => Ok(LogicalType::INT_16), - "INT_32" => Ok(LogicalType::INT_32), - "INT_64" => Ok(LogicalType::INT_64), - "JSON" => Ok(LogicalType::JSON), - "BSON" => Ok(LogicalType::BSON), - "INTERVAL" => Ok(LogicalType::INTERVAL), + "INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + })), + "INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: true, + })), + "INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: true, + })), + "INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: true, + })), + "INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + })), + "INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false, + })), + "INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false, + })), + "INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false, + })), + "MAP" => Ok(LogicalType::MAP(MapType {})), + // "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE), Review comment: Yes, I'll remove it, and add a TODO to the DECIMAL below ########## File path: rust/parquet/src/schema/types.rs ########## @@ -972,18 +1011,22 @@ fn from_thrift_helper( } /// Method to convert to Thrift. -pub fn to_thrift(schema: &Type) -> Result<Vec<SchemaElement>> { +pub fn to_thrift(schema: &Type, writer_version: i32) -> Result<Vec<SchemaElement>> { Review comment: I'll add a comment if you agree with my logic below. I understand the format to mean that `LogicalType` is a version 2 only detail, such that someone writing version 1 of the format, would not expect a `LogicalType` to be populated. So, I'm checking if one is intending on writing v2, and only populating `LogicalType` in that instance. This would become relevant when parsing the schema and displaying it (future PR). a v2 file's text schema has `INTEGER(32,true)` instead of `INT_32`, so to ensure that this is the case, we shouldn't write the `LogicalType` for v1 files; as a loose reader could end up misinterpreting the schema. ########## File path: rust/parquet/src/basic.rs ########## @@ -337,8 +396,11 @@ impl ColumnOrder { // If the max is -0, the row group may contain +0 values as well. // When looking for NaN values, min and max should be ignored. Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, - // unsigned byte-wise comparison - Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, + // Unsigned byte-wise comparison + Type::BYTE_ARRAY => SortOrder::UNSIGNED, + // Only unsigned if there was a logical type that supports unsigned sort. + // Interval has no defined sort order, and should not use UNSIGNED. Review comment: After thinking more about it, I agree with you. I initially resorted to only using LogicalType + PhysicalType, in which case there would have been no ConvertedType covering INTERVAL. I changed this after realising that it was a mistake; but forgot to revise this. I'll change this. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org