This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new a96252271a Support both 0x01 and 0x02 as type for list of booleans in
thrift metadata (#7052)
a96252271a is described below
commit a96252271aaaa2edf035dcef5e032146887ac51a
Author: Jörn Horstmann <[email protected]>
AuthorDate: Thu Feb 6 12:45:30 2025 +0100
Support both 0x01 and 0x02 as type for list of booleans in thrift metadata
(#7052)
* Support both 0x01 and 0x02 as type for list of booleans
* Also support 0 for false inside boolean collections
* Use hex notation in tests
---
parquet/src/thrift.rs | 62 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 60 insertions(+), 2 deletions(-)
diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs
index b216fec6f3..bf8a2926aa 100644
--- a/parquet/src/thrift.rs
+++ b/parquet/src/thrift.rs
@@ -193,9 +193,13 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> {
Some(b) => Ok(b),
None => {
let b = self.read_byte()?;
+ // Previous versions of the thrift specification said to use 0
and 1 inside collections,
+ // but that differed from existing implementations.
+ // The specification was updated in
https://github.com/apache/thrift/commit/2c29c5665bc442e703480bb0ee60fe925ffe02e8.
+ // At least the go implementation seems to have followed the
previously documented values.
match b {
0x01 => Ok(true),
- 0x02 => Ok(false),
+ 0x00 | 0x02 => Ok(false),
unkn => Err(thrift::Error::Protocol(thrift::ProtocolError {
kind: thrift::ProtocolErrorKind::InvalidData,
message: format!("cannot convert {} into bool", unkn),
@@ -274,7 +278,12 @@ impl TInputProtocol for TCompactSliceInputProtocol<'_> {
fn collection_u8_to_type(b: u8) -> thrift::Result<TType> {
match b {
- 0x01 => Ok(TType::Bool),
+ // For historical and compatibility reasons, a reader should be
capable to deal with both cases.
+ // The only valid value in the original spec was 2, but due to an
widespread implementation bug
+ // the defacto standard across large parts of the library became 1
instead.
+ // As a result, both values are now allowed.
+ //
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
+ 0x01 | 0x02 => Ok(TType::Bool),
o => u8_to_type(o),
}
}
@@ -305,3 +314,52 @@ fn eof_error() -> thrift::Error {
message: "Unexpected EOF".to_string(),
})
}
+
+#[cfg(test)]
+mod tests {
+ use crate::format::{BoundaryOrder, ColumnIndex};
+ use crate::thrift::{TCompactSliceInputProtocol, TSerializable};
+
+ #[test]
+ pub fn read_boolean_list_field_type() {
+ // Boolean collection type encoded as 0x01, as used by this crate when
writing.
+ // Values encoded as 1 (true) or 2 (false) as in the current version
of the thrift
+ // documentation.
+ let bytes = vec![0x19, 0x21, 2, 1, 0x19, 8, 0x19, 8, 0x15, 0, 0];
+
+ let mut protocol = TCompactSliceInputProtocol::new(bytes.as_slice());
+ let index = ColumnIndex::read_from_in_protocol(&mut protocol).unwrap();
+ let expected = ColumnIndex {
+ null_pages: vec![false, true],
+ min_values: vec![],
+ max_values: vec![],
+ boundary_order: BoundaryOrder::UNORDERED,
+ null_counts: None,
+ repetition_level_histograms: None,
+ definition_level_histograms: None,
+ };
+
+ assert_eq!(&index, &expected);
+ }
+
+ #[test]
+ pub fn read_boolean_list_alternative_encoding() {
+ // Boolean collection type encoded as 0x02, as allowed by the spec.
+ // Values encoded as 1 (true) or 0 (false) as before the thrift
documentation change on 2024-12-13.
+ let bytes = vec![0x19, 0x22, 0, 1, 0x19, 8, 0x19, 8, 0x15, 0, 0];
+
+ let mut protocol = TCompactSliceInputProtocol::new(bytes.as_slice());
+ let index = ColumnIndex::read_from_in_protocol(&mut protocol).unwrap();
+ let expected = ColumnIndex {
+ null_pages: vec![false, true],
+ min_values: vec![],
+ max_values: vec![],
+ boundary_order: BoundaryOrder::UNORDERED,
+ null_counts: None,
+ repetition_level_histograms: None,
+ definition_level_histograms: None,
+ };
+
+ assert_eq!(&index, &expected);
+ }
+}