alamb commented on code in PR #7807: URL: https://github.com/apache/arrow-rs/pull/7807#discussion_r2177427505
########## parquet-variant/src/variant/object.rs: ########## @@ -29,111 +29,195 @@ const NUM_HEADER_BYTES: usize = 1; /// Header structure for [`VariantObject`] #[derive(Clone, Debug, PartialEq)] pub(crate) struct VariantObjectHeader { - field_offset_size: OffsetSizeBytes, + num_elements_size: OffsetSizeBytes, field_id_size: OffsetSizeBytes, - is_large: bool, + field_offset_size: OffsetSizeBytes, } impl VariantObjectHeader { + // Hide the ugly casting + const fn num_elements_size(&self) -> usize { + self.num_elements_size as _ + } + const fn field_id_size(&self) -> usize { + self.field_id_size as _ + } + const fn field_offset_size(&self) -> usize { + self.field_offset_size as _ + } + + // Avoid materializing this offset, since it's cheaply and safely computable + const fn field_ids_start_byte(&self) -> usize { + NUM_HEADER_BYTES + self.num_elements_size() + } + pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> { // Parse the header byte to get object parameters let value_header = header_byte >> 2; let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits let is_large = (value_header & 0x10) != 0; // 5th bit - + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, + }; Ok(Self { - field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?, + num_elements_size, field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?, - is_large, + field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?, }) } } /// A [`Variant`] Object (struct with named fields). +/// +/// See the [Variant spec] file for more information. +/// +/// # Validation +/// +/// Every instance of variant object is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of a variant object subtype (see below). +/// +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully (and recursively) +/// _validated_. They always contain _valid_ data, and infallible accesses such as iteration and +/// indexing are panic-free. The validation cost is linear in the number of underlying bytes. +/// +/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or +/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying +/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are +/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// _unvalidated_ instance, if desired. +/// +/// _Unvalidated_ instances can be constructed in constant time. They can be useful if the caller +/// knows the underlying bytes were already validated previously, or if the caller intends to +/// perform a small number of (fallible) field accesses against a large object. +/// +/// A _validated_ instance guarantees that: +/// +/// - header byte is valid +/// - num_elements is in bounds +/// - field id array is in bounds +/// - field offset array is in bounds +/// - field value array is in bounds +/// - all field ids are valid metadata dictionary entries (*) +/// - field ids are lexically ordered according by their corresponding string values (*) +/// - all field offsets are in bounds (*) +/// - all field values are (recursively) _valid_ variant values (*) +/// - the associated variant metadata is [valid] (*) +/// +/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)` +/// in the list above); it panics any of the other checks fails. +/// +/// # Safety +/// +/// Even an _invalid_ variant object instance is still _safe_ to use in the Rust sense. Accessing it +/// with infallible methods may cause panics but will never lead to undefined behavior. +/// +/// [valid]: VariantMetadata#Validation +/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2 #[derive(Clone, Debug, PartialEq)] pub struct VariantObject<'m, 'v> { pub metadata: VariantMetadata<'m>, pub value: &'v [u8], header: VariantObjectHeader, num_elements: usize, - field_ids_start_byte: usize, - field_offsets_start_byte: usize, - values_start_byte: usize, + first_field_offset_byte: usize, Review Comment: these names are much easier to understand ########## parquet-variant/src/variant.rs: ########## @@ -177,6 +177,38 @@ impl Deref for ShortString<'_> { /// _ => println!("Other variant"), /// } /// ``` +/// +/// # Validation Review Comment: 👨🍳 👌 -- very nice ########## parquet-variant/src/variant/list.rs: ########## @@ -28,39 +29,103 @@ const NUM_HEADER_BYTES: usize = 1; /// A parsed version of the variant array value header byte. #[derive(Clone, Debug, PartialEq)] pub(crate) struct VariantListHeader { + num_elements_size: OffsetSizeBytes, offset_size: OffsetSizeBytes, - is_large: bool, } impl VariantListHeader { + // Hide the ugly casting + const fn num_elements_size(&self) -> usize { + self.num_elements_size as _ + } + const fn offset_size(&self) -> usize { + self.offset_size as _ + } + + // Avoid materializing this offset, since it's cheaply and safely computable + const fn first_offset_byte(&self) -> usize { + NUM_HEADER_BYTES + self.num_elements_size() + } + pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> { // The 6 first bits to the left are the value_header and the 2 bits // to the right are the basic type, so we shift to get only the value_header let value_header = header_byte >> 2; let is_large = (value_header & 0x04) != 0; // 3rd bit from the right let field_offset_size_minus_one = value_header & 0x03; // Last two bits + + // The size of the num_elements entry in the array value_data is 4 bytes if + // is_large is true, otherwise 1 byte. + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, + }; let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; Ok(Self { + num_elements_size, offset_size, - is_large, }) } } /// [`Variant`] Array. /// +/// See the [Variant spec] for details. +/// /// NOTE: The "list" naming differs from the variant spec -- which calls it "array" -- in order to be /// consistent with Parquet and Arrow type naming. Otherwise, the name would conflict with the /// `VariantArray : Array` we must eventually define for variant-typed arrow arrays. +/// +/// # Validation +/// +/// Every instance of variant list is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of a variant array (see below). +/// +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always Review Comment: This is somewhat duplicative of the comments in `Variant` and we could refer people back to those docs for a definition of valid vs invalid 🤔 However I think repeating the content is also fine ########## parquet-variant/src/variant.rs: ########## @@ -304,12 +381,45 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::ShortString => { Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => Variant::Object(VariantObject::try_new(metadata, value)?), - VariantBasicType::Array => Variant::List(VariantList::try_new(metadata, value)?), + VariantBasicType::Object => { + Variant::Object(VariantObject::try_new_impl(metadata, value)?) + } + VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?), Review Comment: It makes sense to me and I don't really have any better solution ########## parquet-variant/src/variant/list.rs: ########## @@ -69,46 +134,88 @@ impl<'m, 'v> VariantList<'m, 'v> { /// # Validation /// /// This constructor verifies that `value` points to a valid variant array value. In particular, - /// that all offsets are in-bounds and point to valid objects. - // TODO: How to make the validation non-recursive while still making iterators safely infallible?? - // See https://github.com/apache/arrow-rs/issues/7711 + /// that all offsets are in-bounds and point to valid (recursively validated) objects. pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result<Self, ArrowError> { + Self::try_new_impl(metadata, value)?.validate() + } + + pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { + Self::try_new_impl(metadata, value).expect("Invalid variant list value") + } + + /// Attempts to interpet `metadata` and `value` as a variant array, performing only basic + /// (constant-cost) [validation]. + /// + /// [validation]: Self#Validation + pub(crate) fn try_new_impl( + metadata: VariantMetadata<'m>, + value: &'v [u8], + ) -> Result<Self, ArrowError> { let header_byte = first_byte_from_slice(value)?; let header = VariantListHeader::try_new(header_byte)?; - // The size of the num_elements entry in the array value_data is 4 bytes if - // is_large is true, otherwise 1 byte. - let num_elements_size = match header.is_large { - true => OffsetSizeBytes::Four, - false => OffsetSizeBytes::One, - }; - // Skip the header byte to read the num_elements; the offset array immediately follows - let num_elements = num_elements_size.unpack_usize(value, NUM_HEADER_BYTES, 0)?; - let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize; + let num_elements = + header + .num_elements_size + .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?; // (num_elements + 1) * offset_size + first_offset_byte let first_value_byte = num_elements .checked_add(1) - .and_then(|n| n.checked_mul(header.offset_size as usize)) - .and_then(|n| n.checked_add(first_offset_byte)) + .and_then(|n| n.checked_mul(header.offset_size())) + .and_then(|n| n.checked_add(header.first_offset_byte())) .ok_or_else(|| overflow_error("offset of variant list values"))?; - let new_self = Self { + let mut new_self = Self { metadata, value, header, num_elements, - first_offset_byte, first_value_byte, + validated: false, }; - // Iterate over all values of this array in order to validate the field_offset array and - // prove that the field values are all in bounds. Otherwise, `iter` might panic on `unwrap`. - validate_fallible_iterator(new_self.iter_checked())?; + // Validate just the first and last offset, ignoring the other offsets and all value bytes. Review Comment: I think validating the first and last offsets seems very reasonable to me ########## parquet-variant/src/decoder.rs: ########## @@ -453,57 +473,53 @@ mod tests { // One-byte offsets let buf_one = [0x01u8, 0xAB, 0xCD]; assert_eq!( - OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 0).unwrap(), + OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(), Review Comment: I think these tests are much more readable now -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org