alamb commented on code in PR #7704:
URL: https://github.com/apache/arrow-rs/pull/7704#discussion_r2155266762
##########
parquet-variant/src/variant.rs:
##########
@@ -154,64 +152,35 @@ impl<'m> VariantMetadata<'m> {
}
pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
- let header = VariantMetadataHeader::try_new(bytes)?;
+ let header = first_byte_from_slice(bytes)?;
+ let header = VariantMetadataHeader::try_new(header)?;
+
// Offset 1, index 0 because first element after header is dictionary
size
let dict_size = header.offset_size.unpack_usize(bytes, 1, 0)?;
- // Check that we have the correct metadata length according to
dictionary_size, or return
- // error early.
- // Minimum number of bytes the metadata buffer must contain:
- // 1 byte header
- // + offset_size-byte `dictionary_size` field
- // + (dict_size + 1) offset entries, each `offset_size` bytes. (Table
size, essentially)
- // 1 + offset_size + (dict_size + 1) * offset_size
+ // Calculate the starting offset of the dictionary string bytes.
+ //
+ // Value header, dict_size (offset_size bytes), and dict_size+1 offsets
+ // = 1 + offset_size + (dict_size + 1) * offset_size
// = (dict_size + 2) * offset_size + 1
- let offset_size = header.offset_size as usize; // Cheap to copy
-
let dictionary_key_start_byte = dict_size
.checked_add(2)
- .and_then(|n| n.checked_mul(offset_size))
+ .and_then(|n| n.checked_mul(header.offset_size as usize))
.and_then(|n| n.checked_add(1))
.ok_or_else(|| ArrowError::InvalidArgumentError("metadata length
overflow".into()))?;
-
- if bytes.len() < dictionary_key_start_byte {
- return Err(ArrowError::InvalidArgumentError(
- "Metadata shorter than dictionary_size implies".to_string(),
- ));
- }
-
- // Check that all offsets are monotonically increasing
- let mut offsets = (0..=dict_size).map(|i|
header.offset_size.unpack_usize(bytes, 1, i + 1));
- let Some(Ok(mut end @ 0)) = offsets.next() else {
- return Err(ArrowError::InvalidArgumentError(
- "First offset is non-zero".to_string(),
- ));
- };
-
- for offset in offsets {
- let offset = offset?;
- if end >= offset {
- return Err(ArrowError::InvalidArgumentError(
- "Offsets are not monotonically increasing".to_string(),
- ));
- }
- end = offset;
- }
-
- // Verify the buffer covers the whole dictionary-string section
- if end > bytes.len() - dictionary_key_start_byte {
- // `prev` holds the last offset seen still
- return Err(ArrowError::InvalidArgumentError(
- "Last offset does not equal dictionary length".to_string(),
- ));
- }
-
- Ok(Self {
+ let s = Self {
bytes,
header,
dict_size,
dictionary_key_start_byte,
- })
+ };
+
+ // Verify that `iter` can safely `unwrap` the items produced by this
iterator.
+ //
+ // This has the side effect of validating the offset array and proving
that the string bytes
+ // are all in bounds.
Review Comment:
It might also help to explain what this is doing in more "end user visible"
terms
SOmething like
```suggestion
// Verify the contents of the dictionary: validate the offset array
and proving that the string bytes
// are all in bounds.
```
##########
parquet-variant/src/utils.rs:
##########
@@ -74,28 +75,37 @@ pub(crate) fn string_from_slice(slice: &[u8], range:
Range<usize>) -> Result<&st
/// * `Ok(Ok(index))` - Element found at the given index
/// * `Ok(Err(index))` - Element not found, but would be inserted at the given
index
/// * `Err(e)` - Key extraction failed with error `e`
-pub(crate) fn try_binary_search_by<T, K, E, F>(
- slice: &[T],
+pub(crate) fn try_binary_search_range_by<K, E, F>(
+ range: Range<usize>,
target: &K,
mut key_extractor: F,
) -> Result<Result<usize, usize>, E>
where
K: Ord,
- F: FnMut(&T) -> Result<K, E>,
+ F: FnMut(usize) -> Result<K, E>,
{
- let mut left = 0;
- let mut right = slice.len();
+ let Range { mut start, mut end } = range;
- while left < right {
- let mid = (left + right) / 2;
- let key = key_extractor(&slice[mid])?;
+ while start < end {
+ let mid = start + (end - start) / 2;
+ let key = key_extractor(mid)?;
match key.cmp(target) {
std::cmp::Ordering::Equal => return Ok(Ok(mid)),
- std::cmp::Ordering::Greater => right = mid,
- std::cmp::Ordering::Less => left = mid + 1,
+ std::cmp::Ordering::Greater => end = mid,
+ std::cmp::Ordering::Less => start = mid + 1,
}
}
- Ok(Err(left))
+ Ok(Err(start))
+}
+
+/// Attempts to prove a fallible iterator is actually infallible in practice,
by consuming every
+/// element and returning the first error (if any).
+pub(crate) fn validate_fallible_iterator<T, E>(
+ mut it: impl Iterator<Item = Result<T, E>>,
+) -> Result<(), E> {
+ // NOTE: It should really be `let None = ...`, but the compiler can't
prove that.
+ let _ = it.find(Result::is_err).transpose()?;
+ Ok(())
Review Comment:
You can also use something like this to discard the result
```suggestion
it.find(Result::is_err).transpose().map(|_| ())
````
##########
parquet-variant/src/variant.rs:
##########
@@ -223,113 +193,79 @@ impl<'m> VariantMetadata<'m> {
pub fn dictionary_size(&self) -> usize {
self.dict_size
}
+
+ /// The variant protocol version
pub fn version(&self) -> u8 {
self.header.version
}
- /// Helper method to get the offset start and end range for a key by index.
- fn get_offsets_for_key_by(&self, index: usize) -> Result<Range<usize>,
ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
- // Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- Ok(unpack(index)?..unpack(index + 1)?)
- }
-
- /// Get a single offset by index
- pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
+ /// Gets an offset array entry by index.
+ fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
// Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- unpack(index)
+ let bytes = slice_from_slice(self.bytes,
..self.dictionary_key_start_byte)?;
+ self.header.offset_size.unpack_usize(bytes, 1, i + 1)
}
- /// Get the key-name by index
- pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> {
- let offset_range = self.get_offsets_for_key_by(index)?;
- self.get_field_by_offset(offset_range)
+ /// Gets a dictionary entry by index
+ pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
+ let dictionary_keys_bytes = slice_from_slice(self.bytes,
self.dictionary_key_start_byte..)?;
+ let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
+ string_from_slice(dictionary_keys_bytes, byte_range)
Review Comment:
Given we have already validated the strings on construction, this could in
theory be an unchecked conversion (no need to revalidate utf8)
This is a future potential optimization
##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +297,129 @@ impl VariantObjectHeader {
value.len()
)));
}
- Ok(Self {
- field_offset_size,
- field_id_size,
+
+ let s = Self {
+ metadata,
+ value,
+ header,
num_elements,
field_ids_start_byte,
field_offsets_start_byte,
values_start_byte,
- })
+ };
+
+ // Verify that `iter` can safely `unwrap` the items produced by this
iterator.
+ //
+ // This has the side effect of validating the field_id and
field_offset arrays, and also
+ // proves the field values are all in bounds.
Review Comment:
```suggestion
// Verify the contents of the object: validate that the field_id and
field_offset arrays
// within bounds.
```
##########
parquet-variant/src/variant.rs:
##########
@@ -1527,26 +1372,21 @@ mod tests {
let md = VariantMetadata::try_new(bytes).expect("should parse");
assert_eq!(md.dictionary_size(), 2);
// Fields
- assert_eq!(md.get_field_by(0).unwrap(), "cat");
- assert_eq!(md.get_field_by(1).unwrap(), "dog");
+ assert_eq!(md.get(0).unwrap(), "cat");
+ assert_eq!(md.get(1).unwrap(), "dog");
// Offsets
- assert_eq!(md.get_offset_by(0).unwrap(), 0x00);
- assert_eq!(md.get_offset_by(1).unwrap(), 0x03);
- // We only have 2 keys, the final offset should not be accessible
using this method.
- let err = md.get_offset_by(2).unwrap_err();
+ assert_eq!(md.get_offset(0).unwrap(), 0x00);
+ assert_eq!(md.get_offset(1).unwrap(), 0x03);
+ assert_eq!(md.get_offset(2).unwrap(), 0x06);
+ let err = md.get_offset(3);
assert!(
- matches!(err, ArrowError::InvalidArgumentError(ref msg)
- if msg.contains("Index 2 out of bounds for dictionary of
length 2")),
- "unexpected error: {err:?}"
+ matches!(err, Err(ArrowError::InvalidArgumentError(_))),
Review Comment:
I know you are just following the pattern in the existing codebase, but one
pattern I like for checking errors is
```rust
let err = md.get_offset(3).unwrap_err(); // panic's if not an Err
assert!(matches!(err, Err(ArrowError::InvalidArgumentError(_))));
```
##########
parquet-variant/src/variant.rs:
##########
@@ -154,64 +152,36 @@ impl<'m> VariantMetadata<'m> {
}
pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
Review Comment:
I think it would be useful to document the validation / expectations a bit.
on the `VariantMetadata` structure and this `try_new` function
Something like
```suggestion
/// Create a new `VariantMetadata` header from the provided metadata
bytes
///
/// # Validation
/// This method validates that `bytes` contains valid metadata
/// including checking that all offsets are in within bounds and
/// point to valid utf8 strings
pub fn try_new(bytes: &'m [u8]) -> Result<Self, ArrowError> {
```
##########
parquet-variant/src/variant.rs:
##########
@@ -223,113 +193,79 @@ impl<'m> VariantMetadata<'m> {
pub fn dictionary_size(&self) -> usize {
self.dict_size
}
+
+ /// The variant protocol version
pub fn version(&self) -> u8 {
self.header.version
}
- /// Helper method to get the offset start and end range for a key by index.
- fn get_offsets_for_key_by(&self, index: usize) -> Result<Range<usize>,
ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
- // Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- Ok(unpack(index)?..unpack(index + 1)?)
- }
-
- /// Get a single offset by index
- pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
+ /// Gets an offset array entry by index.
+ fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
// Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- unpack(index)
+ let bytes = slice_from_slice(self.bytes,
..self.dictionary_key_start_byte)?;
+ self.header.offset_size.unpack_usize(bytes, 1, i + 1)
}
- /// Get the key-name by index
- pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> {
- let offset_range = self.get_offsets_for_key_by(index)?;
- self.get_field_by_offset(offset_range)
+ /// Gets a dictionary entry by index
+ pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
+ let dictionary_keys_bytes = slice_from_slice(self.bytes,
self.dictionary_key_start_byte..)?;
+ let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
+ string_from_slice(dictionary_keys_bytes, byte_range)
}
- /// Gets the field using an offset (Range) - helper method to keep
consistent API.
- pub(crate) fn get_field_by_offset(&self, offset: Range<usize>) ->
Result<&'m str, ArrowError> {
- let dictionary_keys_bytes =
- slice_from_slice(self.bytes,
self.dictionary_key_start_byte..self.bytes.len())?;
- let result = string_from_slice(dictionary_keys_bytes, offset)?;
-
- Ok(result)
+ /// Get all key-names as an Iterator of strings
Review Comment:
```suggestion
/// Get all dictionary entries as an Iterator of strings
///
/// The entries are returned in dictionary order -- this iterator
returns
/// the same thing as calling [`Self::get`] for the indexes
`0..self.dictionary_size()`
```
##########
parquet-variant/src/variant.rs:
##########
@@ -223,113 +193,79 @@ impl<'m> VariantMetadata<'m> {
pub fn dictionary_size(&self) -> usize {
self.dict_size
}
+
+ /// The variant protocol version
pub fn version(&self) -> u8 {
self.header.version
}
- /// Helper method to get the offset start and end range for a key by index.
- fn get_offsets_for_key_by(&self, index: usize) -> Result<Range<usize>,
ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
- // Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- Ok(unpack(index)?..unpack(index + 1)?)
- }
-
- /// Get a single offset by index
- pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
+ /// Gets an offset array entry by index.
+ fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
// Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- unpack(index)
+ let bytes = slice_from_slice(self.bytes,
..self.dictionary_key_start_byte)?;
+ self.header.offset_size.unpack_usize(bytes, 1, i + 1)
}
- /// Get the key-name by index
- pub fn get_field_by(&self, index: usize) -> Result<&'m str, ArrowError> {
- let offset_range = self.get_offsets_for_key_by(index)?;
- self.get_field_by_offset(offset_range)
+ /// Gets a dictionary entry by index
+ pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
+ let dictionary_keys_bytes = slice_from_slice(self.bytes,
self.dictionary_key_start_byte..)?;
+ let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
+ string_from_slice(dictionary_keys_bytes, byte_range)
}
- /// Gets the field using an offset (Range) - helper method to keep
consistent API.
- pub(crate) fn get_field_by_offset(&self, offset: Range<usize>) ->
Result<&'m str, ArrowError> {
- let dictionary_keys_bytes =
- slice_from_slice(self.bytes,
self.dictionary_key_start_byte..self.bytes.len())?;
- let result = string_from_slice(dictionary_keys_bytes, offset)?;
-
- Ok(result)
+ /// Get all key-names as an Iterator of strings
+ pub fn iter(&self) -> impl Iterator<Item = &'m str> + '_ {
+ self.iter_checked().map(Result::unwrap)
}
- #[allow(unused)]
- pub(crate) fn header(&self) -> VariantMetadataHeader {
- self.header
+ // Fallible iteration over the fields of this dictionary. The constructor
traverses the iterator
+ // to prove it has no errors, so that all other use sites can blindly
`unwrap` the result.
+ fn iter_checked(&self) -> impl Iterator<Item = Result<&'m str,
ArrowError>> + '_ {
+ (0..self.dict_size).map(move |i| self.get(i))
}
+}
- /// Get the offsets as an iterator
- pub fn offsets(&self) -> impl Iterator<Item = Result<Range<usize>,
ArrowError>> + 'm {
- let offset_size = self.header.offset_size; // `Copy`
- let bytes = self.bytes;
+#[derive(Clone, Debug, PartialEq)]
+pub(crate) struct VariantObjectHeader {
+ field_offset_size: OffsetSizeBytes,
+ field_id_size: OffsetSizeBytes,
+ is_large: bool,
+}
- (0..self.dict_size).map(move |i| {
- // This wont be out of bounds as long as dict_size and offsets
have been validated
- // during construction via `try_new`, as it calls unpack_usize for
the
- // indices `1..dict_size+1` already.
- let start = offset_size.unpack_usize(bytes, 1, i + 1);
- let end = offset_size.unpack_usize(bytes, 1, i + 2);
+impl VariantObjectHeader {
+ pub(crate) fn try_new(header_byte: u8) -> Result<Self, ArrowError> {
+ // Parse the header byte to get object parameters
+ let value_header = header_byte >> 2;
+ let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
+ let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2
bits
+ let is_large = (value_header & 0x10) != 0; // 5th bit
- match (start, end) {
- (Ok(s), Ok(e)) => Ok(s..e),
- (Err(e), _) | (_, Err(e)) => Err(e),
- }
+ Ok(Self {
+ field_offset_size:
OffsetSizeBytes::try_new(field_offset_size_minus_one)?,
+ field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?,
+ is_large,
})
}
-
- /// Get all key-names as an Iterator of strings
- pub fn fields(
- &'m self,
- ) -> Result<impl Iterator<Item = Result<&'m str, ArrowError>>, ArrowError>
{
- let iterator = self
- .offsets()
- .map(move |offset_range| self.get_field_by_offset(offset_range?));
- Ok(iterator)
- }
}
#[derive(Clone, Debug, PartialEq)]
-pub(crate) struct VariantObjectHeader {
- field_offset_size: OffsetSizeBytes,
- field_id_size: OffsetSizeBytes,
+pub struct VariantObject<'m, 'v> {
+ pub metadata: VariantMetadata<'m>,
+ pub value: &'v [u8],
+ header: VariantObjectHeader,
num_elements: usize,
field_ids_start_byte: usize,
field_offsets_start_byte: usize,
values_start_byte: usize,
}
-impl VariantObjectHeader {
- pub(crate) fn try_new(value: &[u8]) -> Result<Self, ArrowError> {
- // Parse the header byte to get object parameters
- let header = first_byte_from_slice(value)?;
- let value_header = header >> 2;
-
- let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits
- let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2
bits
- let is_large = value_header & 0x10; // 5th bit
-
- let field_offset_size =
OffsetSizeBytes::try_new(field_offset_size_minus_one)?;
- let field_id_size = OffsetSizeBytes::try_new(field_id_size_minus_one)?;
+impl<'m, 'v> VariantObject<'m, 'v> {
+ pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) ->
Result<Self, ArrowError> {
Review Comment:
Same thing here -- I think it would be useful to add comments about what
happens with validation
##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +297,129 @@ impl VariantObjectHeader {
value.len()
)));
}
- Ok(Self {
- field_offset_size,
- field_id_size,
+
+ let s = Self {
+ metadata,
+ value,
+ header,
num_elements,
field_ids_start_byte,
field_offsets_start_byte,
values_start_byte,
- })
+ };
+
+ // Verify that `iter` can safely `unwrap` the items produced by this
iterator.
+ //
+ // This has the side effect of validating the field_id and
field_offset arrays, and also
+ // proves the field values are all in bounds.
+ validate_fallible_iterator(s.iter_checked())?;
+ Ok(s)
}
/// Returns the number of key-value pairs in this object
- pub(crate) fn num_elements(&self) -> usize {
+ pub fn len(&self) -> usize {
self.num_elements
}
-}
-#[derive(Clone, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
- pub metadata: VariantMetadata<'m>,
- pub value: &'v [u8],
- header: VariantObjectHeader,
-}
+ /// Returns true if the object contains no key-value pairs
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
-impl<'m, 'v> VariantObject<'m, 'v> {
- pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) ->
Result<Self, ArrowError> {
- Ok(Self {
- metadata,
- value,
- header: VariantObjectHeader::try_new(value)?,
- })
+ /// Get a field's value by index in `0..self.len()`
+ pub fn field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+ let start_offset = self.header.field_offset_size.unpack_usize(
+ self.value,
+ self.field_offsets_start_byte,
+ i,
+ )?;
+ let value_bytes = slice_from_slice(self.value, self.values_start_byte
+ start_offset..)?;
+ Variant::try_new_with_metadata(self.metadata, value_bytes)
}
- /// Returns the number of key-value pairs in this object
- pub fn len(&self) -> usize {
- self.header.num_elements()
+ /// Get a field's name by index in `0..self.len()`
+ pub fn field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
+ let field_id =
+ self.header
+ .field_id_size
+ .unpack_usize(self.value, self.field_ids_start_byte, i)?;
+ self.metadata.get(field_id)
}
- /// Returns true if the object contains no key-value pairs
- pub fn is_empty(&self) -> bool {
- self.len() == 0
+ /// Returns an iterator of (name, value) pairs over the fields of this
object.
+ pub fn iter(&self) -> impl Iterator<Item = (&'m str, Variant<'m, 'v>)> +
'_ {
+ self.iter_checked().map(Result::unwrap)
Review Comment:
```suggestion
// It is safe to unwrap here as we validated the iterator in the
constructor
self.iter_checked().map(Result::unwrap)
```
##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +297,129 @@ impl VariantObjectHeader {
value.len()
)));
}
- Ok(Self {
- field_offset_size,
- field_id_size,
+
+ let s = Self {
+ metadata,
+ value,
+ header,
num_elements,
field_ids_start_byte,
field_offsets_start_byte,
values_start_byte,
- })
+ };
+
+ // Verify that `iter` can safely `unwrap` the items produced by this
iterator.
+ //
+ // This has the side effect of validating the field_id and
field_offset arrays, and also
+ // proves the field values are all in bounds.
+ validate_fallible_iterator(s.iter_checked())?;
+ Ok(s)
}
/// Returns the number of key-value pairs in this object
- pub(crate) fn num_elements(&self) -> usize {
+ pub fn len(&self) -> usize {
Review Comment:
💯
##########
parquet-variant/src/variant.rs:
##########
@@ -223,113 +193,79 @@ impl<'m> VariantMetadata<'m> {
pub fn dictionary_size(&self) -> usize {
self.dict_size
}
+
+ /// The variant protocol version
pub fn version(&self) -> u8 {
self.header.version
}
- /// Helper method to get the offset start and end range for a key by index.
- fn get_offsets_for_key_by(&self, index: usize) -> Result<Range<usize>,
ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
- // Skipping the header byte (setting byte_offset = 1) and the
dictionary_size (setting offset_index +1)
- let unpack = |i| self.header.offset_size.unpack_usize(self.bytes, 1, i
+ 1);
- Ok(unpack(index)?..unpack(index + 1)?)
- }
-
- /// Get a single offset by index
- pub fn get_offset_by(&self, index: usize) -> Result<usize, ArrowError> {
- if index >= self.dict_size {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Index {} out of bounds for dictionary of length {}",
- index, self.dict_size
- )));
- }
-
+ /// Gets an offset array entry by index.
Review Comment:
I suggest that this clearer if if also mentioned that the offset returned is
the offset into the string dictionary
```suggestion
/// Gets an offset array entry by index.
///
/// This offset is an index into the dictionary. See [`Self::get`] to
retrieve the corresponding
/// dictionary entry.
```
##########
parquet-variant/src/variant.rs:
##########
@@ -367,150 +297,129 @@ impl VariantObjectHeader {
value.len()
)));
}
- Ok(Self {
- field_offset_size,
- field_id_size,
+
+ let s = Self {
+ metadata,
+ value,
+ header,
num_elements,
field_ids_start_byte,
field_offsets_start_byte,
values_start_byte,
- })
+ };
+
+ // Verify that `iter` can safely `unwrap` the items produced by this
iterator.
+ //
+ // This has the side effect of validating the field_id and
field_offset arrays, and also
+ // proves the field values are all in bounds.
+ validate_fallible_iterator(s.iter_checked())?;
+ Ok(s)
}
/// Returns the number of key-value pairs in this object
- pub(crate) fn num_elements(&self) -> usize {
+ pub fn len(&self) -> usize {
self.num_elements
}
-}
-#[derive(Clone, Debug, PartialEq)]
-pub struct VariantObject<'m, 'v> {
- pub metadata: VariantMetadata<'m>,
- pub value: &'v [u8],
- header: VariantObjectHeader,
-}
+ /// Returns true if the object contains no key-value pairs
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
-impl<'m, 'v> VariantObject<'m, 'v> {
- pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) ->
Result<Self, ArrowError> {
- Ok(Self {
- metadata,
- value,
- header: VariantObjectHeader::try_new(value)?,
- })
+ /// Get a field's value by index in `0..self.len()`
+ pub fn field(&self, i: usize) -> Result<Variant<'m, 'v>, ArrowError> {
+ let start_offset = self.header.field_offset_size.unpack_usize(
+ self.value,
+ self.field_offsets_start_byte,
+ i,
+ )?;
+ let value_bytes = slice_from_slice(self.value, self.values_start_byte
+ start_offset..)?;
+ Variant::try_new_with_metadata(self.metadata, value_bytes)
}
- /// Returns the number of key-value pairs in this object
- pub fn len(&self) -> usize {
- self.header.num_elements()
+ /// Get a field's name by index in `0..self.len()`
+ pub fn field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
+ let field_id =
+ self.header
+ .field_id_size
+ .unpack_usize(self.value, self.field_ids_start_byte, i)?;
+ self.metadata.get(field_id)
}
- /// Returns true if the object contains no key-value pairs
- pub fn is_empty(&self) -> bool {
- self.len() == 0
+ /// Returns an iterator of (name, value) pairs over the fields of this
object.
Review Comment:
I like how this API now mirrors the rust collections iterator
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]