This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 42b6c17930 [Variant] Reduce variant-related struct sizes (#7888)
42b6c17930 is described below
commit 42b6c1793000f8017c952966b0080927145bccff
Author: Ryan Johnson <[email protected]>
AuthorDate: Fri Jul 11 05:23:55 2025 -0700
[Variant] Reduce variant-related struct sizes (#7888)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/7831
# Rationale for this change
Variants naturally work with `u32` offsets, field ids, etc. Widening
them artificially to `usize` on 64-bit architectures causes several
problems:
1. A majority of developers will be using 64-bit architectures, and are
unlikely to think about integer overflow issues when working with
`usize`. But it's actually quite easy for malicious data or buggy code
to overflow the `u32` values that variant actually relies on. Worse, it
becomes difficult, if not impossible, to validate the code's resistance
to 32-bit integer overflow, when manipulating `usize` values on 64-bit
hardware.
2. Related to 1/, casting from `usize` to `u32` can clip the value on
64-bit hardware, which makes it harder to reason about the code's
correctness (always wondering whether the value _might_ be larger than
32-bits can hold). In contrast, casting from `u32` to `usize` is safe in
spite of being fallible in rust (assumes we do _not_ need to support
16-bit architectures).
3. The variant-related data structures occupy significantly more space
than they need to, when storing (64-bit) `usize` offsets instead of
`u32`.
# What changes are included in this PR?
Store all variant-related offsets as `u32` instead of `usize`. The
`VariantMetadata`, `VariantObject` and `VariantList` structs shrink to
32/64/64 bytes (previously 40/88/80 bytes).
Also, rename `OffsetSizeBytes::unpack_usize[_at_offset]` methods to
`unpack_u32[_at_offset]`, to more accurately reflect what they actually
do now.
# Are these changes tested?
Existing unit tests cover the use of these values; new static assertions
will catch any future size changes.
# Are there any user-facing changes?
`VariantMetadata` is no longer `Copy`, reflecting the fact that this PR
still leaves it 2x larger than a fat pointer.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet-variant-json/src/from_json.rs | 2 +-
parquet-variant/src/builder.rs | 1 -
parquet-variant/src/decoder.rs | 66 +++++++++++++--------------------
parquet-variant/src/utils.rs | 9 +++++
parquet-variant/src/variant.rs | 3 ++
parquet-variant/src/variant/list.rs | 47 ++++++++++++-----------
parquet-variant/src/variant/metadata.rs | 44 ++++++++++++----------
parquet-variant/src/variant/object.rs | 60 +++++++++++++++---------------
8 files changed, 118 insertions(+), 114 deletions(-)
diff --git a/parquet-variant-json/src/from_json.rs
b/parquet-variant-json/src/from_json.rs
index c091095036..3052bc504d 100644
--- a/parquet-variant-json/src/from_json.rs
+++ b/parquet-variant-json/src/from_json.rs
@@ -165,7 +165,7 @@ mod test {
expected: Variant<'a, 'a>,
}
- impl<'a> JsonToVariantTest<'a> {
+ impl JsonToVariantTest<'_> {
fn run(self) -> Result<(), ArrowError> {
let mut variant_builder = VariantBuilder::new();
json_to_variant(self.json, &mut variant_builder)?;
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index 542065045c..33608d27cb 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -1932,7 +1932,6 @@ mod tests {
assert!(metadata.is_empty());
let variant = Variant::try_new_with_metadata(metadata,
&value).unwrap();
- assert!(metadata.is_empty());
assert_eq!(variant, Variant::Int8(42));
}
diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs
index 5a6aab43ff..5d6a064793 100644
--- a/parquet-variant/src/decoder.rs
+++ b/parquet-variant/src/decoder.rs
@@ -22,8 +22,6 @@ use crate::ShortString;
use arrow_schema::ArrowError;
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
-use std::num::TryFromIntError;
-
/// The basic type of a [`Variant`] value, encoded in the first two bits of the
/// header byte.
///
@@ -147,11 +145,9 @@ impl OffsetSizeBytes {
/// * `bytes` – the byte buffer to index
/// * `index` – 0-based index into the buffer
///
- /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
- /// Three-byte values are zero-extended to 32 bits before the final
- /// fallible cast to `usize`.
- pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) ->
Result<usize, ArrowError> {
- self.unpack_usize_at_offset(bytes, 0, index)
+ /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended
to 32 bits as needed.
+ pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32,
ArrowError> {
+ self.unpack_u32_at_offset(bytes, 0, index)
}
/// Return one unsigned little-endian value from `bytes`.
@@ -162,15 +158,13 @@ impl OffsetSizeBytes {
/// * `offset_index` – 0-based index **after** the skipped bytes
/// (`0` is the first value, `1` the next, …).
///
- /// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
- /// Three-byte values are zero-extended to 32 bits before the final
- /// fallible cast to `usize`.
- pub(crate) fn unpack_usize_at_offset(
+ /// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended
to 32 bits as needed.
+ pub(crate) fn unpack_u32_at_offset(
&self,
bytes: &[u8],
byte_offset: usize, // how many bytes to skip
offset_index: usize, // which offset in an array of offsets
- ) -> Result<usize, ArrowError> {
+ ) -> Result<u32, ArrowError> {
use OffsetSizeBytes::*;
// Index into the byte array:
@@ -179,7 +173,7 @@ impl OffsetSizeBytes {
.checked_mul(*self as usize)
.and_then(|n| n.checked_add(byte_offset))
.ok_or_else(|| overflow_error("unpacking offset array value"))?;
- let result = match self {
+ let value = match self {
One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Three => {
@@ -192,11 +186,7 @@ impl OffsetSizeBytes {
}
Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
};
-
- // Convert the u32 we extracted to usize (should always succeed on 32-
and 64-bit arch)
- result
- .try_into()
- .map_err(|e: TryFromIntError|
ArrowError::InvalidArgumentError(e.to_string()))
+ Ok(value)
}
}
@@ -518,57 +508,51 @@ mod tests {
}
#[test]
- fn unpack_usize_all_widths() {
+ fn unpack_u32_all_widths() {
// One-byte offsets
let buf_one = [0x01u8, 0xAB, 0xCD];
- assert_eq!(
- OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(),
- 0x01
- );
- assert_eq!(
- OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(),
- 0xCD
- );
+ assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(),
0x01);
+ assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(),
0xCD);
// Two-byte offsets (little-endian 0x1234, 0x5678)
let buf_two = [0x34, 0x12, 0x78, 0x56];
assert_eq!(
- OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(),
+ OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
0x1234
);
assert_eq!(
- OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(),
+ OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
0x5678
);
// Three-byte offsets (0x030201 and 0x0000FF)
let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
assert_eq!(
- OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(),
+ OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
0x030201
);
assert_eq!(
- OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(),
+ OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
0x0000FF
);
// Four-byte offsets (0x12345678, 0x90ABCDEF)
let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
assert_eq!(
- OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(),
+ OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
0x1234_5678
);
assert_eq!(
- OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(),
+ OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
0x90AB_CDEF
);
}
#[test]
- fn unpack_usize_out_of_bounds() {
+ fn unpack_u32_out_of_bounds() {
let tiny = [0x00u8]; // deliberately too short
- assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err());
- assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err());
+ assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
+ assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
}
#[test]
@@ -584,20 +568,20 @@ mod tests {
let width = OffsetSizeBytes::Two;
// dictionary_size starts immediately after the header byte
- let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap();
+ let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
assert_eq!(dict_size, 2);
// offset array immediately follows the dictionary size
- let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap();
+ let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
assert_eq!(first, 0);
- let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap();
+ let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
assert_eq!(second, 5);
- let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap();
+ let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
assert_eq!(third, 9);
- let err = width.unpack_usize_at_offset(&buf, 1, 4);
+ let err = width.unpack_u32_at_offset(&buf, 1, 4);
assert!(err.is_err())
}
}
diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs
index ef402064e9..a9751f0ab6 100644
--- a/parquet-variant/src/utils.rs
+++ b/parquet-variant/src/utils.rs
@@ -122,3 +122,12 @@ where
Some(Err(start))
}
+
+/// Verifies the expected size of type T, for a type that should only grow if
absolutely necessary.
+#[allow(unused)]
+pub(crate) const fn expect_size_of<T>(expected: usize) {
+ let size = std::mem::size_of::<T>();
+ if size != expected {
+ let _ = [""; 0][size];
+ }
+}
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index dd8287fd1c..8138549b1a 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -256,6 +256,9 @@ pub enum Variant<'m, 'v> {
List(VariantList<'m, 'v>),
}
+// We don't want this to grow because it could hurt performance of a
frequently-created type.
+const _: () = crate::utils::expect_size_of::<Variant>(80);
+
impl<'m, 'v> Variant<'m, 'v> {
/// Attempts to interpret a metadata and value buffer pair as a new
`Variant`.
///
diff --git a/parquet-variant/src/variant/list.rs
b/parquet-variant/src/variant/list.rs
index 11122190b4..17f87a2e0d 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata};
use arrow_schema::ArrowError;
// The value header occupies one byte; use a named constant for readability
-const NUM_HEADER_BYTES: usize = 1;
+const NUM_HEADER_BYTES: u32 = 1;
/// A parsed version of the variant array value header byte.
#[derive(Debug, Clone, PartialEq)]
@@ -34,15 +34,15 @@ pub(crate) struct VariantListHeader {
impl VariantListHeader {
// Hide the ugly casting
- const fn num_elements_size(&self) -> usize {
+ const fn num_elements_size(&self) -> u32 {
self.num_elements_size as _
}
- const fn offset_size(&self) -> usize {
+ const fn offset_size(&self) -> u32 {
self.offset_size as _
}
// Avoid materializing this offset, since it's cheaply and safely
computable
- const fn first_offset_byte(&self) -> usize {
+ const fn first_offset_byte(&self) -> u32 {
NUM_HEADER_BYTES + self.num_elements_size()
}
@@ -122,11 +122,14 @@ pub struct VariantList<'m, 'v> {
pub metadata: VariantMetadata<'m>,
pub value: &'v [u8],
header: VariantListHeader,
- num_elements: usize,
- first_value_byte: usize,
+ num_elements: u32,
+ first_value_byte: u32,
validated: bool,
}
+// We don't want this to grow because it could increase the size of `Variant`
and hurt performance.
+const _: () = crate::utils::expect_size_of::<VariantList>(64);
+
impl<'m, 'v> VariantList<'m, 'v> {
/// Attempts to interpret `value` as a variant array value.
///
@@ -157,7 +160,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
let num_elements =
header
.num_elements_size
- .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
+ .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
// (num_elements + 1) * offset_size + first_offset_byte
let first_value_byte = num_elements
@@ -185,10 +188,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
// Use the last offset to upper-bound the value buffer
let last_offset = new_self
- .get_offset(num_elements)?
+ .get_offset(num_elements as _)?
.checked_add(first_value_byte)
.ok_or_else(|| overflow_error("variant array size"))?;
- new_self.value = slice_from_slice(value, ..last_offset)?;
+ new_self.value = slice_from_slice(value, ..last_offset as _)?;
Ok(new_self)
}
@@ -210,7 +213,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
let offset_buffer = slice_from_slice(
self.value,
- self.header.first_offset_byte()..self.first_value_byte,
+ self.header.first_offset_byte() as _..self.first_value_byte as
_,
)?;
let offsets =
@@ -226,7 +229,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
));
}
- let value_buffer = slice_from_slice(self.value,
self.first_value_byte..)?;
+ let value_buffer = slice_from_slice(self.value,
self.first_value_byte as _..)?;
// Validate whether values are valid variant objects
for i in 1..offsets.len() {
@@ -234,7 +237,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
let end_offset = offsets[i];
let value_bytes = slice_from_slice(value_buffer,
start_offset..end_offset)?;
- Variant::try_new_with_metadata(self.metadata, value_bytes)?;
+ Variant::try_new_with_metadata(self.metadata.clone(),
value_bytes)?;
}
self.validated = true;
@@ -244,7 +247,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
/// Return the length of this array
pub fn len(&self) -> usize {
- self.num_elements
+ self.num_elements as _
}
/// Is the array of zero length
@@ -256,7 +259,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
///
/// [invalid]: Self#Validation
pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
- (index < self.num_elements).then(|| {
+ (index < self.len()).then(|| {
self.try_get_with_shallow_validation(index)
.expect("Invalid variant array element")
})
@@ -272,10 +275,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
fn try_get_with_shallow_validation(&self, index: usize) ->
Result<Variant<'m, 'v>, ArrowError> {
// Fetch the value bytes between the two offsets for this index, from
the value array region
// of the byte buffer
- let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
+ let byte_range = self.get_offset(index)? as _..self.get_offset(index +
1)? as _;
let value_bytes =
- slice_from_slice_at_offset(self.value, self.first_value_byte,
byte_range)?;
- Variant::try_new_with_metadata_and_shallow_validation(self.metadata,
value_bytes)
+ slice_from_slice_at_offset(self.value, self.first_value_byte as _,
byte_range)?;
+
Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(),
value_bytes)
}
/// Iterates over the values of this list. When working with [unvalidated]
input, consider
@@ -297,14 +300,14 @@ impl<'m, 'v> VariantList<'m, 'v> {
fn iter_try_with_shallow_validation(
&self,
) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
- (0..self.len()).map(move |i| self.try_get_with_shallow_validation(i))
+ (0..self.len()).map(|i| self.try_get_with_shallow_validation(i))
}
// Attempts to retrieve the ith offset from the offset array region of the
byte buffer.
- fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
- let byte_range =
self.header.first_offset_byte()..self.first_value_byte;
+ fn get_offset(&self, index: usize) -> Result<u32, ArrowError> {
+ let byte_range = self.header.first_offset_byte() as
_..self.first_value_byte as _;
let offset_bytes = slice_from_slice(self.value, byte_range)?;
- self.header.offset_size.unpack_usize(offset_bytes, index)
+ self.header.offset_size.unpack_u32(offset_bytes, index)
}
}
@@ -623,7 +626,7 @@ mod tests {
expected_num_element_size,
variant_list.header.num_elements_size
);
- assert_eq!(list_size, variant_list.num_elements);
+ assert_eq!(list_size, variant_list.num_elements as usize);
// verify the data in the variant
assert_eq!(list_size, variant_list.len());
diff --git a/parquet-variant/src/variant/metadata.rs
b/parquet-variant/src/variant/metadata.rs
index b50a766869..007122af75 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -34,16 +34,16 @@ pub(crate) struct VariantMetadataHeader {
const CORRECT_VERSION_VALUE: u8 = 1;
// The metadata header occupies one byte; use a named constant for readability
-const NUM_HEADER_BYTES: usize = 1;
+const NUM_HEADER_BYTES: u32 = 1;
impl VariantMetadataHeader {
// Hide the cast
- const fn offset_size(&self) -> usize {
- self.offset_size as usize
+ const fn offset_size(&self) -> u32 {
+ self.offset_size as u32
}
// Avoid materializing this offset, since it's cheaply and safely
computable
- const fn first_offset_byte(&self) -> usize {
+ const fn first_offset_byte(&self) -> u32 {
NUM_HEADER_BYTES + self.offset_size()
}
@@ -125,15 +125,19 @@ impl VariantMetadataHeader {
///
/// [`Variant`]: crate::Variant
/// [Variant Spec]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
-#[derive(Debug, Clone, Copy, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
pub struct VariantMetadata<'m> {
bytes: &'m [u8],
header: VariantMetadataHeader,
- dictionary_size: usize,
- first_value_byte: usize,
+ dictionary_size: u32,
+ first_value_byte: u32,
validated: bool,
}
+// We don't want this to grow because it increases the size of VariantList and
VariantObject, which
+// could increase the size of Variant. All those size increases could hurt
performance.
+const _: () = crate::utils::expect_size_of::<VariantMetadata>(32);
+
impl<'m> VariantMetadata<'m> {
/// Attempts to interpret `bytes` as a variant metadata instance, with
full [validation] of all
/// dictionary entries.
@@ -166,7 +170,7 @@ impl<'m> VariantMetadata<'m> {
let dictionary_size =
header
.offset_size
- .unpack_usize_at_offset(bytes, NUM_HEADER_BYTES, 0)?;
+ .unpack_u32_at_offset(bytes, NUM_HEADER_BYTES as usize, 0)?;
// Calculate the starting offset of the dictionary string bytes.
//
@@ -196,16 +200,16 @@ impl<'m> VariantMetadata<'m> {
// Use the last offset to upper-bound the byte slice
let last_offset = new_self
- .get_offset(dictionary_size)?
+ .get_offset(dictionary_size as _)?
.checked_add(first_value_byte)
.ok_or_else(|| overflow_error("variant metadata size"))?;
- new_self.bytes = slice_from_slice(bytes, ..last_offset)?;
+ new_self.bytes = slice_from_slice(bytes, ..last_offset as _)?;
Ok(new_self)
}
/// The number of metadata dictionary entries
pub fn len(&self) -> usize {
- self.dictionary_size
+ self.dictionary_size()
}
/// True if this metadata dictionary contains no entries
@@ -227,7 +231,7 @@ impl<'m> VariantMetadata<'m> {
if !self.validated {
let offset_bytes = slice_from_slice(
self.bytes,
- self.header.first_offset_byte()..self.first_value_byte,
+ self.header.first_offset_byte() as _..self.first_value_byte as
_,
)?;
let offsets =
@@ -245,7 +249,7 @@ impl<'m> VariantMetadata<'m> {
// Verify the string values in the dictionary are UTF-8 encoded
strings.
let value_buffer =
- string_from_slice(self.bytes, 0,
self.first_value_byte..self.bytes.len())?;
+ string_from_slice(self.bytes, 0, self.first_value_byte as
_..self.bytes.len())?;
if self.header.is_sorted {
// Validate the dictionary values are unique and
lexicographically sorted
@@ -278,7 +282,7 @@ impl<'m> VariantMetadata<'m> {
/// Get the dictionary size
pub const fn dictionary_size(&self) -> usize {
- self.dictionary_size
+ self.dictionary_size as _
}
/// The variant protocol version
@@ -290,10 +294,10 @@ impl<'m> VariantMetadata<'m> {
///
/// This offset is an index into the dictionary, at the boundary between
string `i-1` and string
/// `i`. See [`Self::get`] to retrieve a specific dictionary entry.
- fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
- let offset_byte_range =
self.header.first_offset_byte()..self.first_value_byte;
+ fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
+ let offset_byte_range = self.header.first_offset_byte() as
_..self.first_value_byte as _;
let bytes = slice_from_slice(self.bytes, offset_byte_range)?;
- self.header.offset_size.unpack_usize(bytes, i)
+ self.header.offset_size.unpack_u32(bytes, i)
}
/// Attempts to retrieve a dictionary entry by index, failing if out of
bounds or if the
@@ -301,8 +305,8 @@ impl<'m> VariantMetadata<'m> {
///
/// [invalid]: Self#Validation
pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> {
- let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?;
- string_from_slice(self.bytes, self.first_value_byte, byte_range)
+ let byte_range = self.get_offset(i)? as _..self.get_offset(i + 1)? as
_;
+ string_from_slice(self.bytes, self.first_value_byte as _, byte_range)
}
/// Returns an iterator that attempts to visit all dictionary entries,
producing `Err` if the
@@ -310,7 +314,7 @@ impl<'m> VariantMetadata<'m> {
///
/// [invalid]: Self#Validation
pub fn iter_try(&self) -> impl Iterator<Item = Result<&'m str,
ArrowError>> + '_ {
- (0..self.dictionary_size).map(move |i| self.get(i))
+ (0..self.len()).map(|i| self.get(i))
}
/// Iterates over all dictionary entries. When working with [unvalidated]
input, consider
diff --git a/parquet-variant/src/variant/object.rs
b/parquet-variant/src/variant/object.rs
index 36c8f999b2..dd6da08fbe 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata};
use arrow_schema::ArrowError;
// The value header occupies one byte; use a named constant for readability
-const NUM_HEADER_BYTES: usize = 1;
+const NUM_HEADER_BYTES: u32 = 1;
/// Header structure for [`VariantObject`]
#[derive(Debug, Clone, PartialEq)]
@@ -35,18 +35,18 @@ pub(crate) struct VariantObjectHeader {
impl VariantObjectHeader {
// Hide the ugly casting
- const fn num_elements_size(&self) -> usize {
+ const fn num_elements_size(&self) -> u32 {
self.num_elements_size as _
}
- const fn field_id_size(&self) -> usize {
+ const fn field_id_size(&self) -> u32 {
self.field_id_size as _
}
- const fn field_offset_size(&self) -> usize {
+ const fn field_offset_size(&self) -> u32 {
self.field_offset_size as _
}
// Avoid materializing this offset, since it's cheaply and safely
computable
- const fn field_ids_start_byte(&self) -> usize {
+ const fn field_ids_start_byte(&self) -> u32 {
NUM_HEADER_BYTES + self.num_elements_size()
}
@@ -119,12 +119,15 @@ pub struct VariantObject<'m, 'v> {
pub metadata: VariantMetadata<'m>,
pub value: &'v [u8],
header: VariantObjectHeader,
- num_elements: usize,
- first_field_offset_byte: usize,
- first_value_byte: usize,
+ num_elements: u32,
+ first_field_offset_byte: u32,
+ first_value_byte: u32,
validated: bool,
}
+// We don't want this to grow because it could increase the size of `Variant`
and hurt performance.
+const _: () = crate::utils::expect_size_of::<VariantObject>(64);
+
impl<'m, 'v> VariantObject<'m, 'v> {
pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
Self::try_new_with_shallow_validation(metadata, value).expect("Invalid
variant object")
@@ -156,7 +159,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
let num_elements =
header
.num_elements_size
- .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
+ .unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
// Calculate byte offsets for field offsets and values with overflow
protection, and verify
// they're in bounds
@@ -186,10 +189,10 @@ impl<'m, 'v> VariantObject<'m, 'v> {
// Use it to upper-bound the value bytes, which also verifies that the
field id and field
// offset arrays are in bounds.
let last_offset = new_self
- .get_offset(num_elements)?
+ .get_offset(num_elements as _)?
.checked_add(first_value_byte)
.ok_or_else(|| overflow_error("variant object size"))?;
- new_self.value = slice_from_slice(value, ..last_offset)?;
+ new_self.value = slice_from_slice(value, ..last_offset as _)?;
Ok(new_self)
}
@@ -211,7 +214,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
let field_id_buffer = slice_from_slice(
self.value,
-
self.header.field_ids_start_byte()..self.first_field_offset_byte,
+ self.header.field_ids_start_byte() as
_..self.first_field_offset_byte as _,
)?;
let field_ids = map_bytes_to_offsets(field_id_buffer,
self.header.field_id_size)
@@ -268,17 +271,17 @@ impl<'m, 'v> VariantObject<'m, 'v> {
// Validate whether values are valid variant objects
let field_offset_buffer = slice_from_slice(
self.value,
- self.first_field_offset_byte..self.first_value_byte,
+ self.first_field_offset_byte as _..self.first_value_byte as _,
)?;
- let num_offsets = field_offset_buffer.len() /
self.header.field_offset_size();
+ let num_offsets = field_offset_buffer.len() /
self.header.field_offset_size() as usize;
- let value_buffer = slice_from_slice(self.value,
self.first_value_byte..)?;
+ let value_buffer = slice_from_slice(self.value,
self.first_value_byte as _..)?;
map_bytes_to_offsets(field_offset_buffer,
self.header.field_offset_size)
.take(num_offsets.saturating_sub(1))
.try_for_each(|offset| {
let value_bytes = slice_from_slice(value_buffer,
offset..)?;
- Variant::try_new_with_metadata(self.metadata,
value_bytes)?;
+ Variant::try_new_with_metadata(self.metadata.clone(),
value_bytes)?;
Ok::<_, ArrowError>(())
})?;
@@ -290,7 +293,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
/// Returns the number of key-value pairs in this object
pub fn len(&self) -> usize {
- self.num_elements
+ self.num_elements as _
}
/// Returns true if the object contains no key-value pairs
@@ -321,16 +324,16 @@ impl<'m, 'v> VariantObject<'m, 'v> {
// Attempts to retrieve the ith field value from the value region of the
byte buffer; it
// performs only basic (constant-cost) validation.
fn try_field_with_shallow_validation(&self, i: usize) ->
Result<Variant<'m, 'v>, ArrowError> {
- let value_bytes = slice_from_slice(self.value,
self.first_value_byte..)?;
- let value_bytes = slice_from_slice(value_bytes,
self.get_offset(i)?..)?;
- Variant::try_new_with_metadata_and_shallow_validation(self.metadata,
value_bytes)
+ let value_bytes = slice_from_slice(self.value, self.first_value_byte
as _..)?;
+ let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)? as
_..)?;
+
Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(),
value_bytes)
}
// Attempts to retrieve the ith offset from the field offset region of the
byte buffer.
- fn get_offset(&self, i: usize) -> Result<usize, ArrowError> {
- let byte_range = self.first_field_offset_byte..self.first_value_byte;
+ fn get_offset(&self, i: usize) -> Result<u32, ArrowError> {
+ let byte_range = self.first_field_offset_byte as
_..self.first_value_byte as _;
let field_offsets = slice_from_slice(self.value, byte_range)?;
- self.header.field_offset_size.unpack_usize(field_offsets, i)
+ self.header.field_offset_size.unpack_u32(field_offsets, i)
}
/// Get a field's name by index in `0..self.len()`
@@ -347,10 +350,10 @@ impl<'m, 'v> VariantObject<'m, 'v> {
/// Fallible version of `field_name`. Returns field name by index,
capturing validation errors
fn try_field_name(&self, i: usize) -> Result<&'m str, ArrowError> {
- let byte_range =
self.header.field_ids_start_byte()..self.first_field_offset_byte;
+ let byte_range = self.header.field_ids_start_byte() as
_..self.first_field_offset_byte as _;
let field_id_bytes = slice_from_slice(self.value, byte_range)?;
- let field_id = self.header.field_id_size.unpack_usize(field_id_bytes,
i)?;
- self.metadata.get(field_id)
+ let field_id = self.header.field_id_size.unpack_u32(field_id_bytes,
i)?;
+ self.metadata.get(field_id as _)
}
/// Returns an iterator of (name, value) pairs over the fields of this
object.
@@ -374,7 +377,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
fn iter_try_with_shallow_validation(
&self,
) -> impl Iterator<Item = Result<(&'m str, Variant<'m, 'v>), ArrowError>>
+ '_ {
- (0..self.num_elements).map(move |i| {
+ (0..self.len()).map(|i| {
let field = self.try_field_with_shallow_validation(i)?;
Ok((self.try_field_name(i)?, field))
})
@@ -389,8 +392,7 @@ impl<'m, 'v> VariantObject<'m, 'v> {
// NOTE: This does not require a sorted metadata dictionary, because
the variant spec
// requires object field ids to be lexically sorted by their
corresponding string values,
// and probing the dictionary for a field id is always O(1) work.
- let i = try_binary_search_range_by(0..self.num_elements, &name, |i|
self.field_name(i))?
- .ok()?;
+ let i = try_binary_search_range_by(0..self.len(), &name, |i|
self.field_name(i))?.ok()?;
self.field(i)
}