alamb commented on code in PR #8208:
URL: https://github.com/apache/arrow-rs/pull/8208#discussion_r2295784217
##########
parquet-variant/src/builder.rs:
##########
@@ -431,13 +433,97 @@ impl ValueBuilder {
}
}
+/// A trait for building variant metadata dictionaries, to be used in
conjunction with a
+/// [`ValueBuilder`]. The trait provides methods for managing field names and
their IDs, as well as
+/// rolling back a failed builder operation that might have created new field
ids.
+pub trait MetadataBuilder: std::fmt::Debug {
+ /// Attempts to register a field name, returning the corresponding
(possibly newly-created)
+ /// field id on success. Attempting to register the same field name twice
will _generally_
+ /// produce the same field id both times, but the variant spec does not
actually require it.
+ fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32,
ArrowError>;
+
+ /// Retrieves the field name for a given field id, which must be less than
+ /// [`Self::num_field_names`]. Panics if the field id is out of bounds.
+ fn field_name(&self, field_id: usize) -> &str;
+
+ /// Returns the number of field names stored in this metadata builder. Any
number less than this
+ /// is a valid field id. The builder can be reverted back to this size
later on (discarding any
+ /// newer/higher field ids) by calling [`Self::truncate_field_names`].
+ fn num_field_names(&self) -> usize;
+
+ /// Reverts the field names to a previous size, discarding any newly out
of bounds field ids.
+ fn truncate_field_names(&mut self, new_size: usize);
+}
+
+impl MetadataBuilder for BasicMetadataBuilder {
+ fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32,
ArrowError> {
+ Ok(self.upsert_field_name(field_name))
+ }
+ fn field_name(&self, field_id: usize) -> &str {
+ self.field_name(field_id)
+ }
+ fn num_field_names(&self) -> usize {
+ self.num_field_names()
+ }
+ fn truncate_field_names(&mut self, new_size: usize) {
+ self.field_names.truncate(new_size)
+ }
+}
+
+/// A metadata builder that cannot register new field names, and merely
returns the field id
+/// associated with a known field name. This is useful for variant unshredding
operations, where the
+/// metadata column is fixed and -- per variant shredding spec -- already
contains all field names
+/// from the typed_value column. It is also useful when projecting a subset of
fields from a variant
+/// object value, since the bytes can be copied across directly without
re-encoding their field ids.
+#[derive(Debug)]
+pub struct ReadOnlyMetadataBuilder<'m> {
+ metadata: VariantMetadata<'m>,
+ known_field_names: HashMap<&'m str, u32>,
+}
+
+impl<'m> ReadOnlyMetadataBuilder<'m> {
+ /// Creates a new read-only metadata builder from the given metadata
dictionary.
+ pub fn new(metadata: VariantMetadata<'m>) -> Self {
+ Self {
+ metadata,
+ known_field_names: HashMap::new(),
+ }
+ }
+}
+
+impl MetadataBuilder for ReadOnlyMetadataBuilder<'_> {
+ fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32,
ArrowError> {
+ if let Some(field_id) = self.known_field_names.get(field_name) {
+ return Ok(*field_id);
+ }
+
+ let Some((field_id, field_name)) = self.metadata.get_entry(field_name)
else {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Field name '{field_name}' not found in metadata dictionary"
+ )));
+ };
+
+ self.known_field_names.insert(field_name, field_id);
+ Ok(field_id)
+ }
+ fn field_name(&self, field_id: usize) -> &str {
+ &self.metadata[field_id]
+ }
+ fn num_field_names(&self) -> usize {
+ self.metadata.len()
+ }
+ fn truncate_field_names(&mut self, new_size: usize) {
+ debug_assert_eq!(self.metadata.len(), new_size);
+ }
+}
+
/// Builder for constructing metadata for [`Variant`] values.
///
/// This is used internally by the [`VariantBuilder`] to construct the metadata
///
/// You can use an existing `Vec<u8>` as the metadata buffer by using the
`from` impl.
#[derive(Default, Debug)]
-struct MetadataBuilder {
+struct BasicMetadataBuilder {
Review Comment:
I was thinking that Basic is a bit generic and it is somewhat unclear what
the difference between a "ReadOnly" builder and a "Basic" builder are.
Here are some other ideas for names:
* `WriteableMetadataBuilder` -- can update the metadata (this is my
preference)
* `OwnedMetadataBuilder` --- the builder owns the underlying structures (can
thus can make changes)
##########
parquet-variant/src/builder.rs:
##########
@@ -431,13 +433,97 @@ impl ValueBuilder {
}
}
+/// A trait for building variant metadata dictionaries, to be used in
conjunction with a
+/// [`ValueBuilder`]. The trait provides methods for managing field names and
their IDs, as well as
+/// rolling back a failed builder operation that might have created new field
ids.
+pub trait MetadataBuilder: std::fmt::Debug {
+ /// Attempts to register a field name, returning the corresponding
(possibly newly-created)
+ /// field id on success. Attempting to register the same field name twice
will _generally_
+ /// produce the same field id both times, but the variant spec does not
actually require it.
+ fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32,
ArrowError>;
+
+ /// Retrieves the field name for a given field id, which must be less than
+ /// [`Self::num_field_names`]. Panics if the field id is out of bounds.
+ fn field_name(&self, field_id: usize) -> &str;
+
+ /// Returns the number of field names stored in this metadata builder. Any
number less than this
+ /// is a valid field id. The builder can be reverted back to this size
later on (discarding any
+ /// newer/higher field ids) by calling [`Self::truncate_field_names`].
+ fn num_field_names(&self) -> usize;
+
+ /// Reverts the field names to a previous size, discarding any newly out
of bounds field ids.
+ fn truncate_field_names(&mut self, new_size: usize);
+}
+
+impl MetadataBuilder for BasicMetadataBuilder {
+ fn try_upsert_field_name(&mut self, field_name: &str) -> Result<u32,
ArrowError> {
+ Ok(self.upsert_field_name(field_name))
+ }
+ fn field_name(&self, field_id: usize) -> &str {
+ self.field_name(field_id)
+ }
+ fn num_field_names(&self) -> usize {
+ self.num_field_names()
+ }
+ fn truncate_field_names(&mut self, new_size: usize) {
+ self.field_names.truncate(new_size)
+ }
+}
+
+/// A metadata builder that cannot register new field names, and merely
returns the field id
+/// associated with a known field name. This is useful for variant unshredding
operations, where the
+/// metadata column is fixed and -- per variant shredding spec -- already
contains all field names
+/// from the typed_value column. It is also useful when projecting a subset of
fields from a variant
+/// object value, since the bytes can be copied across directly without
re-encoding their field ids.
+#[derive(Debug)]
+pub struct ReadOnlyMetadataBuilder<'m> {
+ metadata: VariantMetadata<'m>,
+ known_field_names: HashMap<&'m str, u32>,
Review Comment:
it might be good to comment here that this structure does not include all
metadata fields, it is more like a cache of known field
##########
parquet-variant/src/builder.rs:
##########
@@ -589,14 +675,14 @@ enum ParentState<'a> {
Variant {
value_builder: &'a mut ValueBuilder,
saved_value_builder_offset: usize,
- metadata_builder: &'a mut MetadataBuilder,
+ metadata_builder: &'a mut dyn MetadataBuilder,
Review Comment:
using `&dyn` here I think means all accesses now need a level of indirection
(and I think we can see this slowdown of a few percent in the benchmarks)
I wonder if (maybe as a follow on PR) we can consider making parent state
generic and squeezing that performance back. It may also be premature
optimization for now
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]