Re: [PR] Add variant `ShredTypeBuilder` [arrow-rs]

via GitHub Wed, 03 Dec 2025 09:43:43 -0800


alamb commented on code in PR #8940:
URL: https://github.com/apache/arrow-rs/pull/8940#discussion_r2586054975



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -992,26 +1125,28 @@ mod tests {
         let input = builder.build();
 
         // Test with schema containing only id field
-        let schema1 = DataType::Struct(Fields::from(vec![Field::new("id", 
DataType::Int32, true)]));
+        let schema1 = ShredTypeBuilder::default()
+            .with_path("id", &DataType::Int32)
+            .build();
         let result1 = shred_variant(&input, &schema1).unwrap();
         let value_field1 = result1.value_field().unwrap();
         assert!(!value_field1.is_null(0)); // should contain {"age": 25, 
"score": 95.5}
 
         // Test with schema containing id and age fields
-        let schema2 = DataType::Struct(Fields::from(vec![
-            Field::new("id", DataType::Int32, true),
-            Field::new("age", DataType::Int64, true),
-        ]));
+        let schema2 = ShredTypeBuilder::default()
+            .with_path("id", &DataType::Int32)

Review Comment:
   this certainly is much nicer now



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {

Review Comment:
   You could also use `VariantPath` for this: 
https://docs.rs/parquet/latest/parquet/variant/struct.VariantPath.html
   
   ```rust
   let path = VariantPath::from(path)
   ```
   



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;
+/// ```
+#[derive(Default, Clone)]
+pub struct ShredTypeBuilder {
+    root: VariantSchemaNode,
+}
+
+impl ShredTypeBuilder {
+    /// Create a new empty schema builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Insert a typed path into the schema.
+    ///
+    /// The path uses dot notation to specify nested fields.
+    /// For example, "a.b.c" will create a nested structure.
+    ///
+    /// # Arguments
+    ///
+    /// * `path` - The dot-separated path (e.g., "user.name" or "metrics.cpu")
+    /// * `data_type` - The Arrow data type for this field
+    pub fn with_path(mut self, path: &str, data_type: &DataType) -> Self {
+        let segments = split_variant_path(path);
+        self.root.insert_path(&segments, data_type);
+        self
+    }
+
+    /// Build the final [`DataType`].
+    pub fn build(self) -> DataType {
+        let shredding_type = self.root.to_shredding_type();
+        match shredding_type {
+            Some(shredding_type) => shredding_type,
+            None => DataType::Null,
+        }
+    }
+}
+
+/// Internal tree node structure for building variant schemas.
+#[derive(Clone)]
+enum VariantSchemaNode {
+    /// A leaf node with a primitive/scalar type
+    Leaf(DataType),
+    /// An inner node with nested fields
+    Inner(BTreeMap<String, VariantSchemaNode>),
+}
+
+impl Default for VariantSchemaNode {
+    fn default() -> Self {
+        Self::Leaf(DataType::Null)
+    }
+}
+
+impl VariantSchemaNode {
+    /// Insert a path into this node with the given data type.
+    fn insert_path(&mut self, segments: &[String], data_type: &DataType) {
+        let Some((head, tail)) = segments.split_first() else {
+            *self = Self::Leaf(data_type.clone());
+            return;
+        };
+
+        // Ensure this node is an Inner node
+        let children = match self {
+            Self::Inner(children) => children,
+            Self::Leaf(_) => {
+                // Conflicting data type, override with inner node
+                *self = Self::Inner(BTreeMap::new());
+                match self {
+                    Self::Inner(children) => children,
+                    _ => unreachable!(),
+                }
+            }
+        };
+
+        children
+            .entry(head.clone())
+            .or_default()
+            .insert_path(tail, data_type);
+    }
+
+    /// Convert this node to a shredding type.
+    ///
+    /// Returns the [`DataType`] for passing to [`shred_variant`].
+    fn to_shredding_type(&self) -> Option<DataType> {
+        match self {
+            Self::Leaf(data_type) => Some(data_type.clone()),
+            Self::Inner(children) => {
+                let child_fields: Vec<_> = children
+                    .iter()
+                    .filter_map(|(name, child)| child.to_shredding_field(name))
+                    .collect();
+                if child_fields.is_empty() {
+                    None
+                } else {
+                    Some(DataType::Struct(Fields::from(child_fields)))

Review Comment:
   I am not sure what you mean by this



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)

Review Comment:
   Maybe it would help to annotate these methods with context (but maybe this 
is redundant with other docs)
   ```suggestion
   ///     .with_path("time", &DataType::Int64) // store the "time" field as a 
separate Int64 when possible
   ```



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;
+/// ```
+#[derive(Default, Clone)]
+pub struct ShredTypeBuilder {

Review Comment:
   How about calling this `ShreddedSchemaBuilder` as it builds Schemas?



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;
+/// ```
+#[derive(Default, Clone)]
+pub struct ShredTypeBuilder {
+    root: VariantSchemaNode,
+}
+
+impl ShredTypeBuilder {
+    /// Create a new empty schema builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Insert a typed path into the schema.
+    ///
+    /// The path uses dot notation to specify nested fields.
+    /// For example, "a.b.c" will create a nested structure.
+    ///
+    /// # Arguments
+    ///
+    /// * `path` - The dot-separated path (e.g., "user.name" or "metrics.cpu")
+    /// * `data_type` - The Arrow data type for this field
+    pub fn with_path(mut self, path: &str, data_type: &DataType) -> Self {

Review Comment:
   Instead of adding a nullable parameter, I suggest passing a FieldRef 
directly (which would avoid the need for `to_shredding_field` as well
   
   ```rust
       pub fn with_path(mut self, path: &str, field: FieldRef) -> Self {
   ```
   
   I do think the data type is easier to use though for simple use cases. Maybe 
we could make some sort of trait or a second method.
   
   ```rust
   trait IntoShreddingField {
     fn to_field_ref(self) -> FieldRef 
   }
   
   impl IntoShreddingField for FieldRef {
   ..
   }
   
   impl IntoShreddingField for &DataType {
   ...
   }
   ```
   
   🤔 



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;

Review Comment:
   I recommend adding a note to `shred_variant` that points to this structure
   
   > "See [ShredTypeBuilder] for building the schema"



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {

Review Comment:
   However, this is not obvious from the docs. I will update them



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add variant `ShredTypeBuilder` [arrow-rs]

Reply via email to