klion26 commented on code in PR #8940:
URL: https://github.com/apache/arrow-rs/pull/8940#discussion_r2580030664


##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;
+/// ```
+#[derive(Default, Clone)]
+pub struct ShredTypeBuilder {
+    root: VariantSchemaNode,
+}
+
+impl ShredTypeBuilder {
+    /// Create a new empty schema builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Insert a typed path into the schema.
+    ///
+    /// The path uses dot notation to specify nested fields.
+    /// For example, "a.b.c" will create a nested structure.
+    ///
+    /// # Arguments
+    ///
+    /// * `path` - The dot-separated path (e.g., "user.name" or "metrics.cpu")
+    /// * `data_type` - The Arrow data type for this field
+    pub fn with_path(mut self, path: &str, data_type: &DataType) -> Self {

Review Comment:
   Do we need to add `nullable` as a parameter here?



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;
+/// ```
+#[derive(Default, Clone)]
+pub struct ShredTypeBuilder {
+    root: VariantSchemaNode,
+}
+
+impl ShredTypeBuilder {
+    /// Create a new empty schema builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Insert a typed path into the schema.
+    ///
+    /// The path uses dot notation to specify nested fields.
+    /// For example, "a.b.c" will create a nested structure.
+    ///
+    /// # Arguments
+    ///
+    /// * `path` - The dot-separated path (e.g., "user.name" or "metrics.cpu")
+    /// * `data_type` - The Arrow data type for this field
+    pub fn with_path(mut self, path: &str, data_type: &DataType) -> Self {
+        let segments = split_variant_path(path);
+        self.root.insert_path(&segments, data_type);
+        self
+    }
+
+    /// Build the final [`DataType`].
+    pub fn build(self) -> DataType {
+        let shredding_type = self.root.to_shredding_type();
+        match shredding_type {
+            Some(shredding_type) => shredding_type,
+            None => DataType::Null,
+        }
+    }
+}
+
+/// Internal tree node structure for building variant schemas.
+#[derive(Clone)]
+enum VariantSchemaNode {
+    /// A leaf node with a primitive/scalar type
+    Leaf(DataType),
+    /// An inner node with nested fields
+    Inner(BTreeMap<String, VariantSchemaNode>),
+}
+
+impl Default for VariantSchemaNode {
+    fn default() -> Self {
+        Self::Leaf(DataType::Null)
+    }
+}
+
+impl VariantSchemaNode {
+    /// Insert a path into this node with the given data type.
+    fn insert_path(&mut self, segments: &[String], data_type: &DataType) {
+        let Some((head, tail)) = segments.split_first() else {
+            *self = Self::Leaf(data_type.clone());
+            return;
+        };
+
+        // Ensure this node is an Inner node
+        let children = match self {
+            Self::Inner(children) => children,
+            Self::Leaf(_) => {
+                // Conflicting data type, override with inner node
+                *self = Self::Inner(BTreeMap::new());
+                match self {
+                    Self::Inner(children) => children,
+                    _ => unreachable!(),
+                }
+            }
+        };
+
+        children
+            .entry(head.clone())
+            .or_default()
+            .insert_path(tail, data_type);
+    }
+
+    /// Convert this node to a shredding type.
+    ///
+    /// Returns the [`DataType`] for passing to [`shred_variant`].
+    fn to_shredding_type(&self) -> Option<DataType> {
+        match self {
+            Self::Leaf(data_type) => Some(data_type.clone()),
+            Self::Inner(children) => {
+                let child_fields: Vec<_> = children
+                    .iter()
+                    .filter_map(|(name, child)| child.to_shredding_field(name))
+                    .collect();
+                if child_fields.is_empty() {
+                    None
+                } else {
+                    Some(DataType::Struct(Fields::from(child_fields)))

Review Comment:
   Seems we can construct `DataType::Struct` and primitive types (use "" as the 
path), and no `List` types, not sure if we need to add the information `struct` 
information in the builder name or anywhere.



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -348,6 +349,139 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
     }
 }
 
+fn split_variant_path(path: &str) -> Vec<String> {
+    path.split('.')
+        .filter(|segment| !segment.is_empty())
+        .map(|segment| segment.to_string())
+        .collect()
+}
+
+/// Builder for constructing a variant shredding schema.
+///
+/// The builder pattern makes it easy to incrementally define which fields
+/// should be shredded and with what types.
+///
+/// # Example
+///
+/// ```
+/// use arrow::datatypes::DataType;
+/// use parquet_variant_compute::ShredTypeBuilder;
+///
+/// // Define the shredding schema using the builder
+/// let shredding_type = ShredTypeBuilder::default()
+///     .with_path("time", &DataType::Int64)
+///     .with_path("hostname", &DataType::Utf8)
+///     .with_path("metrics.cpu", &DataType::Float64)
+///     .with_path("metrics.memory", &DataType::Float64)
+///     .build();
+///
+/// // The shredding_type can now be passed to shred_variant:
+/// // let shredded = shred_variant(&input, &shredding_type)?;
+/// ```
+#[derive(Default, Clone)]
+pub struct ShredTypeBuilder {
+    root: VariantSchemaNode,
+}
+
+impl ShredTypeBuilder {
+    /// Create a new empty schema builder.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Insert a typed path into the schema.
+    ///
+    /// The path uses dot notation to specify nested fields.
+    /// For example, "a.b.c" will create a nested structure.
+    ///
+    /// # Arguments
+    ///
+    /// * `path` - The dot-separated path (e.g., "user.name" or "metrics.cpu")
+    /// * `data_type` - The Arrow data type for this field
+    pub fn with_path(mut self, path: &str, data_type: &DataType) -> Self {

Review Comment:
   Do we need to add the behavior for the conflict cases, such as the test 
`test_variant_schema_builder_conflicting_path`



##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -1244,4 +1378,173 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_variant_schema_builder_simple() {
+        let shredding_type = ShredTypeBuilder::default()
+            .with_path("a", &DataType::Int64)
+            .with_path("b", &DataType::Float64)
+            .build();
+
+        assert_eq!(
+            shredding_type,
+            DataType::Struct(Fields::from(vec![
+                Field::new("a", DataType::Int64, true),
+                Field::new("b", DataType::Float64, true),
+            ]))
+        );
+    }
+
+    #[test]
+    fn test_variant_schema_builder_nested() {
+        let shredding_type = ShredTypeBuilder::default()
+            .with_path("a", &DataType::Int64)
+            .with_path("b.c", &DataType::Utf8)
+            .with_path("b.d", &DataType::Float64)
+            .build();
+
+        assert_eq!(
+            shredding_type,
+            DataType::Struct(Fields::from(vec![
+                Field::new("a", DataType::Int64, true),
+                Field::new(
+                    "b",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("c", DataType::Utf8, true),
+                        Field::new("d", DataType::Float64, true),
+                    ])),
+                    true
+                ),
+            ]))
+        );
+    }
+
+    #[test]
+    fn test_variant_schema_builder_with_shred_variant() {
+        let mut builder = VariantArrayBuilder::new(3);
+        builder
+            .new_object()
+            .with_field("time", 1234567890i64)
+            .with_field("hostname", "server1")
+            .with_field("extra", 42)
+            .finish();
+        builder
+            .new_object()
+            .with_field("time", 9876543210i64)
+            .with_field("hostname", "server2")
+            .finish();
+        builder.append_null();
+
+        let input = builder.build();
+
+        let shredding_type = ShredTypeBuilder::default()
+            .with_path("time", &DataType::Int64)
+            .with_path("hostname", &DataType::Utf8)
+            .build();
+
+        let result = shred_variant(&input, &shredding_type).unwrap();
+
+        assert_eq!(
+            result.data_type(),
+            &DataType::Struct(Fields::from(vec![
+                Field::new("metadata", DataType::BinaryView, false),
+                Field::new("value", DataType::BinaryView, true),
+                Field::new(
+                    "typed_value",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new(
+                            "hostname",
+                            DataType::Struct(Fields::from(vec![
+                                Field::new("value", DataType::BinaryView, 
true),
+                                Field::new("typed_value", DataType::Utf8, 
true),
+                            ])),
+                            false,
+                        ),
+                        Field::new(
+                            "time",
+                            DataType::Struct(Fields::from(vec![
+                                Field::new("value", DataType::BinaryView, 
true),
+                                Field::new("typed_value", DataType::Int64, 
true),
+                            ])),
+                            false,
+                        ),
+                    ])),
+                    true,
+                ),
+            ]))
+        );
+
+        assert_eq!(result.len(), 3);
+        assert!(result.typed_value_field().is_some());
+
+        let typed_value = result
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<arrow::array::StructArray>()
+            .unwrap();
+
+        let time_field =
+            
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("time").unwrap())
+                .unwrap();
+        let hostname_field =
+            
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("hostname").unwrap())
+                .unwrap();
+
+        let time_typed = time_field
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        let hostname_typed = hostname_field
+            .typed_value_field()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<arrow::array::StringArray>()
+            .unwrap();
+
+        // Row 0
+        assert!(!result.is_null(0));
+        assert_eq!(time_typed.value(0), 1234567890);
+        assert_eq!(hostname_typed.value(0), "server1");
+
+        // Row 1
+        assert!(!result.is_null(1));
+        assert_eq!(time_typed.value(1), 9876543210);
+        assert_eq!(hostname_typed.value(1), "server2");
+
+        // Row 2
+        assert!(result.is_null(2));
+    }
+
+    #[test]
+    fn test_variant_schema_builder_conflicting_path() {

Review Comment:
   Nice tests!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to