alamb commented on code in PR #7905:
URL: https://github.com/apache/arrow-rs/pull/7905#discussion_r2200958571


##########
parquet-variant-compute/src/from_json.rs:
##########
@@ -135,43 +68,38 @@ mod test {
             None,
         ]);
         let array_ref: ArrayRef = Arc::new(input);
-        let output = batch_json_string_to_variant(&array_ref).unwrap();
+        let variant_array = batch_json_string_to_variant(&array_ref).unwrap();

Review Comment:
   This test uses the new VariantArray APIs to access the relevant fields / 
value / etc



##########
parquet-variant-compute/src/from_json.rs:
##########
@@ -135,43 +68,38 @@ mod test {
             None,
         ]);
         let array_ref: ArrayRef = Arc::new(input);
-        let output = batch_json_string_to_variant(&array_ref).unwrap();
+        let variant_array = batch_json_string_to_variant(&array_ref).unwrap();
 
-        let struct_array = &output;
-        let metadata_array = struct_array
-            .column(0)
-            .as_any()
-            .downcast_ref::<BinaryArray>()
-            .unwrap();
-        let value_array = struct_array
-            .column(1)
-            .as_any()
-            .downcast_ref::<BinaryArray>()
-            .unwrap();
+        let metadata_array = variant_array.metadata_field().as_binary_view();
+        let value_array = variant_array.value_field().as_binary_view();
 
-        assert!(!struct_array.is_null(0));
-        assert!(struct_array.is_null(1));
-        assert!(!struct_array.is_null(2));
-        assert!(!struct_array.is_null(3));
-        assert!(struct_array.is_null(4));
+        // Compare row 0
+        assert!(!variant_array.is_null(0));
+        assert_eq!(variant_array.value(0), Variant::Int8(1));

Review Comment:
   I think `VariantArray::value()` that returns `Variant` is particularly nice 
to use :bowtie: 



##########
parquet-variant-compute/src/from_json.rs:
##########
@@ -18,109 +18,42 @@
 //! Module for transforming a batch of JSON strings into a batch of Variants 
represented as
 //! STRUCT<metadata: BINARY, value: BINARY>
 
-use std::sync::Arc;
-
-use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, 
StringArray, StructArray};
-use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer};
-use arrow::datatypes::{DataType, Field};
+use crate::{VariantArray, VariantArrayBuilder};
+use arrow::array::{Array, ArrayRef, StringArray};
 use arrow_schema::ArrowError;
 use parquet_variant::VariantBuilder;
 use parquet_variant_json::json_to_variant;
 
-fn variant_arrow_repr() -> DataType {
-    // The subfields are expected to be non-nullable according to the parquet 
variant spec.
-    let metadata_field = Field::new("metadata", DataType::Binary, false);
-    let value_field = Field::new("value", DataType::Binary, false);
-    let fields = vec![metadata_field, value_field];
-    DataType::Struct(fields.into())
-}
-
 /// Parse a batch of JSON strings into a batch of Variants represented as
 /// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The 
JSON strings in the input
 /// must be valid.
-pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, 
ArrowError> {
+pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, 
ArrowError> {
     let input_string_array = match 
input.as_any().downcast_ref::<StringArray>() {
         Some(string_array) => Ok(string_array),
         None => Err(ArrowError::CastError(
             "Expected reference to StringArray as input".into(),
         )),
     }?;
 
-    // Zero-copy builders
-    let mut metadata_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128);
-    let mut metadata_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1);
-    let mut metadata_validity = BooleanBufferBuilder::new(input.len());
-    let mut metadata_current_offset: i32 = 0;
-    metadata_offsets.push(metadata_current_offset);
-
-    let mut value_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128);
-    let mut value_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1);
-    let mut value_validity = BooleanBufferBuilder::new(input.len());
-    let mut value_current_offset: i32 = 0;
-    value_offsets.push(value_current_offset);
-
-    let mut validity = BooleanBufferBuilder::new(input.len());
+    let mut variant_array_builder = 
VariantArrayBuilder::new(input_string_array.len());
     for i in 0..input.len() {
         if input.is_null(i) {
             // The subfields are expected to be non-nullable according to the 
parquet variant spec.
-            metadata_validity.append(true);
-            value_validity.append(true);
-            metadata_offsets.push(metadata_current_offset);
-            value_offsets.push(value_current_offset);
-            validity.append(false);
+            variant_array_builder.append_null();
         } else {
             let mut vb = VariantBuilder::new();
             json_to_variant(input_string_array.value(i), &mut vb)?;
             let (metadata, value) = vb.finish();
-            validity.append(true);
-
-            metadata_current_offset += metadata.len() as i32;
-            metadata_buffer.extend(metadata);
-            metadata_offsets.push(metadata_current_offset);
-            metadata_validity.append(true);
-
-            value_current_offset += value.len() as i32;
-            value_buffer.extend(value);
-            value_offsets.push(value_current_offset);
-            value_validity.append(true);
+            variant_array_builder.append_variant_buffers(&metadata, &value);

Review Comment:
   I have ideas on how to make this more efficient by writing directly into the 
target buffers. I will work on this in a follow on PR



##########
parquet-variant-compute/src/from_json.rs:
##########
@@ -18,109 +18,42 @@
 //! Module for transforming a batch of JSON strings into a batch of Variants 
represented as
 //! STRUCT<metadata: BINARY, value: BINARY>
 
-use std::sync::Arc;
-
-use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, 
StringArray, StructArray};
-use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer};
-use arrow::datatypes::{DataType, Field};
+use crate::{VariantArray, VariantArrayBuilder};
+use arrow::array::{Array, ArrayRef, StringArray};
 use arrow_schema::ArrowError;
 use parquet_variant::VariantBuilder;
 use parquet_variant_json::json_to_variant;
 
-fn variant_arrow_repr() -> DataType {
-    // The subfields are expected to be non-nullable according to the parquet 
variant spec.
-    let metadata_field = Field::new("metadata", DataType::Binary, false);
-    let value_field = Field::new("value", DataType::Binary, false);
-    let fields = vec![metadata_field, value_field];
-    DataType::Struct(fields.into())
-}
-
 /// Parse a batch of JSON strings into a batch of Variants represented as
 /// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The 
JSON strings in the input
 /// must be valid.
-pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, 
ArrowError> {
+pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, 
ArrowError> {
     let input_string_array = match 
input.as_any().downcast_ref::<StringArray>() {
         Some(string_array) => Ok(string_array),
         None => Err(ArrowError::CastError(
             "Expected reference to StringArray as input".into(),
         )),
     }?;
 
-    // Zero-copy builders
-    let mut metadata_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128);
-    let mut metadata_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1);
-    let mut metadata_validity = BooleanBufferBuilder::new(input.len());
-    let mut metadata_current_offset: i32 = 0;
-    metadata_offsets.push(metadata_current_offset);
-
-    let mut value_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128);
-    let mut value_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1);
-    let mut value_validity = BooleanBufferBuilder::new(input.len());
-    let mut value_current_offset: i32 = 0;
-    value_offsets.push(value_current_offset);
-
-    let mut validity = BooleanBufferBuilder::new(input.len());
+    let mut variant_array_builder = 
VariantArrayBuilder::new(input_string_array.len());

Review Comment:
   This PR refactors the details of creating the output arrays into 
`VariantArrayBuilder` so it can be reused



##########
parquet-variant-compute/src/from_json.rs:
##########
@@ -18,109 +18,42 @@
 //! Module for transforming a batch of JSON strings into a batch of Variants 
represented as
 //! STRUCT<metadata: BINARY, value: BINARY>
 
-use std::sync::Arc;
-
-use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, 
StringArray, StructArray};
-use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer};
-use arrow::datatypes::{DataType, Field};
+use crate::{VariantArray, VariantArrayBuilder};
+use arrow::array::{Array, ArrayRef, StringArray};
 use arrow_schema::ArrowError;
 use parquet_variant::VariantBuilder;
 use parquet_variant_json::json_to_variant;
 
-fn variant_arrow_repr() -> DataType {
-    // The subfields are expected to be non-nullable according to the parquet 
variant spec.
-    let metadata_field = Field::new("metadata", DataType::Binary, false);
-    let value_field = Field::new("value", DataType::Binary, false);
-    let fields = vec![metadata_field, value_field];
-    DataType::Struct(fields.into())
-}
-
 /// Parse a batch of JSON strings into a batch of Variants represented as
 /// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The 
JSON strings in the input
 /// must be valid.
-pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, 
ArrowError> {
+pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, 
ArrowError> {

Review Comment:
   Change 1: a new `VariantArray` that wraps a `StructArray` and does basic 
validation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to