alamb commented on code in PR #7905: URL: https://github.com/apache/arrow-rs/pull/7905#discussion_r2200958571
########## parquet-variant-compute/src/from_json.rs: ########## @@ -135,43 +68,38 @@ mod test { None, ]); let array_ref: ArrayRef = Arc::new(input); - let output = batch_json_string_to_variant(&array_ref).unwrap(); + let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); Review Comment: This test uses the new VariantArray APIs to access the relevant fields / value / etc ########## parquet-variant-compute/src/from_json.rs: ########## @@ -135,43 +68,38 @@ mod test { None, ]); let array_ref: ArrayRef = Arc::new(input); - let output = batch_json_string_to_variant(&array_ref).unwrap(); + let variant_array = batch_json_string_to_variant(&array_ref).unwrap(); - let struct_array = &output; - let metadata_array = struct_array - .column(0) - .as_any() - .downcast_ref::<BinaryArray>() - .unwrap(); - let value_array = struct_array - .column(1) - .as_any() - .downcast_ref::<BinaryArray>() - .unwrap(); + let metadata_array = variant_array.metadata_field().as_binary_view(); + let value_array = variant_array.value_field().as_binary_view(); - assert!(!struct_array.is_null(0)); - assert!(struct_array.is_null(1)); - assert!(!struct_array.is_null(2)); - assert!(!struct_array.is_null(3)); - assert!(struct_array.is_null(4)); + // Compare row 0 + assert!(!variant_array.is_null(0)); + assert_eq!(variant_array.value(0), Variant::Int8(1)); Review Comment: I think `VariantArray::value()` that returns `Variant` is particularly nice to use :bowtie: ########## parquet-variant-compute/src/from_json.rs: ########## @@ -18,109 +18,42 @@ //! Module for transforming a batch of JSON strings into a batch of Variants represented as //! STRUCT<metadata: BINARY, value: BINARY> -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray}; -use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{DataType, Field}; +use crate::{VariantArray, VariantArrayBuilder}; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow_schema::ArrowError; use parquet_variant::VariantBuilder; use parquet_variant_json::json_to_variant; -fn variant_arrow_repr() -> DataType { - // The subfields are expected to be non-nullable according to the parquet variant spec. - let metadata_field = Field::new("metadata", DataType::Binary, false); - let value_field = Field::new("value", DataType::Binary, false); - let fields = vec![metadata_field, value_field]; - DataType::Struct(fields.into()) -} - /// Parse a batch of JSON strings into a batch of Variants represented as /// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The JSON strings in the input /// must be valid. -pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, ArrowError> { +pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, ArrowError> { let input_string_array = match input.as_any().downcast_ref::<StringArray>() { Some(string_array) => Ok(string_array), None => Err(ArrowError::CastError( "Expected reference to StringArray as input".into(), )), }?; - // Zero-copy builders - let mut metadata_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128); - let mut metadata_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1); - let mut metadata_validity = BooleanBufferBuilder::new(input.len()); - let mut metadata_current_offset: i32 = 0; - metadata_offsets.push(metadata_current_offset); - - let mut value_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128); - let mut value_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1); - let mut value_validity = BooleanBufferBuilder::new(input.len()); - let mut value_current_offset: i32 = 0; - value_offsets.push(value_current_offset); - - let mut validity = BooleanBufferBuilder::new(input.len()); + let mut variant_array_builder = VariantArrayBuilder::new(input_string_array.len()); for i in 0..input.len() { if input.is_null(i) { // The subfields are expected to be non-nullable according to the parquet variant spec. - metadata_validity.append(true); - value_validity.append(true); - metadata_offsets.push(metadata_current_offset); - value_offsets.push(value_current_offset); - validity.append(false); + variant_array_builder.append_null(); } else { let mut vb = VariantBuilder::new(); json_to_variant(input_string_array.value(i), &mut vb)?; let (metadata, value) = vb.finish(); - validity.append(true); - - metadata_current_offset += metadata.len() as i32; - metadata_buffer.extend(metadata); - metadata_offsets.push(metadata_current_offset); - metadata_validity.append(true); - - value_current_offset += value.len() as i32; - value_buffer.extend(value); - value_offsets.push(value_current_offset); - value_validity.append(true); + variant_array_builder.append_variant_buffers(&metadata, &value); Review Comment: I have ideas on how to make this more efficient by writing directly into the target buffers. I will work on this in a follow on PR ########## parquet-variant-compute/src/from_json.rs: ########## @@ -18,109 +18,42 @@ //! Module for transforming a batch of JSON strings into a batch of Variants represented as //! STRUCT<metadata: BINARY, value: BINARY> -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray}; -use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{DataType, Field}; +use crate::{VariantArray, VariantArrayBuilder}; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow_schema::ArrowError; use parquet_variant::VariantBuilder; use parquet_variant_json::json_to_variant; -fn variant_arrow_repr() -> DataType { - // The subfields are expected to be non-nullable according to the parquet variant spec. - let metadata_field = Field::new("metadata", DataType::Binary, false); - let value_field = Field::new("value", DataType::Binary, false); - let fields = vec![metadata_field, value_field]; - DataType::Struct(fields.into()) -} - /// Parse a batch of JSON strings into a batch of Variants represented as /// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The JSON strings in the input /// must be valid. -pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, ArrowError> { +pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, ArrowError> { let input_string_array = match input.as_any().downcast_ref::<StringArray>() { Some(string_array) => Ok(string_array), None => Err(ArrowError::CastError( "Expected reference to StringArray as input".into(), )), }?; - // Zero-copy builders - let mut metadata_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128); - let mut metadata_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1); - let mut metadata_validity = BooleanBufferBuilder::new(input.len()); - let mut metadata_current_offset: i32 = 0; - metadata_offsets.push(metadata_current_offset); - - let mut value_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128); - let mut value_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1); - let mut value_validity = BooleanBufferBuilder::new(input.len()); - let mut value_current_offset: i32 = 0; - value_offsets.push(value_current_offset); - - let mut validity = BooleanBufferBuilder::new(input.len()); + let mut variant_array_builder = VariantArrayBuilder::new(input_string_array.len()); Review Comment: This PR refactors the details of creating the output arrays into `VariantArrayBuilder` so it can be reused ########## parquet-variant-compute/src/from_json.rs: ########## @@ -18,109 +18,42 @@ //! Module for transforming a batch of JSON strings into a batch of Variants represented as //! STRUCT<metadata: BINARY, value: BINARY> -use std::sync::Arc; - -use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray}; -use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{DataType, Field}; +use crate::{VariantArray, VariantArrayBuilder}; +use arrow::array::{Array, ArrayRef, StringArray}; use arrow_schema::ArrowError; use parquet_variant::VariantBuilder; use parquet_variant_json::json_to_variant; -fn variant_arrow_repr() -> DataType { - // The subfields are expected to be non-nullable according to the parquet variant spec. - let metadata_field = Field::new("metadata", DataType::Binary, false); - let value_field = Field::new("value", DataType::Binary, false); - let fields = vec![metadata_field, value_field]; - DataType::Struct(fields.into()) -} - /// Parse a batch of JSON strings into a batch of Variants represented as /// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The JSON strings in the input /// must be valid. -pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, ArrowError> { +pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, ArrowError> { Review Comment: Change 1: a new `VariantArray` that wraps a `StructArray` and does basic validation -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org