alamb commented on code in PR #7884: URL: https://github.com/apache/arrow-rs/pull/7884#discussion_r2193379346
########## parquet-variant-compute/Cargo.toml: ########## @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "parquet-variant-compute" +# This package is still in development and thus the version does +# not follow the versions of the rest of the crates in this repo. +version = "0.1.0" +license = { workspace = true } +description = "Apache Parquet Variant Batch Processing" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +keywords = ["arrow", "parquet", "variant"] +readme = "README.md" +edition = { workspace = true } +# needs a newer version than workspace due to +# rror: `Option::<T>::unwrap` is not yet stable as a const fn +rust-version = "1.83" Review Comment: this rust-version difference might not be needed in this one ########## parquet-variant-compute/Cargo.toml: ########## @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "parquet-variant-compute" +# This package is still in development and thus the version does +# not follow the versions of the rest of the crates in this repo. +version = "0.1.0" +license = { workspace = true } +description = "Apache Parquet Variant Batch Processing" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +keywords = ["arrow", "parquet", "variant"] +readme = "README.md" +edition = { workspace = true } +# needs a newer version than workspace due to +# rror: `Option::<T>::unwrap` is not yet stable as a const fn +rust-version = "1.83" + + +[dependencies] +arrow = { workspace = true } +arrow-schema = { workspace = true } +parquet-variant = { path = "../parquet-variant" } Review Comment: Can you please make this "workspace=true" (otherwise cargo publish gets angsty)? ```suggestion parquet-variant = { workspace = true } ``` ########## parquet-variant-compute/src/to_json.rs: ########## @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for transforming a batch of Variants represented as +//! STRUCT<metadata: BINARY, value: BINARY> into a batch of JSON strings. + +use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray}; +use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow::datatypes::DataType; +use arrow_schema::ArrowError; +use parquet_variant::{variant_to_json, Variant}; + +pub fn batch_variant_to_json_string(input: &ArrayRef) -> Result<StringArray, ArrowError> { + let struct_array = input + .as_any() + .downcast_ref::<StructArray>() + .ok_or_else(|| ArrowError::CastError("Expected StructArray as input".into()))?; + + // Validate field types + let data_type = struct_array.data_type(); + match data_type { + DataType::Struct(inner_fields) => { + if inner_fields.len() != 2 + || inner_fields[0].data_type() != &DataType::Binary + || inner_fields[1].data_type() != &DataType::Binary + { + return Err(ArrowError::CastError( + "Expected struct with two binary fields".into(), + )); + } + } + _ => { + return Err(ArrowError::CastError( + "Expected StructArray with known fields".into(), + )) + } + } + + let metadata_array = struct_array + .column(0) + .as_any() + .downcast_ref::<BinaryArray>() + .ok_or_else(|| ArrowError::CastError("Expected BinaryArray for 'metadata'".into()))?; + + let value_array = struct_array + .column(1) + .as_any() + .downcast_ref::<BinaryArray>() + .ok_or_else(|| ArrowError::CastError("Expected BinaryArray for 'value'".into()))?; + + // Zero-copy builder + // The size per JSON string is assumed to be 128 bytes. If this holds true, resizing could be + // minimized to improve performance. + let mut json_buffer: Vec<u8> = Vec::with_capacity(struct_array.len() * 128); + let mut offsets: Vec<i32> = Vec::with_capacity(struct_array.len() + 1); + let mut validity = BooleanBufferBuilder::new(struct_array.len()); + let mut current_offset: i32 = 0; + offsets.push(current_offset); + + for i in 0..struct_array.len() { + if struct_array.is_null(i) { + validity.append(false); + offsets.push(current_offset); + } else { + let metadata = metadata_array.value(i); + let value = value_array.value(i); + let variant = Variant::new(metadata, value); + let start_len = json_buffer.len(); + variant_to_json(&mut json_buffer, &variant)?; + let written = (json_buffer.len() - start_len) as i32; + current_offset += written; + offsets.push(current_offset); + validity.append(true); + } + } + + let offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(offsets)); + let value_buffer = Buffer::from_vec(json_buffer); + let null_buffer = NullBuffer::new(validity.finish()); + + Ok(StringArray::new( Review Comment: I think if possible we should consider using StringViewArray here instead of StringArray ########## parquet-variant-compute/src/from_json.rs: ########## @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayRef, BinaryBuilder, BooleanBufferBuilder, StringArray, StructArray, +}; +use arrow::buffer::NullBuffer; +use arrow::datatypes::{DataType, Field}; +use arrow_schema::ArrowError; +use parquet_variant::{json_to_variant, VariantBuilder}; + +fn variant_arrow_repr() -> DataType { + let metadata_field = Field::new("metadata", DataType::Binary, true); + let value_field = Field::new("value", DataType::Binary, true); + let fields = vec![metadata_field, value_field]; + DataType::Struct(fields.into()) +} + +pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, ArrowError> { Review Comment: We should definitely add some docs / example to this kernel I also might suggest calling it `cast_to_variant` but that is more of a personal preference ########## parquet-variant-compute/src/lib.rs: ########## @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod from_json; +mod to_json; + +/// Parse a batch of JSON strings into a batch of Variants represented as Review Comment: Typically these comments would go on the function itself, not its `pub use` ########## parquet-variant-compute/Cargo.toml: ########## @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "parquet-variant-compute" +# This package is still in development and thus the version does +# not follow the versions of the rest of the crates in this repo. +version = "0.1.0" +license = { workspace = true } +description = "Apache Parquet Variant Batch Processing" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +keywords = ["arrow", "parquet", "variant"] +readme = "README.md" Review Comment: I think a placeholder that says "part of arrow-rs" and what it contains is probably enough I would basically follow along the model of other crates in the repo ########## parquet-variant-compute/src/from_json.rs: ########## @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::array::{ + Array, ArrayRef, BinaryBuilder, BooleanBufferBuilder, StringArray, StructArray, +}; +use arrow::buffer::NullBuffer; +use arrow::datatypes::{DataType, Field}; +use arrow_schema::ArrowError; +use parquet_variant::{json_to_variant, VariantBuilder}; + +fn variant_arrow_repr() -> DataType { + let metadata_field = Field::new("metadata", DataType::Binary, true); + let value_field = Field::new("value", DataType::Binary, true); + let fields = vec![metadata_field, value_field]; + DataType::Struct(fields.into()) +} + +pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, ArrowError> { + let input_string_array = match input.as_any().downcast_ref::<StringArray>() { + Some(string_array) => Ok(string_array), + None => Err(ArrowError::CastError( + "Expected reference to StringArray as input".into(), + )), + }?; + + let mut metadata_builder = BinaryBuilder::new(); + let mut value_builder = BinaryBuilder::new(); + let mut validity = BooleanBufferBuilder::new(input.len()); + for i in 0..input.len() { + if input.is_null(i) { + metadata_builder.append_null(); + value_builder.append_null(); + validity.append(false); + } else { + let mut vb = VariantBuilder::new(); Review Comment: This will pattern will cause the variant values to be copied twice -- once into the builder's buffers and then once into the output binary builder, which is probably ok for the first version; With some care I think we will be able to avoid copying the values, though it will take using the lower level APIs (and building offsets directly) ########## parquet-variant-compute/src/to_json.rs: ########## @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for transforming a batch of Variants represented as +//! STRUCT<metadata: BINARY, value: BINARY> into a batch of JSON strings. + +use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray}; +use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow::datatypes::DataType; +use arrow_schema::ArrowError; +use parquet_variant::{variant_to_json, Variant}; + +pub fn batch_variant_to_json_string(input: &ArrayRef) -> Result<StringArray, ArrowError> { + let struct_array = input + .as_any() + .downcast_ref::<StructArray>() + .ok_or_else(|| ArrowError::CastError("Expected StructArray as input".into()))?; + + // Validate field types + let data_type = struct_array.data_type(); + match data_type { + DataType::Struct(inner_fields) => { + if inner_fields.len() != 2 + || inner_fields[0].data_type() != &DataType::Binary + || inner_fields[1].data_type() != &DataType::Binary + { + return Err(ArrowError::CastError( + "Expected struct with two binary fields".into(), + )); + } + } + _ => { + return Err(ArrowError::CastError( + "Expected StructArray with known fields".into(), + )) + } + } + + let metadata_array = struct_array + .column(0) + .as_any() + .downcast_ref::<BinaryArray>() + .ok_or_else(|| ArrowError::CastError("Expected BinaryArray for 'metadata'".into()))?; + + let value_array = struct_array + .column(1) + .as_any() + .downcast_ref::<BinaryArray>() + .ok_or_else(|| ArrowError::CastError("Expected BinaryArray for 'value'".into()))?; + + // Zero-copy builder + // The size per JSON string is assumed to be 128 bytes. If this holds true, resizing could be + // minimized to improve performance. + let mut json_buffer: Vec<u8> = Vec::with_capacity(struct_array.len() * 128); + let mut offsets: Vec<i32> = Vec::with_capacity(struct_array.len() + 1); + let mut validity = BooleanBufferBuilder::new(struct_array.len()); + let mut current_offset: i32 = 0; + offsets.push(current_offset); + + for i in 0..struct_array.len() { + if struct_array.is_null(i) { + validity.append(false); + offsets.push(current_offset); + } else { + let metadata = metadata_array.value(i); + let value = value_array.value(i); + let variant = Variant::new(metadata, value); + let start_len = json_buffer.len(); + variant_to_json(&mut json_buffer, &variant)?; + let written = (json_buffer.len() - start_len) as i32; + current_offset += written; + offsets.push(current_offset); Review Comment: this looks good to me -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org