scovich commented on code in PR #8998: URL: https://github.com/apache/arrow-rs/pull/8998#discussion_r2623950092
########## arrow-json/src/reader/variant_array.rs: ########## @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, StructArray}; +use parquet_variant::{ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt}; +use parquet_variant_compute::VariantArrayBuilder; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; + +use crate::reader::ArrayDecoder; +use crate::reader::tape::{Tape, TapeElement}; + +#[derive(Default)] +pub struct VariantArrayDecoder {} + +impl ArrayDecoder for VariantArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> { + let mut array_builder = VariantArrayBuilder::new(pos.len()); + for p in pos { + let mut builder = VariantBuilder::new(); + variant_from_tape_element(&mut builder, *p, tape)?; + let (metadata, value) = builder.finish(); + array_builder.append_value(Variant::new(&metadata, &value)); + } + let variant_struct_array: StructArray = array_builder.build().into(); + Ok(variant_struct_array.into_data()) + } +} + +fn variant_from_tape_element(builder: &mut impl VariantBuilderExt, mut p: u32, tape: &Tape) -> Result<u32, ArrowError> { + match tape.get(p) { + TapeElement::StartObject(end_idx) => { + let mut object_builder = builder.try_new_object()?; + p += 1; + while p < end_idx { + // Read field name + let field_name = match tape.get(p) { + TapeElement::String(s) => tape.get_string(s), + _ => return Err(tape.error(p, "field name")), + }; + + let mut field_builder = ObjectFieldBuilder::new(field_name, &mut object_builder); + p = tape.next(p, "field value")?; + p = variant_from_tape_element(&mut field_builder, p, tape)?; + } + object_builder.finish(); + } + TapeElement::EndObject(_u32) => unreachable!(), Review Comment: Is it truly unreachable, even for invalid input JSON? Does the tape decoder have enough invariant checking to prove that this can never arise? What about bugs in the decoding process or tape navigation? (perhaps it's better to just return an Err here instead of panicking) ########## arrow-json/src/reader/variant_array.rs: ########## @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, StructArray}; +use parquet_variant::{ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt}; +use parquet_variant_compute::VariantArrayBuilder; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; + +use crate::reader::ArrayDecoder; +use crate::reader::tape::{Tape, TapeElement}; + +#[derive(Default)] +pub struct VariantArrayDecoder {} + +impl ArrayDecoder for VariantArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> { + let mut array_builder = VariantArrayBuilder::new(pos.len()); + for p in pos { + let mut builder = VariantBuilder::new(); + variant_from_tape_element(&mut builder, *p, tape)?; + let (metadata, value) = builder.finish(); + array_builder.append_value(Variant::new(&metadata, &value)); + } + let variant_struct_array: StructArray = array_builder.build().into(); + Ok(variant_struct_array.into_data()) + } +} + +fn variant_from_tape_element(builder: &mut impl VariantBuilderExt, mut p: u32, tape: &Tape) -> Result<u32, ArrowError> { + match tape.get(p) { + TapeElement::StartObject(end_idx) => { + let mut object_builder = builder.try_new_object()?; + p += 1; + while p < end_idx { + // Read field name + let field_name = match tape.get(p) { + TapeElement::String(s) => tape.get_string(s), + _ => return Err(tape.error(p, "field name")), + }; + + let mut field_builder = ObjectFieldBuilder::new(field_name, &mut object_builder); + p = tape.next(p, "field value")?; + p = variant_from_tape_element(&mut field_builder, p, tape)?; + } + object_builder.finish(); + } + TapeElement::EndObject(_u32) => unreachable!(), + TapeElement::StartList(end_idx) => { + let mut list_builder = builder.try_new_list()?; + p+= 1; + while p < end_idx { + p = variant_from_tape_element(&mut list_builder, p, tape)?; + } + list_builder.finish(); + } + TapeElement::EndList(_u32) => unreachable!(), + TapeElement::String(idx) => builder.append_value(tape.get_string(idx)), + TapeElement::Number(idx) => { + let s = tape.get_string(idx); + builder.append_value(parse_number(s)?) + }, + TapeElement::I64(i) => builder.append_value(i), + TapeElement::I32(i) => builder.append_value(i), + TapeElement::F64(f) => builder.append_value(f), + TapeElement::F32(f) => builder.append_value(f), + TapeElement::True => builder.append_value(true), + TapeElement::False => builder.append_value(false), + TapeElement::Null => builder.append_value(Variant::Null), + } + p += 1; + Ok(p) +} + +fn parse_number<'a, 'b>(s: &'a str) -> Result<Variant<'a, 'b>, ArrowError> { + match lexical_core::parse::<i64>(s.as_bytes()) { + Ok(v) => Ok(Variant::from(v)), + Err(_) => { + match lexical_core::parse::<f64>(s.as_bytes()) { + Ok(v) => Ok(Variant::from(v)), + Err(_) => Err(ArrowError::JsonError(format!("failed to parse {s} as number"))), + } + } + } Review Comment: nit: a few ideas to simplify the code ```suggestion if let Ok(v) = lexical_core::parse(s.as_bytes()) { return Ok(Variant::Int64(v)); } match lexical_core::parse(s.as_bytes()) { Ok(v) => Ok(Variant::Double(v)), Err(_) => Err(ArrowError::JsonError(format!("failed to parse {s} as number"))), } ``` ########## arrow-json/src/reader/variant_array.rs: ########## @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{Array, StructArray}; +use parquet_variant::{ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt}; +use parquet_variant_compute::VariantArrayBuilder; +use arrow_data::ArrayData; +use arrow_schema::ArrowError; + +use crate::reader::ArrayDecoder; +use crate::reader::tape::{Tape, TapeElement}; + +#[derive(Default)] +pub struct VariantArrayDecoder {} + +impl ArrayDecoder for VariantArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> { + let mut array_builder = VariantArrayBuilder::new(pos.len()); + for p in pos { + let mut builder = VariantBuilder::new(); + variant_from_tape_element(&mut builder, *p, tape)?; + let (metadata, value) = builder.finish(); + array_builder.append_value(Variant::new(&metadata, &value)); + } + let variant_struct_array: StructArray = array_builder.build().into(); + Ok(variant_struct_array.into_data()) + } +} + +fn variant_from_tape_element(builder: &mut impl VariantBuilderExt, mut p: u32, tape: &Tape) -> Result<u32, ArrowError> { + match tape.get(p) { + TapeElement::StartObject(end_idx) => { + let mut object_builder = builder.try_new_object()?; + p += 1; + while p < end_idx { + // Read field name + let field_name = match tape.get(p) { + TapeElement::String(s) => tape.get_string(s), + _ => return Err(tape.error(p, "field name")), + }; + + let mut field_builder = ObjectFieldBuilder::new(field_name, &mut object_builder); + p = tape.next(p, "field value")?; + p = variant_from_tape_element(&mut field_builder, p, tape)?; + } + object_builder.finish(); + } + TapeElement::EndObject(_u32) => unreachable!(), + TapeElement::StartList(end_idx) => { + let mut list_builder = builder.try_new_list()?; + p+= 1; + while p < end_idx { + p = variant_from_tape_element(&mut list_builder, p, tape)?; + } + list_builder.finish(); + } + TapeElement::EndList(_u32) => unreachable!(), + TapeElement::String(idx) => builder.append_value(tape.get_string(idx)), + TapeElement::Number(idx) => { + let s = tape.get_string(idx); + builder.append_value(parse_number(s)?) + }, + TapeElement::I64(i) => builder.append_value(i), + TapeElement::I32(i) => builder.append_value(i), + TapeElement::F64(f) => builder.append_value(f), + TapeElement::F32(f) => builder.append_value(f), + TapeElement::True => builder.append_value(true), + TapeElement::False => builder.append_value(false), + TapeElement::Null => builder.append_value(Variant::Null), + } + p += 1; + Ok(p) +} + +fn parse_number<'a, 'b>(s: &'a str) -> Result<Variant<'a, 'b>, ArrowError> { + match lexical_core::parse::<i64>(s.as_bytes()) { + Ok(v) => Ok(Variant::from(v)), + Err(_) => { + match lexical_core::parse::<f64>(s.as_bytes()) { + Ok(v) => Ok(Variant::from(v)), + Err(_) => Err(ArrowError::JsonError(format!("failed to parse {s} as number"))), + } + } + } +} Review Comment: Do we care about missing newline at EOF? ########## arrow-json/src/reader/mod.rs: ########## @@ -686,12 +694,18 @@ macro_rules! primitive_decoder { } fn make_decoder( + field: Option<FieldRef>, data_type: DataType, coerce_primitive: bool, strict_mode: bool, is_nullable: bool, Review Comment: But it looks like nullability in particular is not always directly taken from a field (nested struct case below), and the data type is not always from a field either (decoder builder above). So maybe we should consider passing the field's metadata instead? But I don't know if we have extension type support directly on metadata (or if it requires a Field instance to work with)? ########## parquet-variant-compute/src/shred_variant.rs: ########## @@ -22,16 +22,14 @@ use crate::variant_to_arrow::{ PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::{ - ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, NullBufferBuilder, - OffsetSizeTrait, -}; -use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::compute::CastOptions; -use arrow::datatypes::{ArrowNativeTypeOp, DataType, Field, FieldRef, Fields, TimeUnit}; -use arrow::error::{ArrowError, Result}; +use arrow_array::{ArrayRef, BinaryViewArray}; +use arrow_buffer::{NullBuffer, NullBufferBuilder}; +use arrow_cast::CastOptions; +use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; +use std::result::Result; Review Comment: Isn't `Result` a prelude auto-import? ########## parquet-variant-compute/src/shred_variant.rs: ########## @@ -22,16 +22,14 @@ use crate::variant_to_arrow::{ PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::{ - ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, NullBufferBuilder, - OffsetSizeTrait, -}; -use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::compute::CastOptions; -use arrow::datatypes::{ArrowNativeTypeOp, DataType, Field, FieldRef, Fields, TimeUnit}; -use arrow::error::{ArrowError, Result}; +use arrow_array::{ArrayRef, BinaryViewArray}; +use arrow_buffer::{NullBuffer, NullBufferBuilder}; +use arrow_cast::CastOptions; +use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; +use std::result::Result; Review Comment: Also, there seems to be at least some precedent in existing arrow packages, to just define the arrow result manually if desired? pub type Result<T> = std::result::Result<T, ArrowError>; (see arrow-array/ffi, arrow-integration-testing, parquet/errors, etc) Given the amount of churn it causes across multiple files in the variant package, a type alias seems helpful? ########## parquet-variant-compute/src/shred_variant.rs: ########## @@ -22,16 +22,14 @@ use crate::variant_to_arrow::{ PrimitiveVariantToArrowRowBuilder, make_primitive_variant_to_arrow_row_builder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::{ - ArrayRef, BinaryViewArray, GenericListArray, GenericListViewArray, NullBufferBuilder, - OffsetSizeTrait, -}; -use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::compute::CastOptions; -use arrow::datatypes::{ArrowNativeTypeOp, DataType, Field, FieldRef, Fields, TimeUnit}; -use arrow::error::{ArrowError, Result}; +use arrow_array::{ArrayRef, BinaryViewArray}; +use arrow_buffer::{NullBuffer, NullBufferBuilder}; +use arrow_cast::CastOptions; +use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; +use std::result::Result; Review Comment: Honestly, I've wondered why arrow-schema doesn't just provide that type alias for everyone to use, so we can simplify a _lot_ of code across all of arrow-rs. Basically everyone has to depend on arrow-schema AFAIK. ########## parquet-variant-compute/src/type_conversion.rs: ########## @@ -17,8 +17,8 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. -use arrow::compute::{DecimalCast, rescale_decimal}; -use arrow::datatypes::{ +use arrow_cast::{DecimalCast, rescale_decimal}; +use arrow_array::types::{ self, ArrowPrimitiveType, ArrowTimestampType, Decimal32Type, Decimal64Type, Decimal128Type, Review Comment: If you wanted, could import `self as datatypes` to reduce churn? (again below) ########## arrow-json/src/reader/mod.rs: ########## @@ -686,12 +694,18 @@ macro_rules! primitive_decoder { } fn make_decoder( + field: Option<FieldRef>, data_type: DataType, coerce_primitive: bool, strict_mode: bool, is_nullable: bool, Review Comment: This seems very awkward, to take a `Field` but still require both `data_type` and `is_nullable`? Should we just admit that the decoder officially uses fields now, and rely on [Field::is_nullalbe](https://docs.rs/arrow/latest/arrow/datatypes/struct.Field.html#method.is_nullable) and [Field::data_type](https://docs.rs/arrow/latest/arrow/datatypes/struct.Field.html#method.data_type)? Especially when `ReaderBuilder::build_decoder` (L280/305 above) is anyway deriving those two value from fields it was already working with? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
