alamb commented on code in PR #8408:
URL: https://github.com/apache/arrow-rs/pull/8408#discussion_r2370065342
##########
parquet/src/variant.rs:
##########
@@ -108,6 +98,203 @@
//! ```
//!
//! # Example: Reading a Parquet file with Variant column
-//! (TODO: add example)
+//!
+//! Use the [`VariantType`] extension type to find the Variant column:
+//!
+//! ```
+//! # use std::sync::Arc;
+//! # use std::path::PathBuf;
+//! # use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
+//! # use parquet::variant::{Variant, VariantArray, VariantType};
+//! # use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! # use arrow_array::StructArray;
+//! # fn file_path() -> PathBuf { // return a testing file path
+//! # PathBuf::from(arrow::util::test_util::parquet_test_data())
+//! # .join("..")
+//! # .join("shredded_variant")
+//! # .join("case-075.parquet")
+//! # }
+//! // Read the Parquet file using standard Arrow Parquet reader
+//! let file = std::fs::File::open(file_path())?;
+//! let mut reader = ArrowReaderBuilder::try_new(file)?.build()?;
+//!
+//! // You can check if a column contains a Variant using
+//! // the VariantType extension type
+//! let schema = reader.schema();
+//! let field = schema.field_with_name("var")?;
+//! assert!(field.try_extension_type::<VariantType>().is_ok());
+//!
+//! // The reader will yield RecordBatches with a StructArray
+//! // to convert them to VariantArray, use VariantArray::try_new
+//! let batch = reader.next().unwrap().unwrap();
+//! let col = batch.column_by_name("var").unwrap();
+//! let var_array = VariantArray::try_new(col)?;
+//! assert_eq!(var_array.len(), 1);
+//! let var_value: Variant = var_array.value(0);
+//! assert_eq!(var_value, Variant::from("iceberg")); // the value in
case-075.parquet
+//! # Ok(())
+//! # }
+//! ```
pub use parquet_variant::*;
-pub use parquet_variant_compute as compute;
+pub use parquet_variant_compute::*;
+
+#[cfg(test)]
+mod tests {
+ use crate::arrow::arrow_reader::ArrowReaderBuilder;
+ use crate::arrow::ArrowWriter;
+ use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
+ use crate::file::reader::ChunkReader;
+ use arrow::util::test_util::parquet_test_data;
+ use arrow_array::{ArrayRef, RecordBatch};
+ use arrow_schema::Schema;
+ use bytes::Bytes;
+ use parquet_variant::{Variant, VariantBuilderExt};
+ use parquet_variant_compute::{VariantArray, VariantArrayBuilder,
VariantType};
+ use std::path::PathBuf;
+ use std::sync::Arc;
+
+ #[test]
+ fn roundtrip_basic() {
+ roundtrip(variant_array());
+ }
+
+ /// Ensure a file with Variant LogicalType, written by another writer in
+ /// parquet-testing, can be read as a VariantArray
+ #[test]
+ fn read_logical_type() {
Review Comment:
@scovich noted in following comment of the pathfinding PR that this is very
similar to the doc test above.
- https://github.com/apache/arrow-rs/pull/8365#discussion_r2362316100
I re-reviewed it and while I agree this is redundant with the doc test
above, I feel that putting the test in both places makes it easier to
understand that there is test coverage for reading logical types, and thus I
would like to keep it here.
##########
parquet/src/arrow/schema/extension.rs:
##########
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Arrow Extension Type Support for Parquet
Review Comment:
I plan to consolidate the rest of the extension type handling in this
module, to try and improve the current situation where `#cfg(..)` is sprinkled
over the type conversion logic
##########
parquet/src/variant.rs:
##########
@@ -108,6 +98,203 @@
//! ```
//!
//! # Example: Reading a Parquet file with Variant column
-//! (TODO: add example)
+//!
+//! Use the [`VariantType`] extension type to find the Variant column:
+//!
+//! ```
+//! # use std::sync::Arc;
+//! # use std::path::PathBuf;
+//! # use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
+//! # use parquet::variant::{Variant, VariantArray, VariantType};
+//! # use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! # use arrow_array::StructArray;
+//! # fn file_path() -> PathBuf { // return a testing file path
+//! # PathBuf::from(arrow::util::test_util::parquet_test_data())
+//! # .join("..")
+//! # .join("shredded_variant")
+//! # .join("case-075.parquet")
+//! # }
+//! // Read the Parquet file using standard Arrow Parquet reader
+//! let file = std::fs::File::open(file_path())?;
+//! let mut reader = ArrowReaderBuilder::try_new(file)?.build()?;
+//!
+//! // You can check if a column contains a Variant using
+//! // the VariantType extension type
+//! let schema = reader.schema();
+//! let field = schema.field_with_name("var")?;
+//! assert!(field.try_extension_type::<VariantType>().is_ok());
+//!
+//! // The reader will yield RecordBatches with a StructArray
+//! // to convert them to VariantArray, use VariantArray::try_new
+//! let batch = reader.next().unwrap().unwrap();
+//! let col = batch.column_by_name("var").unwrap();
+//! let var_array = VariantArray::try_new(col)?;
+//! assert_eq!(var_array.len(), 1);
+//! let var_value: Variant = var_array.value(0);
+//! assert_eq!(var_value, Variant::from("iceberg")); // the value in
case-075.parquet
+//! # Ok(())
+//! # }
+//! ```
pub use parquet_variant::*;
-pub use parquet_variant_compute as compute;
+pub use parquet_variant_compute::*;
+
+#[cfg(test)]
+mod tests {
+ use crate::arrow::arrow_reader::ArrowReaderBuilder;
+ use crate::arrow::ArrowWriter;
+ use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
+ use crate::file::reader::ChunkReader;
+ use arrow::util::test_util::parquet_test_data;
+ use arrow_array::{ArrayRef, RecordBatch};
+ use arrow_schema::Schema;
+ use bytes::Bytes;
+ use parquet_variant::{Variant, VariantBuilderExt};
+ use parquet_variant_compute::{VariantArray, VariantArrayBuilder,
VariantType};
+ use std::path::PathBuf;
+ use std::sync::Arc;
+
+ #[test]
Review Comment:
here are end to end cases showing how to read/write the files to disk
##########
parquet/src/arrow/schema/extension.rs:
##########
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Arrow Extension Type Support for Parquet
+//!
+//! This module contains mapping code to map Parquet [`LogicalType`]s to/from
+//! Arrow [`ExtensionType`]s.
+//!
+//! Extension types are represented using the metadata from Arrow [`Field`]s
+//! with the key "ARROW:extension:name".
+
+use crate::basic::LogicalType;
+use crate::schema::types::Type;
+use arrow_schema::extension::ExtensionType;
+use arrow_schema::Field;
+
+/// Adds extension type metadata, if necessary, based on the Parquet field's
+/// [`LogicalType`]
+///
+/// Some Parquet logical types, such as Variant, do not map directly to an
+/// Arrow DataType, and instead are represented by an Arrow ExtensionType.
+/// Extension types are attached to Arrow Fields via metadata.
+pub(crate) fn add_extension_type(arrow_field: Field, parquet_type: &Type) ->
Field {
+ let result = match parquet_type.get_basic_info().logical_type() {
+ #[cfg(feature = "variant_experimental")]
+ Some(LogicalType::Variant) => {
+
arrow_field.with_extension_type(parquet_variant_compute::VariantType)
+ }
+ // TODO add other LogicalTypes here
+ _ => arrow_field,
+ };
+ result
+}
+
+/// Return the Parquet logical type to use for the specified Arrow field, if
any.
+#[cfg(feature = "variant_experimental")]
+pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> {
+ use parquet_variant_compute::VariantType;
+ if field.extension_type_name()? == VariantType::NAME {
Review Comment:
I was worried here about testing for the extension type using
`try_extension_type` and then discarding any error via `ok()` -- creating an
ArrowError requires allocating a string, so that pattern can be expensive
(allocate and format a string just to throw it away)
@scovich also noticed this in the pathfinding PR here:
- https://github.com/apache/arrow-rs/pull/8365#discussion_r2362704747
##########
parquet/src/arrow/schema/complex.rs:
##########
@@ -352,13 +353,13 @@ impl Visitor {
// Need both columns to be projected
match (maybe_key, maybe_value) {
- (Some(key), Some(value)) => {
+ (Some(mut key), Some(mut value)) => {
let key_field = Arc::new(
- convert_field(map_key, &key, arrow_key)
+ convert_field(map_key, &mut key, arrow_key)
// The key is always non-nullable (#5630)
.with_nullable(false),
);
- let value_field = Arc::new(convert_field(map_value, &value,
arrow_value));
+ let value_field = Arc::new(convert_field(map_value, &mut
value, arrow_value));
Review Comment:
This logic is somewhat confusing to me in that the arrow type is encoded
twice -- once on a `ParquetField` (which has an arrow `Field` in it) and once
in this `ValueField`.
Any help simplifing this would be most appreciated
##########
parquet/src/variant.rs:
##########
@@ -108,6 +98,203 @@
//! ```
//!
//! # Example: Reading a Parquet file with Variant column
-//! (TODO: add example)
+//!
+//! Use the [`VariantType`] extension type to find the Variant column:
+//!
+//! ```
+//! # use std::sync::Arc;
+//! # use std::path::PathBuf;
+//! # use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
+//! # use parquet::variant::{Variant, VariantArray, VariantType};
+//! # use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! # use arrow_array::StructArray;
+//! # fn file_path() -> PathBuf { // return a testing file path
+//! # PathBuf::from(arrow::util::test_util::parquet_test_data())
+//! # .join("..")
+//! # .join("shredded_variant")
+//! # .join("case-075.parquet")
+//! # }
+//! // Read the Parquet file using standard Arrow Parquet reader
+//! let file = std::fs::File::open(file_path())?;
+//! let mut reader = ArrowReaderBuilder::try_new(file)?.build()?;
+//!
+//! // You can check if a column contains a Variant using
+//! // the VariantType extension type
+//! let schema = reader.schema();
+//! let field = schema.field_with_name("var")?;
+//! assert!(field.try_extension_type::<VariantType>().is_ok());
+//!
+//! // The reader will yield RecordBatches with a StructArray
+//! // to convert them to VariantArray, use VariantArray::try_new
+//! let batch = reader.next().unwrap().unwrap();
+//! let col = batch.column_by_name("var").unwrap();
+//! let var_array = VariantArray::try_new(col)?;
+//! assert_eq!(var_array.len(), 1);
+//! let var_value: Variant = var_array.value(0);
+//! assert_eq!(var_value, Variant::from("iceberg")); // the value in
case-075.parquet
+//! # Ok(())
+//! # }
+//! ```
pub use parquet_variant::*;
-pub use parquet_variant_compute as compute;
+pub use parquet_variant_compute::*;
Review Comment:
I also started importing everything directly into `parquet::variant` rather
than also having a `parquet::variant::compute` which I think makes it easier to
work with this crate
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]