This is an automated email from the ASF dual-hosted git repository.

mbrobbel pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d9a4b39815 Add `variant_experimental` feature to `parquet` crate 
(#8133)
d9a4b39815 is described below

commit d9a4b39815de52a15ca84b392a39fdf422361718
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Sep 8 13:52:32 2025 -0700

    Add `variant_experimental` feature to `parquet` crate (#8133)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/8132
    - Part of https://github.com/apache/arrow-rs/issues/8084
    - Follow on to https://github.com/apache/arrow-rs/pull/8104
    
    # Rationale for this change
    
    TLDR is we need a way to test and work out how Variant integration with
    the actual parquet reader/writer will look, so let's do it in the
    parquet crate.
    
    Please see the essay on https://github.com/apache/arrow-rs/issues/8132
    for background
    
    Follow on tasks (I will file tickets for these items if we agree on this
    as an integration mechanism):
    - [x] Do not `panic` when writing VariantArray with the ArrowWriter:
    https://github.com/apache/arrow-rs/issues/8296
    - [ ] Add some way to write the logical annotation to parquet metadata
    - [ ] Read arrays annotated with VARIANT logical type as VariantArrays
    in ArrowReader
    - [x] Update the variant_integration test to use `VariantArray` :
    https://github.com/apache/arrow-rs/issues/8084
    - [x] Rename `variant_experimental` flag to `variant` and remove
    warnings about being experimental:
    https://github.com/apache/arrow-rs/issues/8297
    
    
    Follow up tasks that came out of this PR but do not depend on it
    - [x] https://github.com/apache/arrow-rs/issues/8145
    - [x] https://github.com/apache/arrow-rs/issues/8144
    
    # What changes are included in this PR?
    
    1. Add the `variant_experimental` feature to the `parquet` crate
    2. Publicly export the variant crates
    3. Add docs and examples
    
    # Are these changes tested?
    Yes by new CI
    
    # Are there any user-facing changes?
    
    This adds a new feature flag, and new
    
    ---------
    
    Co-authored-by: Matthijs Brobbel <[email protected]>
---
 .github/workflows/parquet.yml                      |   4 +-
 Cargo.toml                                         |   2 +-
 parquet/Cargo.toml                                 |   8 +-
 parquet/README.md                                  |   2 +
 parquet/src/lib.rs                                 |  11 ++
 parquet/src/variant.rs                             | 115 +++++++++++++++++++++
 ...riant_integration.rs => variant_integration.rs} |   0
 7 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index 126e4aa3a6..0d1a01ca5e 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -119,7 +119,9 @@ jobs:
         run: cargo check -p parquet --no-default-features --features flate2 
--features flate2-rust_backened
       - name: Check compilation --no-default-features --features flate2 
--features flate2-zlib-rs
         run: cargo check -p parquet --no-default-features --features flate2 
--features flate2-zlib-rs
-        
+      - name: Check compilation --no-default-features --features 
variant_experimental
+        run: cargo check -p parquet --no-default-features --features 
variant_experimental
+
 
   # test the parquet crate builds against wasm32 in stable rust
   wasm32-build:
diff --git a/Cargo.toml b/Cargo.toml
index 722a1cd7ea..bf0efc37d3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -104,7 +104,7 @@ parquet = { version = "56.1.0", path = "./parquet", 
default-features = false }
 # These crates have not yet been released and thus do not use the workspace 
version
 parquet-variant = { version = "0.1.0", path = "./parquet-variant" }
 parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" }
-parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json" 
}
+parquet-variant-compute = { version = "0.1.0", path = 
"./parquet-variant-compute" }
 
 chrono = { version = "0.4.40", default-features = false, features = ["clock"] }
 
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index bae90a51f0..a39275fb25 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -45,6 +45,10 @@ arrow-data = { workspace = true, optional = true }
 arrow-schema = { workspace = true, optional = true }
 arrow-select = { workspace = true, optional = true }
 arrow-ipc = { workspace = true, optional = true }
+parquet-variant = { workspace = true, optional = true }
+parquet-variant-json = { workspace = true, optional = true }
+parquet-variant-compute = { workspace = true, optional = true }
+
 object_store = { version = "0.12.0", default-features = false, optional = true 
}
 
 bytes = { version = "1.1", default-features = false, features = ["std"] }
@@ -108,7 +112,7 @@ json = ["serde_json", "base64"]
 # Enable internal testing APIs
 test_common = ["arrow/test_utils"]
 # Experimental, unstable functionality primarily used for testing
-experimental = []
+experimental = ["variant_experimental"]
 # Enable async APIs
 async = ["futures", "tokio"]
 # Enable object_store integration
@@ -124,6 +128,8 @@ encryption = ["dep:ring"]
 # Explicitely enabling rust_backend and zlib-rs features for flate2
 flate2-rust_backened = ["flate2/rust_backend"]
 flate2-zlib-rs = ["flate2/zlib-rs"]
+# Enable parquet variant support
+variant_experimental = ["parquet-variant", "parquet-variant-json", 
"parquet-variant-compute"]
 
 
 [[example]]
diff --git a/parquet/README.md b/parquet/README.md
index 8fc72bfbc3..5e087ac6a9 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -64,9 +64,11 @@ The `parquet` crate provides the following features which 
may be enabled in your
 - `experimental` - Experimental APIs which may change, even between minor 
releases
 - `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 
validation
 - `encryption` - support for reading / writing encrypted Parquet files
+- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which 
may change, even between minor releases.
 
 [`arrow`]: https://crates.io/crates/arrow
 [`simdutf8`]: https://crates.io/crates/simdutf8
+[parquet variant]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
 
 ## Parquet Feature Status
 
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 07a673c295..1142a1c4a0 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -86,6 +86,14 @@
 //! [`ParquetRecordBatchStreamBuilder`]: 
arrow::async_reader::ParquetRecordBatchStreamBuilder
 //! [`ParquetObjectReader`]: arrow::async_reader::ParquetObjectReader
 //!
+//! ## Variant Logical Type (`variant_experimental` feature)
+//!
+//! The [`variant`] module supports reading and writing Parquet files
+//! with the [Variant Binary Encoding] logical type, which can represent
+//! semi-structured data such as JSON efficiently.
+//!
+//! [Variant Binary Encoding]: 
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
+//!
 //! ## Read/Write Parquet Directly
 //!
 //! Workloads needing finer-grained control, or to avoid a dependence on arrow,
@@ -179,3 +187,6 @@ pub mod record;
 pub mod schema;
 
 pub mod thrift;
+
+#[cfg(feature = "variant_experimental")]
+pub mod variant;
diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs
new file mode 100644
index 0000000000..a837a877df
--- /dev/null
+++ b/parquet/src/variant.rs
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! ⚠️ Experimental Support for reading and writing [`Variant`]s to / from 
Parquet files ⚠️
+//!
+//! This is a 🚧 Work In Progress
+//!
+//! Note: Requires the `variant_experimental` feature of the `parquet` crate 
to be enabled.
+//!
+//! # Features
+//! * [`Variant`] represents variant value, which can be an object, list, or 
primitive.
+//! * [`VariantBuilder`] for building `Variant` values.
+//! * [`VariantArray`] for representing a column of Variant values.
+//! * [`compute`] module with functions for manipulating Variants, such as
+//!   [`variant_get`] to extracting a value by path and functions to convert
+//!   between `Variant` and JSON.
+//!
+//! [Variant Logical Type]: Variant
+//! [`VariantArray`]: compute::VariantArray
+//! [`variant_get`]: compute::variant_get
+//!
+//! # Example: Writing a Parquet file with Variant column
+//! ```rust
+//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder};
+//! # use parquet::variant::VariantBuilderExt;
+//! # use std::sync::Arc;
+//! # use arrow_array::{ArrayRef, RecordBatch};
+//! # use parquet::arrow::ArrowWriter;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//!  // Use the VariantArrayBuilder to build a VariantArray
+//!  let mut builder = VariantArrayBuilder::new(3);
+//!  // row 1: {"name": "Alice"}
+//!  let mut variant_builder = builder.variant_builder();
+//!  variant_builder.new_object().with_field("name", "Alice").finish();
+//!  variant_builder.finish();
+//!  let array = builder.build();
+//!
+//! // TODO support writing VariantArray directly
+//! // at the moment it panics when trying to downcast to a struct array
+//! // https://github.com/apache/arrow-rs/issues/8296
+//! //  let array: ArrayRef = Arc::new(array);
+//! let array: ArrayRef = Arc::new(array.into_inner());
+//!
+//!  // create a RecordBatch with the VariantArray
+//!  let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
+//!
+//!  // write the RecordBatch to a Parquet file
+//!  let file = std::fs::File::create("variant.parquet")?;
+//!  let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
+//!  writer.write(&batch)?;
+//!  writer.close()?;
+//!
+//! # std::fs::remove_file("variant.parquet")?;
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Writing JSON with a Parquet file with Variant column
+//! ```rust
+//! # use std::sync::Arc;
+//! # use arrow_array::{ArrayRef, RecordBatch, StringArray};
+//! # use parquet::variant::compute::json_to_variant;
+//! # use parquet::variant::compute::VariantArray;
+//! # use parquet::arrow::ArrowWriter;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! // Create an array of JSON strings, simulating a column of JSON data
+//! // TODO use StringViewArray when available
+//! let input_array = StringArray::from(vec![
+//!   Some(r#"{"name": "Alice", "age": 30}"#),
+//!   Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#),
+//!   None,
+//!   Some("{}"),
+//! ]);
+//! let input_array: ArrayRef = Arc::new(input_array);
+//!
+//! // Convert the JSON strings to a VariantArray
+//! let array: VariantArray = json_to_variant(&input_array)?;
+//!
+//! // TODO support writing VariantArray directly
+//! // at the moment it panics when trying to downcast to a struct array
+//! // https://github.com/apache/arrow-rs/issues/8296
+//! //  let array: ArrayRef = Arc::new(array);
+//! let array: ArrayRef = Arc::new(array.into_inner());
+//!
+//!  // create a RecordBatch with the VariantArray
+//!  let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
+//!
+//!  // write the RecordBatch to a Parquet file
+//!  let file = std::fs::File::create("variant-json.parquet")?;
+//!  let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
+//!  writer.write(&batch)?;
+//!  writer.close()?;
+//! # std::fs::remove_file("variant-json.parquet")?;
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Reading a Parquet file with Variant column
+//! (TODO: add example)
+pub use parquet_variant::*;
+pub use parquet_variant_compute as compute;
diff --git a/parquet/tests/simple_variant_integration.rs 
b/parquet/tests/variant_integration.rs
similarity index 100%
rename from parquet/tests/simple_variant_integration.rs
rename to parquet/tests/variant_integration.rs

Reply via email to