This is an automated email from the ASF dual-hosted git repository.
mbrobbel pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d9a4b39815 Add `variant_experimental` feature to `parquet` crate
(#8133)
d9a4b39815 is described below
commit d9a4b39815de52a15ca84b392a39fdf422361718
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Sep 8 13:52:32 2025 -0700
Add `variant_experimental` feature to `parquet` crate (#8133)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/8132
- Part of https://github.com/apache/arrow-rs/issues/8084
- Follow on to https://github.com/apache/arrow-rs/pull/8104
# Rationale for this change
TLDR is we need a way to test and work out how Variant integration with
the actual parquet reader/writer will look, so let's do it in the
parquet crate.
Please see the essay on https://github.com/apache/arrow-rs/issues/8132
for background
Follow on tasks (I will file tickets for these items if we agree on this
as an integration mechanism):
- [x] Do not `panic` when writing VariantArray with the ArrowWriter:
https://github.com/apache/arrow-rs/issues/8296
- [ ] Add some way to write the logical annotation to parquet metadata
- [ ] Read arrays annotated with VARIANT logical type as VariantArrays
in ArrowReader
- [x] Update the variant_integration test to use `VariantArray` :
https://github.com/apache/arrow-rs/issues/8084
- [x] Rename `variant_experimental` flag to `variant` and remove
warnings about being experimental:
https://github.com/apache/arrow-rs/issues/8297
Follow up tasks that came out of this PR but do not depend on it
- [x] https://github.com/apache/arrow-rs/issues/8145
- [x] https://github.com/apache/arrow-rs/issues/8144
# What changes are included in this PR?
1. Add the `variant_experimental` feature to the `parquet` crate
2. Publicly export the variant crates
3. Add docs and examples
# Are these changes tested?
Yes by new CI
# Are there any user-facing changes?
This adds a new feature flag, and new
---------
Co-authored-by: Matthijs Brobbel <[email protected]>
---
.github/workflows/parquet.yml | 4 +-
Cargo.toml | 2 +-
parquet/Cargo.toml | 8 +-
parquet/README.md | 2 +
parquet/src/lib.rs | 11 ++
parquet/src/variant.rs | 115 +++++++++++++++++++++
...riant_integration.rs => variant_integration.rs} | 0
7 files changed, 139 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index 126e4aa3a6..0d1a01ca5e 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -119,7 +119,9 @@ jobs:
run: cargo check -p parquet --no-default-features --features flate2
--features flate2-rust_backened
- name: Check compilation --no-default-features --features flate2
--features flate2-zlib-rs
run: cargo check -p parquet --no-default-features --features flate2
--features flate2-zlib-rs
-
+ - name: Check compilation --no-default-features --features
variant_experimental
+ run: cargo check -p parquet --no-default-features --features
variant_experimental
+
# test the parquet crate builds against wasm32 in stable rust
wasm32-build:
diff --git a/Cargo.toml b/Cargo.toml
index 722a1cd7ea..bf0efc37d3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -104,7 +104,7 @@ parquet = { version = "56.1.0", path = "./parquet",
default-features = false }
# These crates have not yet been released and thus do not use the workspace
version
parquet-variant = { version = "0.1.0", path = "./parquet-variant" }
parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" }
-parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json"
}
+parquet-variant-compute = { version = "0.1.0", path =
"./parquet-variant-compute" }
chrono = { version = "0.4.40", default-features = false, features = ["clock"] }
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index bae90a51f0..a39275fb25 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -45,6 +45,10 @@ arrow-data = { workspace = true, optional = true }
arrow-schema = { workspace = true, optional = true }
arrow-select = { workspace = true, optional = true }
arrow-ipc = { workspace = true, optional = true }
+parquet-variant = { workspace = true, optional = true }
+parquet-variant-json = { workspace = true, optional = true }
+parquet-variant-compute = { workspace = true, optional = true }
+
object_store = { version = "0.12.0", default-features = false, optional = true
}
bytes = { version = "1.1", default-features = false, features = ["std"] }
@@ -108,7 +112,7 @@ json = ["serde_json", "base64"]
# Enable internal testing APIs
test_common = ["arrow/test_utils"]
# Experimental, unstable functionality primarily used for testing
-experimental = []
+experimental = ["variant_experimental"]
# Enable async APIs
async = ["futures", "tokio"]
# Enable object_store integration
@@ -124,6 +128,8 @@ encryption = ["dep:ring"]
# Explicitely enabling rust_backend and zlib-rs features for flate2
flate2-rust_backened = ["flate2/rust_backend"]
flate2-zlib-rs = ["flate2/zlib-rs"]
+# Enable parquet variant support
+variant_experimental = ["parquet-variant", "parquet-variant-json",
"parquet-variant-compute"]
[[example]]
diff --git a/parquet/README.md b/parquet/README.md
index 8fc72bfbc3..5e087ac6a9 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -64,9 +64,11 @@ The `parquet` crate provides the following features which
may be enabled in your
- `experimental` - Experimental APIs which may change, even between minor
releases
- `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8
validation
- `encryption` - support for reading / writing encrypted Parquet files
+- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which
may change, even between minor releases.
[`arrow`]: https://crates.io/crates/arrow
[`simdutf8`]: https://crates.io/crates/simdutf8
+[parquet variant]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
## Parquet Feature Status
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 07a673c295..1142a1c4a0 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -86,6 +86,14 @@
//! [`ParquetRecordBatchStreamBuilder`]:
arrow::async_reader::ParquetRecordBatchStreamBuilder
//! [`ParquetObjectReader`]: arrow::async_reader::ParquetObjectReader
//!
+//! ## Variant Logical Type (`variant_experimental` feature)
+//!
+//! The [`variant`] module supports reading and writing Parquet files
+//! with the [Variant Binary Encoding] logical type, which can represent
+//! semi-structured data such as JSON efficiently.
+//!
+//! [Variant Binary Encoding]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
+//!
//! ## Read/Write Parquet Directly
//!
//! Workloads needing finer-grained control, or to avoid a dependence on arrow,
@@ -179,3 +187,6 @@ pub mod record;
pub mod schema;
pub mod thrift;
+
+#[cfg(feature = "variant_experimental")]
+pub mod variant;
diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs
new file mode 100644
index 0000000000..a837a877df
--- /dev/null
+++ b/parquet/src/variant.rs
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! ⚠️ Experimental Support for reading and writing [`Variant`]s to / from
Parquet files ⚠️
+//!
+//! This is a 🚧 Work In Progress
+//!
+//! Note: Requires the `variant_experimental` feature of the `parquet` crate
to be enabled.
+//!
+//! # Features
+//! * [`Variant`] represents variant value, which can be an object, list, or
primitive.
+//! * [`VariantBuilder`] for building `Variant` values.
+//! * [`VariantArray`] for representing a column of Variant values.
+//! * [`compute`] module with functions for manipulating Variants, such as
+//! [`variant_get`] to extracting a value by path and functions to convert
+//! between `Variant` and JSON.
+//!
+//! [Variant Logical Type]: Variant
+//! [`VariantArray`]: compute::VariantArray
+//! [`variant_get`]: compute::variant_get
+//!
+//! # Example: Writing a Parquet file with Variant column
+//! ```rust
+//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder};
+//! # use parquet::variant::VariantBuilderExt;
+//! # use std::sync::Arc;
+//! # use arrow_array::{ArrayRef, RecordBatch};
+//! # use parquet::arrow::ArrowWriter;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! // Use the VariantArrayBuilder to build a VariantArray
+//! let mut builder = VariantArrayBuilder::new(3);
+//! // row 1: {"name": "Alice"}
+//! let mut variant_builder = builder.variant_builder();
+//! variant_builder.new_object().with_field("name", "Alice").finish();
+//! variant_builder.finish();
+//! let array = builder.build();
+//!
+//! // TODO support writing VariantArray directly
+//! // at the moment it panics when trying to downcast to a struct array
+//! // https://github.com/apache/arrow-rs/issues/8296
+//! // let array: ArrayRef = Arc::new(array);
+//! let array: ArrayRef = Arc::new(array.into_inner());
+//!
+//! // create a RecordBatch with the VariantArray
+//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
+//!
+//! // write the RecordBatch to a Parquet file
+//! let file = std::fs::File::create("variant.parquet")?;
+//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
+//! writer.write(&batch)?;
+//! writer.close()?;
+//!
+//! # std::fs::remove_file("variant.parquet")?;
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Writing JSON with a Parquet file with Variant column
+//! ```rust
+//! # use std::sync::Arc;
+//! # use arrow_array::{ArrayRef, RecordBatch, StringArray};
+//! # use parquet::variant::compute::json_to_variant;
+//! # use parquet::variant::compute::VariantArray;
+//! # use parquet::arrow::ArrowWriter;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! // Create an array of JSON strings, simulating a column of JSON data
+//! // TODO use StringViewArray when available
+//! let input_array = StringArray::from(vec![
+//! Some(r#"{"name": "Alice", "age": 30}"#),
+//! Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#),
+//! None,
+//! Some("{}"),
+//! ]);
+//! let input_array: ArrayRef = Arc::new(input_array);
+//!
+//! // Convert the JSON strings to a VariantArray
+//! let array: VariantArray = json_to_variant(&input_array)?;
+//!
+//! // TODO support writing VariantArray directly
+//! // at the moment it panics when trying to downcast to a struct array
+//! // https://github.com/apache/arrow-rs/issues/8296
+//! // let array: ArrayRef = Arc::new(array);
+//! let array: ArrayRef = Arc::new(array.into_inner());
+//!
+//! // create a RecordBatch with the VariantArray
+//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
+//!
+//! // write the RecordBatch to a Parquet file
+//! let file = std::fs::File::create("variant-json.parquet")?;
+//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
+//! writer.write(&batch)?;
+//! writer.close()?;
+//! # std::fs::remove_file("variant-json.parquet")?;
+//! # Ok(())
+//! # }
+//! ```
+//!
+//! # Example: Reading a Parquet file with Variant column
+//! (TODO: add example)
+pub use parquet_variant::*;
+pub use parquet_variant_compute as compute;
diff --git a/parquet/tests/simple_variant_integration.rs
b/parquet/tests/variant_integration.rs
similarity index 100%
rename from parquet/tests/simple_variant_integration.rs
rename to parquet/tests/variant_integration.rs