This is an automated email from the ASF dual-hosted git repository.
blackmwk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 023f4ea12 feat(arrow): pass `&FieldRef` instead of `&Field` in
`ArrowSchemaVisitor` to achieve cheap clone (#2471)
023f4ea12 is described below
commit 023f4ea12a763e7af63cce91d52d717247923fd6
Author: Liang Gong <[email protected]>
AuthorDate: Tue May 26 20:44:13 2026 +0800
feat(arrow): pass `&FieldRef` instead of `&Field` in `ArrowSchemaVisitor`
to achieve cheap clone (#2471)
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- Closes #2310
## What changes are included in this PR?
<!--
Provide a summary of the modifications in this PR. List the main changes
such as new features, bug fixes, refactoring, or any other updates.
-->
Trait `ArrowSchemaVisitor` accepts `&FieldRef` instead of `&Field`.
Correspondingly, struct that implements this trait has also changed,
such as `Int96CoercionVisitor`, `ArrowSchemaConverter` and
`MetadataStripVisitor`.
All changes happen in `crates/iceberg/src/arrow/schema.rs` and
`crates/iceberg/src/arrow/int96.rs`
## Are these changes tested?
<!--
Specify what test covers (unit test, integration test, etc.).
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
Yes.
---
crates/iceberg/src/arrow/int96.rs | 31 +++++++++++++++----------------
crates/iceberg/src/arrow/schema.rs | 30 +++++++++++++++---------------
2 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/crates/iceberg/src/arrow/int96.rs
b/crates/iceberg/src/arrow/int96.rs
index 63a7a30f1..6add02309 100644
--- a/crates/iceberg/src/arrow/int96.rs
+++ b/crates/iceberg/src/arrow/int96.rs
@@ -20,7 +20,7 @@
use std::sync::Arc;
use arrow_schema::{
- DataType, Field, Fields, Schema as ArrowSchema, SchemaRef as
ArrowSchemaRef, TimeUnit,
+ DataType, Field, FieldRef, Fields, Schema as ArrowSchema, SchemaRef as
ArrowSchemaRef, TimeUnit,
};
use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
@@ -58,8 +58,7 @@ pub(crate) fn coerce_int96_timestamps(
/// indicated by the Iceberg schema.
struct Int96CoercionVisitor<'a> {
iceberg_schema: &'a Schema,
- // TODO(#2310): use FieldRef (Arc<Field>) once ArrowSchemaVisitor passes
FieldRef.
- field_stack: Vec<Field>,
+ field_stack: Vec<FieldRef>,
changed: bool,
}
@@ -75,7 +74,7 @@ impl<'a> Int96CoercionVisitor<'a> {
/// Determine the target TimeUnit for a Timestamp(Nanosecond) field based
on the
/// Iceberg schema. Falls back to microsecond when field IDs are
unavailable,
/// matching Iceberg Java behavior.
- fn target_unit(&self, field: &Field) -> Option<TimeUnit> {
+ fn target_unit(&self, field: &FieldRef) -> Option<TimeUnit> {
if !matches!(
field.data_type(),
DataType::Timestamp(TimeUnit::Nanosecond, _)
@@ -112,42 +111,42 @@ impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> {
type T = Field;
type U = ArrowSchema;
- fn before_field(&mut self, field: &Field) -> Result<()> {
- self.field_stack.push(field.as_ref().clone());
+ fn before_field(&mut self, field: &FieldRef) -> Result<()> {
+ self.field_stack.push(field.clone());
Ok(())
}
- fn after_field(&mut self, _field: &Field) -> Result<()> {
+ fn after_field(&mut self, _field: &FieldRef) -> Result<()> {
self.field_stack.pop();
Ok(())
}
- fn before_list_element(&mut self, field: &Field) -> Result<()> {
- self.field_stack.push(field.as_ref().clone());
+ fn before_list_element(&mut self, field: &FieldRef) -> Result<()> {
+ self.field_stack.push(field.clone());
Ok(())
}
- fn after_list_element(&mut self, _field: &Field) -> Result<()> {
+ fn after_list_element(&mut self, _field: &FieldRef) -> Result<()> {
self.field_stack.pop();
Ok(())
}
- fn before_map_key(&mut self, field: &Field) -> Result<()> {
- self.field_stack.push(field.as_ref().clone());
+ fn before_map_key(&mut self, field: &FieldRef) -> Result<()> {
+ self.field_stack.push(field.clone());
Ok(())
}
- fn after_map_key(&mut self, _field: &Field) -> Result<()> {
+ fn after_map_key(&mut self, _field: &FieldRef) -> Result<()> {
self.field_stack.pop();
Ok(())
}
- fn before_map_value(&mut self, field: &Field) -> Result<()> {
- self.field_stack.push(field.as_ref().clone());
+ fn before_map_value(&mut self, field: &FieldRef) -> Result<()> {
+ self.field_stack.push(field.clone());
Ok(())
}
- fn after_map_value(&mut self, _field: &Field) -> Result<()> {
+ fn after_map_value(&mut self, _field: &FieldRef) -> Result<()> {
self.field_stack.pop();
Ok(())
}
diff --git a/crates/iceberg/src/arrow/schema.rs
b/crates/iceberg/src/arrow/schema.rs
index 9b504421a..e6649d926 100644
--- a/crates/iceberg/src/arrow/schema.rs
+++ b/crates/iceberg/src/arrow/schema.rs
@@ -26,7 +26,7 @@ use arrow_array::{
FixedSizeBinaryArray, Float32Array, Float64Array, Int32Array, Int64Array,
Scalar, StringArray,
TimestampMicrosecondArray, TimestampNanosecondArray,
};
-use arrow_schema::{DataType, Field, Fields, Schema as ArrowSchema, TimeUnit};
+use arrow_schema::{DataType, Field, FieldRef, Fields, Schema as ArrowSchema,
TimeUnit};
use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
use parquet::file::statistics::Statistics;
use uuid::Uuid;
@@ -55,42 +55,42 @@ pub trait ArrowSchemaVisitor {
type U;
/// Called before struct/list/map field.
- fn before_field(&mut self, _field: &Field) -> Result<()> {
+ fn before_field(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called after struct/list/map field.
- fn after_field(&mut self, _field: &Field) -> Result<()> {
+ fn after_field(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called before list element.
- fn before_list_element(&mut self, _field: &Field) -> Result<()> {
+ fn before_list_element(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called after list element.
- fn after_list_element(&mut self, _field: &Field) -> Result<()> {
+ fn after_list_element(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called before map key.
- fn before_map_key(&mut self, _field: &Field) -> Result<()> {
+ fn before_map_key(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called after map key.
- fn after_map_key(&mut self, _field: &Field) -> Result<()> {
+ fn after_map_key(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called before map value.
- fn before_map_value(&mut self, _field: &Field) -> Result<()> {
+ fn before_map_value(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
/// Called after map value.
- fn after_map_value(&mut self, _field: &Field) -> Result<()> {
+ fn after_map_value(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}
@@ -176,7 +176,7 @@ fn visit_type<V: ArrowSchemaVisitor>(r#type: &DataType,
visitor: &mut V) -> Resu
/// Visit list types in post order.
fn visit_list<V: ArrowSchemaVisitor>(
data_type: &DataType,
- element_field: &Field,
+ element_field: &FieldRef,
visitor: &mut V,
) -> Result<V::T> {
visitor.before_list_element(element_field)?;
@@ -244,7 +244,7 @@ pub fn arrow_type_to_type(ty: &DataType) -> Result<Type> {
const ARROW_FIELD_DOC_KEY: &str = "doc";
-pub(super) fn get_field_id_from_metadata(field: &Field) -> Result<i32> {
+pub(super) fn get_field_id_from_metadata(field: &FieldRef) -> Result<i32> {
if let Some(value) = field.metadata().get(PARQUET_FIELD_ID_META_KEY) {
return value.parse::<i32>().map_err(|e| {
Error::new(
@@ -261,7 +261,7 @@ pub(super) fn get_field_id_from_metadata(field: &Field) ->
Result<i32> {
))
}
-fn get_field_doc(field: &Field) -> Option<String> {
+fn get_field_doc(field: &FieldRef) -> Option<String> {
if let Some(value) = field.metadata().get(ARROW_FIELD_DOC_KEY) {
return Some(value.clone());
}
@@ -293,7 +293,7 @@ impl ArrowSchemaConverter {
}
}
- fn get_field_id(&mut self, field: &Field) -> Result<i32> {
+ fn get_field_id(&mut self, field: &FieldRef) -> Result<i32> {
if self.reassign_field_ids_from.is_some() {
// Field IDs will be reassigned by the schema builder.
// We need unique temporary IDs because ReassignFieldIds builds an
@@ -1159,7 +1159,7 @@ impl ArrowSchemaVisitor for MetadataStripVisitor {
type T = Field;
type U = ArrowSchema;
- fn before_field(&mut self, field: &Field) -> Result<()> {
+ fn before_field(&mut self, field: &FieldRef) -> Result<()> {
// Store field name and nullability for later reconstruction
self.field_stack.push(Field::new(
field.name(),
@@ -1169,7 +1169,7 @@ impl ArrowSchemaVisitor for MetadataStripVisitor {
Ok(())
}
- fn after_field(&mut self, _field: &Field) -> Result<()> {
+ fn after_field(&mut self, _field: &FieldRef) -> Result<()> {
Ok(())
}