This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 8dd4a221 Read ManifestList V1 with V2 projection. (#1482)
8dd4a221 is described below
commit 8dd4a2216728c6fcad42a4d48ea7e39bef30517d
Author: Alex Stephen <[email protected]>
AuthorDate: Mon Jul 14 02:55:57 2025 -0700
Read ManifestList V1 with V2 projection. (#1482)
## Which issue does this PR close?
- Closes #1471
## What changes are included in this PR?
On ManifestList data files in v1, this sets the default content-type to
DATA (1).
## Are these changes tested?
---
crates/iceberg/src/avro/schema.rs | 13 ++++++---
crates/iceberg/src/spec/manifest/_serde.rs | 46 ++++++++++++++++++++++++++----
crates/iceberg/src/spec/manifest/entry.rs | 14 ++++-----
3 files changed, 56 insertions(+), 17 deletions(-)
diff --git a/crates/iceberg/src/avro/schema.rs
b/crates/iceberg/src/avro/schema.rs
index 2411c318..b08a6730 100644
--- a/crates/iceberg/src/avro/schema.rs
+++ b/crates/iceberg/src/avro/schema.rs
@@ -81,6 +81,14 @@ impl SchemaVisitor for SchemaToAvroSchema {
field_schema = avro_optional(field_schema)?;
}
+ let default = if let Some(literal) = &field.initial_default {
+ Some(literal.clone().try_into_json(&field.field_type)?)
+ } else if !field.required {
+ Some(Value::Null)
+ } else {
+ None
+ };
+
let mut avro_record_field = AvroRecordField {
name: field.name.clone(),
schema: field_schema,
@@ -88,13 +96,10 @@ impl SchemaVisitor for SchemaToAvroSchema {
position: 0,
doc: field.doc.clone(),
aliases: None,
- default: None,
+ default,
custom_attributes: Default::default(),
};
- if !field.required {
- avro_record_field.default = Some(Value::Null);
- }
avro_record_field.custom_attributes.insert(
FIELD_ID_PROP.to_string(),
Value::Number(Number::from(field.id)),
diff --git a/crates/iceberg/src/spec/manifest/_serde.rs
b/crates/iceberg/src/spec/manifest/_serde.rs
index 97923c7a..fd7bc2e6 100644
--- a/crates/iceberg/src/spec/manifest/_serde.rs
+++ b/crates/iceberg/src/spec/manifest/_serde.rs
@@ -330,9 +330,8 @@ mod tests {
assert_eq!(ret, expected_ret, "Negative i64 entry should be ignored!");
}
- #[tokio::test]
- async fn test_data_file_serialize_deserialize() {
- let schema = Arc::new(
+ fn schema() -> Arc<Schema> {
+ Arc::new(
Schema::builder()
.with_fields(vec![
Arc::new(NestedField::optional(
@@ -353,8 +352,11 @@ mod tests {
])
.build()
.unwrap(),
- );
- let data_files = vec![DataFile {
+ )
+ }
+
+ fn data_files() -> Vec<DataFile> {
+ vec![DataFile {
content: DataContentType::Data,
file_path:
"s3://testbucket/iceberg_data/iceberg_ctl/iceberg_db/iceberg_tbl/data/00000-7-45268d71-54eb-476c-b42c-942d880c04a1-00001.parquet".to_string(),
file_format: DataFileFormat::Parquet,
@@ -376,7 +378,13 @@ mod tests {
referenced_data_file: None,
content_offset: None,
content_size_in_bytes: None,
- }];
+ }]
+ }
+
+ #[tokio::test]
+ async fn test_data_file_serialize_deserialize() {
+ let schema = schema();
+ let data_files = data_files();
let mut buffer = Vec::new();
let _ = write_data_files_to_avro(
@@ -398,4 +406,30 @@ mod tests {
assert_eq!(data_files, actual_data_file);
}
+
+ #[tokio::test]
+ async fn test_data_file_serialize_deserialize_v1_data_on_v2_reader() {
+ let schema = schema();
+ let data_files = data_files();
+
+ let mut buffer = Vec::new();
+ let _ = write_data_files_to_avro(
+ &mut buffer,
+ data_files.clone().into_iter(),
+ &StructType::new(vec![]),
+ FormatVersion::V1,
+ )
+ .unwrap();
+
+ let actual_data_file = read_data_files_from_avro(
+ &mut Cursor::new(buffer),
+ &schema,
+ 0,
+ &StructType::new(vec![]),
+ FormatVersion::V2,
+ )
+ .unwrap();
+
+ assert_eq!(actual_data_file[0].content, DataContentType::Data)
+ }
}
diff --git a/crates/iceberg/src/spec/manifest/entry.rs
b/crates/iceberg/src/spec/manifest/entry.rs
index 7d2f982d..7ba9efb3 100644
--- a/crates/iceberg/src/spec/manifest/entry.rs
+++ b/crates/iceberg/src/spec/manifest/entry.rs
@@ -24,8 +24,8 @@ use typed_builder::TypedBuilder;
use crate::avro::schema_to_avro_schema;
use crate::error::Result;
use crate::spec::{
- DataContentType, DataFile, INITIAL_SEQUENCE_NUMBER, ListType,
ManifestFile, MapType,
- NestedField, NestedFieldRef, PrimitiveType, Schema, StructType, Type,
+ DataContentType, DataFile, INITIAL_SEQUENCE_NUMBER, ListType, Literal,
ManifestFile, MapType,
+ NestedField, NestedFieldRef, PrimitiveLiteral, PrimitiveType, Schema,
StructType, Type,
};
use crate::{Error, ErrorKind};
@@ -232,11 +232,11 @@ static FILE_SEQUENCE_NUMBER: Lazy<NestedFieldRef> = {
static CONTENT: Lazy<NestedFieldRef> = {
Lazy::new(|| {
- Arc::new(NestedField::required(
- 134,
- "content",
- Type::Primitive(PrimitiveType::Int),
- ))
+ Arc::new(
+ NestedField::required(134, "content",
Type::Primitive(PrimitiveType::Int))
+ // 0 refers to DataContentType::DATA
+
.with_initial_default(Literal::Primitive(PrimitiveLiteral::Int(0))),
+ )
})
};