Re: [PR] feat: support serialize/deserialize DataFile into avro bytes [iceberg-rust]

via GitHub Mon, 30 Dec 2024 19:53:21 -0800


liurenjie1024 commented on code in PR #797:
URL: https://github.com/apache/iceberg-rust/pull/797#discussion_r1899889761



##########
crates/iceberg/src/spec/manifest.rs:
##########
@@ -656,6 +656,38 @@ mod _const_schema {
         })
     };
 
+    fn data_file_fields_v2(partition_type: StructType) -> Vec<NestedFieldRef> {

Review Comment:
   ```suggestion
       fn data_file_fields_v2(partition_type: &StructType) -> 
Vec<NestedFieldRef> {
   ```



##########
crates/iceberg/src/spec/manifest.rs:
##########
@@ -656,6 +656,38 @@ mod _const_schema {
         })
     };
 
+    fn data_file_fields_v2(partition_type: StructType) -> Vec<NestedFieldRef> {
+        vec![
+            CONTENT.clone(),
+            FILE_PATH.clone(),
+            FILE_FORMAT.clone(),
+            Arc::new(NestedField::required(
+                102,
+                "partition",
+                Type::Struct(partition_type),
+            )),
+            RECORD_COUNT.clone(),
+            FILE_SIZE_IN_BYTES.clone(),
+            COLUMN_SIZES.clone(),
+            VALUE_COUNTS.clone(),
+            NULL_VALUE_COUNTS.clone(),
+            NAN_VALUE_COUNTS.clone(),
+            LOWER_BOUNDS.clone(),
+            UPPER_BOUNDS.clone(),
+            KEY_METADATA.clone(),
+            SPLIT_OFFSETS.clone(),
+            EQUALITY_IDS.clone(),
+            SORT_ORDER_ID.clone(),
+        ]
+    }
+
+    pub(super) fn data_file_schema_v2(partition_type: StructType) -> 
Result<AvroSchema, Error> {

Review Comment:
   ```suggestion
       pub(super) fn data_file_schema_v2(partition_type: &StructType) -> 
Result<AvroSchema, Error> {
   ```



##########
crates/iceberg/src/spec/manifest.rs:
##########
@@ -1189,6 +1211,47 @@ impl DataFile {
         self.sort_order_id
     }
 }
+
+/// Convert data files to avro bytes.
+pub fn data_files_to_avro(

Review Comment:
   To maintain api consistency, I would suggest to have sth like following:
   
   ```rust
   /// comments
   pub struct DataFileWriter<W> {
     writer: W,
     version: FormatVersion,
     partition_type: &StructType
   }
   
   impl<W> DataFileWriter<W> {
     pub fn v1_writer(w: W, partition_type: &StructType) {
     }
   }
   
   impl<W: Write> DataFileWriter {
     pub fn append(data_file: &DataFile) { ... }
     pub fn fluse(mut self) {...}
   }
   ```
   
   It would be better to have similar data structures for reader. 



##########
crates/iceberg/src/spec/manifest.rs:
##########
@@ -665,62 +697,52 @@ mod _const_schema {
             Arc::new(NestedField::required(
                 2,
                 "data_file",
-                Type::Struct(StructType::new(vec![
-                    CONTENT.clone(),
-                    FILE_PATH.clone(),
-                    FILE_FORMAT.clone(),
-                    Arc::new(NestedField::required(
-                        102,
-                        "partition",
-                        Type::Struct(partition_type),
-                    )),
-                    RECORD_COUNT.clone(),
-                    FILE_SIZE_IN_BYTES.clone(),
-                    COLUMN_SIZES.clone(),
-                    VALUE_COUNTS.clone(),
-                    NULL_VALUE_COUNTS.clone(),
-                    NAN_VALUE_COUNTS.clone(),
-                    LOWER_BOUNDS.clone(),
-                    UPPER_BOUNDS.clone(),
-                    KEY_METADATA.clone(),
-                    SPLIT_OFFSETS.clone(),
-                    EQUALITY_IDS.clone(),
-                    SORT_ORDER_ID.clone(),
-                ])),
+                
Type::Struct(StructType::new(data_file_fields_v2(partition_type))),
             )),
         ];
         let schema = Schema::builder().with_fields(fields).build()?;
         schema_to_avro_schema("manifest_entry", &schema)
     }
 
+    fn data_file_fields_v1(partition_type: StructType) -> Vec<NestedFieldRef> {

Review Comment:
   ```suggestion
       fn data_file_fields_v1(partition_type: &StructType) -> 
Vec<NestedFieldRef> {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: support serialize/deserialize DataFile into avro bytes [iceberg-rust]

Reply via email to