This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git


The following commit(s) were added to refs/heads/main by this push:
     new 7e46240  feat: Allow Schema Serialization/deserialization (#46)
7e46240 is described below

commit 7e462408b06fa37f2332cb5245b5394000e8e7d7
Author: y0psolo <[email protected]>
AuthorDate: Thu Aug 31 13:29:14 2023 +0200

    feat: Allow Schema Serialization/deserialization (#46)
    
    * Implement Schema serialize/deserialize
    
    * review comments
    
    * Convert Schema to SchemaV2 only
    
    * Add a test for serialization/Deserialization without shcema-id
    Add a test to ensure correct enum is choosen by ser/deser crate
    Reuse existing table_schema_simple schema for test
    Pretty indent of JSON Schema
    
    * Move json string inside the table_schema_simple function with schema build
    
    * fix cargo fmt errors
---
 crates/iceberg/src/spec/schema.rs | 104 +++++++++++++++++++++++++++++++++++---
 1 file changed, 96 insertions(+), 8 deletions(-)

diff --git a/crates/iceberg/src/spec/schema.rs 
b/crates/iceberg/src/spec/schema.rs
index 865df25..2e9ead2 100644
--- a/crates/iceberg/src/spec/schema.rs
+++ b/crates/iceberg/src/spec/schema.rs
@@ -25,14 +25,18 @@ use crate::spec::datatypes::{
 use crate::{ensure_data_valid, Error, ErrorKind};
 use bimap::BiHashMap;
 use itertools::Itertools;
+use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::fmt::{Display, Formatter};
 use std::sync::OnceLock;
 
+use _serde::SchemaEnum;
+
 const DEFAULT_SCHEMA_ID: i32 = 0;
 
 /// Defines schema in iceberg.
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)]
+#[serde(try_from = "SchemaEnum", into = "SchemaEnum")]
 pub struct Schema {
     r#struct: StructType,
     schema_id: i32,
@@ -620,6 +624,14 @@ pub(super) mod _serde {
 
     use super::{Schema, DEFAULT_SCHEMA_ID};
 
+    #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
+    #[serde(untagged)]
+    /// Enum for Schema serialization/deserializaion
+    pub(super) enum SchemaEnum {
+        V2(SchemaV2),
+        V1(SchemaV1),
+    }
+
     #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
     #[serde(rename_all = "kebab-case")]
     /// Defines the structure of a v2 schema for serialization/deserialization
@@ -643,6 +655,23 @@ pub(super) mod _serde {
         pub fields: StructType,
     }
 
+    /// Helper to serialize/deserializa Schema
+    impl TryFrom<SchemaEnum> for Schema {
+        type Error = Error;
+        fn try_from(value: SchemaEnum) -> Result<Self> {
+            match value {
+                SchemaEnum::V2(value) => value.try_into(),
+                SchemaEnum::V1(value) => value.try_into(),
+            }
+        }
+    }
+
+    impl From<Schema> for SchemaEnum {
+        fn from(value: Schema) -> Self {
+            SchemaEnum::V2(value.into())
+        }
+    }
+
     impl TryFrom<SchemaV2> for Schema {
         type Error = Error;
         fn try_from(value: SchemaV2) -> Result<Self> {
@@ -702,9 +731,42 @@ mod tests {
         ListType, MapType, NestedField, NestedFieldRef, PrimitiveType, 
StructType, Type,
     };
     use crate::spec::schema::Schema;
-    use crate::spec::schema::_serde::SchemaV2;
+    use crate::spec::schema::_serde::{SchemaEnum, SchemaV1, SchemaV2};
     use std::collections::HashMap;
 
+    use super::DEFAULT_SCHEMA_ID;
+
+    fn check_schema_serde(json: &str, expected_type: Schema, _expected_enum: 
SchemaEnum) {
+        let desered_type: Schema = serde_json::from_str(json).unwrap();
+        assert_eq!(desered_type, expected_type);
+        assert!(matches!(desered_type.clone(), _expected_enum));
+
+        let sered_json = serde_json::to_string(&expected_type).unwrap();
+        let parsed_json_value = 
serde_json::from_str::<Schema>(&sered_json).unwrap();
+
+        assert_eq!(parsed_json_value, desered_type);
+    }
+
+    #[test]
+    fn test_serde_with_schema_id() {
+        let (schema, record) = table_schema_simple();
+
+        let x: SchemaV2 = serde_json::from_str(record).unwrap();
+        check_schema_serde(record, schema, SchemaEnum::V2(x));
+    }
+
+    #[test]
+    fn test_serde_without_schema_id() {
+        let (mut schema, record) = table_schema_simple();
+        // we remove the ""schema-id": 1," string from example
+        let new_record = record.replace("\"schema-id\":1,", "");
+        // By default schema_id field is set to DEFAULT_SCHEMA_ID when no 
value is set in json
+        schema.schema_id = DEFAULT_SCHEMA_ID;
+
+        let x: SchemaV1 = serde_json::from_str(new_record.as_str()).unwrap();
+        check_schema_serde(&new_record, schema, SchemaEnum::V1(x));
+    }
+
     #[test]
     fn test_construct_schema() {
         let field1: NestedFieldRef =
@@ -763,8 +825,8 @@ mod tests {
         assert!(!result.fields[1].required);
     }
 
-    fn table_schema_simple() -> Schema {
-        Schema::builder()
+    fn table_schema_simple<'a>() -> (Schema, &'a str) {
+        let schema = Schema::builder()
             .with_schema_id(1)
             .with_identifier_field_ids(vec![2])
             .with_fields(vec![
@@ -773,7 +835,33 @@ mod tests {
                 NestedField::optional(3, "baz", 
Type::Primitive(PrimitiveType::Boolean)).into(),
             ])
             .build()
-            .unwrap()
+            .unwrap();
+        let record = r#"{
+            "type":"struct",
+            "schema-id":1,
+            "fields":[
+                {
+                    "id":1,
+                    "name":"foo",
+                    "required":false,
+                    "type":"string"
+                },
+                {
+                    "id":2,
+                    "name":"bar",
+                    "required":true,
+                    "type":"int"
+                },
+                {
+                    "id":3,
+                    "name":"baz",
+                    "required":false,
+                    "type":"boolean"
+                }
+            ],
+            "identifier-field-ids":[2]
+        }"#;
+        (schema, record)
     }
 
     fn table_schema_nested() -> Schema {
@@ -879,7 +967,7 @@ table {
 }
 "#;
 
-        assert_eq!(expected_str, format!("\n{}", table_schema_simple()));
+        assert_eq!(expected_str, format!("\n{}", table_schema_simple().0));
     }
 
     #[test]
@@ -973,7 +1061,7 @@ table {
     fn test_schema_find_column_name_by_id_simple() {
         let expected_id_to_name = HashMap::from([(1, "foo"), (2, "bar"), (3, 
"baz")]);
 
-        let schema = table_schema_simple();
+        let schema = table_schema_simple().0;
 
         for (id, name) in expected_id_to_name {
             assert_eq!(
@@ -987,7 +1075,7 @@ table {
 
     #[test]
     fn test_schema_find_simple() {
-        let schema = table_schema_simple();
+        let schema = table_schema_simple().0;
 
         assert_eq!(
             Some(schema.r#struct.fields()[0].clone()),

Reply via email to