liurenjie1024 commented on code in PR #261:
URL: https://github.com/apache/iceberg-rust/pull/261#discussion_r1535397546


##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -642,6 +644,204 @@ impl SchemaVisitor for IndexByName {
     }
 }
 
+struct PruneColumn {
+    selected: HashSet<i32>,
+    select_full_types: bool,
+}
+
+/// Visit a schema and returns only the fields selected by id set
+pub fn prune_columns(
+    schema: &Schema,
+    selected: HashSet<i32>,
+    select_full_types: bool,
+) -> Result<Option<Type>> {

Review Comment:
   It's odd to me that the ok type is `Option`, I think when it's `None`, it's 
either an error or an empty struct?



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -642,6 +644,204 @@ impl SchemaVisitor for IndexByName {
     }
 }
 
+struct PruneColumn {
+    selected: HashSet<i32>,
+    select_full_types: bool,
+}
+
+/// Visit a schema and returns only the fields selected by id set
+pub fn prune_columns(
+    schema: &Schema,
+    selected: HashSet<i32>,
+    select_full_types: bool,
+) -> Result<Option<Type>> {
+    let mut visitor = PruneColumn::new(selected, select_full_types);
+    visit_schema(schema, &mut visitor)
+}
+
+impl PruneColumn {
+    fn new(selected: HashSet<i32>, select_full_types: bool) -> Self {
+        Self {
+            selected,
+            select_full_types,
+        }
+    }
+
+    fn project_selected_struct(projected_field: Option<Type>) -> 
Result<StructType> {
+        match projected_field {
+            // If the field is a StructType, return it as such
+            Some(Type::Struct(s)) => Ok(s),
+            Some(_) => Err(Error::new(
+                ErrorKind::DataInvalid,
+                "Projected Field must be Struct".to_string(),
+            )),
+            // If projected_field is None or not a StructType, return an empty 
StructType
+            None => Ok(StructType::default()),
+        }
+    }
+    fn project_list(list: &ListType, element_result: Type) -> Result<ListType> 
{
+        if *list.element_field.field_type == element_result {
+            return Ok(list.clone());
+        }
+        Ok(ListType {
+            element_field: Arc::new(NestedField {
+                id: list.element_field.id,
+                name: list.element_field.name.clone(),
+                required: list.element_field.required,
+                field_type: Box::new(element_result),
+                doc: list.element_field.doc.clone(),
+                initial_default: list.element_field.initial_default.clone(),
+                write_default: list.element_field.write_default.clone(),
+            }),
+        })
+    }
+    fn project_map(map: &MapType, value_result: Type) -> Result<MapType> {
+        if *map.value_field.field_type == value_result {
+            return Ok(map.clone());
+        }
+        Ok(MapType {
+            key_field: map.key_field.clone(),
+            value_field: Arc::new(NestedField {
+                id: map.value_field.id,
+                name: map.value_field.name.clone(),
+                required: map.value_field.required,
+                field_type: Box::new(value_result),
+                doc: map.value_field.doc.clone(),
+                initial_default: map.value_field.initial_default.clone(),
+                write_default: map.value_field.write_default.clone(),
+            }),
+        })
+    }
+}
+
+impl SchemaVisitor for PruneColumn {
+    type T = Option<Type>;
+
+    fn schema(&mut self, _schema: &Schema, value: Option<Type>) -> 
Result<Self::T> {
+        Ok(Some(value.unwrap()))
+    }
+
+    fn field(&mut self, field: &NestedFieldRef, value: Self::T) -> 
Result<Self::T> {

Review Comment:
   ```suggestion
       fn field(&mut self, field: &NestedFieldRef, value: Option<Type>) -> 
Result<Self::T> {
   ```
   
   Sorry for not being clear, I mean we should replace all `Self::T` with 
concrete type for better readability.



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -642,6 +644,204 @@ impl SchemaVisitor for IndexByName {
     }
 }
 
+struct PruneColumn {
+    selected: HashSet<i32>,
+    select_full_types: bool,
+}
+
+/// Visit a schema and returns only the fields selected by id set
+pub fn prune_columns(
+    schema: &Schema,
+    selected: HashSet<i32>,
+    select_full_types: bool,
+) -> Result<Option<Type>> {
+    let mut visitor = PruneColumn::new(selected, select_full_types);
+    visit_schema(schema, &mut visitor)
+}
+
+impl PruneColumn {
+    fn new(selected: HashSet<i32>, select_full_types: bool) -> Self {
+        Self {
+            selected,
+            select_full_types,
+        }
+    }
+
+    fn project_selected_struct(projected_field: Option<Type>) -> 
Result<StructType> {
+        match projected_field {
+            // If the field is a StructType, return it as such
+            Some(Type::Struct(s)) => Ok(s),
+            Some(_) => Err(Error::new(
+                ErrorKind::DataInvalid,
+                "Projected Field must be Struct".to_string(),
+            )),
+            // If projected_field is None or not a StructType, return an empty 
StructType
+            None => Ok(StructType::default()),
+        }
+    }
+    fn project_list(list: &ListType, element_result: Type) -> Result<ListType> 
{
+        if *list.element_field.field_type == element_result {
+            return Ok(list.clone());
+        }
+        Ok(ListType {
+            element_field: Arc::new(NestedField {
+                id: list.element_field.id,
+                name: list.element_field.name.clone(),
+                required: list.element_field.required,
+                field_type: Box::new(element_result),
+                doc: list.element_field.doc.clone(),
+                initial_default: list.element_field.initial_default.clone(),
+                write_default: list.element_field.write_default.clone(),
+            }),
+        })
+    }
+    fn project_map(map: &MapType, value_result: Type) -> Result<MapType> {
+        if *map.value_field.field_type == value_result {
+            return Ok(map.clone());
+        }
+        Ok(MapType {
+            key_field: map.key_field.clone(),
+            value_field: Arc::new(NestedField {
+                id: map.value_field.id,
+                name: map.value_field.name.clone(),
+                required: map.value_field.required,
+                field_type: Box::new(value_result),
+                doc: map.value_field.doc.clone(),
+                initial_default: map.value_field.initial_default.clone(),
+                write_default: map.value_field.write_default.clone(),
+            }),
+        })
+    }
+}
+
+impl SchemaVisitor for PruneColumn {
+    type T = Option<Type>;
+
+    fn schema(&mut self, _schema: &Schema, value: Option<Type>) -> 
Result<Self::T> {
+        Ok(Some(value.unwrap()))
+    }
+
+    fn field(&mut self, field: &NestedFieldRef, value: Self::T) -> 
Result<Self::T> {
+        if self.selected.contains(&field.id) {
+            if self.select_full_types {
+                Ok(Some(*field.field_type.clone()))
+            } else if field.field_type.is_struct() {
+                return 
Ok(Some(Type::Struct(PruneColumn::project_selected_struct(
+                    value,
+                )?)));
+            } else if !field.field_type.is_nested() {
+                return Ok(Some(*field.field_type.clone()));
+            } else {
+                return Err(Error::new(
+                    ErrorKind::DataInvalid,
+                    "Can't project list or map field directly when not 
selecting full type."
+                        .to_string(),
+                )
+                .with_context("field_id", field.id.to_string())
+                .with_context("field_type", field.field_type.to_string()));
+            }
+        } else {
+            Ok(value)
+        }
+    }
+
+    fn r#struct(&mut self, r#struct: &StructType, results: Vec<Self::T>) -> 
Result<Self::T> {
+        let fields = r#struct.fields();
+        let mut selected_field = Vec::with_capacity(fields.len());
+        let mut same_type = true;
+
+        for (field, projected_type) in zip_eq(fields.iter(), results.iter()) {
+            if let Some(projected_type) = projected_type {
+                if *field.field_type == *projected_type {
+                    selected_field.push(field.clone());
+                } else {
+                    same_type = false;
+                    let new_field = NestedField {
+                        id: field.id,
+                        name: field.name.clone(),
+                        required: field.required,
+                        field_type: Box::new(projected_type.clone()),
+                        doc: field.doc.clone(),
+                        initial_default: field.initial_default.clone(),
+                        write_default: field.write_default.clone(),
+                    };
+                    selected_field.push(Arc::new(new_field));
+                }
+            }
+        }
+
+        if !selected_field.is_empty() {
+            if selected_field.len() == fields.len() && same_type {
+                return Ok(Some(Type::Struct(r#struct.clone())));
+            } else {
+                return Ok(Some(Type::Struct(StructType::new(selected_field))));
+            }
+        }
+        Ok(None)
+    }
+
+    fn list(&mut self, list: &ListType, value: Self::T) -> Result<Self::T> {
+        if self.selected.contains(&list.element_field.id) {
+            if self.select_full_types {
+                Ok(Some(Type::List(list.clone())))
+            } else if list.element_field.field_type.is_struct() {
+                let projected_struct =
+                    
PruneColumn::project_selected_struct(Some(value.unwrap())).unwrap();

Review Comment:
   ```suggestion
                       PruneColumn::project_selected_struct(value).unwrap();
   ```



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -1338,4 +1538,522 @@ table {
             );
         }
     }
+    #[test]
+    fn test_schema_prune_columns_string() {
+        let expected_type = Type::from(
+            Schema::builder()
+                .with_fields(vec![NestedField::optional(
+                    1,
+                    "foo",
+                    Type::Primitive(PrimitiveType::String),
+                )
+                .into()])
+                .build()
+                .unwrap()
+                .as_struct()
+                .clone(),
+        );
+        let schema = table_schema_nested();
+        let selected: HashSet<i32> = HashSet::from([1]);
+        let mut visitor = PruneColumn::new(selected, false);
+        let result = visit_schema(&schema, &mut visitor);
+        assert!(result.is_ok());
+        assert_eq!(result.unwrap().unwrap(), expected_type);
+    }
+
+    #[test]
+    fn test_schema_prune_columns_string_full() {
+        let expected_type = Type::from(
+            Schema::builder()
+                .with_fields(vec![NestedField::optional(
+                    1,
+                    "foo",
+                    Type::Primitive(PrimitiveType::String),
+                )
+                .into()])
+                .build()
+                .unwrap()
+                .as_struct()
+                .clone(),
+        );
+        let schema = table_schema_nested();
+        let selected: HashSet<i32> = HashSet::from([1]);
+        let mut visitor = PruneColumn::new(selected, true);
+        let result = visit_schema(&schema, &mut visitor);

Review Comment:
   Ditto



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -642,6 +644,204 @@ impl SchemaVisitor for IndexByName {
     }
 }
 
+struct PruneColumn {
+    selected: HashSet<i32>,
+    select_full_types: bool,
+}
+
+/// Visit a schema and returns only the fields selected by id set
+pub fn prune_columns(
+    schema: &Schema,
+    selected: HashSet<i32>,
+    select_full_types: bool,
+) -> Result<Option<Type>> {
+    let mut visitor = PruneColumn::new(selected, select_full_types);
+    visit_schema(schema, &mut visitor)
+}
+
+impl PruneColumn {
+    fn new(selected: HashSet<i32>, select_full_types: bool) -> Self {
+        Self {
+            selected,
+            select_full_types,
+        }
+    }
+
+    fn project_selected_struct(projected_field: Option<Type>) -> 
Result<StructType> {
+        match projected_field {
+            // If the field is a StructType, return it as such
+            Some(Type::Struct(s)) => Ok(s),
+            Some(_) => Err(Error::new(
+                ErrorKind::DataInvalid,

Review Comment:
   ```suggestion
                   ErrorKind::Unexpected,
   ```
   This should be a bug, and we should return `Unexpected` error.



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -642,6 +644,204 @@ impl SchemaVisitor for IndexByName {
     }
 }
 
+struct PruneColumn {
+    selected: HashSet<i32>,
+    select_full_types: bool,
+}
+
+/// Visit a schema and returns only the fields selected by id set
+pub fn prune_columns(
+    schema: &Schema,
+    selected: HashSet<i32>,
+    select_full_types: bool,
+) -> Result<Option<Type>> {
+    let mut visitor = PruneColumn::new(selected, select_full_types);
+    visit_schema(schema, &mut visitor)
+}
+
+impl PruneColumn {
+    fn new(selected: HashSet<i32>, select_full_types: bool) -> Self {
+        Self {
+            selected,
+            select_full_types,
+        }
+    }
+
+    fn project_selected_struct(projected_field: Option<Type>) -> 
Result<StructType> {
+        match projected_field {
+            // If the field is a StructType, return it as such
+            Some(Type::Struct(s)) => Ok(s),
+            Some(_) => Err(Error::new(
+                ErrorKind::DataInvalid,
+                "Projected Field must be Struct".to_string(),

Review Comment:
   ```suggestion
                   "Projected field with struct type must be 
struct".to_string(),
   ```



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -1338,4 +1538,522 @@ table {
             );
         }
     }
+    #[test]
+    fn test_schema_prune_columns_string() {
+        let expected_type = Type::from(
+            Schema::builder()
+                .with_fields(vec![NestedField::optional(
+                    1,
+                    "foo",
+                    Type::Primitive(PrimitiveType::String),
+                )
+                .into()])
+                .build()
+                .unwrap()
+                .as_struct()
+                .clone(),
+        );
+        let schema = table_schema_nested();
+        let selected: HashSet<i32> = HashSet::from([1]);
+        let mut visitor = PruneColumn::new(selected, false);
+        let result = visit_schema(&schema, &mut visitor);

Review Comment:
   nit: We can use `prune_columns` directly here.



##########
crates/iceberg/src/spec/schema.rs:
##########
@@ -642,6 +644,204 @@ impl SchemaVisitor for IndexByName {
     }
 }
 
+struct PruneColumn {
+    selected: HashSet<i32>,
+    select_full_types: bool,
+}
+
+/// Visit a schema and returns only the fields selected by id set
+pub fn prune_columns(
+    schema: &Schema,
+    selected: HashSet<i32>,

Review Comment:
   ```suggestion
       selected: impl IntoIterator<item=i32>,
   ```
   nit: This makes apis easier to use.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to