Re: [PR] Set Glue Table Information when creating/updating tables [iceberg-python]

via GitHub Sat, 20 Jan 2024 22:44:43 -0800


HonahX commented on code in PR #288:
URL: https://github.com/apache/iceberg-python/pull/288#discussion_r1460723261



##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
     return new_parameters
 
 
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+    if isinstance(input_type, BooleanType):
+        return "boolean"
+    if isinstance(input_type, IntegerType):
+        return "int"
+    if isinstance(input_type, LongType):
+        return "bigint"
+    if isinstance(input_type, FloatType):
+        return "float"
+    if isinstance(input_type, DoubleType):
+        return "double"
+    if isinstance(input_type, DateType):
+        return "date"
+    if isinstance(
+        input_type,
+        (
+            TimeType,
+            StringType,
+            UUIDType,
+        ),
+    ):
+        return "string"
+    if isinstance(input_type, TimestampType):
+        return "timestamp"
+    if isinstance(
+        input_type,
+        (
+            FixedType,
+            BinaryType,
+        ),
+    ):
+        return "binary"
+    if isinstance(input_type, DecimalType):
+        return f"decimal({input_type.precision},{input_type.scale})"
+    if isinstance(input_type, StructType):
+        name_to_type = 
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in 
input_type.fields)
+        return f"struct<{name_to_type}>"
+    if isinstance(input_type, ListType):
+        return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+    if isinstance(input_type, MapType):
+        return 
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+    raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+    results: Dict[str, ColumnTypeDef] = {}
+
+    def _append_to_results(field: NestedField, is_current: bool) -> None:
+        if field.name in results:
+            return
+
+        results[field.name] = cast(
+            ColumnTypeDef,
+            {
+                "Name": field.name,
+                "Type": _type_to_glue_type_string(field.field_type),
+                "Parameters": {
+                    ICEBERG_FIELD_ID: str(field.field_id),
+                    ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(),
+                    ICEBERG_FIELD_CURRENT: str(is_current).lower(),
+                },
+            },
+        )
+        if field.doc:
+            results[field.name]["Comment"] = field.doc
+
+    if current_schema := metadata.schema_by_id(metadata.current_schema_id):
+        for field in current_schema.columns:
+            _append_to_results(field, True)
+
+    for schema in metadata.schemas:
+        if schema.schema_id == metadata.current_schema_id:
+            continue
+        for field in schema.columns:
+            _append_to_results(field, False)
+
+    return list(results.values())
+
+
 def _construct_table_input(
     table_name: str,
     metadata_location: str,
     properties: Properties,
+    metadata: TableMetadataCommonFields,

Review Comment:
   ```suggestion
       metadata: TableMetadata,
   ```



##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
     return new_parameters
 
 
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+    if isinstance(input_type, BooleanType):
+        return "boolean"
+    if isinstance(input_type, IntegerType):
+        return "int"
+    if isinstance(input_type, LongType):
+        return "bigint"
+    if isinstance(input_type, FloatType):
+        return "float"
+    if isinstance(input_type, DoubleType):
+        return "double"
+    if isinstance(input_type, DateType):
+        return "date"
+    if isinstance(
+        input_type,
+        (
+            TimeType,
+            StringType,
+            UUIDType,
+        ),
+    ):
+        return "string"
+    if isinstance(input_type, TimestampType):
+        return "timestamp"
+    if isinstance(
+        input_type,
+        (
+            FixedType,
+            BinaryType,
+        ),
+    ):
+        return "binary"
+    if isinstance(input_type, DecimalType):
+        return f"decimal({input_type.precision},{input_type.scale})"
+    if isinstance(input_type, StructType):
+        name_to_type = 
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in 
input_type.fields)
+        return f"struct<{name_to_type}>"
+    if isinstance(input_type, ListType):
+        return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+    if isinstance(input_type, MapType):
+        return 
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+    raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:

Review Comment:
   ```suggestion
   def _to_columns(metadata: TableMetadata) -> List[ColumnTypeDef]:
   ```



##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
     return new_parameters
 
 
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+    if isinstance(input_type, BooleanType):
+        return "boolean"
+    if isinstance(input_type, IntegerType):
+        return "int"
+    if isinstance(input_type, LongType):
+        return "bigint"
+    if isinstance(input_type, FloatType):
+        return "float"
+    if isinstance(input_type, DoubleType):
+        return "double"
+    if isinstance(input_type, DateType):
+        return "date"
+    if isinstance(
+        input_type,
+        (
+            TimeType,
+            StringType,
+            UUIDType,
+        ),
+    ):
+        return "string"
+    if isinstance(input_type, TimestampType):
+        return "timestamp"
+    if isinstance(
+        input_type,
+        (
+            FixedType,
+            BinaryType,
+        ),
+    ):
+        return "binary"
+    if isinstance(input_type, DecimalType):
+        return f"decimal({input_type.precision},{input_type.scale})"
+    if isinstance(input_type, StructType):
+        name_to_type = 
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in 
input_type.fields)
+        return f"struct<{name_to_type}>"
+    if isinstance(input_type, ListType):
+        return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+    if isinstance(input_type, MapType):
+        return 
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+    raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+    results: Dict[str, ColumnTypeDef] = {}
+
+    def _append_to_results(field: NestedField, is_current: bool) -> None:
+        if field.name in results:
+            return
+
+        results[field.name] = cast(
+            ColumnTypeDef,
+            {
+                "Name": field.name,
+                "Type": _type_to_glue_type_string(field.field_type),
+                "Parameters": {
+                    ICEBERG_FIELD_ID: str(field.field_id),
+                    ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(),
+                    ICEBERG_FIELD_CURRENT: str(is_current).lower(),
+                },
+            },
+        )
+        if field.doc:
+            results[field.name]["Comment"] = field.doc
+
+    if current_schema := metadata.schema_by_id(metadata.current_schema_id):
+        for field in current_schema.columns:
+            _append_to_results(field, True)
+
+    for schema in metadata.schemas:
+        if schema.schema_id == metadata.current_schema_id:
+            continue
+        for field in schema.columns:
+            _append_to_results(field, False)
+
+    return list(results.values())
+
+
 def _construct_table_input(
     table_name: str,
     metadata_location: str,
     properties: Properties,
+    metadata: TableMetadataCommonFields,
+    location: Optional[str] = None,
     glue_table: Optional[TableTypeDef] = None,
     prev_metadata_location: Optional[str] = None,
 ) -> TableInputTypeDef:
     table_input: TableInputTypeDef = {
         "Name": table_name,
         "TableType": EXTERNAL_TABLE,
         "Parameters": _construct_parameters(metadata_location, glue_table, 
prev_metadata_location),
+        "StorageDescriptor": {"Columns": _to_columns(metadata)},
     }
 
+    if location:
+        table_input["StorageDescriptor"]["Location"] = location

Review Comment:
   The `location` is a required field in metadata, so we can fetch it from the 
metadata directly:
   ```python
   table_input["StorageDescriptor"]["Location"] = metadata.location
   ```
   and we can safely delete the "location" parameter
   
   



##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
     return new_parameters
 
 
+def _type_to_glue_type_string(input_type: IcebergType) -> str:

Review Comment:
   I think we might utilize the existing schema visitor for this task:
   
https://github.com/apache/iceberg-python/blob/4616d036440bf4cb3733e8d091220587cf290b75/pyiceberg/schema.py#L345-L348
   
https://github.com/apache/iceberg-python/blob/4616d036440bf4cb3733e8d091220587cf290b75/pyiceberg/schema.py#L784-L797
   
   We could implement a `_TypeToGlueTypeString` implementing the 
`SchemaVisitor[str]` like this
   ```python
   class _TypeToGlueTypeString(SchemaVisitor[str]):
       def schema(self, schema: Schema, struct_result: str) -> str:
           return struct_result
   
       def struct(self, struct: StructType, field_results: List[str]) -> str:
           name_to_type = ",".join(field_results)
           return f"struct<{name_to_type}>"
   
       def field(self, field: NestedField, field_result: str) -> str:
           return f"{field.name}:{field_result}"
   
       def list(self, list_type: ListType, element_result: str) -> str:
           return f"array<{element_result}>"
   
       def map(self, map_type: MapType, key_result: str, value_result: str) -> 
str:
           return f"map<{key_result},{value_result}>"
   
       def primitive(self, primitive: IcebergType) -> str:
           if isinstance(primitive, BooleanType):
               return "boolean"
           elif isinstance(primitive, IntegerType):
               return "int"
           elif isinstance(primitive, LongType):
               return "bigint"
           ...(more types)
   
           raise ValueError(f"Unknown Type {primitive}")
   ```
   
   This can also let us benefit from the `@singledispatch` decorator, which 
make the visitor faster in practice. Does this idea sound reasonable to you?



##########
tests/catalog/test_glue.py:
##########
@@ -554,6 +580,19 @@ def test_commit_table_update_schema(
     assert new_schema == update._apply()
     assert new_schema.find_field("b").field_type == IntegerType()
 
+    # Ensure schema is also pushed to Glue
+    table_info = _glue.get_table(
+        DatabaseName=database_name,
+        Name=table_name,
+    )
+    columns = table_info["Table"]["StorageDescriptor"]["Columns"]
+    assert len(columns) == len(table_schema_nested.fields) + 1
+    assert columns[-1] == {
+        "Name": "b",
+        "Type": "int",
+        "Parameters": {"iceberg.field.id": "18", "iceberg.field.optional": 
"true", "iceberg.field.current": "true"},
+    }
+

Review Comment:
   Shall we also check the StorageDscriptor.location here?
   ```suggestion
   assert table_info["Table"]["StorageDescriptor"]["Location"] == 
f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}"
   ```



##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
     return new_parameters
 
 
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+    if isinstance(input_type, BooleanType):
+        return "boolean"
+    if isinstance(input_type, IntegerType):
+        return "int"
+    if isinstance(input_type, LongType):
+        return "bigint"
+    if isinstance(input_type, FloatType):
+        return "float"
+    if isinstance(input_type, DoubleType):
+        return "double"
+    if isinstance(input_type, DateType):
+        return "date"
+    if isinstance(
+        input_type,
+        (
+            TimeType,
+            StringType,
+            UUIDType,
+        ),
+    ):
+        return "string"
+    if isinstance(input_type, TimestampType):
+        return "timestamp"
+    if isinstance(
+        input_type,
+        (
+            FixedType,
+            BinaryType,
+        ),
+    ):
+        return "binary"
+    if isinstance(input_type, DecimalType):
+        return f"decimal({input_type.precision},{input_type.scale})"
+    if isinstance(input_type, StructType):
+        name_to_type = 
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in 
input_type.fields)
+        return f"struct<{name_to_type}>"
+    if isinstance(input_type, ListType):
+        return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+    if isinstance(input_type, MapType):
+        return 
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+    raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+    results: Dict[str, ColumnTypeDef] = {}
+
+    def _append_to_results(field: NestedField, is_current: bool) -> None:
+        if field.name in results:
+            return
+
+        results[field.name] = cast(
+            ColumnTypeDef,
+            {
+                "Name": field.name,
+                "Type": _type_to_glue_type_string(field.field_type),

Review Comment:
   If we decide to use the existing visitor as mentioned above, this will become
   ```python
   "Type": visit(field.field_type, _TypeToGlueTypeString())
   ```



##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
     return new_parameters
 
 
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+    if isinstance(input_type, BooleanType):
+        return "boolean"
+    if isinstance(input_type, IntegerType):
+        return "int"
+    if isinstance(input_type, LongType):
+        return "bigint"
+    if isinstance(input_type, FloatType):
+        return "float"
+    if isinstance(input_type, DoubleType):
+        return "double"
+    if isinstance(input_type, DateType):
+        return "date"
+    if isinstance(
+        input_type,
+        (
+            TimeType,
+            StringType,
+            UUIDType,
+        ),
+    ):
+        return "string"
+    if isinstance(input_type, TimestampType):
+        return "timestamp"
+    if isinstance(
+        input_type,
+        (
+            FixedType,
+            BinaryType,
+        ),
+    ):
+        return "binary"
+    if isinstance(input_type, DecimalType):
+        return f"decimal({input_type.precision},{input_type.scale})"
+    if isinstance(input_type, StructType):
+        name_to_type = 
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in 
input_type.fields)
+        return f"struct<{name_to_type}>"
+    if isinstance(input_type, ListType):
+        return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+    if isinstance(input_type, MapType):
+        return 
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+    raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+    results: Dict[str, ColumnTypeDef] = {}
+
+    def _append_to_results(field: NestedField, is_current: bool) -> None:
+        if field.name in results:
+            return
+
+        results[field.name] = cast(
+            ColumnTypeDef,
+            {
+                "Name": field.name,
+                "Type": _type_to_glue_type_string(field.field_type),
+                "Parameters": {
+                    ICEBERG_FIELD_ID: str(field.field_id),
+                    ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(),
+                    ICEBERG_FIELD_CURRENT: str(is_current).lower(),
+                },
+            },
+        )
+        if field.doc:
+            results[field.name]["Comment"] = field.doc
+
+    if current_schema := metadata.schema_by_id(metadata.current_schema_id):
+        for field in current_schema.columns:
+            _append_to_results(field, True)
+
+    for schema in metadata.schemas:
+        if schema.schema_id == metadata.current_schema_id:
+            continue
+        for field in schema.columns:
+            _append_to_results(field, False)

Review Comment:
   I think these can be combined to:
   ```python
       for schema in metadata.schemas:
           for field in schema.columns:
               _append_to_results(field=field, is_current=schema.schema_id == 
metadata.current_schema_id)
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Set Glue Table Information when creating/updating tables [iceberg-python]

Reply via email to