HonahX commented on code in PR #288:
URL: https://github.com/apache/iceberg-python/pull/288#discussion_r1460723261
##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
return new_parameters
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+ if isinstance(input_type, BooleanType):
+ return "boolean"
+ if isinstance(input_type, IntegerType):
+ return "int"
+ if isinstance(input_type, LongType):
+ return "bigint"
+ if isinstance(input_type, FloatType):
+ return "float"
+ if isinstance(input_type, DoubleType):
+ return "double"
+ if isinstance(input_type, DateType):
+ return "date"
+ if isinstance(
+ input_type,
+ (
+ TimeType,
+ StringType,
+ UUIDType,
+ ),
+ ):
+ return "string"
+ if isinstance(input_type, TimestampType):
+ return "timestamp"
+ if isinstance(
+ input_type,
+ (
+ FixedType,
+ BinaryType,
+ ),
+ ):
+ return "binary"
+ if isinstance(input_type, DecimalType):
+ return f"decimal({input_type.precision},{input_type.scale})"
+ if isinstance(input_type, StructType):
+ name_to_type =
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in
input_type.fields)
+ return f"struct<{name_to_type}>"
+ if isinstance(input_type, ListType):
+ return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+ if isinstance(input_type, MapType):
+ return
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+ raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+ results: Dict[str, ColumnTypeDef] = {}
+
+ def _append_to_results(field: NestedField, is_current: bool) -> None:
+ if field.name in results:
+ return
+
+ results[field.name] = cast(
+ ColumnTypeDef,
+ {
+ "Name": field.name,
+ "Type": _type_to_glue_type_string(field.field_type),
+ "Parameters": {
+ ICEBERG_FIELD_ID: str(field.field_id),
+ ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(),
+ ICEBERG_FIELD_CURRENT: str(is_current).lower(),
+ },
+ },
+ )
+ if field.doc:
+ results[field.name]["Comment"] = field.doc
+
+ if current_schema := metadata.schema_by_id(metadata.current_schema_id):
+ for field in current_schema.columns:
+ _append_to_results(field, True)
+
+ for schema in metadata.schemas:
+ if schema.schema_id == metadata.current_schema_id:
+ continue
+ for field in schema.columns:
+ _append_to_results(field, False)
+
+ return list(results.values())
+
+
def _construct_table_input(
table_name: str,
metadata_location: str,
properties: Properties,
+ metadata: TableMetadataCommonFields,
Review Comment:
```suggestion
metadata: TableMetadata,
```
##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
return new_parameters
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+ if isinstance(input_type, BooleanType):
+ return "boolean"
+ if isinstance(input_type, IntegerType):
+ return "int"
+ if isinstance(input_type, LongType):
+ return "bigint"
+ if isinstance(input_type, FloatType):
+ return "float"
+ if isinstance(input_type, DoubleType):
+ return "double"
+ if isinstance(input_type, DateType):
+ return "date"
+ if isinstance(
+ input_type,
+ (
+ TimeType,
+ StringType,
+ UUIDType,
+ ),
+ ):
+ return "string"
+ if isinstance(input_type, TimestampType):
+ return "timestamp"
+ if isinstance(
+ input_type,
+ (
+ FixedType,
+ BinaryType,
+ ),
+ ):
+ return "binary"
+ if isinstance(input_type, DecimalType):
+ return f"decimal({input_type.precision},{input_type.scale})"
+ if isinstance(input_type, StructType):
+ name_to_type =
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in
input_type.fields)
+ return f"struct<{name_to_type}>"
+ if isinstance(input_type, ListType):
+ return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+ if isinstance(input_type, MapType):
+ return
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+ raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
Review Comment:
```suggestion
def _to_columns(metadata: TableMetadata) -> List[ColumnTypeDef]:
```
##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
return new_parameters
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+ if isinstance(input_type, BooleanType):
+ return "boolean"
+ if isinstance(input_type, IntegerType):
+ return "int"
+ if isinstance(input_type, LongType):
+ return "bigint"
+ if isinstance(input_type, FloatType):
+ return "float"
+ if isinstance(input_type, DoubleType):
+ return "double"
+ if isinstance(input_type, DateType):
+ return "date"
+ if isinstance(
+ input_type,
+ (
+ TimeType,
+ StringType,
+ UUIDType,
+ ),
+ ):
+ return "string"
+ if isinstance(input_type, TimestampType):
+ return "timestamp"
+ if isinstance(
+ input_type,
+ (
+ FixedType,
+ BinaryType,
+ ),
+ ):
+ return "binary"
+ if isinstance(input_type, DecimalType):
+ return f"decimal({input_type.precision},{input_type.scale})"
+ if isinstance(input_type, StructType):
+ name_to_type =
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in
input_type.fields)
+ return f"struct<{name_to_type}>"
+ if isinstance(input_type, ListType):
+ return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+ if isinstance(input_type, MapType):
+ return
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+ raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+ results: Dict[str, ColumnTypeDef] = {}
+
+ def _append_to_results(field: NestedField, is_current: bool) -> None:
+ if field.name in results:
+ return
+
+ results[field.name] = cast(
+ ColumnTypeDef,
+ {
+ "Name": field.name,
+ "Type": _type_to_glue_type_string(field.field_type),
+ "Parameters": {
+ ICEBERG_FIELD_ID: str(field.field_id),
+ ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(),
+ ICEBERG_FIELD_CURRENT: str(is_current).lower(),
+ },
+ },
+ )
+ if field.doc:
+ results[field.name]["Comment"] = field.doc
+
+ if current_schema := metadata.schema_by_id(metadata.current_schema_id):
+ for field in current_schema.columns:
+ _append_to_results(field, True)
+
+ for schema in metadata.schemas:
+ if schema.schema_id == metadata.current_schema_id:
+ continue
+ for field in schema.columns:
+ _append_to_results(field, False)
+
+ return list(results.values())
+
+
def _construct_table_input(
table_name: str,
metadata_location: str,
properties: Properties,
+ metadata: TableMetadataCommonFields,
+ location: Optional[str] = None,
glue_table: Optional[TableTypeDef] = None,
prev_metadata_location: Optional[str] = None,
) -> TableInputTypeDef:
table_input: TableInputTypeDef = {
"Name": table_name,
"TableType": EXTERNAL_TABLE,
"Parameters": _construct_parameters(metadata_location, glue_table,
prev_metadata_location),
+ "StorageDescriptor": {"Columns": _to_columns(metadata)},
}
+ if location:
+ table_input["StorageDescriptor"]["Location"] = location
Review Comment:
The `location` is a required field in metadata, so we can fetch it from the
metadata directly:
```python
table_input["StorageDescriptor"]["Location"] = metadata.location
```
and we can safely delete the "location" parameter
##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
return new_parameters
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
Review Comment:
I think we might utilize the existing schema visitor for this task:
https://github.com/apache/iceberg-python/blob/4616d036440bf4cb3733e8d091220587cf290b75/pyiceberg/schema.py#L345-L348
https://github.com/apache/iceberg-python/blob/4616d036440bf4cb3733e8d091220587cf290b75/pyiceberg/schema.py#L784-L797
We could implement a `_TypeToGlueTypeString` implementing the
`SchemaVisitor[str]` like this
```python
class _TypeToGlueTypeString(SchemaVisitor[str]):
def schema(self, schema: Schema, struct_result: str) -> str:
return struct_result
def struct(self, struct: StructType, field_results: List[str]) -> str:
name_to_type = ",".join(field_results)
return f"struct<{name_to_type}>"
def field(self, field: NestedField, field_result: str) -> str:
return f"{field.name}:{field_result}"
def list(self, list_type: ListType, element_result: str) -> str:
return f"array<{element_result}>"
def map(self, map_type: MapType, key_result: str, value_result: str) ->
str:
return f"map<{key_result},{value_result}>"
def primitive(self, primitive: IcebergType) -> str:
if isinstance(primitive, BooleanType):
return "boolean"
elif isinstance(primitive, IntegerType):
return "int"
elif isinstance(primitive, LongType):
return "bigint"
...(more types)
raise ValueError(f"Unknown Type {primitive}")
```
This can also let us benefit from the `@singledispatch` decorator, which
make the visitor faster in practice. Does this idea sound reasonable to you?
##########
tests/catalog/test_glue.py:
##########
@@ -554,6 +580,19 @@ def test_commit_table_update_schema(
assert new_schema == update._apply()
assert new_schema.find_field("b").field_type == IntegerType()
+ # Ensure schema is also pushed to Glue
+ table_info = _glue.get_table(
+ DatabaseName=database_name,
+ Name=table_name,
+ )
+ columns = table_info["Table"]["StorageDescriptor"]["Columns"]
+ assert len(columns) == len(table_schema_nested.fields) + 1
+ assert columns[-1] == {
+ "Name": "b",
+ "Type": "int",
+ "Parameters": {"iceberg.field.id": "18", "iceberg.field.optional":
"true", "iceberg.field.current": "true"},
+ }
+
Review Comment:
Shall we also check the StorageDscriptor.location here?
```suggestion
assert table_info["Table"]["StorageDescriptor"]["Location"] ==
f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}"
```
##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
return new_parameters
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+ if isinstance(input_type, BooleanType):
+ return "boolean"
+ if isinstance(input_type, IntegerType):
+ return "int"
+ if isinstance(input_type, LongType):
+ return "bigint"
+ if isinstance(input_type, FloatType):
+ return "float"
+ if isinstance(input_type, DoubleType):
+ return "double"
+ if isinstance(input_type, DateType):
+ return "date"
+ if isinstance(
+ input_type,
+ (
+ TimeType,
+ StringType,
+ UUIDType,
+ ),
+ ):
+ return "string"
+ if isinstance(input_type, TimestampType):
+ return "timestamp"
+ if isinstance(
+ input_type,
+ (
+ FixedType,
+ BinaryType,
+ ),
+ ):
+ return "binary"
+ if isinstance(input_type, DecimalType):
+ return f"decimal({input_type.precision},{input_type.scale})"
+ if isinstance(input_type, StructType):
+ name_to_type =
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in
input_type.fields)
+ return f"struct<{name_to_type}>"
+ if isinstance(input_type, ListType):
+ return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+ if isinstance(input_type, MapType):
+ return
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+ raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+ results: Dict[str, ColumnTypeDef] = {}
+
+ def _append_to_results(field: NestedField, is_current: bool) -> None:
+ if field.name in results:
+ return
+
+ results[field.name] = cast(
+ ColumnTypeDef,
+ {
+ "Name": field.name,
+ "Type": _type_to_glue_type_string(field.field_type),
Review Comment:
If we decide to use the existing visitor as mentioned above, this will become
```python
"Type": visit(field.field_type, _TypeToGlueTypeString())
```
##########
pyiceberg/catalog/glue.py:
##########
@@ -84,19 +110,105 @@ def _construct_parameters(
return new_parameters
+def _type_to_glue_type_string(input_type: IcebergType) -> str:
+ if isinstance(input_type, BooleanType):
+ return "boolean"
+ if isinstance(input_type, IntegerType):
+ return "int"
+ if isinstance(input_type, LongType):
+ return "bigint"
+ if isinstance(input_type, FloatType):
+ return "float"
+ if isinstance(input_type, DoubleType):
+ return "double"
+ if isinstance(input_type, DateType):
+ return "date"
+ if isinstance(
+ input_type,
+ (
+ TimeType,
+ StringType,
+ UUIDType,
+ ),
+ ):
+ return "string"
+ if isinstance(input_type, TimestampType):
+ return "timestamp"
+ if isinstance(
+ input_type,
+ (
+ FixedType,
+ BinaryType,
+ ),
+ ):
+ return "binary"
+ if isinstance(input_type, DecimalType):
+ return f"decimal({input_type.precision},{input_type.scale})"
+ if isinstance(input_type, StructType):
+ name_to_type =
",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in
input_type.fields)
+ return f"struct<{name_to_type}>"
+ if isinstance(input_type, ListType):
+ return f"array<{_type_to_glue_type_string(input_type.element_type)}>"
+ if isinstance(input_type, MapType):
+ return
f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>"
+
+ raise ValueError(f"Unknown Type {input_type}")
+
+
+def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]:
+ results: Dict[str, ColumnTypeDef] = {}
+
+ def _append_to_results(field: NestedField, is_current: bool) -> None:
+ if field.name in results:
+ return
+
+ results[field.name] = cast(
+ ColumnTypeDef,
+ {
+ "Name": field.name,
+ "Type": _type_to_glue_type_string(field.field_type),
+ "Parameters": {
+ ICEBERG_FIELD_ID: str(field.field_id),
+ ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(),
+ ICEBERG_FIELD_CURRENT: str(is_current).lower(),
+ },
+ },
+ )
+ if field.doc:
+ results[field.name]["Comment"] = field.doc
+
+ if current_schema := metadata.schema_by_id(metadata.current_schema_id):
+ for field in current_schema.columns:
+ _append_to_results(field, True)
+
+ for schema in metadata.schemas:
+ if schema.schema_id == metadata.current_schema_id:
+ continue
+ for field in schema.columns:
+ _append_to_results(field, False)
Review Comment:
I think these can be combined to:
```python
for schema in metadata.schemas:
for field in schema.columns:
_append_to_results(field=field, is_current=schema.schema_id ==
metadata.current_schema_id)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]