Yicong-Huang commented on code in PR #4100:
URL: https://github.com/apache/texera/pull/4100#discussion_r2579621464
##########
amber/src/main/python/core/models/schema/schema.py:
##########
@@ -81,26 +81,47 @@ def _from_raw_schema(self, raw_schema: Mapping[str, str])
-> None:
def _from_arrow_schema(self, arrow_schema: pa.Schema) -> None:
"""
Resets the Schema by converting a pyarrow.Schema.
+ Checks field metadata to detect BIG_OBJECT types.
:param arrow_schema: a pyarrow.Schema.
:return:
"""
self._name_type_mapping = OrderedDict()
for attr_name in arrow_schema.names:
- arrow_type = arrow_schema.field(attr_name).type # type: ignore
- attr_type = FROM_ARROW_MAPPING[arrow_type.id]
+ field = arrow_schema.field(attr_name)
+
+ # Check metadata for BIG_OBJECT type
+ # (can be stored by either Scala ArrowUtils or Python)
+ is_big_object = (
+ field.metadata and field.metadata.get(b"texera_type") ==
b"BIG_OBJECT"
+ )
+
+ attr_type = (
+ AttributeType.BIG_OBJECT
+ if is_big_object
+ else FROM_ARROW_MAPPING[field.type.id]
+ )
+
self.add(attr_name, attr_type)
def as_arrow_schema(self) -> pa.Schema:
"""
Creates a new pyarrow.Schema according to the current Schema.
+ Includes metadata for BIG_OBJECT types to preserve type information.
:return: pyarrow.Schema
"""
- return pa.schema(
- [
- pa.field(attr_name, TO_ARROW_MAPPING[attr_type])
- for attr_name, attr_type in self._name_type_mapping.items()
- ]
- )
+ fields = [
+ pa.field(
+ attr_name,
+ TO_ARROW_MAPPING[attr_type],
+ metadata=(
+ {b"texera_type": b"BIG_OBJECT"}
+ if attr_type == AttributeType.BIG_OBJECT
+ else None
+ ),
+ )
+ for attr_name, attr_type in self._name_type_mapping.items()
+ ]
+ return pa.schema(fields)
Review Comment:
a second thought on this. if we are only using this field to store a
pointer/metadata, then underlying data type may be not idea to be
`large_binary`, since that is designed for data > 2GB. So I take back this
suggestion.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]