Fokko commented on code in PR #6525:
URL: https://github.com/apache/iceberg/pull/6525#discussion_r1066862697
##########
python/pyiceberg/avro/resolver.py:
##########
@@ -109,38 +109,46 @@ def resolve(
class SchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]):
- read_types: Optional[Dict[int, Callable[[Schema], StructProtocol]]]
+ read_types: Dict[int, Type[StructProtocol]]
+ context: List[int]
- def __init__(self, read_types: Optional[Dict[int, Callable[[Schema],
StructProtocol]]]):
+ def __init__(self, read_types: Dict[int, Type[StructProtocol]] =
EMPTY_DICT) -> None:
self.read_types = read_types
+ self.context = []
def schema(self, schema: Schema, expected_schema: Optional[IcebergType],
result: Reader) -> Reader:
return result
+ def before_field(self, field: NestedField, field_partner:
Optional[NestedField]) -> None:
+ self.context.append(field.field_id)
+
+ def after_field(self, field: NestedField, field_partner:
Optional[NestedField]) -> None:
+ self.context.pop()
+
def struct(self, struct: StructType, expected_struct:
Optional[IcebergType], field_readers: List[Reader]) -> Reader:
+ # -1 indicates the struct root
+ read_struct_id = self.context[-1] if len(self.context) > 0 else -1
+ struct_callable = self.read_types.get(read_struct_id, Record)
Review Comment:
Got it. Currently, there is no record schema for the typed classes
(`ManifestEntry`, `ManifestFile`, etc). I would be hesitant to add yet another
schema there. Are there any cases where we want to have the Record schema and
the Read schema differ? Ideally, you want to shape the read schema to the data
that you need, so you don't read anything that's not being used.
> It's also better to pass the read schema because we can make generic
records more friendly.
This is something that we do for the `Record` class:
```python
def read(self, decoder: BinaryDecoder) -> Any:
if issubclass(self.create_struct, Record):
struct = self.create_struct(length=len(self.field_readers),
fields=self.fields)
elif issubclass(self.create_struct, PydanticStruct):
struct = self.create_struct.construct()
else:
raise ValueError(f"Expected a subclass of PydanticStruct, got:
{self.create_struct}")
```
And then we have the attributes available:
```python
def test_named_record() -> None:
r = Record(fields=(NestedField(0, "id", IntegerType()), NestedField(1,
"name", StringType())))
assert r.id is None
assert r.name is None
r[0] = 123
r[1] = "abc"
assert r[0] == 123
assert r[1] == "abc"
assert r.id == 123
assert r.name == "abc"
```
> It looks like we can use inspect.getfullargspec(class.__init__) to inspect
the init arguments. So we could do something like this:
I'm not a big fan of doing inspections, similar to Java, it is quite
expensive. And it looks like there is a [performance
issue](https://bugs.python.org/issue37010) in `getfullargspec`. What do you
think of passing in the fields as we have currently?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]