Fokko commented on code in PR #6525:
URL: https://github.com/apache/iceberg/pull/6525#discussion_r1064451131


##########
python/pyiceberg/manifest.py:
##########
@@ -97,48 +178,82 @@ class DataFile(IcebergBaseModel):
     sort_order_id: Optional[int] = Field()
 
 
-class ManifestEntry(IcebergBaseModel):
+MANIFEST_ENTRY_SCHEMA = Schema(
+    NestedField(0, "status", IntegerType(), required=True),
+    NestedField(1, "snapshot_id", LongType(), required=False),
+    NestedField(3, "sequence_number", LongType(), required=False),
+    NestedField(4, "file_sequence_number", LongType(), required=False),
+    NestedField(2, "data_file", DATA_FILE, required=False),
+)
+
+
+class ManifestEntry(PydanticStruct):
     status: ManifestEntryStatus = Field()
     snapshot_id: Optional[int] = Field()
     sequence_number: Optional[int] = Field()
+    file_sequence_number: Optional[int] = Field()
     data_file: DataFile = Field()
 
 
-class PartitionFieldSummary(IcebergBaseModel):
+PARTITION_FIELD_SUMMARY_TYPE = StructType(
+    NestedField(509, "contains_null", BooleanType(), required=True),
+    NestedField(518, "contains_nan", BooleanType(), required=False),
+    NestedField(510, "lower_bound", BinaryType(), required=False),
+    NestedField(511, "upper_bound", BinaryType(), required=False),
+)
+
+
+class PartitionFieldSummary(PydanticStruct):
     contains_null: bool = Field()
     contains_nan: Optional[bool] = Field()
     lower_bound: Optional[bytes] = Field()
     upper_bound: Optional[bytes] = Field()
 
 
-class ManifestFile(IcebergBaseModel):
+MANIFEST_FILE_SCHEMA: Schema = Schema(
+    NestedField(500, "manifest_path", StringType(), required=True, 
doc="Location URI with FS scheme"),
+    NestedField(501, "manifest_length", LongType(), required=True),
+    NestedField(502, "partition_spec_id", IntegerType(), required=True),
+    NestedField(517, "content", IntegerType(), required=False),
+    NestedField(515, "sequence_number", LongType(), required=False),
+    NestedField(516, "min_sequence_number", LongType(), required=False),
+    NestedField(503, "added_snapshot_id", LongType(), required=False),
+    NestedField(504, "added_files_count", IntegerType(), required=False),
+    NestedField(505, "existing_files_count", IntegerType(), required=False),
+    NestedField(506, "deleted_files_count", IntegerType(), required=False),
+    NestedField(512, "added_rows_count", LongType(), required=False),
+    NestedField(513, "existing_rows_count", LongType(), required=False),
+    NestedField(514, "deleted_rows_count", LongType(), required=False),
+    NestedField(507, "partitions", ListType(508, PARTITION_FIELD_SUMMARY_TYPE, 
element_required=True), required=False),
+    NestedField(519, "key_metadata", BinaryType(), required=False),
+)
+
+
+class ManifestFile(PydanticStruct):
     manifest_path: str = Field()
     manifest_length: int = Field()
     partition_spec_id: int = Field()
     content: ManifestContent = Field(default=ManifestContent.DATA)
-    sequence_number: int = Field(default=0)
-    min_sequence_number: int = Field(default=0)
+    sequence_number: Optional[int] = Field()
+    min_sequence_number: Optional[int] = Field()
     added_snapshot_id: Optional[int] = Field()
-    added_data_files_count: Optional[int] = Field()
-    existing_data_files_count: Optional[int] = Field()
-    deleted_data_files_count: Optional[int] = Field()
+    added_files_count: Optional[int] = Field()
+    existing_files_count: Optional[int] = Field()
+    deleted_files_count: Optional[int] = Field()
     added_rows_count: Optional[int] = Field()
-    existing_rows_counts: Optional[int] = Field()
+    existing_rows_count: Optional[int] = Field()
     deleted_rows_count: Optional[int] = Field()
-    partitions: Optional[List[PartitionFieldSummary]] = Field()
+    partitions: List[PartitionFieldSummary] = Field(default_factory=list)
     key_metadata: Optional[bytes] = Field()
 
     def fetch_manifest_entry(self, io: FileIO) -> List[ManifestEntry]:
         file = io.new_input(self.manifest_path)
         return list(read_manifest_entry(file))
 
 
-def read_manifest_entry(input_file: InputFile) -> Iterator[ManifestEntry]:
-    with AvroFile(input_file) as reader:
-        schema = reader.schema
-        for record in reader:
-            dict_repr = _convert_pos_to_dict(schema, record)
-            yield ManifestEntry(**dict_repr)
+def read_manifest_entry(input_file: InputFile) -> List[ManifestEntry]:
+    with AvroFile[ManifestEntry](input_file, MANIFEST_ENTRY_SCHEMA, {-1: 
ManifestEntry, 2: DataFile, 102: Record}) as reader:
+        return list(reader)

Review Comment:
   This is easier for debugging, because if it is a generator, then you have to 
materialize the generator first to inspect it. Reverted it back to an iterator.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to