Re: [PR] Write support [iceberg-python]

via GitHub Sun, 17 Dec 2023 11:06:00 -0800


rdblue commented on code in PR #41:
URL: https://github.com/apache/iceberg-python/pull/41#discussion_r1429244651



##########
pyiceberg/table/__init__.py:
##########
@@ -1904,3 +2001,144 @@ def _generate_snapshot_id() -> int:
     snapshot_id = snapshot_id if snapshot_id >= 0 else snapshot_id * -1
 
     return snapshot_id
+
+
+@dataclass(frozen=True)
+class WriteTask:
+    df: pa.Table
+    # Later to be extended with partition information
+
+
+def _generate_datafile_filename(extension: str) -> str:
+    # Mimics the behavior in the Java API:
+    # 
https://github.com/apache/iceberg/blob/a582968975dd30ff4917fbbe999f1be903efac02/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java#L92-L101
+    return f"00000-0-{uuid.uuid4()}-0.{extension}"
+
+
+def _new_manifest_path(location: str, num: int = 0) -> str:
+    return f'{location}/metadata/{uuid.uuid4()}-m{num}.avro'
+
+
+def _generate_manifest_list_filename(snapshot_id: int, attempt: int = 0) -> 
str:
+    # Mimics the behavior in Java:
+    # 
https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491
+    return f"snap-{snapshot_id}-{attempt}-{uuid.uuid4()}.avro"
+
+
+def _dataframe_to_data_files(table: Table, snapshot_id: int, df: pa.Table) -> 
Iterable[DataFile]:
+    from pyiceberg.io.pyarrow import write_file
+
+    # This is an iter, so we don't have to materialize everything every time
+    # This will be more relevant when we start doing partitioned writes
+    yield from write_file(table, iter([WriteTask(df)]))
+
+
+def _manifests_to_manifest_list(
+    table: Table, snapshot_id: int, parent_snapshot_id: Optional[int], 
manifests: List[ManifestFile], summary: Dict[str, str]
+) -> Snapshot:
+    manifest_list_file_path = 
f'{table.location()}/metadata/{_generate_manifest_list_filename(snapshot_id=snapshot_id)}'
+    with write_manifest_list(
+        format_version=table.metadata.format_version,
+        output_file=table.io.new_output(manifest_list_file_path),
+        snapshot_id=snapshot_id,
+        parent_snapshot_id=parent_snapshot_id,
+        sequence_number=None,
+    ) as writer:
+        writer.add_manifests(manifests)
+
+    return Snapshot(
+        snapshot_id=snapshot_id,
+        parent_snapshot_id=parent_snapshot_id,
+        manifest_list=manifest_list_file_path,
+        sequence_number=table._next_sequence_number(),
+        summary=summary,
+        schema_id=table.schema().schema_id,
+    )
+
+
+class _AppendManifest(ABC):
+    @abstractmethod
+    def append_manifest(self, manifest_file: ManifestFile) -> _AppendManifest:
+        pass
+
+    @abstractmethod
+    def manifests(self) -> List[ManifestFile]:
+        pass
+
+
+class _MergeAppend:
+    _table: Table
+    _snapshot_id: int
+    _added_manifests: List[ManifestFile]
+    _added_datafiles: List[DataFile]
+
+    def __init__(self, table: Table, snapshot_id: int) -> None:
+        self._table = table
+        self._snapshot_id = snapshot_id
+        self._added_manifests = []
+        self._added_datafiles = []
+
+    def append_datafile(self, data_file: DataFile) -> _MergeAppend:
+        self._added_datafiles.append(data_file)
+        return self
+
+    def append_manifest(self, manifest_file: ManifestFile) -> _MergeAppend:

Review Comment:
   Do we need to append manifests? This seems like something we won't use right 
away, which means we should remove it so that we can add it with thorough 
testing when someone needs it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Write support [iceberg-python]

Reply via email to