HonahX commented on code in PR #363:
URL: https://github.com/apache/iceberg-python/pull/363#discussion_r1671812813
##########
tests/integration/test_writes/test_writes.py:
##########
@@ -1031,3 +1031,174 @@ def test_write_all_timestamp_precision(mocker:
MockerFixture, session_catalog: C
])
assert written_arrow_table.schema == expected_schema_in_all_us
assert written_arrow_table ==
input_arrow_table.cast(expected_schema_in_all_us)
+
+
[email protected]
[email protected]("format_version", [1, 2])
+def test_merge_manifests(session_catalog: Catalog, arrow_table_with_null:
pa.Table, format_version: int) -> None:
+ tbl_a = _create_table(
+ session_catalog,
+ "default.merge_manifest_a",
+ {"commit.manifest-merge.enabled": "true",
"commit.manifest.min-count-to-merge": "1", "format-version": format_version},
+ [],
+ )
+ tbl_b = _create_table(
+ session_catalog,
+ "default.merge_manifest_b",
+ {
+ "commit.manifest-merge.enabled": "true",
+ "commit.manifest.min-count-to-merge": "1",
+ "commit.manifest.target-size-bytes": "1",
+ "format-version": format_version,
+ },
+ [],
+ )
+ tbl_c = _create_table(
+ session_catalog,
+ "default.merge_manifest_c",
+ {"commit.manifest.min-count-to-merge": "1", "format-version":
format_version},
+ [],
+ )
+
+ # tbl_a should merge all manifests into 1
+ tbl_a.append(arrow_table_with_null)
+ tbl_a.append(arrow_table_with_null)
+ tbl_a.append(arrow_table_with_null)
+
+ # tbl_b should not merge any manifests because the target size is too small
+ tbl_b.append(arrow_table_with_null)
+ tbl_b.append(arrow_table_with_null)
+ tbl_b.append(arrow_table_with_null)
+
+ # tbl_c should not merge any manifests because merging is disabled
+ tbl_c.append(arrow_table_with_null)
+ tbl_c.append(arrow_table_with_null)
+ tbl_c.append(arrow_table_with_null)
+
+ assert len(tbl_a.current_snapshot().manifests(tbl_a.io)) == 1 # type:
ignore
+ assert len(tbl_b.current_snapshot().manifests(tbl_b.io)) == 3 # type:
ignore
+ assert len(tbl_c.current_snapshot().manifests(tbl_c.io)) == 3 # type:
ignore
+
+ # tbl_a and tbl_c should contain the same data
+ assert tbl_a.scan().to_arrow().equals(tbl_c.scan().to_arrow())
+ # tbl_b and tbl_c should contain the same data
+ assert tbl_b.scan().to_arrow().equals(tbl_c.scan().to_arrow())
+
+
[email protected]
[email protected]("format_version", [1, 2])
+def test_merge_manifests_file_content(session_catalog: Catalog,
arrow_table_with_null: pa.Table, format_version: int) -> None:
+ tbl_a = _create_table(
+ session_catalog,
+ "default.merge_manifest_a",
+ {"commit.manifest-merge.enabled": "true",
"commit.manifest.min-count-to-merge": "1", "format-version": format_version},
+ [],
+ )
+
+ # tbl_a should merge all manifests into 1
+ tbl_a.append(arrow_table_with_null)
+
+ tbl_a_first_entries = tbl_a.inspect.entries().to_pydict()
+ first_snapshot_id = tbl_a_first_entries["snapshot_id"][0]
+ first_data_file_path = tbl_a_first_entries["data_file"][0]["file_path"]
+
+ tbl_a.append(arrow_table_with_null)
+ tbl_a.append(arrow_table_with_null)
+
+ assert len(tbl_a.current_snapshot().manifests(tbl_a.io)) == 1 # type:
ignore
+
+ # verify the sequence number of tbl_a's only manifest file
+ tbl_a_manifest = tbl_a.current_snapshot().manifests(tbl_a.io)[0] # type:
ignore
+ assert tbl_a_manifest.sequence_number == (3 if format_version == 2 else 0)
+ assert tbl_a_manifest.min_sequence_number == (1 if format_version == 2
else 0)
+
+ # verify the manifest entries of tbl_a, in which the manifests are merged
+ tbl_a_entries = tbl_a.inspect.entries().to_pydict()
+ assert tbl_a_entries["status"] == [1, 0, 0]
+ assert tbl_a_entries["sequence_number"] == [3, 2, 1] if format_version ==
2 else [0, 0, 0]
+ assert tbl_a_entries["file_sequence_number"] == [3, 2, 1] if
format_version == 2 else [0, 0, 0]
+ for i in range(3):
+ tbl_a_data_file = tbl_a_entries["data_file"][i]
+ assert tbl_a_data_file["column_sizes"] == [
+ (1, 49),
+ (2, 78),
+ (3, 128),
+ (4, 94),
+ (5, 118),
+ (6, 94),
+ (7, 118),
+ (8, 118),
+ (9, 118),
+ (10, 94),
+ (11, 78),
+ (12, 109),
+ ]
+ assert tbl_a_data_file["content"] == 0
+ assert tbl_a_data_file["equality_ids"] is None
+ assert tbl_a_data_file["file_format"] == "PARQUET"
+ assert
tbl_a_data_file["file_path"].startswith("s3://warehouse/default/merge_manifest_a/data/")
+ if tbl_a_data_file["file_path"] == first_data_file_path:
+ # verify that the snapshot id recorded should be the one where the
file was added
+ assert tbl_a_entries["snapshot_id"][i] == first_snapshot_id
Review Comment:
added a test to verify the snapshot_id
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]