Re: [PR] test: add v9 test tables and test cases [hudi-rs]

via GitHub Sat, 07 Mar 2026 16:32:45 -0800


Copilot commented on code in PR #536:
URL: https://github.com/apache/hudi-rs/pull/536#discussion_r2900883240



##########
python/tests/test_table_read.py:
##########
@@ -81,139 +80,116 @@ def 
test_read_table_can_read_from_batches(get_sample_table):
         )
     )
     t = pa.Table.from_batches([batch])
-    assert t.num_rows == 1
+    assert t.num_rows > 0
     assert t.num_columns == 11
 
     file_slices_gen = iter(table.get_file_slices_splits(2))
-    assert len(next(file_slices_gen)) == 3
     assert len(next(file_slices_gen)) == 2
+    assert len(next(file_slices_gen)) == 1
 
 
-def test_read_table_returns_correct_data(get_sample_table):
-    table_path = get_sample_table
-    table = HudiTable(table_path)
+def test_read_table_returns_correct_data(v8_trips_table):
+    table = HudiTable(v8_trips_table)
 
     batches = table.read_snapshot()
-    t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
-    assert t.to_pylist() == [
-        {
-            "_hoodie_commit_time": "20240402144910683",
-            "ts": 1695046462179,
-            "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
-            "fare": 339.0,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695091554788,
-            "uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
-            "fare": 27.7,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695115999911,
-            "uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
-            "fare": 17.85,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695159649087,
-            "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
-            "fare": 19.1,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695516137016,
-            "uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
-            "fare": 34.15,
-        },
-    ]
-
+    t = (
+        pa.Table.from_batches(batches)
+        .select(["ts", "uuid", "rider", "fare"])
+        .sort_by("ts")
+    )
+    rows = t.to_pylist()
+
+    # 6 surviving rows (8 inserts - 2 deletes: rider-F, rider-J)
+    assert len(rows) == 6
+
+    # Verify riders and fares derived from SQL operations
+    rider_fares = {r["rider"]: r["fare"] for r in rows}
+    assert rider_fares == {
+        "rider-D": 33.9,
+        "rider-C": 27.7,
+        "rider-A": 0.0,  # updated fare=0
+        "rider-I": 41.06,
+        "rider-E": 93.5,
+        "rider-G": 0.0,  # updated fare=0
+    }
 
-def test_read_table_for_partition(get_sample_table):
-    table_path = get_sample_table
-    table = HudiTable(table_path)
+    # Deleted riders should not be present
+    assert "rider-F" not in rider_fares
+    assert "rider-J" not in rider_fares
 
-    batches = table.read_snapshot([("city", "=", "san_francisco")])
-    t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
-    assert t.to_pylist() == [
-        {
-            "_hoodie_commit_time": "20240402144910683",
-            "ts": 1695046462179,
-            "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
-            "fare": 339.0,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695091554788,
-            "uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
-            "fare": 27.7,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695159649087,
-            "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
-            "fare": 19.1,
-        },
-    ]
+    # Verify UUIDs match the original inserts
+    uuid_riders = {r["uuid"]: r["rider"] for r in rows}
+    assert uuid_riders["334e26e9-8355-45cc-97c6-c31daf0df330"] == "rider-A"
+    assert uuid_riders["9909a8b1-2d15-4d3d-8ec9-efc48c536a00"] == "rider-D"
+    assert uuid_riders["7a84095f-737f-40bc-b62f-6b69664712d2"] == "rider-G"
 
 
-def test_table_apis_as_of_timestamp(get_sample_table):
-    table_path = get_sample_table
-    table = HudiTable(table_path)
-    timestamp = "20240402123035233"
+def test_read_table_for_partition(v8_trips_table):
+    table = HudiTable(v8_trips_table)
 
-    file_slices_gen = table.get_file_slices_splits_as_of(2, timestamp)
-    file_slices_base_paths = set(
-        f.base_file_relative_path() for f in 
chain.from_iterable(file_slices_gen)
+    batches = table.read_snapshot([("city", "=", "san_francisco")])
+    t = (
+        pa.Table.from_batches(batches)
+        .select(["ts", "uuid", "rider", "fare"])
+        .sort_by("ts")
     )
-    assert file_slices_base_paths == {
-        
"san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet",
-        
"san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet",
-        
"san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_2-10-0_20240402123035233.parquet",
-        
"sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet",
-        
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",
+    rows = t.to_pylist()
+
+    # san_francisco has 4 surviving rows: rider-D, C, A, E
+    assert len(rows) == 4
+    rider_fares = {r["rider"]: r["fare"] for r in rows}
+    assert rider_fares == {
+        "rider-D": 33.9,
+        "rider-C": 27.7,
+        "rider-A": 0.0,
+        "rider-E": 93.5,
     }
 
-    batches = table.read_snapshot_as_of(timestamp)
-    t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
-    assert t.to_pylist() == [
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695046462179,
-            "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
-            "fare": 33.9,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695091554788,
-            "uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
-            "fare": 27.7,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695115999911,
-            "uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
-            "fare": 17.85,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695159649087,
-            "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
-            "fare": 19.1,
-        },
-        {
-            "_hoodie_commit_time": "20240402123035233",
-            "ts": 1695516137016,
-            "uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
-            "fare": 34.15,
-        },
+
+def test_table_apis_as_of_timestamp(v8_trips_table):
+    table = HudiTable(v8_trips_table)
+
+    # Get the first commit timestamp from the timeline
+    timeline = table.get_timeline()
+    all_commits = timeline.get_completed_commits()
+    first_commit = all_commits[0].timestamp
+

Review Comment:
   `all_commits[0]` will raise `IndexError` if the timeline is unexpectedly 
empty (e.g., table extraction failure or corrupt fixture). Adding an explicit 
`assert all_commits` (or `assert len(all_commits) > 0`) before indexing will 
make failures easier to diagnose.



##########
python/tests/test_table_incremental_read.py:
##########
@@ -15,24 +15,36 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
+"""Incremental read tests using v9_txns_simple_nometa (COW, partitioned by 
region).
+
+SQL sequence: INSERT 8 rows, then UPDATE TXN-001 txn_type='reversal', etc.
+The incremental read from the insert commit to the update commit should
+return the updated TXN-001 row.
+"""
+
 import pyarrow as pa
 
 from hudi import HudiTable
+from hudi._internal import get_test_table_path
 
 
-def test_table_incremental_read_returns_correct_data(get_sample_table):
-    table_path = get_sample_table
+def test_table_incremental_read_returns_correct_data():
+    table_path = get_test_table_path("v9_txns_simple_nometa", "cow")
     table = HudiTable(table_path)

Review Comment:
   This test relies on `hudi._internal.get_test_table_path`, which is only 
exposed when the extension is built with the `testing` feature. Without that 
feature, pytest will fail during import/collection. Either ensure the test 
build enables `testing` or guard the import with a skip and a clear message.



##########
python/src/testing_internal.rs:
##########
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use hudi_test::{QuickstartTripsTable, SampleTable};
+use pyo3::exceptions::PyValueError;
+use pyo3::prelude::*;
+
+use crate::internal::rt;
+
+#[pyfunction]
+#[pyo3(signature = (table_name, table_type))]
+pub fn get_test_table_path(table_name: &str, table_type: &str) -> 
PyResult<String> {
+    if let Ok(table) = table_name.parse::<SampleTable>() {
+        return match table_type {
+            "cow" => Ok(table.path_to_cow()),
+            "mor_avro" => Ok(table.path_to_mor_avro()),
+            "mor_parquet" => Ok(table.path_to_mor_parquet()),
+            _ => Err(PyValueError::new_err(format!(
+                "Unknown table type: {table_type}. Use 'cow', 'mor_avro', or 
'mor_parquet'"
+            ))),
+        };
+    }
+
+    if let Ok(table) = table_name.parse::<QuickstartTripsTable>() {
+        return match table_type {
+            "mor_avro" => Ok(table.path_to_mor_avro()),
+            _ => Err(PyValueError::new_err(format!(
+                "QuickstartTripsTable only supports 'mor_avro'. Got: 
{table_type}"
+            ))),
+        };
+    }
+
+    Err(PyValueError::new_err(format!(
+        "Unknown test table: {table_name}"
+    )))
+}
+
+#[pyfunction]
+#[pyo3(signature = (table_name, cow, partitioned))]
+pub fn verify_v9_txns_table(table_name: &str, cow: bool, partitioned: bool) -> 
PyResult<()> {
+    let table: SampleTable = table_name
+        .parse()
+        .map_err(|_| PyValueError::new_err(format!("Unknown sample table: 
{table_name}")))?;
+    rt().block_on(hudi_test::v9_verification::verify_v9_txns_table(
+        &table,
+        cow,
+        partitioned,
+    ));
+    Ok(())

Review Comment:
   `verify_v9_txns_table` runs a potentially long DataFusion verification via 
`rt().block_on(...)` while holding the Python GIL (the function signature 
doesn't take `py: Python` and doesn't use `allow_threads`/`detach`). This can 
block other Python threads during test runs. Consider accepting a `py: Python` 
argument and running the verification under `py.allow_threads(...)` (and 
ideally converting failures into a `PyErr` rather than panicking inside Rust).



##########
python/tests/test_file_group_read.py:
##########
@@ -15,56 +15,61 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
+"""File group reader tests using v8_trips_8i3u1d (MOR, partitioned by city).
+
+SQL: 8 inserts, 3 updates (A, J, G fare=0), 2 deletes (F, J).
+Final: san_francisco(4 rows), sao_paulo(1), chennai(1).
+"""
+
 import pyarrow as pa
 
-from hudi import HudiFileGroupReader
+from hudi import HudiFileGroupReader, HudiTable
+
 
-TEST_SAMPLE_BASE_FILE = 
"san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet"
+def test_file_group_api_read_file_slice(v8_trips_table):
+    table = HudiTable(v8_trips_table)
+    file_group_reader = HudiFileGroupReader(v8_trips_table)
 
+    # Get the san_francisco file slice (has 4 rows: rider-A, C, D, E)
+    file_slices = table.get_file_slices()
+    sf_slice = [
+        f for f in file_slices if "san_francisco" in 
f.base_file_relative_path()
+    ][0]

Review Comment:
   Using `[0]` on the filtered `file_slices` list will raise `IndexError` with 
a less-informative failure if the expected partition isn't present. Prefer 
asserting that a matching slice exists (or using `next(..., None)` and an 
explicit assertion) so test failures are clearer.



##########
python/src/lib.rs:
##########
@@ -41,6 +44,13 @@ fn _internal(_py: Python, m: &Bound<'_, PyModule>) -> 
PyResult<()> {
         m.add_class::<HudiDataFusionDataSource>()?;
     }
 
+    #[cfg(feature = "testing")]
+    {
+        use testing_internal::{get_test_table_path, verify_v9_txns_table};
+        m.add_function(wrap_pyfunction!(get_test_table_path, m)?)?;
+        m.add_function(wrap_pyfunction!(verify_v9_txns_table, m)?)?;
+    }

Review Comment:
   `get_test_table_path`/`verify_v9_txns_table` are only registered when the 
Rust extension is built with the `testing` Cargo feature (`#[cfg(feature = 
"testing")]`). The updated Python tests import these symbols unconditionally, 
so running the test suite with a non-`testing` build of `hudi._internal` will 
raise `ImportError` during collection. Consider either enabling the `testing` 
feature in the test build pipeline, or registering these helpers under an 
existing test-enabled feature used in CI so the Python tests can run 
consistently.



##########
python/src/testing_internal.rs:
##########
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use hudi_test::{QuickstartTripsTable, SampleTable};
+use pyo3::exceptions::PyValueError;
+use pyo3::prelude::*;
+
+use crate::internal::rt;
+
+#[pyfunction]
+#[pyo3(signature = (table_name, table_type))]
+pub fn get_test_table_path(table_name: &str, table_type: &str) -> 
PyResult<String> {
+    if let Ok(table) = table_name.parse::<SampleTable>() {
+        return match table_type {
+            "cow" => Ok(table.path_to_cow()),
+            "mor_avro" => Ok(table.path_to_mor_avro()),
+            "mor_parquet" => Ok(table.path_to_mor_parquet()),
+            _ => Err(PyValueError::new_err(format!(
+                "Unknown table type: {table_type}. Use 'cow', 'mor_avro', or 
'mor_parquet'"
+            ))),
+        };

Review Comment:
   `get_test_table_path` currently advertises support for `"mor_avro"` and 
`"mor_parquet"` for any `SampleTable`, but several sample tables do not have 
those zip assets (e.g., v9 tables appear to be MOR-avro-only; v8 sample tables 
appear cow-only). Calling `table.path_to_*()` when the zip is missing will 
panic inside `hudi_test::extract_test_table` (due to `fs::read(...).unwrap()`), 
crashing the Python process instead of returning a Python exception. Suggest 
restricting the allowed `table_type` values based on `table_name`/version, or 
adding an existence check and returning `PyValueError` when the requested 
format isn't available.



##########
python/tests/conftest.py:
##########
@@ -15,26 +15,11 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
-import os
-import pathlib
-import zipfile
-
 import pytest
 
-
-def _extract_testing_table(zip_file_path, target_path) -> str:
-    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-        zip_ref.extractall(target_path)
-    return os.path.join(target_path, "trips_table")
+from hudi._internal import get_test_table_path
 
 
[email protected](
-    params=[
-        "0.x_cow_partitioned",
-    ]
-)
-def get_sample_table(request, tmp_path) -> str:
-    fixture_path = pathlib.Path(__file__).parent.joinpath("table")
-    table_name = request.param
-    zip_file_path = pathlib.Path(fixture_path).joinpath(f"{table_name}.zip")
-    return _extract_testing_table(zip_file_path, tmp_path)
[email protected]
+def v8_trips_table() -> str:
+    return get_test_table_path("v8_trips_8i3u1d", "mor_avro")

Review Comment:
   This fixture depends on `hudi._internal.get_test_table_path`, which is only 
exposed when the extension is built with the `testing` feature. As written, 
pytest collection will fail with `ImportError` if tests run against a 
non-`testing` wheel/build. Either adjust the test build to enable the `testing` 
feature or make this fixture robust to missing test-only helpers (e.g., skip 
with a clear message).



##########
python/tests/test_datafusion_read.py:
##########
@@ -15,18 +15,71 @@
 #  specific language governing permissions and limitations
 #  under the License.
 
-from datafusion import SessionContext
+"""Query result verification tests for DataFusion integration with v9 Hudi 
tables.
 
-from hudi import HudiDataFusionDataSource
+Tests actual record contents after INSERT, UPDATE, DELETE, INSERT OVERWRITE,
+compaction (MOR), and clustering operations.
 
+Verification logic is shared with Rust tests via hudi_test::v9_verification.
+"""
 
-def test_datafusion_table_registry(get_sample_table):
-    table_path = get_sample_table
+import pytest
 
-    table = HudiDataFusionDataSource(
-        table_path, [("hoodie.read.use.read_optimized.mode", "true")]
-    )
-    ctx = SessionContext()
-    ctx.register_table("trips", table)
-    df = ctx.sql("SELECT  city from trips order by city desc limit 
1").to_arrow_table()
-    assert df.to_pylist() == [{"city": "sao_paulo"}]
+from hudi._internal import verify_v9_txns_table
+

Review Comment:
   This test imports `verify_v9_txns_table` from `hudi._internal`, but that 
symbol is only available when the extension is built with the `testing` Cargo 
feature. If CI/dev builds only enable `datafusion`, this test file will fail to 
import during collection. Ensure the test build enables `testing` as well, or 
guard the import with a skip so the suite remains runnable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] test: add v9 test tables and test cases [hudi-rs]

Reply via email to