Copilot commented on code in PR #5706:
URL: https://github.com/apache/texera/pull/5706#discussion_r3408881588


##########
amber/src/test/python/core/util/test_virtual_identity.py:
##########
@@ -76,6 +77,35 @@ def 
test_extracts_trailing_index_even_when_layer_name_contains_hyphens(self):
         assert get_worker_index("Worker:WF1-myOp-1st-physical-op-3") == 3
 
 
+class TestGetOperatorId:
+    def test_extracts_operator_id_from_canonical_name(self):
+        assert get_operator_id("Worker:WF1-myOp-main-0") == "myOp"
+
+    def test_isolates_operator_id_containing_hyphens(self):
+        # Load-bearing: operator ids contain dashes; greedy `.+` must still
+        # stop at the final <layer>-<index> tokens.
+        assert (
+            get_operator_id("Worker:WF12-PythonUDFV2-abc-def-main-0")
+            == "PythonUDFV2-abc-def"
+        )
+
+    def test_handles_non_main_layer_and_nonzero_index(self):
+        # The exact case the old `rsplit("-main-0")` got silently wrong.
+        assert get_operator_id("Worker:WF3-op-loopLayer-7") == "op"
+
+    def test_operator_id_ending_in_digits(self):
+        assert get_operator_id("Worker:WF1-op123-main-0") == "op123"
+
+    def test_raises_value_error_on_special_actor_id(self):
+        # Companions like CONTROLLER / SELF must fail loudly, not return junk.
+        with pytest.raises(ValueError, match="Invalid worker ID format"):
+            get_operator_id("CONTROLLER")
+
+    def test_raises_value_error_on_partial_match(self):
+        with pytest.raises(ValueError, match="Invalid worker ID format"):
+            get_operator_id("Worker:WF1-myOp-main")
+

Review Comment:
   `get_operator_id` now uses `fullmatch` to fail on malformed IDs with 
trailing junk, but the test suite doesn't currently cover that specific 
behavior change. Adding a trailing-junk regression test will pin the stricter 
parsing semantics.



##########
amber/src/test/python/core/util/test_virtual_identity.py:
##########
@@ -76,6 +77,35 @@ def 
test_extracts_trailing_index_even_when_layer_name_contains_hyphens(self):
         assert get_worker_index("Worker:WF1-myOp-1st-physical-op-3") == 3

Review Comment:
   The behavior change to `re.fullmatch` is intended to reject trailing junk 
(e.g., `...-7-extra`) rather than silently parsing a prefix. There isn't 
currently a test that exercises this new failure mode for `get_worker_index`, 
so a regression back to `match()` (or equivalent) could slip through.



##########
amber/src/main/python/core/util/virtual_identity.py:
##########
@@ -24,18 +24,34 @@
     ActorVirtualIdentity,
 )
 
-worker_name_pattern = re.compile(r"Worker:WF\d+-.+-(\w+)-(\d+)")
+worker_name_pattern = re.compile(r"Worker:WF(\d+)-(.+)-(\w+)-(\d+)")
 
 MATERIALIZATION_READER_ACTOR_PREFIX = "MATERIALIZATION_READER_"
 
 
 def get_worker_index(worker_id: str) -> int:
-    match = worker_name_pattern.match(worker_id)
+    match = worker_name_pattern.fullmatch(worker_id)
     if match:
-        return int(match.group(2))
+        return int(match.group(4))
     raise ValueError("Invalid worker ID format")

Review Comment:
   `get_operator_id` includes the offending `worker_id` in its `ValueError`, 
but `get_worker_index` still raises a generic message. Including the input in 
the exception would make debugging malformed actor IDs easier and keeps the two 
helpers consistent.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to