(texera) branch main updated: fix: Adding Floor to idle_time (#4900)

yiconghuang Mon, 04 May 2026 17:16:53 -0700

This is an automated email from the ASF dual-hosted git repository.

Yicong-Huang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git



The following commit(s) were added to refs/heads/main by this push:
     new 056e39d0d9 fix: Adding Floor to idle_time (#4900)
056e39d0d9 is described below

commit 056e39d0d9a46b25648e671cedee4337ffe3a29c
Author: Matthew B. <[email protected]>
AuthorDate: Mon May 4 17:16:38 2026 -0700

    fix: Adding Floor to idle_time (#4900)
    
    ### What changes were proposed in this PR?
    
    - idle_time was computed as total_execution_time - data_processing_time
    - control_processing_time, with no lower bound — timing races could make
    it negative and propagate bad values to the dashboard
    - Added drift detection warning in update_total_execution_time when the
    new total is less than the accumulated processing time
    - Added monotonicity check in update_total_execution_time to catch
    non-increasing timestamps
      - get_statistics() clamps idle_time to 0 via max(0, ...)
    
    ### Any related issues, documentation, or discussions?
    
    closes: #4892
    
    ### How was this PR tested?
    - Updated TestStatisticsManagerExecutionTime to assert idle_time == 0
    for the overshoot case instead of -30
      - Removed the xfail marker that was tracking the unfixed bug
    
    ### Was this PR authored or co-authored using generative AI tooling?
    <!--
    If generative AI tooling has been used in the process of authoring this
    PR,
    please include the phrase: 'Generated-by: ' followed by the name of the
    tool
    and its version. If no, write 'No'.
    Please refer to the [ASF Generative Tooling
    Guidance](https://www.apache.org/legal/generative-tooling.html) for
    details.
    -->
    Co-Authored with Claude Opus 4.6 in Compliance with ASF
    
    ---------
    
    Signed-off-by: Matthew B. <[email protected]>
---
 .../architecture/managers/statistics_manager.py    | 29 ++++++++++++++++++---
 .../managers/test_statistics_manager.py            | 30 +++++-----------------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git 
a/amber/src/main/python/core/architecture/managers/statistics_manager.py 
b/amber/src/main/python/core/architecture/managers/statistics_manager.py
index 6b36b78e57..8151ca3bf1 100644
--- a/amber/src/main/python/core/architecture/managers/statistics_manager.py
+++ b/amber/src/main/python/core/architecture/managers/statistics_manager.py
@@ -18,6 +18,8 @@
 from collections import defaultdict
 from typing import DefaultDict
 
+from loguru import logger
+
 from proto.org.apache.texera.amber.core import PortIdentity
 from proto.org.apache.texera.amber.engine.architecture.worker import (
     WorkerStatistics,
@@ -53,9 +55,12 @@ class StatisticsManager:
             ],
             self._data_processing_time,
             self._control_processing_time,
-            self._total_execution_time
-            - self._data_processing_time
-            - self._control_processing_time,
+            max(
+                0,
+                self._total_execution_time
+                - self._data_processing_time
+                - self._control_processing_time,
+            ),
         )
 
     def increase_input_statistics(self, port_id: PortIdentity, size: int) -> 
None:
@@ -85,7 +90,23 @@ class StatisticsManager:
             raise ValueError(
                 "Current time must be greater than or equal to worker start 
time"
             )
-        self._total_execution_time = time - self._worker_start_time
+        new_total = time - self._worker_start_time
+        if new_total < self._total_execution_time:
+            logger.warning(
+                f"update_total_execution_time called with non-monotonic time: "
+                f"new total {new_total}ns < current total 
{self._total_execution_time}ns. "
+                "Clock skew or out-of-order call detected."
+            )
+        processing_total = self._data_processing_time + 
self._control_processing_time
+        if new_total < processing_total:
+            logger.warning(
+                f"idle_time drift: total_execution_time ({new_total}ns) < "
+                f"data ({self._data_processing_time}ns) + control "
+                f"({self._control_processing_time}ns). "
+                "update_total_execution_time should be called after 
increase_*_processing_time "
+                "with the same end timestamp. idle_time will be clamped to 0."
+            )
+        self._total_execution_time = new_total
 
     def initialize_worker_start_time(self, time: int) -> None:
         # Set the worker start time
diff --git 
a/amber/src/main/python/core/architecture/managers/test_statistics_manager.py 
b/amber/src/main/python/core/architecture/managers/test_statistics_manager.py
index 26d1a3ec64..5abf7a36b4 100644
--- 
a/amber/src/main/python/core/architecture/managers/test_statistics_manager.py
+++ 
b/amber/src/main/python/core/architecture/managers/test_statistics_manager.py
@@ -135,31 +135,13 @@ class TestStatisticsManagerExecutionTime:
         ):
             mgr.update_total_execution_time(999)
 
-    def test_idle_time_can_go_negative_when_processing_exceeds_total(self):
-        # Pin a real-but-questionable behavior: get_statistics computes
-        # idle_time = total_execution - data - control with NO clamp. If
-        # instrumentation overcounts (or update_total_execution_time was
-        # called early), idle goes negative. Filed as a Bug — see the
-        # accompanying issue. A future fix that floors at 0 must also
-        # update this test deliberately.
+    def test_idle_time_clamped_to_zero_when_processing_overshoots(self):
+        # When data+control exceed total_execution_time (e.g. update_total was
+        # called before all increase_* calls for that interval), idle_time is
+        # clamped to 0 and a warning is logged. It must never be negative.
         mgr = StatisticsManager()
         mgr.initialize_worker_start_time(1_000)
         mgr.update_total_execution_time(1_100)  # 100ns total
         mgr.increase_data_processing_time(80)
-        mgr.increase_control_processing_time(50)  # 130 > 100 total
-        stats = mgr.get_statistics()
-        assert stats.idle_time == -30
-
-    @pytest.mark.xfail(
-        strict=True,
-        reason="Bug: idle_time goes negative when data+control processing time 
"
-        "overshoots total_execution_time. The fix should floor at 0 (or 
surface "
-        "the inconsistency); flips to XPASS when corrected.",
-    )
-    def test_idle_time_should_never_be_negative(self):
-        mgr = StatisticsManager()
-        mgr.initialize_worker_start_time(1_000)
-        mgr.update_total_execution_time(1_100)
-        mgr.increase_data_processing_time(80)
-        mgr.increase_control_processing_time(50)
-        assert mgr.get_statistics().idle_time >= 0
+        mgr.increase_control_processing_time(50)  # 130 > 100
+        assert mgr.get_statistics().idle_time == 0

(texera) branch main updated: fix: Adding Floor to idle_time (#4900)

Reply via email to