(airflow) branch v3-2-test updated: [v3-2-test] Prevent AlreadyRunningBackfill error caused by invalid date range request (#66874) (#67250)

bbovenzi Thu, 21 May 2026 08:44:45 -0700

This is an automated email from the ASF dual-hosted git repository.

bbovenzi pushed a commit to branch v3-2-test
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/v3-2-test by this push:
     new b6692ee6ef8 [v3-2-test] Prevent AlreadyRunningBackfill error caused by 
invalid date range request (#66874) (#67250)
b6692ee6ef8 is described below

commit b6692ee6ef80f2f7a35798c8bf9c8bd6b52818e6
Author: Yeonguk Choo <[email protected]>
AuthorDate: Fri May 22 00:44:27 2026 +0900

    [v3-2-test] Prevent AlreadyRunningBackfill error caused by invalid date 
range request (#66874) (#67250)
    
    * Fix OTel timer metrics using Gauge instead of Histogram (#64207) (#66865)
    
    * Fix OTel timer metrics using Gauge instead of Histogram
    
    * Use ExponentialBucketHistogramAggregation for timing metrics
    
    * Use public API import path for ExponentialBucketHistogramAggregation and 
fix histogram map isolation
    
    (cherry picked from commit b2dadd2b7623d0d99f6fea0521bf008b7b957cac)
    
    Co-authored-by: namratachaudhary <[email protected]>
    
    * [v3-2-test] Prevent AlreadyRunningBackfill error caused by invalid date 
range request (#66874)
    
    When a backfill is requested with from_date after to_date, the Backfill
    record was committed before _get_info_list() returned an empty list, leaving
    an orphaned record that blocked subsequent backfills with
    AlreadyRunningBackfill until the scheduler's 2-minute cleanup ran.
    
    Add an InvalidBackfillDateRange exception and validate from_date <= to_date
    at the top of _validate_backfill_params(), before any DB operations.
    
    (cherry picked from commit 153623856efb44a3f60ef9fdb67e22e05609e067)
    
    ---------
    
    Co-authored-by: Rahul Vats <[email protected]>
    Co-authored-by: namratachaudhary <[email protected]>
    Co-authored-by: Park Jiwon <[email protected]>
---
 airflow-core/newsfragments/64207.significant.rst   |  1 +
 .../core_api/routes/public/backfills.py            |  3 ++
 airflow-core/src/airflow/models/backfill.py        | 21 ++++++--
 airflow-core/tests/unit/models/test_backfill.py    | 18 +++++++
 .../observability/metrics/otel_logger.py           | 58 +++++++++++++++++++---
 .../observability/metrics/test_otel_logger.py      | 47 ++++++++++++------
 6 files changed, 125 insertions(+), 23 deletions(-)

diff --git a/airflow-core/newsfragments/64207.significant.rst 
b/airflow-core/newsfragments/64207.significant.rst
new file mode 100644
index 00000000000..3254fa20a54
--- /dev/null
+++ b/airflow-core/newsfragments/64207.significant.rst
@@ -0,0 +1 @@
+OTel timer and timing metrics now use Histogram instead of Gauge, preserving 
count, sum, and bucket distribution across recordings.
diff --git 
a/airflow-core/src/airflow/api_fastapi/core_api/routes/public/backfills.py 
b/airflow-core/src/airflow/api_fastapi/core_api/routes/public/backfills.py
index 5d4e112df18..d6373f1d52a 100644
--- a/airflow-core/src/airflow/api_fastapi/core_api/routes/public/backfills.py
+++ b/airflow-core/src/airflow/api_fastapi/core_api/routes/public/backfills.py
@@ -52,6 +52,7 @@ from airflow.models.backfill import (
     DagNoScheduleException,
     InvalidBackfillConf,
     InvalidBackfillDate,
+    InvalidBackfillDateRange,
     InvalidBackfillDirection,
     InvalidReprocessBehavior,
     _create_backfill,
@@ -265,6 +266,7 @@ def create_backfill(
         InvalidBackfillDirection,
         DagNoScheduleException,
         InvalidBackfillDate,
+        InvalidBackfillDateRange,
         InvalidBackfillConf,
     ) as e:
         raise RequestValidationError(str(e))
@@ -314,6 +316,7 @@ def create_backfill_dry_run(
         InvalidBackfillDirection,
         DagNoScheduleException,
         InvalidBackfillDate,
+        InvalidBackfillDateRange,
         InvalidBackfillConf,
     ) as e:
         raise RequestValidationError(str(e))
diff --git a/airflow-core/src/airflow/models/backfill.py 
b/airflow-core/src/airflow/models/backfill.py
index 24c102f1cfc..4f4dba38bf0 100644
--- a/airflow-core/src/airflow/models/backfill.py
+++ b/airflow-core/src/airflow/models/backfill.py
@@ -100,6 +100,14 @@ class InvalidBackfillDate(AirflowException):
     """
 
 
+class InvalidBackfillDateRange(AirflowException):
+    """
+    Raised when from_date is after to_date in a backfill request.
+
+    :meta private:
+    """
+
+
 class InvalidBackfillConf(AirflowException):
     """
     Raised when the provided ``dag_run_conf`` fails validation against the 
DAG's params.
@@ -259,6 +267,16 @@ def _validate_backfill_params(
     reprocess_behavior: ReprocessBehavior | None,
     dag_run_conf: dict | None = None,
 ) -> None:
+
+    if from_date > to_date:
+        raise InvalidBackfillDateRange(
+            f"from_date ({from_date.isoformat()}) must not be after to_date 
({to_date.isoformat()})."
+        )
+
+    current_time = timezone.utcnow()
+    if from_date >= current_time and to_date >= current_time:
+        raise InvalidBackfillDate("Backfill cannot be executed for future 
dates.")
+
     depends_on_past = any(x.depends_on_past for x in dag.tasks)
     if depends_on_past:
         if reverse is True:
@@ -270,9 +288,6 @@ def _validate_backfill_params(
                 "Dag has tasks for which depends_on_past=True. "
                 "You must set reprocess behavior to reprocess completed or 
reprocess failed."
             )
-    current_time = timezone.utcnow()
-    if from_date >= current_time and to_date >= current_time:
-        raise InvalidBackfillDate("Backfill cannot be executed for future 
dates.")
     if dag_run_conf is not None:
         try:
             dag.params.deep_merge(dag_run_conf).validate()
diff --git a/airflow-core/tests/unit/models/test_backfill.py 
b/airflow-core/tests/unit/models/test_backfill.py
index 2749e0e6776..67b2b727a76 100644
--- a/airflow-core/tests/unit/models/test_backfill.py
+++ b/airflow-core/tests/unit/models/test_backfill.py
@@ -33,6 +33,7 @@ from airflow.models.backfill import (
     BackfillDagRun,
     BackfillDagRunExceptionReason,
     InvalidBackfillConf,
+    InvalidBackfillDateRange,
     InvalidBackfillDirection,
     InvalidReprocessBehavior,
     ReprocessBehavior,
@@ -779,3 +780,20 @@ def test_get_latest_dag_run_row_partitioned(session: 
Session):
     dr = session.scalar(stmt)
     assert dr is not None
     assert dr.start_date == timezone.parse("2026-02-23")
+
+
+def test_create_backfill_from_date_after_to_date_raises(dag_maker, session):
+    with dag_maker(schedule="@daily") as dag:
+        PythonOperator(task_id="hi", python_callable=print)
+    session.commit()
+
+    with pytest.raises(InvalidBackfillDateRange, match="must not be after 
to_date"):
+        _create_backfill(
+            dag_id=dag.dag_id,
+            from_date=pendulum.parse("2026-05-13"),
+            to_date=pendulum.parse("2026-05-12"),
+            max_active_runs=2,
+            reverse=False,
+            triggering_user_name="pytest",
+            dag_run_conf={},
+        )
diff --git 
a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py 
b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py
index c8db3ee02f9..b7b8effe8dc 100644
--- 
a/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py
+++ 
b/shared/observability/src/airflow_shared/observability/metrics/otel_logger.py
@@ -30,6 +30,7 @@ from opentelemetry.sdk.metrics._internal.export import (
     ConsoleMetricExporter,
     PeriodicExportingMetricReader,
 )
+from opentelemetry.sdk.metrics.view import 
ExponentialBucketHistogramAggregation, View
 from opentelemetry.sdk.resources import SERVICE_NAME, Resource
 
 from ..common import get_otel_data_exporter
@@ -146,7 +147,8 @@ class _OtelTimer(Timer):
     """
     An implementation of Stats.Timer() which records the result in the OTel 
Metrics Map.
 
-    OpenTelemetry does not have a native timer, we will store the values as a 
Gauge.
+    OpenTelemetry does not have a native timer; values are stored as a 
Histogram so that
+    all observations (count, sum, bucket distribution) are preserved across 
multiple recordings.
 
     :param name: The name of the timer.
     :param tags: Tags to append to the timer.
@@ -160,9 +162,9 @@ class _OtelTimer(Timer):
 
     def stop(self, send: bool = True) -> None:
         super().stop(send)
-        if self.name and send and self.duration:
-            self.otel_logger.metrics_map.set_gauge_value(
-                full_name(prefix=self.otel_logger.prefix, name=self.name), 
self.duration, False, self.tags
+        if self.name and send and self.duration is not None:
+            self.otel_logger.metrics_map.record_histogram_value(
+                full_name(prefix=self.otel_logger.prefix, name=self.name), 
self.duration, self.tags
             )
 
 
@@ -278,11 +280,11 @@ class SafeOtelLogger:
         *,
         tags: Attributes = None,
     ) -> None:
-        """OTel does not have a native timer, stored as a Gauge whose value is 
elapsed ms."""
+        """Record a timing observation as a Histogram to preserve distribution 
information."""
         if self.metrics_validator.test(stat) and 
name_is_otel_safe(self.prefix, stat):
             if isinstance(dt, datetime.timedelta):
                 dt = dt.total_seconds() * 1000.0
-            self.metrics_map.set_gauge_value(full_name(prefix=self.prefix, 
name=stat), float(dt), False, tags)
+            
self.metrics_map.record_histogram_value(full_name(prefix=self.prefix, 
name=stat), float(dt), tags)
 
     def timer(
         self,
@@ -314,15 +316,29 @@ class InternalGauge:
         self.gauge.set(new_value, attributes=self.attributes)
 
 
+class InternalHistogram:
+    """Stores a histogram instrument for timer/timing metrics."""
+
+    def __init__(self, meter, name: str):
+        otel_safe_name = _get_otel_safe_name(name)
+        self.histogram = meter.create_histogram(name=otel_safe_name, unit="ms")
+        log.debug("Created %s as type: %s", otel_safe_name, 
_type_as_str(self.histogram))
+
+    def record(self, value: float, tags: Attributes) -> None:
+        self.histogram.record(value, attributes=tags)
+
+
 class MetricsMap:
     """Stores Otel Instruments."""
 
     def __init__(self, meter):
         self.meter = meter
         self.map = {}
+        self.histograms: dict[str, InternalHistogram] = {}
 
     def clear(self) -> None:
         self.map.clear()
+        self.histograms.clear()
 
     def _create_counter(self, name):
         """Create a new counter or up_down_counter for the provided name."""
@@ -376,6 +392,21 @@ class MetricsMap:
 
         self.map[key].set_value(value, delta)
 
+    def record_histogram_value(self, name: str, value: float, tags: 
Attributes) -> None:
+        """
+        Record a timing observation in a Histogram instrument.
+
+        Unlike a Gauge, a Histogram accumulates all observations so that 
count, sum,
+        and bucket distribution are preserved across multiple recordings.
+
+        :param name: The name of the histogram to record.
+        :param value: The timing observation in milliseconds.
+        :param tags: Attributes to attach to the observation.
+        """
+        if name not in self.histograms:
+            self.histograms[name] = InternalHistogram(meter=self.meter, 
name=name)
+        self.histograms[name].record(value, tags)
+
 
 def flush_otel_metrics():
     provider = metrics.get_meter_provider()
@@ -400,6 +431,15 @@ def get_otel_logger(
     stat_name_handler: Callable[[str], str] | None = None,
     statsd_influxdb_enabled: bool = False,
 ) -> SafeOtelLogger:
+    """
+    Build and return a :class:`SafeOtelLogger` backed by a configured 
:class:`MeterProvider`.
+
+    Histogram instruments (used for ``timing()`` / ``timer()`` metrics) are 
aggregated with
+    
:class:`~opentelemetry.sdk.metrics.view.ExponentialBucketHistogramAggregation`
+    so that bucket boundaries adapt automatically to the observed data range.  
This avoids
+    the need to hand-tune explicit bucket boundaries for metrics that span 
very different
+    scales (milliseconds to hours).
+    """
     otel_env_config = load_metrics_env_config()
 
     effective_service_name: str = otel_env_config.service_name or service_name 
or "airflow"
@@ -453,6 +493,12 @@ def get_otel_logger(
         MeterProvider(
             resource=resource,
             metric_readers=readers,
+            views=[
+                View(
+                    instrument_type=metrics.Histogram,
+                    aggregation=ExponentialBucketHistogramAggregation(),
+                )
+            ],
             shutdown_on_exit=False,
         ),
     )
diff --git 
a/shared/observability/tests/observability/metrics/test_otel_logger.py 
b/shared/observability/tests/observability/metrics/test_otel_logger.py
index f7b348354d7..3c1369ec44b 100644
--- a/shared/observability/tests/observability/metrics/test_otel_logger.py
+++ b/shared/observability/tests/observability/metrics/test_otel_logger.py
@@ -25,6 +25,7 @@ from unittest import mock
 
 import pytest
 from opentelemetry.metrics import MeterProvider
+from opentelemetry.sdk.metrics.view import 
ExponentialBucketHistogramAggregation, View
 
 from airflow_shared.observability.common import get_otel_data_exporter
 from airflow_shared.observability.exceptions import InvalidStatsNameException
@@ -244,25 +245,28 @@ class TestOtelMetrics:
 
         self.stats.timing(name, dt=datetime.timedelta(seconds=123))
 
-        
self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name))
-        expected_value = 123000.0
-        assert self.map[full_name(name)].value == expected_value
+        
self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name),
 unit="ms")
+        
self.meter.get_meter().create_histogram.return_value.record.assert_called_once_with(
+            123000.0, attributes=None
+        )
 
     def test_timing_new_metric_with_tags(self, name):
         tags = {"hello": "world"}
-        key = _generate_key_name(full_name(name), tags)
 
         self.stats.timing(name, dt=1, tags=tags)
 
-        
self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name))
-        self.map[key].attributes == tags
+        
self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name),
 unit="ms")
+        
self.meter.get_meter().create_histogram.return_value.record.assert_called_once_with(
+            1.0, attributes=tags
+        )
 
     def test_timing_existing_metric(self, name):
         self.stats.timing(name, dt=1)
         self.stats.timing(name, dt=2)
 
-        
self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name))
-        assert self.map[full_name(name)].value == 2
+        # histogram created only once, but both observations are recorded
+        
self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name),
 unit="ms")
+        assert 
self.meter.get_meter().create_histogram.return_value.record.call_count == 2
 
     # For the four test_timer_foo tests below:
     #   time.perf_count() is called once to get the starting timestamp and 
again
@@ -277,7 +281,7 @@ class TestOtelMetrics:
         expected_duration = 3140.0
         assert timer.duration == expected_duration
         assert mock_time.call_count == 2
-        
self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name))
+        
self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name),
 unit="ms")
 
     @mock.patch.object(time, "perf_counter", side_effect=[0.0, 3.14])
     def test_timer_no_name_returns_float_but_does_not_store_value(self, 
mock_time, name):
@@ -288,7 +292,7 @@ class TestOtelMetrics:
         expected_duration = 3140.0
         assert timer.duration == expected_duration
         assert mock_time.call_count == 2
-        self.meter.get_meter().create_gauge.assert_not_called()
+        self.meter.get_meter().create_histogram.assert_not_called()
 
     @mock.patch.object(time, "perf_counter", side_effect=[0.0, 3.14])
     def test_timer_start_and_stop_manually_send_false(self, mock_time, name):
@@ -301,7 +305,7 @@ class TestOtelMetrics:
         expected_value = 3140.0
         assert timer.duration == expected_value
         assert mock_time.call_count == 2
-        self.meter.get_meter().create_gauge.assert_not_called()
+        self.meter.get_meter().create_histogram.assert_not_called()
 
     @mock.patch.object(time, "perf_counter", side_effect=[0.0, 3.14])
     def test_timer_start_and_stop_manually_send_true(self, mock_time, name):
@@ -314,7 +318,7 @@ class TestOtelMetrics:
         expected_value = 3140.0
         assert timer.duration == expected_value
         assert mock_time.call_count == 2
-        
self.meter.get_meter().create_gauge.assert_called_once_with(name=full_name(name))
+        
self.meter.get_meter().create_histogram.assert_called_once_with(name=full_name(name),
 unit="ms")
 
     @pytest.mark.parametrize(
         (
@@ -415,6 +419,18 @@ class TestOtelMetrics:
                 == 
f"opentelemetry.exporter.otlp.proto.{expected_exporter_module}.metric_exporter"
             )
 
+    @mock.patch("airflow_shared.observability.metrics.otel_logger.metrics")
+    
@mock.patch("airflow_shared.observability.metrics.otel_logger.MeterProvider")
+    def test_get_otel_logger_uses_exponential_histogram_view(self, 
mock_provider, mock_metrics):
+        get_otel_logger(host="localhost", port=4318)
+
+        call_kwargs = mock_provider.call_args.kwargs
+        views = call_kwargs["views"]
+        assert len(views) == 1
+        view = views[0]
+        assert isinstance(view, View)
+        assert isinstance(view._aggregation, 
ExponentialBucketHistogramAggregation)
+
     def test_atexit_flush_on_process_exit(self):
         """
         Run a process that initializes a logger, creates a stat and then exits.
@@ -422,8 +438,11 @@ class TestOtelMetrics:
         The logger initialization registers an atexit hook.
         Test that the hook runs and flushes the created stat at shutdown.
         """
-        test_module_name = "tests.observability.metrics.test_otel_logger"
-        function_call_str = f"import {test_module_name} as m; 
m.mock_service_run()"
+        function_call_str = (
+            "from airflow_shared.observability.metrics.otel_logger import 
get_otel_logger; "
+            "logger = get_otel_logger(debug=True); "
+            "logger.incr('my_test_stat')"
+        )
 
         proc = subprocess.run(
             [sys.executable, "-c", function_call_str],

(airflow) branch v3-2-test updated: [v3-2-test] Prevent AlreadyRunningBackfill error caused by invalid date range request (#66874) (#67250)

Reply via email to