kaxil commented on code in PR #63491:
URL: https://github.com/apache/airflow/pull/63491#discussion_r3335927338
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/local_kubernetes_executor.py:
##########
@@ -144,7 +135,9 @@ def slots_available(self) -> int:
@property
def slots_occupied(self):
"""Number of tasks this executor instance is currently managing."""
- return len(self.running) + len(self.queued_tasks)
+ return (
+ self.local_executor.slots_occupied +
self.kubernetes_executor.slots_occupied - len(self.running)
Review Comment:
This subtracts `len(self.running)` too much. Each child's `slots_occupied`
already counts its own running set, and the two children's running sets are
disjoint (a TI runs on exactly one of them), so `local.slots_occupied +
kube.slots_occupied` already equals the old `len(self.running) +
len(self.queued_tasks)`. Subtracting `len(self.running)` on top cancels the
running count entirely, so this returns only the queued total. The scheduler
computes `max_tis = parallelism - sum(e.slots_occupied)`
(scheduler_job_runner.py:1081), so undercounting here lets it dispatch past
`core.parallelism`. Drop the `- len(self.running)`. The same one-line fix
applies to `celery_kubernetes_executor.py`'s `slots_occupied`.
##########
airflow-core/src/airflow/executors/base_executor.py:
##########
@@ -254,58 +330,45 @@ def log_task_event(self, *, event: str, extra: str,
ti_key: WorkloadKey):
return
self._task_event_logs.append(Log(event=event, task_instance=ti_key,
extra=extra))
- def queue_workload(self, workload: ExecutorWorkload, session: Session) ->
None:
- if isinstance(workload, workloads.ExecuteTask):
- ti = workload.ti
- self.queued_tasks[ti.key] = workload
- elif isinstance(workload, workloads.ExecuteCallback):
- if not self.supports_callbacks:
- raise NotImplementedError(
- f"{type(self).__name__} does not support ExecuteCallback
workloads. "
- f"Set supports_callbacks = True and implement callback
handling in _process_workloads(). "
- f"See LocalExecutor or CeleryExecutor for reference
implementation."
- )
- self.queued_callbacks[workload.key] = workload
- else:
- raise ValueError(
- f"Un-handled workload type {type(workload).__name__!r} in
{type(self).__name__}. "
- f"Workload must be one of: ExecuteTask, ExecuteCallback."
+ def queue_workload(self, workload: QueueableWorkload, session: Session) ->
None:
+ if workload.type not in self.supported_workload_types:
+ raise NotImplementedError(
+ f"{type(self).__name__} does not support {workload.type!r}
workloads. "
+ f"Add {workload.type!r} to supported_workload_types and
implement handling "
+ f"in _process_workloads()."
)
+ self.executor_queues[workload.type][workload.key] = workload
- def _get_workloads_to_schedule(self, open_slots: int) ->
list[tuple[WorkloadKey, ExecutorWorkload]]:
+ def _get_workloads_to_schedule(self, open_slots: int) ->
list[tuple[WorkloadKey, QueueableWorkload]]:
"""
Select and return the next batch of workloads to schedule, respecting
priority policy.
- Priority Policy: Callbacks are scheduled before tasks (callbacks
complete existing work).
- Callbacks are processed in FIFO order. Tasks are sorted by
priority_weight (higher priority first).
+ Workloads are sorted by ``WORKLOAD_TYPE_PRIORITY`` (priority assigned
by workload type) first,
+ then by ``sort_key`` within the same priority. Lower priority values
are scheduled first;
+ within the same priority, lower ``sort_key`` values come first
(``sort_key=0`` gives FIFO).
:param open_slots: Number of available execution slots
"""
- workloads_to_schedule: list[tuple[WorkloadKey, ExecutorWorkload]] = []
-
- if self.queued_callbacks:
- for key, workload in self.queued_callbacks.items():
- if len(workloads_to_schedule) >= open_slots:
- break
- workloads_to_schedule.append((key, workload))
-
- if open_slots > len(workloads_to_schedule) and self.queued_tasks:
- for task_key, task_workload in
self.order_queued_tasks_by_priority():
- if len(workloads_to_schedule) >= open_slots:
- break
- workloads_to_schedule.append((task_key, task_workload))
-
- return workloads_to_schedule
+ all_workloads: list[tuple[WorkloadKey, QueueableWorkload]] = [
+ (key, workload) for queue in self.executor_queues.values() for
key, workload in queue.items()
+ ]
+ all_workloads.sort(
+ key=lambda item: (
+ workloads.WORKLOAD_TYPE_PRIORITY.get(item[1].type,
len(workloads.WORKLOAD_TYPE_PRIORITY)),
+ item[1].sort_key,
+ )
+ )
+ return all_workloads[:open_slots]
Review Comment:
When `open_slots` is negative (over-subscribed, e.g. after
`try_adopt_task_instances` adopts more than the free slots on scheduler
restart), `all_workloads[:negative]` returns all-but-the-last-N and dispatches
workloads. The old accumulate loop broke immediately when `open_slots <= 0` and
returned nothing. `all_workloads[: max(0, open_slots)]` restores that.
##########
providers/cncf/kubernetes/tests/unit/cncf/kubernetes/executors/test_local_kubernetes_executor.py:
##########
@@ -51,21 +51,18 @@ def test_serve_logs_default_value(self):
def test_cli_commands_vended(self):
assert LocalKubernetesExecutor.get_cli_commands()
- def test_queued_tasks(self):
+ def test_slots_occupied_sums_children_without_deprecation(self):
local_executor_mock = mock.MagicMock()
k8s_executor_mock = mock.MagicMock()
- local_kubernetes_executor =
LocalKubernetesExecutor(local_executor_mock, k8s_executor_mock)
-
- local_queued_tasks = {("dag_id", "task_id", "2020-08-30", 1):
"queued_command"}
- k8s_queued_tasks = {("dag_id_2", "task_id_2", "2020-08-30", 2):
"queued_command"}
+ local_executor_mock.slots_occupied = 3
+ k8s_executor_mock.slots_occupied = 2
+ local_executor_mock.running = {("dag_id", "task_id", "2020-08-30", 1)}
+ k8s_executor_mock.running = set()
- local_executor_mock.queued_tasks = local_queued_tasks
- k8s_executor_mock.queued_tasks = k8s_queued_tasks
-
- expected_queued_tasks = {**local_queued_tasks, **k8s_queued_tasks}
+ local_kubernetes_executor =
LocalKubernetesExecutor(local_executor_mock, k8s_executor_mock)
- assert local_kubernetes_executor.queued_tasks == expected_queued_tasks
- assert len(local_kubernetes_executor.queued_tasks) == 2
+ assert local_kubernetes_executor.slots_occupied == 4
Review Comment:
With `local.slots_occupied=3` and `kube.slots_occupied=2`, the correct
hybrid total is 5, not 4. This assertion bakes in the `- len(self.running)`
over-subtraction from the production property, so it'll need to become `== 5`
once that's fixed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]