diogosilva30 commented on code in PR #65943:
URL: https://github.com/apache/airflow/pull/65943#discussion_r3201592827


##########
providers/edge3/src/airflow/providers/edge3/cli/worker.py:
##########
@@ -595,37 +659,65 @@ async def fetch_and_run_job(self) -> None:
             self.background_tasks.add(task)
             task.add_done_callback(self.background_tasks.discard)
 
-        while job.is_running and results_queue.empty():
+        # Fork path: keep pushing logs while the child is running and has not 
sent a result yet.
+        # Subprocess path: keep pushing logs while the child is running; 
status comes from Popen.
+        while job.is_running and (results_queue is None or 
results_queue.empty()):
             await self._push_logs_in_chunks(job)
             for _ in range(0, self.job_poll_interval * 10):
                 await sleep(0.1)
                 if not job.is_running:
                     break
         await self._push_logs_in_chunks(job)
-        supervisor_msg = (
-            "(Unknown error, no exception details available)"
-            if results_queue.empty()
-            else results_queue.get()
-        )
-        # Ensure that supervisor really ended after we grabbed results from 
queue
-        while True:
-            if not job.is_running:
-                break
+        # Fork path: drain the result queue BEFORE waiting for the child to 
fully exit.
+        # A large exception travels through multiprocessing's pipe-backed 
queue; reading it
+        # here lets the child's feeder thread flush and avoids deadlocking on 
process exit.
+        # Fresh-interpreter subprocesses do not share Python exception objects 
with the parent.
+        fork_result = None if (results_queue is None or results_queue.empty()) 
else results_queue.get()
+        # Wait for the child process to fully exit (fork path: queue is 
already drained above).
+        while job.is_running:  # noqa: ASYNC110
             await sleep(0.1)
 
         self.jobs.remove(job)
+        # Subprocess stderr is keyed by PID because Job intentionally stores 
only the process
+        # object. Pop it once the process is done so every completion path 
owns cleanup.
+        stderr_file_path = (
+            self._subprocess_stderr_files.pop(job.process.pid, None)
+            if isinstance(job.process, subprocess.Popen)
+            else None
+        )
         if job.is_success:
             logger.info("Job completed: %s", job.edge_job.identifier)
             await jobs_set_state(job.edge_job.key, TaskInstanceState.SUCCESS)
+            if stderr_file_path:
+                stderr_file_path.unlink(missing_ok=True)
         else:
-            if isinstance(supervisor_msg, Exception):
-                supervisor_msg = 
"\n".join(traceback.format_exception(supervisor_msg))
-            logger.error("Job failed: %s with:\n%s", job.edge_job.identifier, 
supervisor_msg)
+            if isinstance(job.process, subprocess.Popen):

Review Comment:
   Also moved to Job dataclass



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to