New submission from Zhesi Huang <huangzh...@gmail.com>:
i see a case, the worker proc of pool can't be terminated, i try kill -SIGTERM or -SIGINT, but it still can't terminated the worker proc, then the pool __exit__ will hang forever ``` class NonDaemonProcess(multiprocessing.Process): # https://stackoverflow.com/questions/6974695/python-process-pool-non-daemonic # make 'daemon' attribute always return False def _get_daemon(self): return False def _set_daemon(self, value): pass daemon = property(_get_daemon, _set_daemon) def wrapper_copy_parallel(output_name, local_output_path, obs_path): log_thread('upload the content of [%s] outputs' % output_name) try: if os.path.exists(local_output_path): log_thread('%s has %d files to be uploaded' % (output_name, len(file_list))) xxxxxx log_thread('upload the content of [%s] outputs successfully' % output_name) log_thread('it can be accessed at obs path [%s]' % obs_path) else: log_thread('local output path is not found, skip upload the content of [%s] outputs' % output_name) except Exception as upload_exception: err_thread('upload the content of [%s] outputs failed: %s' % (output_name, str(upload_exception))) return 255 return 0 def upload_to_s3(): """ upload the content of local path to s3, handle action [on_completed] :return: """ outputs = [] for local_output_path, (output_name, obs_path, action, _, _) in local_to_target.items(): if action == ACTION_ON_COMPLETED: outputs.append((output_name, local_output_path, obs_path)) if len(outputs) == 0: return 0 with NonDaemonPool(processes=len(outputs)) as pool: results = pool.starmap(wrapper_copy_parallel, outputs) for result in results: if result != 0: return result return 0 ``` stdout/stderr log [ma-test Service Log][INFO][2021/01/05 03:07:49,218]: registered signal handler [ma-test Service Log][INFO][2021/01/05 03:09:40,299]: output-handler finalizing [ma-test Service Log][INFO][2021-01-05 03:09:40,309][NonDaemonPoolWorker-1]: child process calling self.run() [ma-test Service Log][INFO][2021-01-05 03:09:40,311][NonDaemonPoolWorker-1]: upload the content of [] outputs [ma-test Service Log][INFO][2021-01-05 03:09:41,331][Process-1:1]: child process calling self.run() [ma-test Service Log][INFO][2021-01-05 03:09:41,333][Process-1:2]: child process calling self.run() [ma-test Service Log][INFO][2021-01-05 03:09:41,338][Process-1:3]: child process calling self.run() [ma-test Service Log][INFO][2021-01-05 03:09:41,351][Process-1:1]: process shutting down [ma-test Service Log][INFO][2021-01-05 03:09:41,351][Process-1:1]: process exiting with exitcode 0 [ma-test Service Log][INFO][2021-01-05 03:09:41,386][Process-1:2]: process shutting down [ma-test Service Log][INFO][2021-01-05 03:09:41,386][Process-1:2]: process exiting with exitcode 0 [ma-test Service Log][INFO][2021-01-05 03:09:41,410][Process-1:3]: process shutting down [ma-test Service Log][INFO][2021-01-05 03:09:41,410][Process-1:3]: process exiting with exitcode 0 [ma-test Service Log][INFO][2021-01-05 03:09:41,415][NonDaemonPoolWorker-1]: upload the content of [] outputs successfully [ma-test Service Log][INFO][2021-01-05 03:09:41,415][NonDaemonPoolWorker-1]: it can be accessed at obs path [s3://ma-test-algorancher-intel/model_evaluation/6e5746ff-2839-400a-ba93-df38311415f4/dac957b0-b43b-43e2-ab19-0b45672a7ea0/] 18 (pid) proc python stacktrace >>> Interrupting process at following point: File "/home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py", line 396, in <module> ret_code = upload_to_obs() File "/home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py", line 287, in upload_to_obs results = pool.starmap(wrapper_copy_parallel, outputs) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/pool.py", line 623, in __exit__ self.terminate() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/pool.py", line 548, in terminate self._terminate() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/util.py", line 201, in __call__ res = self._callback(*self._args, **self._kwargs) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/pool.py", line 617, in _terminate_pool p.join() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/process.py", line 140, in join res = self._popen.wait(timeout) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/popen_fork.py", line 48, in wait return self.poll(os.WNOHANG if timeout == 0.0 else 0) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/popen_fork.py", line 28, in poll pid, sts = os.waitpid(self.pid, flag) 18 (pid) proc gdb bt #0 0x00007f15c2c5ff7b in waitpid () from /lib/x86_64-linux-gnu/libpthread.so.0 #1 0x000055f54a8a5180 in os_waitpid_impl.isra.78 (options=0, pid=28) at /tmp/build/80754af9/python_1588882889832/work/Modules/posixmodule.c:7066 #2 os_waitpid () at /tmp/build/80754af9/python_1588882889832/work/Modules/clinic/posixmodule.c.h:3049 #3 0x000055f54a7f1b19 in _PyMethodDef_RawFastCallKeywords () at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:655 #4 0x000055f54a7f1db1 in _PyCFunction_FastCallKeywords (func=0x7f15c1f175f0, args=<optimized out>, nargs=<optimized out>, kwnames=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:734 28 (pid) proc python stacktrace Interrupting process at following point: File "/home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py", line 396, in <module> ret_code = upload_to_obs() File "/home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py", line 286, in upload_to_obs with NonDaemonPool(processes=len(outputs)) as pool: File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/pool.py", line 176, in __init__ self._repopulate_pool() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/pool.py", line 241, in _repopulate_pool w.start() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/process.py", line 112, in start self._popen = self._Popen(self) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/context.py", line 223, in _Popen return _default_context.get_context().Process._Popen(process_obj) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/context.py", line 277, in _Popen return Popen(process_obj) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/popen_fork.py", line 20, in __init__ self._launch(process_obj) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/popen_fork.py", line 74, in _launch code = process_obj._bootstrap() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap self.run() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/process.py", line 99, in run self._target(*self._args, **self._kwargs) File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker task = get() File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/queues.py", line 351, in get with self._rlock: File "/home/ma-user/miniconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__ return self._semlock.__enter__() 28 (pid) proc gdb bt #0 0x00007f15c2c5e827 in do_futex_wait.constprop () from /lib/x86_64-linux-gnu/libpthread.so.0 #1 0x00007f15c2c5e8d4 in __new_sem_wait_slow.constprop.0 () from /lib/x86_64-linux-gnu/libpthread.so.0 #2 0x00007f15c2c5e97a in sem_wait@@GLIBC_2.2.5 () from /lib/x86_64-linux-gnu/libpthread.so.0 #3 0x00007f15a98058b1 in semlock_acquire () at /usr/local/src/conda/python-3.7.7/Modules/_multiprocessing/semaphore.c:319 #4 0x000055f54a7f1c94 in _PyMethodDef_RawFastCallKeywords () at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:694 #5 0x000055f54a7f8aef in _PyMethodDescr_FastCallKeywords () at /tmp/build/80754af9/python_1588882889832/work/Objects/descrobject.c:288 proc list ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ ps -ef UID PID PPID C STIME TTY TIME CMD ma-user 1 0 0 Jan05 ? 00:00:00 /bin/bash /home/ma-user/sidecar.sh ma-user 16 1 0 Jan05 ? 00:00:00 tail -n 0 -F /home/ma-user/ma-test/log/-0---ma-platform-sidecar.log ma-user 18 1 0 Jan05 ? 00:00:00 python /home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py ma-user 28 18 0 Jan05 ? 00:00:00 python /home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py ma-user 102 0 0 Jan05 pts/0 00:00:00 bash ma-user 911 0 0 14:56 pts/1 00:00:00 bash ma-user 931 911 0 14:57 pts/1 00:00:00 ps -ef ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ 28 fd ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ ls -alh /proc/28/fd total 0 dr-x------ 2 ma-user ma-group 0 Jan 6 20:23 . dr-xr-xr-x 9 ma-user ma-group 0 Jan 5 17:27 .. lr-x------ 1 ma-user ma-group 64 Jan 6 20:23 0 -> /dev/null l-wx------ 1 ma-user ma-group 64 Jan 6 20:23 1 -> /home/ma-user/ma-test/log/-0---ma-platform-sidecar.log l-wx------ 1 ma-user ma-group 64 Jan 6 20:23 2 -> /home/ma-user/ma-test/log/-0---ma-platform-sidecar.log lr-x------ 1 ma-user ma-group 64 Jan 6 20:23 3 -> /dev/urandom lr-x------ 1 ma-user ma-group 64 Jan 6 20:23 4 -> pipe:[952852668] lrwx------ 1 ma-user ma-group 64 Jan 6 20:23 5 -> socket:[952857499] lrwx------ 1 ma-user ma-group 64 Jan 6 20:23 6 -> /home/ma-user/conf/scc_logger.conf l-wx------ 1 ma-user ma-group 64 Jan 6 20:23 7 -> pipe:[952852671] lr-x------ 1 ma-user ma-group 64 Jan 6 20:23 8 -> /dev/null l-wx------ 1 ma-user ma-group 64 Jan 6 20:23 9 -> pipe:[952852675] 18 fd ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ ls -alh /proc/18/fd total 0 dr-x------ 2 ma-user ma-group 0 Jan 6 20:26 . dr-xr-xr-x 9 ma-user ma-group 0 Jan 5 03:07 .. lr-x------ 1 ma-user ma-group 64 Jan 6 20:26 0 -> /dev/null l-wx------ 1 ma-user ma-group 64 Jan 6 20:26 1 -> /home/ma-user/ma-test/log/-0---ma-platform-sidecar.log l-wx------ 1 ma-user ma-group 64 Jan 6 20:26 2 -> /home/ma-user/ma-test/log/-0---ma-platform-sidecar.log lr-x------ 1 ma-user ma-group 64 Jan 6 20:26 3 -> /dev/urandom lr-x------ 1 ma-user ma-group 64 Jan 6 20:26 4 -> pipe:[952852668] l-wx------ 1 ma-user ma-group 64 Jan 6 20:26 5 -> pipe:[952852668] lr-x------ 1 ma-user ma-group 64 Jan 6 20:26 6 -> pipe:[952852671] l-wx------ 1 ma-user ma-group 64 Jan 6 20:26 7 -> pipe:[952852671] lr-x------ 1 ma-user ma-group 64 Jan 6 20:26 8 -> pipe:[952852675] SIGTERM / SIGINT not work ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ ps -ef UID PID PPID C STIME TTY TIME CMD ma-user 1 0 0 Jan05 ? 00:00:00 /bin/bash /home/ma-user/sidecar.sh ma-user 16 1 0 Jan05 ? 00:00:00 tail -n 0 -F /home/ma-user/ma-test/log/-0---ma-platform-sidecar.log ma-user 18 1 0 Jan05 ? 00:00:00 python /home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py ma-user 28 18 0 Jan05 ? 00:00:00 python /home/ma-user/runtime-scripts-v2/init-container/outputs-handler.py ma-user 102 0 0 Jan05 pts/0 00:00:00 bash ma-user 1166 0 0 Jan06 pts/1 00:00:00 bash ma-user 1216 1166 0 10:51 pts/1 00:00:00 ps -ef ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ kill -SIGTERM 28 ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ kill -SIGINT 28 ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ ma-user@ma-test-eval-dac957b0-b43b-43e2-ab19-0b45672a7ea0-worker-0:/$ finally, pool hang ---------- components: Library (Lib) messages: 384560 nosy: huangzhesi priority: normal severity: normal status: open title: pool worker can't be terminated type: behavior versions: Python 3.7 _______________________________________ Python tracker <rep...@bugs.python.org> <https://bugs.python.org/issue42849> _______________________________________ _______________________________________________ Python-bugs-list mailing list Unsubscribe: https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com