Hi Ludo, Now that I have root access to overdrive1, I could strace the sshd process (I just did 'strace -p340', noting the process of sshd displayed with 'herd status sshd'):
--8<---------------cut here---------------start------------->8--- pselect6(87, [3 4], NULL, NULL, NULL, NULL) = 1 (in [3]) accept(3, {sa_family=AF_INET, sin_port=htons(33262), sin_addr=inet_addr("66.158.152.121")}, [128->16]) = 5 fcntl(5, F_GETFL) = 0x2 (flags O_RDWR) pipe2([6, 7], 0) = 0 socketpair(AF_UNIX, SOCK_STREAM, 0, [8, 9]) = 0 clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0xffff8e0ef0e0) = 644 close(7) = 0 close(9) = 0 write(8, "\0\0\1\245\0", 5) = 5 write(8, "\0\0\1\234\nPort 22\nPermitRootLogin no\n"..., 420) = 420 close(8) = 0 close(5) = 0 getpid() = 340 getpid() = 340 getpid() = 340 getpid() = 340 getpid() = 340 getpid() = 340 getpid() = 340 pselect6(87, [3 4 6], NULL, NULL, NULL, NULL) = 1 (in [6]) read(6, "\0", 1) = 1 pselect6(87, [3 4 6], NULL, NULL, NULL, NULL) = 1 (in [6]) read(6, "", 1) = 0 close(6) = 0 pselect6(87, [3 4], NULL, NULL, NULL, NULL) = ? ERESTARTNOHAND (To be restarted if no handler) --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=644, si_uid=0, si_status=255, si_utime=1, si_stime=0} --- wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 255}], WNOHANG, NULL) = 644 wait4(-1, 0xfffffa4d90e4, WNOHANG, NULL) = 0 rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call) pselect6(87, [3 4], NULL, NULL, NULL, NULL <detached ...> --8<---------------cut here---------------end--------------->8--- With the attached v3 patch, the corresponding output (still problematic) was: --8<---------------cut here---------------start------------->8--- $ ./pre-inst-env guix offload test /etc/guix/machines.scm overdrive1 guix offload: Testing 1 build machines defined in '/etc/guix/machines.scm'... guix offload: got premature EOF from machine 'overdrive1.guix.gnu.org' from inferior '#<inferior pipe (0 1 1) 7f6ee10d3400>' on port '#<input-output: channel (open) 7f6ee10c5500>'; retrying connection Backtrace: In ice-9/boot-9.scm: 1752:10 10 (with-exception-handler _ _ #:unwind? _ #:unwind-for-type _) In unknown file: 9 (apply-smob/0 #<thunk 7f6ee5ae9f60>) In ice-9/boot-9.scm: 724:2 8 (call-with-prompt _ _ #<procedure default-prompt-handler (k proc)>) In ice-9/eval.scm: 619:8 7 (_ #(#(#<directory (guile-user) 7f6ee5ae3c80>))) In guix/ui.scm: 2161:12 6 (run-guix-command _ . _) In ice-9/boot-9.scm: 1752:10 5 (with-exception-handler _ _ #:unwind? _ #:unwind-for-type _) 1747:15 4 (with-exception-handler #<procedure 7f6ee25e1480 at ice-9/boot-9.scm:1831:7 (exn)> _ # _ # …) In ice-9/threads.scm: 288:21 3 (loop _) In guix/scripts/offload.scm: 719:29 2 (_ _) 719:29 1 (_ _) In ice-9/boot-9.scm: 1685:16 0 (raise-exception _ #:continuable? _) ice-9/boot-9.scm:1685:16: In procedure raise-exception: Wrong type to apply: 2 --8<---------------cut here---------------end--------------->8--- I haven't tried analyzing the strace output yet. Maxim
>From c7b2ec1c58adf8c795df0a6aaf075dbc331f41e8 Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer <maxim.courno...@gmail.com> Date: Thu, 27 May 2021 08:44:44 -0400 Subject: [PATCH 1/2] offload: Parallelize machine check in offload test. * guix/scripts/offload.scm (check-machine-availability): Refactor so that it takes a single machine object. Ensure the cleanup code is always run. (check-machines-availability): New procedure. Call CHECK-MACHINES-AVAILABILITY in parallel, which improves performance (about twice as fast with 4 build machines, from ~30 s to ~15 s). --- guix/scripts/offload.scm | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index 835078cb97..b0fd20e158 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -1,7 +1,7 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <l...@gnu.org> ;;; Copyright © 2017 Ricardo Wurmus <rek...@elephly.net> -;;; Copyright © 2020 Maxim Cournoyer <maxim.courno...@gmail.com> +;;; Copyright © 2020, 2021 Maxim Cournoyer <maxim.courno...@gmail.com> ;;; Copyright © 2020 Julien Lepiller <jul...@lepiller.eu> ;;; ;;; This file is part of GNU Guix. @@ -53,6 +53,7 @@ #:use-module (ice-9 regex) #:use-module (ice-9 format) #:use-module (ice-9 binary-ports) + #:use-module (ice-9 threads) #:export (build-machine build-machine? build-machine-name @@ -684,7 +685,7 @@ daemon is not running." (leave (G_ "failed to import '~a' from '~a'~%") item name))))) -(define (check-machine-availability machine-file pred) +(define (check-machines-availability machine-file pred) "Check that each machine matching PRED in MACHINE-FILE is usable as a build machine." (define (build-machine=? m1 m2) @@ -696,18 +697,28 @@ machine." (let ((machines (filter pred (delete-duplicates (build-machines machine-file) build-machine=?)))) - (info (G_ "testing ~a build machines defined in '~a'...~%") + (info (G_ "Testing ~a build machines defined in '~a'...~%") (length machines) machine-file) - (let* ((names (map build-machine-name machines)) - (sockets (map build-machine-daemon-socket machines)) - (sessions (map (cut open-ssh-session <> %short-timeout) machines)) - (nodes (map remote-inferior sessions))) - (for-each assert-node-has-guix nodes names) - (for-each assert-node-repl nodes names) - (for-each assert-node-can-import sessions nodes names sockets) - (for-each assert-node-can-export sessions nodes names sockets) - (for-each close-inferior nodes) - (for-each disconnect! sessions)))) + (par-for-each check-machine-availability machines))) + +(define (check-machine-availability machine) + "Check whether MACHINE is available. Exit with an error upon failure." + ;; Sometimes, the machine remote port may return EOF, presumably because the + ;; connection was lost. Retry up to 3 times. + (let* ((name (build-machine-name machine)) + (socket (build-machine-daemon-socket machine)) + (session (open-ssh-session machine %short-timeout)) + (node (remote-inferior session))) + (dynamic-wind + (lambda () #t) + (lambda () + (assert-node-has-guix node name) + (assert-node-repl node name) + (assert-node-can-import session node name socket) + (assert-node-can-export session node name socket)) + (lambda () + (close-inferior node) + (disconnect! session))))) (define (check-machine-status machine-file pred) "Print the load of each machine matching PRED in MACHINE-FILE." @@ -824,7 +835,7 @@ machine." ((file) (values file (const #t))) (() (values %machine-file (const #t))) (x (leave (G_ "wrong number of arguments~%")))))) - (check-machine-availability (or file %machine-file) pred)))) + (check-machines-availability (or file %machine-file) pred)))) (("status" rest ...) (with-error-handling (let-values (((file pred) -- 2.31.1
>From b5558777617e4674a150895458d57d202de56120 Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer <maxim.courno...@gmail.com> Date: Tue, 25 May 2021 08:42:06 -0400 Subject: [PATCH 2/2] offload: Handle a possible EOF response from read-repl-response. Partially fixes <https://issues.guix.gnu.org/41625>. * guix/scripts/offload.scm (check-machine-availability): Handle the case where the checks raised an exception due to receiving EOF prematurely, and retry up to 3 times. * guix/inferior.scm (&inferior-premature-eof): New condition type. (read-repl-response): Raise a condition of the above type when reading EOF from the build machine's port. --- guix/inferior.scm | 15 ++++++++++++++ guix/scripts/offload.scm | 42 ++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/guix/inferior.scm b/guix/inferior.scm index 7c8e478f2a..e63b37a7dd 100644 --- a/guix/inferior.scm +++ b/guix/inferior.scm @@ -1,5 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2018, 2019, 2020, 2021 Ludovic Courtès <l...@gnu.org> +;;; Copyright © 2021 Maxim Cournoyer <maxim.courno...@gmail.com> ;;; ;;; This file is part of GNU Guix. ;;; @@ -70,6 +71,9 @@ inferior-exception-arguments inferior-exception-inferior inferior-exception-stack + inferior-premature-eof? + inferior-premature-eof-port + inferior-premature-eof-inferior read-repl-response inferior-packages @@ -228,6 +232,11 @@ equivalent. Return #f if the inferior could not be launched." (inferior inferior-exception-inferior) ;<inferior> | #f (stack inferior-exception-stack)) ;list of (FILE COLUMN LINE) +(define-condition-type &inferior-premature-eof &error + inferior-premature-eof? + (port inferior-premature-eof-port) + (inferior inferior-premature-eof-inferior)) + (define* (read-repl-response port #:optional inferior) "Read a (guix repl) response from PORT and return it as a Scheme object. Raise '&inferior-exception' when an exception is read from PORT." @@ -241,6 +250,12 @@ Raise '&inferior-exception' when an exception is read from PORT." (match (read port) (('values objects ...) (apply values (map sexp->object objects))) + ((? eof-object?) + ;; Unexpectedly read EOF from the port. This can happen for example when + ;; the underlying connection for PORT was lost with Guile-SSH. + (raise (condition (&inferior-premature-eof + (inferior inferior) + (port port))))) (('exception ('arguments key objects ...) ('stack frames ...)) ;; Protocol (0 1 1) and later. diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index b0fd20e158..4312eb4e22 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -705,20 +705,34 @@ machine." "Check whether MACHINE is available. Exit with an error upon failure." ;; Sometimes, the machine remote port may return EOF, presumably because the ;; connection was lost. Retry up to 3 times. - (let* ((name (build-machine-name machine)) - (socket (build-machine-daemon-socket machine)) - (session (open-ssh-session machine %short-timeout)) - (node (remote-inferior session))) - (dynamic-wind - (lambda () #t) - (lambda () - (assert-node-has-guix node name) - (assert-node-repl node name) - (assert-node-can-import session node name socket) - (assert-node-can-export session node name socket)) - (lambda () - (close-inferior node) - (disconnect! session))))) + (let loop ((retries 3)) + (guard (c ((inferior-premature-eof? c) + (let ((retries-left (1- retries)) + (inferior (inferior-premature-eof-inferior c))) + (if (> retries-left 0) + (begin + (info (G_ "got premature EOF from machine '~a' from \ +inferior '~a' on port '~a'; retrying connection~%") + (build-machine-name machine) + inferior + (inferior-premature-eof-port c)) + (loop (retries-left))) + (leave (G_ "connection repeatedly lost with machine '~a'~%") + (build-machine-name machine)))))) + (let* ((name (build-machine-name machine)) + (socket (build-machine-daemon-socket machine)) + (session (open-ssh-session machine %short-timeout)) + (node (remote-inferior session))) + (dynamic-wind + (lambda () #t) + (lambda () + (assert-node-has-guix node name) + (assert-node-repl node name) + (assert-node-can-import session node name socket) + (assert-node-can-export session node name socket)) + (lambda () + (close-inferior node) + (disconnect! session))))))) (define (check-machine-status machine-file pred) "Print the load of each machine matching PRED in MACHINE-FILE." -- 2.31.1