Hi Ludo,
Now that I have root access to overdrive1, I could strace the sshd
process (I just did 'strace -p340', noting the process of sshd displayed
with 'herd status sshd'):
--8<---------------cut here---------------start------------->8---
pselect6(87, [3 4], NULL, NULL, NULL, NULL) = 1 (in [3])
accept(3, {sa_family=AF_INET, sin_port=htons(33262),
sin_addr=inet_addr("66.158.152.121")}, [128->16]) = 5
fcntl(5, F_GETFL) = 0x2 (flags O_RDWR)
pipe2([6, 7], 0) = 0
socketpair(AF_UNIX, SOCK_STREAM, 0, [8, 9]) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0xffff8e0ef0e0) = 644
close(7) = 0
close(9) = 0
write(8, "\0\0\1\245\0", 5) = 5
write(8, "\0\0\1\234\nPort 22\nPermitRootLogin no\n"..., 420) = 420
close(8) = 0
close(5) = 0
getpid() = 340
getpid() = 340
getpid() = 340
getpid() = 340
getpid() = 340
getpid() = 340
getpid() = 340
pselect6(87, [3 4 6], NULL, NULL, NULL, NULL) = 1 (in [6])
read(6, "\0", 1) = 1
pselect6(87, [3 4 6], NULL, NULL, NULL, NULL) = 1 (in [6])
read(6, "", 1) = 0
close(6) = 0
pselect6(87, [3 4], NULL, NULL, NULL, NULL) = ? ERESTARTNOHAND (To be restarted
if no handler)
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=644, si_uid=0,
si_status=255, si_utime=1, si_stime=0} ---
wait4(-1, [{WIFEXITED(s) && WEXITSTATUS(s) == 255}], WNOHANG, NULL) = 644
wait4(-1, 0xfffffa4d90e4, WNOHANG, NULL) = 0
rt_sigreturn({mask=[]}) = -1 EINTR (Interrupted system call)
pselect6(87, [3 4], NULL, NULL, NULL, NULL <detached ...>
--8<---------------cut here---------------end--------------->8---
With the attached v3 patch, the corresponding output (still problematic)
was:
--8<---------------cut here---------------start------------->8---
$ ./pre-inst-env guix offload test /etc/guix/machines.scm overdrive1
guix offload: Testing 1 build machines defined in '/etc/guix/machines.scm'...
guix offload: got premature EOF from machine 'overdrive1.guix.gnu.org' from
inferior '#<inferior pipe (0 1 1) 7f6ee10d3400>' on port '#<input-output:
channel (open) 7f6ee10c5500>'; retrying connection
Backtrace:
In ice-9/boot-9.scm:
1752:10 10 (with-exception-handler _ _ #:unwind? _ #:unwind-for-type _)
In unknown file:
9 (apply-smob/0 #<thunk 7f6ee5ae9f60>)
In ice-9/boot-9.scm:
724:2 8 (call-with-prompt _ _ #<procedure default-prompt-handler (k proc)>)
In ice-9/eval.scm:
619:8 7 (_ #(#(#<directory (guile-user) 7f6ee5ae3c80>)))
In guix/ui.scm:
2161:12 6 (run-guix-command _ . _)
In ice-9/boot-9.scm:
1752:10 5 (with-exception-handler _ _ #:unwind? _ #:unwind-for-type _)
1747:15 4 (with-exception-handler #<procedure 7f6ee25e1480 at
ice-9/boot-9.scm:1831:7 (exn)> _ # _ # …)
In ice-9/threads.scm:
288:21 3 (loop _)
In guix/scripts/offload.scm:
719:29 2 (_ _)
719:29 1 (_ _)
In ice-9/boot-9.scm:
1685:16 0 (raise-exception _ #:continuable? _)
ice-9/boot-9.scm:1685:16: In procedure raise-exception:
Wrong type to apply: 2
--8<---------------cut here---------------end--------------->8---
I haven't tried analyzing the strace output yet.
Maxim
>From c7b2ec1c58adf8c795df0a6aaf075dbc331f41e8 Mon Sep 17 00:00:00 2001
From: Maxim Cournoyer <[email protected]>
Date: Thu, 27 May 2021 08:44:44 -0400
Subject: [PATCH 1/2] offload: Parallelize machine check in offload test.
* guix/scripts/offload.scm (check-machine-availability): Refactor so that it
takes a single machine object. Ensure the cleanup code is always run.
(check-machines-availability): New procedure. Call
CHECK-MACHINES-AVAILABILITY in parallel, which improves performance (about
twice as fast with 4 build machines, from ~30 s to ~15 s).
---
guix/scripts/offload.scm | 39 +++++++++++++++++++++++++--------------
1 file changed, 25 insertions(+), 14 deletions(-)
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm
index 835078cb97..b0fd20e158 100644
--- a/guix/scripts/offload.scm
+++ b/guix/scripts/offload.scm
@@ -1,7 +1,7 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès <[email protected]>
;;; Copyright © 2017 Ricardo Wurmus <[email protected]>
-;;; Copyright © 2020 Maxim Cournoyer <[email protected]>
+;;; Copyright © 2020, 2021 Maxim Cournoyer <[email protected]>
;;; Copyright © 2020 Julien Lepiller <[email protected]>
;;;
;;; This file is part of GNU Guix.
@@ -53,6 +53,7 @@
#:use-module (ice-9 regex)
#:use-module (ice-9 format)
#:use-module (ice-9 binary-ports)
+ #:use-module (ice-9 threads)
#:export (build-machine
build-machine?
build-machine-name
@@ -684,7 +685,7 @@ daemon is not running."
(leave (G_ "failed to import '~a' from '~a'~%")
item name)))))
-(define (check-machine-availability machine-file pred)
+(define (check-machines-availability machine-file pred)
"Check that each machine matching PRED in MACHINE-FILE is usable as a build
machine."
(define (build-machine=? m1 m2)
@@ -696,18 +697,28 @@ machine."
(let ((machines (filter pred
(delete-duplicates (build-machines machine-file)
build-machine=?))))
- (info (G_ "testing ~a build machines defined in '~a'...~%")
+ (info (G_ "Testing ~a build machines defined in '~a'...~%")
(length machines) machine-file)
- (let* ((names (map build-machine-name machines))
- (sockets (map build-machine-daemon-socket machines))
- (sessions (map (cut open-ssh-session <> %short-timeout) machines))
- (nodes (map remote-inferior sessions)))
- (for-each assert-node-has-guix nodes names)
- (for-each assert-node-repl nodes names)
- (for-each assert-node-can-import sessions nodes names sockets)
- (for-each assert-node-can-export sessions nodes names sockets)
- (for-each close-inferior nodes)
- (for-each disconnect! sessions))))
+ (par-for-each check-machine-availability machines)))
+
+(define (check-machine-availability machine)
+ "Check whether MACHINE is available. Exit with an error upon failure."
+ ;; Sometimes, the machine remote port may return EOF, presumably because the
+ ;; connection was lost. Retry up to 3 times.
+ (let* ((name (build-machine-name machine))
+ (socket (build-machine-daemon-socket machine))
+ (session (open-ssh-session machine %short-timeout))
+ (node (remote-inferior session)))
+ (dynamic-wind
+ (lambda () #t)
+ (lambda ()
+ (assert-node-has-guix node name)
+ (assert-node-repl node name)
+ (assert-node-can-import session node name socket)
+ (assert-node-can-export session node name socket))
+ (lambda ()
+ (close-inferior node)
+ (disconnect! session)))))
(define (check-machine-status machine-file pred)
"Print the load of each machine matching PRED in MACHINE-FILE."
@@ -824,7 +835,7 @@ machine."
((file) (values file (const #t)))
(() (values %machine-file (const #t)))
(x (leave (G_ "wrong number of arguments~%"))))))
- (check-machine-availability (or file %machine-file) pred))))
+ (check-machines-availability (or file %machine-file) pred))))
(("status" rest ...)
(with-error-handling
(let-values (((file pred)
--
2.31.1
>From b5558777617e4674a150895458d57d202de56120 Mon Sep 17 00:00:00 2001
From: Maxim Cournoyer <[email protected]>
Date: Tue, 25 May 2021 08:42:06 -0400
Subject: [PATCH 2/2] offload: Handle a possible EOF response from
read-repl-response.
Partially fixes <https://issues.guix.gnu.org/41625>.
* guix/scripts/offload.scm (check-machine-availability): Handle the case where
the checks raised an exception due to receiving EOF prematurely, and retry up
to 3 times.
* guix/inferior.scm (&inferior-premature-eof): New condition type.
(read-repl-response): Raise a condition of the above type when reading EOF
from the build machine's port.
---
guix/inferior.scm | 15 ++++++++++++++
guix/scripts/offload.scm | 42 ++++++++++++++++++++++++++--------------
2 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/guix/inferior.scm b/guix/inferior.scm
index 7c8e478f2a..e63b37a7dd 100644
--- a/guix/inferior.scm
+++ b/guix/inferior.scm
@@ -1,5 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2018, 2019, 2020, 2021 Ludovic Courtès <[email protected]>
+;;; Copyright © 2021 Maxim Cournoyer <[email protected]>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -70,6 +71,9 @@
inferior-exception-arguments
inferior-exception-inferior
inferior-exception-stack
+ inferior-premature-eof?
+ inferior-premature-eof-port
+ inferior-premature-eof-inferior
read-repl-response
inferior-packages
@@ -228,6 +232,11 @@ equivalent. Return #f if the inferior could not be launched."
(inferior inferior-exception-inferior) ;<inferior> | #f
(stack inferior-exception-stack)) ;list of (FILE COLUMN LINE)
+(define-condition-type &inferior-premature-eof &error
+ inferior-premature-eof?
+ (port inferior-premature-eof-port)
+ (inferior inferior-premature-eof-inferior))
+
(define* (read-repl-response port #:optional inferior)
"Read a (guix repl) response from PORT and return it as a Scheme object.
Raise '&inferior-exception' when an exception is read from PORT."
@@ -241,6 +250,12 @@ Raise '&inferior-exception' when an exception is read from PORT."
(match (read port)
(('values objects ...)
(apply values (map sexp->object objects)))
+ ((? eof-object?)
+ ;; Unexpectedly read EOF from the port. This can happen for example when
+ ;; the underlying connection for PORT was lost with Guile-SSH.
+ (raise (condition (&inferior-premature-eof
+ (inferior inferior)
+ (port port)))))
(('exception ('arguments key objects ...)
('stack frames ...))
;; Protocol (0 1 1) and later.
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm
index b0fd20e158..4312eb4e22 100644
--- a/guix/scripts/offload.scm
+++ b/guix/scripts/offload.scm
@@ -705,20 +705,34 @@ machine."
"Check whether MACHINE is available. Exit with an error upon failure."
;; Sometimes, the machine remote port may return EOF, presumably because the
;; connection was lost. Retry up to 3 times.
- (let* ((name (build-machine-name machine))
- (socket (build-machine-daemon-socket machine))
- (session (open-ssh-session machine %short-timeout))
- (node (remote-inferior session)))
- (dynamic-wind
- (lambda () #t)
- (lambda ()
- (assert-node-has-guix node name)
- (assert-node-repl node name)
- (assert-node-can-import session node name socket)
- (assert-node-can-export session node name socket))
- (lambda ()
- (close-inferior node)
- (disconnect! session)))))
+ (let loop ((retries 3))
+ (guard (c ((inferior-premature-eof? c)
+ (let ((retries-left (1- retries))
+ (inferior (inferior-premature-eof-inferior c)))
+ (if (> retries-left 0)
+ (begin
+ (info (G_ "got premature EOF from machine '~a' from \
+inferior '~a' on port '~a'; retrying connection~%")
+ (build-machine-name machine)
+ inferior
+ (inferior-premature-eof-port c))
+ (loop (retries-left)))
+ (leave (G_ "connection repeatedly lost with machine '~a'~%")
+ (build-machine-name machine))))))
+ (let* ((name (build-machine-name machine))
+ (socket (build-machine-daemon-socket machine))
+ (session (open-ssh-session machine %short-timeout))
+ (node (remote-inferior session)))
+ (dynamic-wind
+ (lambda () #t)
+ (lambda ()
+ (assert-node-has-guix node name)
+ (assert-node-repl node name)
+ (assert-node-can-import session node name socket)
+ (assert-node-can-export session node name socket))
+ (lambda ()
+ (close-inferior node)
+ (disconnect! session)))))))
(define (check-machine-status machine-file pred)
"Print the load of each machine matching PRED in MACHINE-FILE."
--
2.31.1