[ 
https://ovirt-jira.atlassian.net/browse/OVIRT-938?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=24837#comment-24837
 ] 

Barak Korren commented on OVIRT-938:
------------------------------------

Trying to get more info about this, here are the journal lines we get on the 
slave when the job fails:
{code}
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org libvirtd[17504]: End of file while 
reading data: Input/output error
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_END pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=login id=1012 exe="/usr/sbin/sshd" hostname=66.187.230.92 
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_LOGOUT pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=login id=1012 exe="/usr/sbin/sshd" hostname=66.187.230.92 
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1106 
audit(1482333814.930:3694): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012 
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh 
res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1113 
audit(1482333814.930:3695): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012 
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh 
res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1112 
audit(1482333814.952:3696): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012 
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh 
res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1105 
audit(1482333814.952:3697): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012 
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh 
res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_LOGIN pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=login id=1012 exe="/usr/sbin/sshd" hostname=66.187.230.92 
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_START pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=login id=1012 exe="/usr/sbin/sshd" hostname=66.187.230.92 
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER 
pid=5410 uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=destroy kind=server 
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
 direction=? spid=25269 suid=1012  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=2404 
audit(1482333814.953:3698): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy kind=server 
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
 direction=? spid=25269 suid=1012  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org sshd[5421]: Received disconnect 
from 66.187.230.92 port 53024:11: Closed due to user request.
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=2404 
audit(1482333814.989:3699): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy kind=server 
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
 direction=? spid=5421 suid=1012  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=2404 
audit(1482333814.989:3700): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=destroy kind=session fp=? 
direction=both spid=5421 suid=1012 rport=53024 laddr=172.19.12.136 lport=22  
exe="/usr/sbin/sshd" hostname=? addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER 
pid=5410 uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=destroy kind=server 
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
 direction=? spid=5421 suid=1012  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER 
pid=5410 uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=destroy kind=session fp=? direction=both spid=5421 suid=1012 
rport=53024 laddr=172.19.12.136 lport=22  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org sshd[5421]: Disconnected from 
66.187.230.92 port 53024
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1106 
audit(1482333814.995:3701): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=PAM:session_close 
grantors=pam_selinux,pam_loginuid,pam_selinux,pam_namespace,pam_keyinit,pam_keyinit,pam_limits,pam_systemd,pam_unix,pam_lastlog
 acct="jenkins" exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 
terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1104 
audit(1482333814.995:3702): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=PAM:setcred 
grantors=pam_env,pam_unix acct="jenkins" exe="/usr/sbin/sshd" 
hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org kernel: audit: type=1106 
audit(1482333814.996:3703): pid=5410 uid=0 auid=1012 ses=3 
subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 msg='op=login id=1012 
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh 
res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_END pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=PAM:session_close 
grantors=pam_selinux,pam_loginuid,pam_selinux,pam_namespace,pam_keyinit,pam_keyinit,pam_limits,pam_systemd,pam_unix,pam_lastlog
 acct="jenkins" exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 
terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRED_DISP pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=PAM:setcred grantors=pam_env,pam_unix acct="jenkins" 
exe="/usr/sbin/sshd" hostname=66.187.230.92 addr=66.187.230.92 terminal=ssh 
res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_END pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=login id=1012 exe="/usr/sbin/sshd" hostname=66.187.230.92 
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: USER_LOGOUT pid=5410 
uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=login id=1012 exe="/usr/sbin/sshd" hostname=66.187.230.92 
addr=66.187.230.92 terminal=ssh res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER 
pid=5410 uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=destroy kind=server 
fp=SHA256:c1:30:1f:20:35:5f:1a:4a:cc:9f:c9:eb:05:0c:0f:41:06:32:52:d7:d3:e9:fc:0a:aa:ae:f4:ce:98:15:b3:4c
 direction=? spid=5410 suid=0  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER 
pid=5410 uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=destroy kind=server 
fp=SHA256:3d:5f:aa:8f:5f:6d:01:ba:8a:6c:ca:2a:ae:4e:78:f6:19:cd:44:7c:df:6a:4d:58:ef:be:16:c0:41:b9:a8:a4
 direction=? spid=5410 suid=0  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org audit[5410]: CRYPTO_KEY_USER 
pid=5410 uid=0 auid=1012 ses=3 subj=system_u:system_r:sshd_t:s0-s0:c0.c1023 
msg='op=destroy kind=server 
fp=SHA256:95:42:59:76:99:08:09:ae:94:d8:46:3e:61:7d:53:b6:e5:1e:f5:f5:73:f2:04:87:38:ca:80:4f:44:6d:df:d5
 direction=? spid=5410 suid=0  exe="/usr/sbin/sshd" hostname=? 
addr=66.187.230.92 terminal=? res=success'
Dec 21 15:23:34 vm0136.workers-phx.ovirt.org sshd[5410]: 
pam_unix(sshd:session): session closed for user jenkins
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd-logind[444]: Removed 
session 3.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[1]: Stopping User Manager 
for UID 1012...
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target 
Default.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target 
Basic System.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target 
Timers.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target 
Paths.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Stopped target 
Sockets.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Closed D-Bus User 
Message Bus Socket.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Reached target 
Shutdown.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Starting Exit the 
Session...
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[5414]: Received 
SIGRTMIN+24 from PID 25273 (kill).
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[1]: Stopped User Manager 
for UID 1012.
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org audit[1]: SERVICE_STOP pid=1 uid=0 
auid=4294967295 ses=4294967295 subj=system_u:system_r:init_t:s0 
msg='unit=user@1012 comm="systemd" exe="/usr/lib/systemd/systemd" hostname=? 
addr=? terminal=? res=success'
Dec 21 15:23:35 vm0136.workers-phx.ovirt.org systemd[1]: Removed slice User 
Slice of jenkins.
{code}


> Fix Jenkins slave connection dying on vdsm check_merged jobs
> ------------------------------------------------------------
>
>                 Key: OVIRT-938
>                 URL: https://ovirt-jira.atlassian.net/browse/OVIRT-938
>             Project: oVirt - virtualization made easy
>          Issue Type: Bug
>          Components: Jenkins
>            Reporter: Barak Korren
>            Assignee: infra
>
> Something in the vdsm build_artifacs job makes the Jenkins slave disconnect 
> when it is running. This in turn makes the cleanup scripts not run on the 
> slave leaving it dirty enough to make the next job on that slave fail.
> Example of this can be seen here:
> http://jenkins.ovirt.org/job/vdsm_master_check-merged-el7-x86_64/692/console
> Relevant log lines:
> {code}
> 21:49:00 Ran 44 tests in 1231.988s
> 21:49:00 
> 21:49:00 OK
> 21:49:00 + return 0
> 21:49:00 sh: [13086: 1 (255)] tcsetattr: Inappropriate ioctl for device
> 21:49:00 Took 2464 seconds
> 21:49:00 ===================================
> 21:49:00 logout
> 21:49:01 Slave went offline during the build
> 21:49:01 ERROR: Connection was broken: java.io.IOException: Unexpected 
> termination of the channel
> 21:49:01      at 
> hudson.remoting.SynchronousCommandTransport$ReaderThread.run(SynchronousCommandTransport.java:50)
> 21:49:01 Caused by: java.io.EOFException
> 21:49:01      at 
> java.io.ObjectInputStream$PeekInputStream.readFully(ObjectInputStream.java:2353)
> 21:49:01      at 
> java.io.ObjectInputStream$BlockDataInputStream.readShort(ObjectInputStream.java:2822)
> 21:49:01      at 
> java.io.ObjectInputStream.readStreamHeader(ObjectInputStream.java:804)
> 21:49:01      at java.io.ObjectInputStream.<init>(ObjectInputStream.java:301)
> 21:49:01      at 
> hudson.remoting.ObjectInputStreamEx.<init>(ObjectInputStreamEx.java:48)
> 21:49:01      at 
> hudson.remoting.AbstractSynchronousByteArrayCommandTransport.read(AbstractSynchronousByteArrayCommandTransport.java:34)
> 21:49:01      at 
> hudson.remoting.SynchronousCommandTransport$ReaderThread.run(SynchronousCommandTransport.java:48)
> 21:49:01 
> 21:49:01 Build step 'Execute shell' marked build as failure
> 21:49:01 Performing Post build task...
> 21:49:01 Match found for :.* : True
> 21:49:01 Logical operation result is TRUE
> 21:49:01 Running script  : #!/bin/bash -x
> 21:49:01 echo "shell-scripts/mock_cleanup.sh"
> ... SNIP ...
> 21:49:01 Exception when executing the batch command : no workspace from node 
> hudson.slaves.DumbSlave[fc24-vm06.phx.ovirt.org] which is computer 
> hudson.slaves.SlaveComputer@30863c81 and has channel null
> 21:49:01 Build step 'Post build task' marked build as failure
> 21:49:02 ERROR: Step ?Archive the artifacts? failed: no workspace for 
> vdsm_master_check-merged-el7-x86_64 #692
> 21:49:02 ERROR: Failed to evaluate groovy script.
> 21:49:02 java.lang.NullPointerException: Cannot invoke method child() on null 
> object
> 21:49:02      at 
> org.codehaus.groovy.runtime.NullObject.invokeMethod(NullObject.java:77)
> 21:49:02      at 
> org.codehaus.groovy.runtime.callsite.PogoMetaClassSite.call(PogoMetaClassSite.java:45)
> 21:49:02      at 
> org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:42)
> 21:49:02      at 
> org.codehaus.groovy.runtime.callsite.NullCallSite.call(NullCallSite.java:32)
> 21:49:02      at 
> org.codehaus.groovy.runtime.callsite.CallSiteArray.defaultCall(CallSiteArray.java:42)
> 21:49:02      at 
> org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:108)
> 21:49:02      at 
> org.codehaus.groovy.runtime.callsite.AbstractCallSite.call(AbstractCallSite.java:116)
> 21:49:02      at Script1.run(Script1.groovy:2)
> 21:49:02      at groovy.lang.GroovyShell.evaluate(GroovyShell.java:580)
> 21:49:02      at groovy.lang.GroovyShell.evaluate(GroovyShell.java:618)
> 21:49:02      at groovy.lang.GroovyShell.evaluate(GroovyShell.java:589)
> 21:49:02      at 
> org.jenkinsci.plugins.scriptsecurity.sandbox.groovy.SecureGroovyScript.evaluate(SecureGroovyScript.java:166)
> 21:49:02      at 
> org.jvnet.hudson.plugins.groovypostbuild.GroovyPostbuildRecorder.perform(GroovyPostbuildRecorder.java:361)
> 21:49:02      at 
> hudson.tasks.BuildStepMonitor$1.perform(BuildStepMonitor.java:20)
> 21:49:02      at 
> hudson.model.AbstractBuild$AbstractBuildExecution.perform(AbstractBuild.java:782)
> 21:49:02      at 
> hudson.model.AbstractBuild$AbstractBuildExecution.performAllBuildSteps(AbstractBuild.java:723)
> 21:49:02      at hudson.model.Build$BuildExecution.post2(Build.java:185)
> 21:49:02      at 
> hudson.model.AbstractBuild$AbstractBuildExecution.post(AbstractBuild.java:668)
> 21:49:02      at hudson.model.Run.execute(Run.java:1763)
> 21:49:02      at hudson.model.FreeStyleBuild.run(FreeStyleBuild.java:43)
> 21:49:02      at 
> hudson.model.ResourceController.execute(ResourceController.java:98)
> 21:49:02      at hudson.model.Executor.run(Executor.java:410)
> 21:49:02 Build step 'Groovy Postbuild' marked build as failure
> 21:49:02 Started calculate disk usage of build
> 21:49:02 Finished Calculation of disk usage of build in 0 seconds
> 21:49:02 Finished: FAILURE
> {code}



--
This message was sent by Atlassian JIRA
(v1000.621.5#100023)
_______________________________________________
Infra mailing list
Infra@ovirt.org
http://lists.ovirt.org/mailman/listinfo/infra

Reply via email to