[
https://issues.apache.org/jira/browse/QPID-6297?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Pavel Moravec updated QPID-6297:
--------------------------------
Attachment: goferBug.cap
tcpdump from the error.
(I changed SSL to non-SSL traffic by purpose, for easily follofing tcpdump)
What happens:
- I bounced qpidd at the beginning to see goferd reconnect & reestablish
sessions
- two TCP connections established (ports 42292 and 42293), the 2nd one creates
an AMQP session named "42d8f2cf-.." (important below)
- at 14:37:49 GMT, I blocked goferd->qpidd traffic (iptables -j DROP) for 14
seconds; since the client machine had 4 TCP retries set, it was enough
- retries on the TCP connection on port 42293 causes goferd detecting
connection loss - but only _this_ connection seems to be lost, not the 42292 one
- at 14:38:04 GMT, new (one) connection established from port 42296. It tried
to attach to the session named "42d8f2cf-.." and was bounced by the broker by
"session-busy" / session already attached. TCP connection closed, no further
goferd activity on wire
netstat outputs after the test:
goferd client:
[root@localhost ~]# netstat -anp | grep 5672
tcp 0 0 10.34.84.221:42292 10.34.84.76:5672 ESTABLISHED
44483/python
tcp 0 0 10.34.84.221:42296 10.34.84.76:5672 TIME_WAIT
-
[root@localhost ~]#
qpidd server:
[root@pmoravec-sat6 ~]# netstat -anp | grep 5672
tcp 0 0 0.0.0.0:5672 0.0.0.0:*
LISTEN 119588/qpidd
tcp 0 0 10.34.84.76:5672 10.34.84.221:42292
ESTABLISHED 119588/qpidd
tcp 0 0 10.34.84.76:5672 10.34.84.221:42288
TIME_WAIT -
tcp 0 0 10.34.84.76:5672 10.34.84.221:42289
TIME_WAIT -
tcp 0 0 10.34.84.76:5672 10.34.84.221:42293
ESTABLISHED 119588/qpidd
tcp 0 0 :::5672 :::*
LISTEN 119588/qpidd
[root@pmoravec-sat6 ~]#
gofer logs from the test:
Jan 11 15:37:40 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 1]: connection aborted
Jan 11 15:37:40 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 1 seconds
Jan 11 15:37:40 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 1]: connection aborted
Jan 11 15:37:40 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 1 seconds
Jan 11 15:37:41 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:41 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 2]: [Errno 111] Connection refused
Jan 11 15:37:41 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 2 seconds
Jan 11 15:37:41 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:41 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 2]: [Errno 111] Connection refused
Jan 11 15:37:41 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 2 seconds
Jan 11 15:37:43 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:43 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:43 localhost goferd: [WARNING][Thread-2] qpid.messaging:407 -
reconnect succeeded: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:43 localhost goferd: [WARNING][Thread-2] qpid.messaging:407 -
reconnect succeeded: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:57 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 0]: [Errno 111] Connection refused
Jan 11 15:37:57 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 1 seconds
Jan 11 15:37:58 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:37:58 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 1]: [Errno 111] Connection refused
Jan 11 15:37:58 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 2 seconds
Jan 11 15:38:00 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:38:00 localhost goferd: [WARNING][Thread-2] qpid.messaging:453 -
recoverable error[attempt 2]: [Errno 111] Connection refused
Jan 11 15:38:00 localhost goferd: [WARNING][Thread-2] qpid.messaging:455 -
sleeping 4 seconds
Jan 11 15:38:04 localhost goferd: [WARNING][Thread-2] qpid.messaging:537 -
trying: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:38:04 localhost goferd: [WARNING][Thread-2] qpid.messaging:407 -
reconnect succeeded: pmoravec-sat6.gsslab.brq.redhat.com:5672
Jan 11 15:38:04 localhost goferd:
[ERROR][pulp.agent.0e6a5d29-33aa-40bd-a1e1-ed1ed31da69a]
gofer.transport.qpid.consumer:117 - fb934bf3-cd6e-451c-a6c3-46af86dda59d
Jan 11 15:38:04 localhost goferd:
[ERROR][pulp.agent.0e6a5d29-33aa-40bd-a1e1-ed1ed31da69a]
gofer.transport.qpid.consumer:117 - Traceback (most recent call last): File
"/usr/lib/python2.7/site-packages/gofer/transport/qpid/consumer.py", line 113,
in get return self.__receiver.fetch(timeout=timeout) File "<string>", line 6,
in fetch File "/usr/lib/python2.7/site-packages/qpid/messaging/endpoints.py",
line 1041, in fetch self._ecwait(lambda: not self.draining) File
"/usr/lib/python2.7/site-packages/qpid/messaging/endpoints.py", line 50, in
_ecwait result = self._ewait(lambda: self.closed or predicate(), timeout) File
"/usr/lib/python2.7/site-packages/qpid/messaging/endpoints.py", line 993, in
_ewait result = self.session._ewait(lambda: self.error or predicate(), timeout)
File "/usr/lib/python2.7/site-packages/qpid/messaging/endpoints.py", line 580,
in _ewait result = self.connection._ewait(lambda: self.error or predicate(),
timeout) File "/usr/lib/python2.7/site-packages/qpid/messaging/endpoints.py",
line 219, in _ewait self.check_error() File
"/usr/lib/python2.7/site-packages/qpid/messaging/endpoints.py", line 212, in
check_error raise e InternalError: Traceback (most recent call last): File
"/usr/lib/python2.7/site-packages/qpid/messaging/driver.py", line 660, in write
op.dispatch(self) File "/usr/lib/python2.7/site-packages/qpid/ops.py", line 84,
in dispatch getattr(target, handler)(self, *args) File
"/usr/lib/python2.7/site-packages/qpid/messaging/driver.py", line 877, in
do_session_detached sst = self._sessions.pop(dtc.channel) KeyError: 0
> Python client (qpid.messaging) raises KeyError insead of reconnecting
> ---------------------------------------------------------------------
>
> Key: QPID-6297
> URL: https://issues.apache.org/jira/browse/QPID-6297
> Project: Qpid
> Issue Type: Bug
> Components: Python Client
> Affects Versions: 0.22
> Environment: EL6
> Reporter: Jeff Ortel
> Attachments: goferBug.cap
>
>
> Description of problem:
> Having some temporary network outage causing gofer loses TCP connection to
> AMQP broker, it does not try to reconnect.
> How reproducible:
> 100%
> Steps to Reproduce:
> 1. Just to speedup reproducer, lower kernel tunable net.ipv4.tcp_retries2 to
> e.g. 4:
> echo 4 > /proc/sys/net/ipv4/tcp_retries2
> 2. Have consumer connected (with auto-reconnect enabled and heartbeats not
> enabled) and receiver open on a queue address and check its TCP connections
> to AMQP broker:
> netstat -anp | grep 5671
> (there should be 2 TCP connections)
> 3. Emulate network outage via iptables:
> iptables -A OUTPUT -p tcp --dport 5671 -j REJECT
> 4. Monitor /var/log/messages; once it logs WARNING "recoverable error", flush
> iptables (iptables -F).
> 5. Wait few seconds.
> 6. Check gofer TCP connections:
> netstat -anp | grep 5671
> Actual results:
> 6. shows just 1 TCP connection
> /var/log/messages repeatedly logs:
> Dec 1 16:39:02 pmoravec-rhel6-3 goferd:
> [ERROR][pulp.agent.a726580c-5f1e-4a79-9f11-de0adc52c1e9]
> gofer.transport.qpid.consumer:117 - 046d2084-b0f1-4de4-a039-89499d9e680d
> Dec 1 16:39:02 pmoravec-rhel6-3 goferd:
> [ERROR][pulp.agent.a726580c-5f1e-4a79-9f11-de0adc52c1e9]
> gofer.transport.qpid.consumer:117 - Traceback (most recent call last): File
> "/usr/lib/python2.6/site-packages/gofer/transport/qpid/consumer.py", line
> 113, in get return self.__receiver.fetch(timeout=timeout) File "<string>",
> line 6, in fetch File
> "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 1030, in
> fetch self._ecwait(lambda: self.linked) File
> "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 50, in
> _ecwait result = self._ewait(lambda: self.closed or predicate(), timeout)
> File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line
> 993, in _ewait result = self.session._ewait(lambda: self.error or
> predicate(), timeout) File
> "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 580, in
> _ewait result = self.connection._ewait(lambda: self.error or predicate(),
> timeout) File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py",
> line 219, in _ewait self.check_error() File
> "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 212, in
> check_error raise e InternalError: Traceback (most recent call last): File
> "/usr/lib/python2.6/site-packages/qpid/messaging/driver.py", line 660, in
> write op.dispatch(self) File "/usr/lib/python2.6/site-packages/qpid/ops.py",
> line 84, in dispatch getattr(target, handler)(self, *args) File
> "/usr/lib/python2.6/site-packages/qpid/messaging/driver.py", line 877, in
> do_session_detached sst = self._sessions.pop(dtc.channel) KeyError: 'pop():
> dictionary is empty'
> Expected results:
> 2nd TCP connection re-established, no errors in /var/log/messages
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]