Hit this today on latest Havana build, logs below. I reproduced doing some stress testing; create 50 instances boot from volume in one operation. Need to try it in my Icehouse setup next.
2014-03-20 00:42:51.725 17580 INFO nova.compute.manager [req-ef61a326-288b-494d-9d30-f533e7739949 None None] Updating bandwidth usage cache 2014-03-20 00:43:51.843 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources 2014-03-20 00:43:52.607 17580 AUDIT nova.compute.resource_tracker [-] Free ram (MB): 257773 2014-03-20 00:43:52.607 17580 AUDIT nova.compute.resource_tracker [-] Free disk (GB): 49 2014-03-20 00:43:52.607 17580 AUDIT nova.compute.resource_tracker [-] Free VCPUS: 48 2014-03-20 00:43:52.677 17580 INFO nova.compute.resource_tracker [-] Compute_service record updated for os-1.solidfire.net:os-1.solidfire.net 2014-03-20 00:44:52.808 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources 2014-03-20 00:44:53.589 17580 AUDIT nova.compute.resource_tracker [-] Free ram (MB): 257773 2014-03-20 00:44:53.589 17580 AUDIT nova.compute.resource_tracker [-] Free disk (GB): 49 2014-03-20 00:44:53.590 17580 AUDIT nova.compute.resource_tracker [-] Free VCPUS: 48 2014-03-20 00:44:53.657 17580 INFO nova.compute.resource_tracker [-] Compute_service record updated for os-1.solidfire.net:os-1.solidfire.net 2014-03-20 08:47:05.277 17580 ERROR nova.openstack.common.rpc.impl_qpid [-] Failed to publish message to topic 'conductor': heartbeat timeout 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid Traceback (most recent call last): 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 540, in ensure 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid return method(*args, **kwargs) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 632, in _publisher_send 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid publisher = cls(self.conf, self.session, topic) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 398, in __init__ 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid super(TopicPublisher, self).__init__(conf, session, node_name) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 328, in __init__ 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid self.reconnect(session) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 332, in reconnect 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid self.sender = session.sender(self.address) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "<string>", line 6, in sender 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 592, in sender 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid sender._ewait(lambda: sender.linked) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 799, in _ewait 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid result = self.session._ewait(lambda: self.error or predicate(), timeout) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 566, in _ewait 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid result = self.connection._ewait(lambda: self.error or predicate(), timeout) 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 209, in _ewait 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid self.check_error() 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 202, in check_error 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid raise self.error 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid HeartbeatTimeout: heartbeat timeout 2014-03-20 08:47:05.277 17580 TRACE nova.openstack.common.rpc.impl_qpid 2014-03-20 08:47:05.295 17580 ERROR nova.openstack.common.rpc.impl_qpid [-] Failed to consume message from queue: heartbeat timeout 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid Traceback (most recent call last): 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 540, in ensure 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid return method(*args, **kwargs) 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 597, in _consume 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid nxt_receiver = self.session.next_receiver(timeout=timeout) 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "<string>", line 6, in next_receiver 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 660, in next_receiver 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid if self._ecwait(lambda: self.incoming, timeout): 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 50, in _ecwait 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid result = self._ewait(lambda: self.closed or predicate(), timeout) 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 566, in _ewait 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid result = self.connection._ewait(lambda: self.error or predicate(), timeout) 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 209, in _ewait 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid self.check_error() 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 202, in check_error 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid raise self.error 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid HeartbeatTimeout: heartbeat timeout 2014-03-20 08:47:05.295 17580 TRACE nova.openstack.common.rpc.impl_qpid 2014-03-20 08:47:05.301 17580 ERROR nova.openstack.common.rpc.impl_qpid [-] Failed to consume message from queue: heartbeat timeout 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid Traceback (most recent call last): 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 540, in ensure 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid return method(*args, **kwargs) 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/nova/openstack/common/rpc/impl_qpid.py", line 597, in _consume 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid nxt_receiver = self.session.next_receiver(timeout=timeout) 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "<string>", line 6, in next_receiver 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 660, in next_receiver 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid if self._ecwait(lambda: self.incoming, timeout): 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 50, in _ecwait 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid result = self._ewait(lambda: self.closed or predicate(), timeout) 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 566, in _ewait 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid result = self.connection._ewait(lambda: self.error or predicate(), timeout) 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 209, in _ewait 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid self.check_error() 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid File "/usr/lib/python2.6/site-packages/qpid/messaging/endpoints.py", line 202, in check_error 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid raise self.error 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid HeartbeatTimeout: heartbeat timeout 2014-03-20 08:47:05.301 17580 TRACE nova.openstack.common.rpc.impl_qpid 2014-03-20 08:47:05.326 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672 2014-03-20 08:47:05.330 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672 2014-03-20 08:47:05.332 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672 2014-03-20 08:47:05.335 17580 INFO nova.openstack.common.rpc.impl_qpid [-] Connected to AMQP server on 172.27.33.21:5672 2014-03-20 08:47:05.458 17580 INFO nova.compute.manager [-] Running instance usage audit for host os-1.solidfire.net from 2014-03-20 13:00:00 to 2014-03-20 14:00:00. 0 instances. 2014-03-20 08:47:05.532 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources 2014-03-20 08:47:06.269 17580 AUDIT nova.compute.resource_tracker [-] Free ram (MB): 257773 2014-03-20 08:47:06.269 17580 AUDIT nova.compute.resource_tracker [-] Free disk (GB): 49 2014-03-20 08:47:06.269 17580 AUDIT nova.compute.resource_tracker [-] Free VCPUS: 48 2014-03-20 08:47:06.336 17580 INFO nova.compute.resource_tracker [-] Compute_service record updated for os-1.solidfire.net:os-1.solidfire.net 2014-03-20 08:47:06.482 17580 INFO nova.compute.manager [-] Updating bandwidth usage cache 2014-03-20 08:48:06.692 17580 AUDIT nova.compute.resource_tracker [-] Auditing locally available compute resources ** Changed in: nova Status: Invalid => New -- You received this bug notification because you are a member of Yahoo! Engineering Team, which is subscribed to OpenStack Compute (nova). https://bugs.launchpad.net/bugs/1247603 Title: nova-conductor process can't create cosumer connection to qpid after HeartbeatTimeout in heavy workload Status in OpenStack Compute (Nova): New Bug description: nova-conductor will lose the queue and not able to get requests anymore after running workload for some time. This also occured in process nova-compute. They share same impl_qpid.py. When nova-conductor with heavy workload, a exception HeartbeatTimeout will be raised. The exceptin will be caught and try to reconnect to qpid server. logs shows we can't reconnect qoid in method iterconsume , but can reconnect qpid server in method publisher_send. That means we can't only send message to the qpid queue, but can't receive message from qpid queue. impl_qpid.py def ensure(self, error_callback, method, *args, **kwargs): while True: try: return method(*args, **kwargs) ---------------------------> raise HeartbeatTimeout except (qpid_exceptions.Empty, qpid_exceptions.ConnectionError), e: if error_callback: error_callback(e) self.reconnect() ------------------------------> retry method ensure is used in def iterconsume(self, limit=None, timeout=None): """Return an iterator that will consume from all queues/consumers""" def _error_callback(exc): if isinstance(exc, qpid_exceptions.Empty): LOG.debug(_('Timed out waiting for RPC response: %s') % str(exc)) raise rpc_common.Timeout() else: LOG.exception(_('Failed to consume message from queue: %s') % str(exc)) def _consume(): nxt_receiver = self.session.next_receiver(timeout=timeout) try: self._lookup_consumer(nxt_receiver).consume() except Exception: LOG.exception(_("Error processing message. Skipping it.")) for iteration in itertools.count(0): if limit and iteration >= limit: raise StopIteration yield self.ensure(_error_callback, _consume) ----------------------> here can't reconnect and def publisher_send(self, cls, topic, msg): """Send to a publisher based on the publisher class""" def _connect_error(exc): log_info = {'topic': topic, 'err_str': str(exc)} LOG.exception(_("Failed to publish message to topic " "'%(topic)s': %(err_str)s") % log_info) def _publisher_send(): publisher = cls(self.conf, self.session, topic) publisher.send(msg) return self.ensure(_connect_error, _publisher_send) ------------------> here can reconnect. To manage notifications about this bug go to: https://bugs.launchpad.net/nova/+bug/1247603/+subscriptions -- Mailing list: https://launchpad.net/~yahoo-eng-team Post to : yahoo-eng-team@lists.launchpad.net Unsubscribe : https://launchpad.net/~yahoo-eng-team More help : https://help.launchpad.net/ListHelp