Mike Gerdts has proposed merging ~mgerdts/cloud-init:bug-1746605 into cloud-init:master.
Requested reviews: cloud-init commiters (cloud-init-dev) Related bugs: Bug #1746605 in cloud-init: "DataSourceSmartOS needs locking and retries" https://bugs.launchpad.net/cloud-init/+bug/1746605 For more details, see: https://code.launchpad.net/~mgerdts/cloud-init/+git/cloud-init/+merge/337271 Fixing a race between cloud-init and mdata-get (called from rc.local) which results in cross talk on the serial port. The locking strategy used by mdata-get is used for reasons discussed at https://github.com/joyent/mdata-client/issues/11#issuecomment-362490774 Additionally, a retry is performed to help when 1) something else had the port open but not locked before cloud-init starts and 2) the metadata server in the host restarts. -- Your team cloud-init commiters is requested to review the proposed merge of ~mgerdts/cloud-init:bug-1746605 into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceSmartOS.py b/cloudinit/sources/DataSourceSmartOS.py index 86bfa5d..24187d6 100644 --- a/cloudinit/sources/DataSourceSmartOS.py +++ b/cloudinit/sources/DataSourceSmartOS.py @@ -1,4 +1,5 @@ # Copyright (C) 2013 Canonical Ltd. +# Copyright (c) 2018, Joyent, Inc. # # Author: Ben Howard <ben.how...@canonical.com> # @@ -21,6 +22,8 @@ import base64 import binascii +import errno +import fcntl import json import os import random @@ -339,7 +342,11 @@ class JoyentMetadataClient(object): binascii.crc32(body.encode('utf-8')) & 0xffffffff) def _get_value_from_frame(self, expected_request_id, frame): - frame_data = self.line_regex.match(frame).groupdict() + match = self.line_regex.match(frame) + if match is None: + raise JoyentMetadataFetchException( + 'No regex match for frame "%s"' % frame) + frame_data = match.groupdict() if int(frame_data['length']) != len(frame_data['body']): raise JoyentMetadataFetchException( 'Incorrect frame length given ({0} != {1}).'.format( @@ -378,9 +385,16 @@ class JoyentMetadataClient(object): self.fp.flush() response = bytearray() - response.extend(self.fp.read(1)) - while response[-1:] != b'\n': - response.extend(self.fp.read(1)) + while True: + try: + response.extend(self.fp.read(1)) + if response[-1:] == b'\n': + break + except OSError as exc: + if exc.errno == errno.EAGAIN: + raise JoyentMetadataFetchException( + "Timeout during read. Partial response: b'%s'" % + response) if need_close: self.close_transport() @@ -395,12 +409,21 @@ class JoyentMetadataClient(object): return value def get(self, key, default=None, strip=False): - result = self.request(rtype='GET', param=key) - if result is None: - return default - if result and strip: - result = result.strip() - return result + # Do a couple tries in case someone else has the serial port open + # before this process opened it. This also helps in the event that + # the metadata server goes away in middle of a conversation. + for tries in [1, 2]: + try: + result = self.request(rtype='GET', param=key) + if result is None: + return default + if result and strip: + result = result.strip() + return result + except JoyentMetadataFetchException as exc: + last_exc = exc + pass + raise(last_exc) def get_json(self, key, default=None): result = self.get(key, default=default) @@ -471,6 +494,7 @@ class JoyentMetadataSerialClient(JoyentMetadataClient): ser = serial.Serial(self.device, timeout=self.timeout) if not ser.isOpen(): raise SystemError("Unable to open %s" % self.device) + fcntl.lockf(ser, fcntl.LOCK_EX) self.fp = ser def __repr__(self): diff --git a/packages/bddeb b/packages/bddeb index 4f2e2dd..7e96130 100755 --- a/packages/bddeb +++ b/packages/bddeb @@ -68,7 +68,8 @@ def write_debian_folder(root, templ_data, is_python2, cloud_util_deps): test_reqs = run_helper( 'read-dependencies', ['--requirements-file', 'test-requirements.txt', - '--system-pkg-names', '--python-version', pyver]).splitlines() + '--system-pkg-names', '--python-version', pyver, + '-d', 'debian']).splitlines() requires = ['cloud-utils | cloud-guest-utils'] if cloud_util_deps else [] # We consolidate all deps as Build-Depends as our package build runs all diff --git a/test-requirements.txt b/test-requirements.txt index d9d41b5..548132c 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -11,3 +11,5 @@ coverage # Only really needed on older versions of python contextlib2 setuptools + +pyserial diff --git a/tests/unittests/test_datasource/test_smartos.py b/tests/unittests/test_datasource/test_smartos.py index 88bae5f..4d9f9ab 100644 --- a/tests/unittests/test_datasource/test_smartos.py +++ b/tests/unittests/test_datasource/test_smartos.py @@ -1,4 +1,5 @@ # Copyright (C) 2013 Canonical Ltd. +# Copyright (c) 2018, Joyent, Inc. # # Author: Ben Howard <ben.how...@canonical.com> # @@ -15,12 +16,16 @@ from __future__ import print_function from binascii import crc32 import json +import multiprocessing import os import os.path import re import shutil +import signal import stat +import subprocess import tempfile +import unittest import uuid from cloudinit import serial @@ -872,4 +877,67 @@ class TestNetworkConversion(TestCase): found = convert_net(SDC_NICS_SINGLE_GATEWAY) self.assertEqual(expected, found) + +@unittest.skipUnless(DataSourceSmartOS.get_smartos_environ() == + DataSourceSmartOS.SMARTOS_ENV_KVM, + "Only supported on KVM and bhyve guests under SmartOS") +@unittest.skipUnless(os.access(DataSourceSmartOS.SERIAL_DEVICE, os.W_OK), + "Requires write access to " + + DataSourceSmartOS.SERIAL_DEVICE) +class TestSerialConcurrency(TestCase): + """ + This class tests locking on an actual serial port, and as such can only + be run in a kvm or bhyve guest running on a SmartOS host. A test run on + a metadata socket will not be valid because a metadata socket ensures + there is only one session over a connection. In contrast, in the + absence of proper locking multiple processes opening the same serial + port can corrupt each others' exchanges with the metadata server. + """ + def setUp(self): + self.mdata_proc = multiprocessing.Process(target=self.start_mdata_loop) + self.mdata_proc.start() + super(TestSerialConcurrency, self).setUp() + + def tearDown(self): + # os.kill() rather than mdata_proc.terminate() to avoid console spam. + os.kill(self.mdata_proc.pid, signal.SIGKILL) + self.mdata_proc.join() + super(TestSerialConcurrency, self).tearDown() + + def start_mdata_loop(self): + """ + The mdata-get command is repeatedly run in a separate process so + that it may try to race with metadata operations performed in the + main test process. Use of mdata-get is better than two processes + using the protocol implementation in DataSourceSmartOS because we + are testing to be sure that cloud-init and mdata-get respect each + others locks. + """ + while True: + try: + subprocess.check_output(['/usr/sbin/mdata-get', 'sdc:routes']) + except subprocess.CalledProcessError: + pass + + def test_all_keys(self): + self.assertIsNotNone(self.mdata_proc.pid) + ds = DataSourceSmartOS + keys = [tup[0] for tup in ds.SMARTOS_ATTRIB_MAP.values()] + keys.extend(ds.SMARTOS_ATTRIB_JSON.values()) + + client = ds.jmc_client_factory() + self.assertIsNotNone(client) + + # The behavior that we are testing for was observed mdata-get running + # 10 times at roughly the same time as cloud-init fetched each key + # once. cloud-init would regularly see failures before making it + # through all keys once. + for it in range(0, 3): + for key in keys: + # We don't care about the return value, just that it doesn't + # thrown any exceptions. + client.get(key) + + self.assertIsNone(self.mdata_proc.exitcode) + # vi: ts=4 expandtab
_______________________________________________ Mailing list: https://launchpad.net/~cloud-init-dev Post to : cloud-init-dev@lists.launchpad.net Unsubscribe : https://launchpad.net/~cloud-init-dev More help : https://help.launchpad.net/ListHelp