Hotplug will not be supported for: - KVM < 1.0 - existing devices in the cluster - python-fdsend module is not installed (NIC hotplug) - chroot (Disk hotplug) - security mode other than None (Disk hotplug) If no hotplug takes place modifications will take place after reboot.
New methods: Introduce new method HotplugDevice() and HotAddNic/HotDelNic, HotAddDisk/HotDelDisk helper methods that eventually make use of QEMU monitor interface for hotpluging. Device naming: QEMU monitor expects devices to be uniquely named. Device ids derive from the following function: kvm_devid = <device_type>-<part of uuid>-pci-<pci_slot> Device ids must be reproduce-able when we want to remove them. For that reason we store the pci slot inside the runtime file and in case we want to remove a device we obtain its pci slot by parsing the corresponding runtime enrty and matching the device by its uuid. Finding the PCI slot: For newly added devices Hypervisor parses existing PCI allocations (via _AnnotateFreePCISlot() and eventually ``info pci`` monitor command) and decides the PCI slot to plug in the device. During instance startup hypervisor invokes _UpdatePCISlots() for every device of the instance. Initial PCI reservations derive from KVM default setup, that allocates 4 slots for devices other than disks and NICs. NIC hotplug: - open a tap and get its file descriptor. - pass fd with SCM rights (using python-fdsend) via monitor socket - create netdev and device with id=kvm_devid and proper pci info Disk hotplug: - create drive with id=kvm_devid - create device with id=kvm_devid and corresponding pci info In order to migrate a VM, an identical VM should be booted with exactly the same pci configuration (and with -incoming option). PCI info is passed via runtime file. To this end every time a hotplug takes place runtime file must be updated. Introduce _GenerateKVMBlockDevicesOptions(): The runtime file contains one more field: block_devices. kvm_cmd is extended with block device options during _ExecuteKVMRuntime(). Handle old style format of runtime files: In case block_devices are already encapsulated inside kvm_cmd and runtime files have only 3 entries, set block_devices to []. This way migration will not fail and hotplug will succeed for new disks only. Signed-off-by: Dimitris Aragiorgis <[email protected]> --- lib/hypervisor/hv_kvm.py | 356 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 312 insertions(+), 44 deletions(-) diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py index 4bd8a9e..5bf4255 100644 --- a/lib/hypervisor/hv_kvm.py +++ b/lib/hypervisor/hv_kvm.py @@ -37,10 +37,16 @@ import shutil import socket import stat import StringIO +import copy +from bitarray import bitarray try: import affinity # pylint: disable=F0401 except ImportError: affinity = None +try: + import fdsend # pylint: disable=F0401 +except ImportError: + fdsend = None from ganeti import utils from ganeti import constants @@ -79,6 +85,40 @@ _SPICE_ADDITIONAL_PARAMS = frozenset([ constants.HV_KVM_SPICE_USE_TLS, ]) +FREE = bitarray("0") + +def _GenerateDeviceKVMId(dev_type, dev): + + if not dev or not dev.pci: + return None + + return "%s-%s-pci-%d" % (dev_type.lower(), dev.uuid.split("-")[0], dev.pci) + + +def _UpdatePCISlots(dev, pci_reservations): + """Update pci configuration for a stopped instance + + If dev has a pci slot the reserve it, else find first available. + + """ + if dev.pci: + free = dev.pci + else: + [free] = pci_reservations.search(FREE, 1) # pylint: disable=E1103 + if not free: + raise errors.HypervisorError("All PCI slots occupied") + dev.pci = int(free) + + pci_reservations[free] = True + + +def _RemoveFromRuntimeEntry(devices, device, fn): + try: + [rem] = [x for x in fn(devices) if x.uuid == device.uuid] + devices.remove(rem) + except (ValueError, IndexError): + logging.info("No device with uuid %s in runtime file", device.uuid) + def _GetTunFeatures(fd, _ioctl=fcntl.ioctl): """Retrieves supported TUN features from file descriptor. @@ -569,6 +609,14 @@ class KVMHypervisor(hv_base.BaseHypervisor): _BOOT_RE = re.compile(r"^-drive\s([^-]|(?<!^)-)*,boot=on\|off", re.M | re.S) _UUID_RE = re.compile(r"^-uuid\s", re.M) + _INFO_PCI_RE = re.compile(r'Bus.*device[ ]*(\d+).*') + _INFO_PCI_CMD = "info pci" + _INFO_VERSION_RE = \ + re.compile(r'^QEMU (\d+)\.(\d+)(\.(\d+))?.*monitor.*', re.M) + _INFO_VERSION_CMD = "info version" + + _DEFAULT_PCI_RESERVATIONS = "11110000000000000000000000000000" + ANCILLARY_FILES = [ _KVM_NETWORK_SCRIPT, ] @@ -1021,6 +1069,81 @@ class KVMHypervisor(hv_base.BaseHypervisor): data.append(info) return data + def _GetExistingDeviceKVMId(self, instance, dev_type, dev): + (_, kvm_nics, __, block_devices) = self._LoadKVMRuntime(instance) + if dev_type == constants.HOTPLUG_NIC: + found = [n for n in kvm_nics + if n.uuid == dev.uuid] + elif dev_type == constants.HOTPLUG_DISK: + found = [d for d, _ in block_devices + if d.uuid == dev.uuid] + dev_info = None + if found: + dev_info = found[0] + return _GenerateDeviceKVMId(dev_type, dev_info) + + def _GenerateKVMBlockDevicesOptions(self, instance, kvm_cmd, block_devices, + pci_reservations, kvmhelp): + + hvp = instance.hvparams + boot_disk = hvp[constants.HV_BOOT_ORDER] == constants.HT_BO_DISK + + # whether this is an older KVM version that uses the boot=on flag + # on devices + needs_boot_flag = self._BOOT_RE.search(kvmhelp) + + disk_type = hvp[constants.HV_DISK_TYPE] + if disk_type == constants.HT_DISK_PARAVIRTUAL: + #TODO: parse kvm -device ? output + disk_model = "virtio-blk-pci" + if_val = ",if=virtio" + else: + if_val = ",if=%s" % disk_type + # Cache mode + disk_cache = hvp[constants.HV_DISK_CACHE] + if instance.disk_template in constants.DTS_EXT_MIRROR: + if disk_cache != "none": + # TODO: make this a hard error, instead of a silent overwrite + logging.warning("KVM: overriding disk_cache setting '%s' with 'none'" + " to prevent shared storage corruption on migration", + disk_cache) + cache_val = ",cache=none" + elif disk_cache != constants.HT_CACHE_DEFAULT: + cache_val = ",cache=%s" % disk_cache + else: + cache_val = "" + for cfdev, dev_path in block_devices: + if cfdev.mode != constants.DISK_RDWR: + raise errors.HypervisorError("Instance has read-only disks which" + " are not supported by KVM") + # TODO: handle FD_LOOP and FD_BLKTAP (?) + boot_val = "" + if boot_disk: + kvm_cmd.extend(["-boot", "c"]) + boot_disk = False + if needs_boot_flag and disk_type != constants.HT_DISK_IDE: + boot_val = ",boot=on" + drive_val = "file=%s,format=raw%s%s" % \ + (dev_path, boot_val, cache_val) + _UpdatePCISlots(cfdev, pci_reservations) + kvm_devid = _GenerateDeviceKVMId("DISK", cfdev) + if kvm_devid: + #TODO: name id after model + drive_val += (",if=none,id=%s" % kvm_devid) + drive_val += (",bus=0,unit=%d" % cfdev.pci) + else: + drive_val += if_val + + kvm_cmd.extend(["-drive", drive_val]) + + if kvm_devid: + dev_val = ("%s,drive=%s,id=%s" % + (disk_model, kvm_devid, kvm_devid)) + dev_val += ",bus=pci.0,addr=%s" % hex(cfdev.pci) + kvm_cmd.extend(["-device", dev_val]) + + return kvm_cmd + def _GenerateKVMRuntime(self, instance, block_devices, startup_paused, kvmhelp): """Generate KVM information to start an instance. @@ -1089,9 +1212,8 @@ class KVMHypervisor(hv_base.BaseHypervisor): kernel_path = hvp[constants.HV_KERNEL_PATH] if kernel_path: - boot_disk = boot_cdrom = boot_floppy = boot_network = False + boot_cdrom = boot_floppy = boot_network = False else: - boot_disk = hvp[constants.HV_BOOT_ORDER] == constants.HT_BO_DISK boot_cdrom = hvp[constants.HV_BOOT_ORDER] == constants.HT_BO_CDROM boot_floppy = hvp[constants.HV_BOOT_ORDER] == constants.HT_BO_FLOPPY boot_network = hvp[constants.HV_BOOT_ORDER] == constants.HT_BO_NETWORK @@ -1107,38 +1229,6 @@ class KVMHypervisor(hv_base.BaseHypervisor): needs_boot_flag = self._BOOT_RE.search(kvmhelp) disk_type = hvp[constants.HV_DISK_TYPE] - if disk_type == constants.HT_DISK_PARAVIRTUAL: - if_val = ",if=virtio" - else: - if_val = ",if=%s" % disk_type - # Cache mode - disk_cache = hvp[constants.HV_DISK_CACHE] - if instance.disk_template in constants.DTS_EXT_MIRROR: - if disk_cache != "none": - # TODO: make this a hard error, instead of a silent overwrite - logging.warning("KVM: overriding disk_cache setting '%s' with 'none'" - " to prevent shared storage corruption on migration", - disk_cache) - cache_val = ",cache=none" - elif disk_cache != constants.HT_CACHE_DEFAULT: - cache_val = ",cache=%s" % disk_cache - else: - cache_val = "" - for cfdev, dev_path in block_devices: - if cfdev.mode != constants.DISK_RDWR: - raise errors.HypervisorError("Instance has read-only disks which" - " are not supported by KVM") - # TODO: handle FD_LOOP and FD_BLKTAP (?) - boot_val = "" - if boot_disk: - kvm_cmd.extend(["-boot", "c"]) - boot_disk = False - if needs_boot_flag and disk_type != constants.HT_DISK_IDE: - boot_val = ",boot=on" - - drive_val = "file=%s,format=raw%s%s%s" % (dev_path, if_val, boot_val, - cache_val) - kvm_cmd.extend(["-drive", drive_val]) #Now we can specify a different device type for CDROM devices. cdrom_disk_type = hvp[constants.HV_KVM_CDROM_DISK_TYPE] @@ -1407,7 +1497,7 @@ class KVMHypervisor(hv_base.BaseHypervisor): kvm_nics = instance.nics hvparams = hvp - return (kvm_cmd, kvm_nics, hvparams) + return (kvm_cmd, kvm_nics, hvparams, block_devices) def _WriteKVMRuntime(self, instance_name, data): """Write an instance's KVM runtime @@ -1433,9 +1523,13 @@ class KVMHypervisor(hv_base.BaseHypervisor): """Save an instance's KVM runtime """ - kvm_cmd, kvm_nics, hvparams = kvm_runtime + kvm_cmd, kvm_nics, hvparams, block_devices = kvm_runtime + serialized_nics = [nic.ToDict() for nic in kvm_nics] - serialized_form = serializer.Dump((kvm_cmd, serialized_nics, hvparams)) + serialized_blockdevs = [(blk.ToDict(), link) for blk, link in block_devices] + serialized_form = serializer.Dump((kvm_cmd, serialized_nics, hvparams, + serialized_blockdevs)) + self._WriteKVMRuntime(instance.name, serialized_form) def _LoadKVMRuntime(self, instance, serialized_runtime=None): @@ -1444,10 +1538,19 @@ class KVMHypervisor(hv_base.BaseHypervisor): """ if not serialized_runtime: serialized_runtime = self._ReadKVMRuntime(instance.name) + loaded_runtime = serializer.Load(serialized_runtime) - kvm_cmd, serialized_nics, hvparams = loaded_runtime + if len(loaded_runtime)==3: + serialized_blockdevs = [] + kvm_cmd, serialized_nics, hvparams = loaded_runtime + else: + kvm_cmd, serialized_nics, hvparams, serialized_blockdevs = loaded_runtime + kvm_nics = [objects.NIC.FromDict(snic) for snic in serialized_nics] - return (kvm_cmd, kvm_nics, hvparams) + block_devices = [(objects.Disk.FromDict(sdisk), link) + for sdisk, link in serialized_blockdevs] + + return (kvm_cmd, kvm_nics, hvparams, block_devices) def _RunKVMCmd(self, name, kvm_cmd, tap_fds=None): """Run the KVM cmd and check for errors @@ -1472,6 +1575,7 @@ class KVMHypervisor(hv_base.BaseHypervisor): if not self._InstancePidAlive(name)[2]: raise errors.HypervisorError("Failed to start instance %s" % name) + # pylint: disable=R0914 def _ExecuteKVMRuntime(self, instance, kvm_runtime, kvmhelp, incoming=None): """Execute a KVM cmd, after completing it with some last minute data. @@ -1495,9 +1599,12 @@ class KVMHypervisor(hv_base.BaseHypervisor): temp_files = [] - kvm_cmd, kvm_nics, up_hvp = kvm_runtime + kvm_cmd, kvm_nics, up_hvp, block_devices = kvm_runtime # the first element of kvm_cmd is always the path to the kvm binary kvm_path = kvm_cmd[0] + + kvm_cmd_runtime = copy.deepcopy(kvm_cmd) + up_hvp = objects.FillDict(conf_hvp, up_hvp) # We know it's safe to run as a different user upon migration, so we'll use @@ -1516,6 +1623,13 @@ class KVMHypervisor(hv_base.BaseHypervisor): utils.WriteFile(keymap_path, data="include en-us\ninclude %s\n" % keymap) kvm_cmd.extend(["-k", keymap_path]) + pci_reservations = bitarray(self._DEFAULT_PCI_RESERVATIONS) + + kvm_cmd = self._GenerateKVMBlockDevicesOptions(instance, kvm_cmd, + block_devices, + pci_reservations, + kvmhelp) + # We have reasons to believe changing something like the nic driver/type # upon migration won't exactly fly with the instance kernel, so for nic # related parameters we'll use up_hvp @@ -1556,8 +1670,15 @@ class KVMHypervisor(hv_base.BaseHypervisor): tapfds.append(tapfd) taps.append(tapname) if kvm_supports_netdev: - nic_val = "%s,mac=%s,netdev=netdev%s" % (nic_model, nic.mac, nic_seq) - tap_val = "type=tap,id=netdev%s,fd=%d%s" % (nic_seq, tapfd, tap_extra) + nic_val = "%s,mac=%s" % (nic_model, nic.mac) + _UpdatePCISlots(nic, pci_reservations) + kvm_devid = _GenerateDeviceKVMId("NIC", nic) + netdev = kvm_devid or "netdev%d" % nic_seq + nic_val += (",netdev=%s" % netdev) + if kvm_devid: + nic_val += (",id=%s,bus=pci.0,addr=%s" % (kvm_devid, hex(nic.pci))) + tap_val = ("type=tap,id=%s,fd=%d%s" % + (netdev, tapfd, tap_extra)) kvm_cmd.extend(["-netdev", tap_val, "-device", nic_val]) else: nic_val = "nic,vlan=%s,macaddr=%s,model=%s" % (nic_seq, @@ -1680,6 +1801,10 @@ class KVMHypervisor(hv_base.BaseHypervisor): # explicitly requested resume the vm status. self._CallMonitorCommand(instance.name, self._CONT_CMD) + kvm_runtime_with_pci_info = (kvm_cmd_runtime, kvm_nics, + up_hvp, block_devices) + return kvm_runtime_with_pci_info + def StartInstance(self, instance, block_devices, startup_paused): """Start an instance. @@ -1690,7 +1815,9 @@ class KVMHypervisor(hv_base.BaseHypervisor): kvm_runtime = self._GenerateKVMRuntime(instance, block_devices, startup_paused, kvmhelp) self._SaveKVMRuntime(instance, kvm_runtime) - self._ExecuteKVMRuntime(instance, kvm_runtime, kvmhelp) + kvm_runtime_with_pci_info = self._ExecuteKVMRuntime(instance, kvm_runtime, + kvmhelp) + self._SaveKVMRuntime(instance, kvm_runtime_with_pci_info) def _CallMonitorCommand(self, instance_name, command): """Invoke a command on the instance monitor. @@ -1716,6 +1843,145 @@ class KVMHypervisor(hv_base.BaseHypervisor): return result + def _AnnotateFreePCISlot(self, instance, dev): + """Get the first available pci slot of a runnung instance. + + """ + slots = bitarray(32) + slots.setall(False) # pylint: disable=E1101 + output = self._CallMonitorCommand(instance.name, self._INFO_PCI_CMD) + for line in output.stdout.splitlines(): + match = self._INFO_PCI_RE.search(line) + if match: + slot = int(match.group(1)) + slots[slot] = True + + [free] = slots.search(FREE, 1) # pylint: disable=E1101 + if not free: + raise errors.HypervisorError("All PCI slots occupied") + + dev.pci = int(free) + + def _TryHotplug(self, instance, dev_type): + """Get QEMU version from the instance's monitor. + + Hotplug is supported for running instances and for versions >= 1.0. + """ + if dev_type == constants.HOTPLUG_DISK: + hvp = instance.hvparams + security_model = hvp[constants.HV_SECURITY_MODEL] + use_chroot = hvp[constants.HV_KVM_USE_CHROOT] + if use_chroot or security_model != constants.HT_SM_NONE: + return False + output = self._CallMonitorCommand(instance.name, self._INFO_VERSION_CMD) + #TODO: search for netdev_add, drive_add, device_add..... + match = self._INFO_VERSION_RE.search(output.stdout) + if not match: + return False + v_major, v_min, _, _ = match.groups() + return (v_major, v_min) >= (1, 0) + + def _CallMonitorHotplugCommand(self, name, cmd): + output = self._CallMonitorCommand(name, cmd) + #TODO: parse output and check if succeeded + for line in output.stdout.splitlines(): + logging.info("%s", line) + + def HotplugDevice(self, instance, action, dev_type, device, extra, seq): + """ Generic method to hotplug device + + Depending on action and dev_type invoke the coresponding method that + does the actual hotplug (via KVM monitor commands). + Before and after, do all hotplug related steps (e.g. check if hotplug + is possible, annotate pci slot to device and generate/get existing + KVM device ids, read/write runtime file) + + """ + if self._TryHotplug(instance, dev_type): + (kvm_cmd, kvm_nics, hvparams, \ + block_devices) = self._LoadKVMRuntime(instance) + if action == constants.HOTPLUG_ADD: + self._AnnotateFreePCISlot(instance, device) + kvm_devid = _GenerateDeviceKVMId(dev_type, device) + if dev_type == constants.HOTPLUG_DISK: + self._HotAddDisk(instance, + device, extra, seq, kvm_devid, block_devices) + elif dev_type == constants.HOTPLUG_NIC and fdsend: + self._HotAddNic(instance, device, extra, seq, kvm_devid, kvm_nics) + elif action == constants.HOTPLUG_REMOVE: + kvm_devid = self._GetExistingDeviceKVMId(instance, dev_type, device) + if dev_type == constants.HOTPLUG_DISK: + self._HotDelDisk(instance, + device, extra, seq, kvm_devid, block_devices) + elif dev_type == constants.HOTPLUG_NIC and fdsend: + self._HotDelNic(instance, device, extra, seq, kvm_devid, kvm_nics) + self._SaveKVMRuntime(instance, + (kvm_cmd, kvm_nics, hvparams, block_devices)) + time.sleep(2) + + def _HotAddDisk(self, instance, disk, dev_path, _, kvm_devid, block_devices): + """Hotplug/add new disk to and instance + + """ + command = ("drive_add dummy file=%s,if=none,id=%s,format=raw" % + (dev_path, kvm_devid)) + self._CallMonitorHotplugCommand(instance.name, command) + command = ("device_add virtio-blk-pci,bus=pci.0,addr=%s," + "drive=%s,id=%s" + % (hex(disk.pci), kvm_devid, kvm_devid)) + self._CallMonitorHotplugCommand(instance.name, command) + block_devices.append((disk, dev_path)) + + def _HotAddNic(self, instance, nic, _, seq, kvm_devid, kvm_nics): + """Hotplug/add nic to an instance + + """ + (tap, fd) = _OpenTap() + self._PassTapFd(instance, fd, nic) + command = ("netdev_add tap,id=%s,fd=%s" % (kvm_devid, kvm_devid)) + self._CallMonitorHotplugCommand(instance.name, command) + command = ("device_add virtio-net-pci,bus=pci.0,addr=%s,mac=%s," + "netdev=%s,id=%s" + % (hex(nic.pci), nic.mac, kvm_devid, kvm_devid)) + self._CallMonitorHotplugCommand(instance.name, command) + self._ConfigureNIC(instance, seq, nic, tap) + utils.WriteFile(self._InstanceNICFile(instance.name, seq), data=tap) + kvm_nics.append(nic) + + def _HotDelDisk(self, instance, disk, _, __, kvm_devid, block_devices): + """Hotplug/remove disk from an instance + + """ + command = "device_del %s" % kvm_devid + self._CallMonitorHotplugCommand(instance.name, command) + #command = "drive_del %s" % uuid + #self._CallMonitorHotplugCommand(instance.name, command) + _RemoveFromRuntimeEntry(block_devices, disk, lambda x: [d for d, l in x]) + + def _HotDelNic(self, instance, nic, _, __, kvm_devid, kvm_nics): + """Hotplug/remove existing nic from an instance + + """ + command = "device_del %s" % kvm_devid + self._CallMonitorHotplugCommand(instance.name, command) + command = "netdev_del %s" % kvm_devid + self._CallMonitorHotplugCommand(instance.name, command) + _RemoveFromRuntimeEntry(kvm_nics, nic, lambda x: x) + + def _PassTapFd(self, instance, fd, nic): + """Pass file descriptor to kvm process via monitor socket using SCM_RIGHTS + + """ + monsock = utils.ShellQuote(self._InstanceMonitor(instance.name)) + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(monsock) + kvm_devid = _GenerateDeviceKVMId("NIC", nic) + command = "getfd %s\n" % kvm_devid + fds = [fd] + logging.info("%s", fds) + fdsend.sendfds(s, command, fds = fds) + s.close() + @classmethod def _ParseKVMVersion(cls, text): """Parse the KVM version from the --help output. @@ -1831,7 +2097,9 @@ class KVMHypervisor(hv_base.BaseHypervisor): self._SaveKVMRuntime(instance, kvm_runtime) kvmpath = instance.hvparams[constants.HV_KVM_PATH] kvmhelp = self._GetKVMOutput(kvmpath, self._KVMOPT_HELP) - self._ExecuteKVMRuntime(instance, kvm_runtime, kvmhelp) + kvm_runtime_with_pci_info = \ + self._ExecuteKVMRuntime(instance, kvm_runtime, kvmhelp) + self._SaveKVMRuntime(instance, kvm_runtime_with_pci_info) def MigrationInfo(self, instance): """Get instance information to perform a migration. -- 1.7.10.4
