Re: [Qemu-devel] [PATCH 2/3] VCPU hotplug support

Igor Mammedov Mon, 23 Jan 2012 08:29:54 -0800

On 01/17/2012 03:17 PM, Jan Kiszka wrote:

On 2012-01-17 14:17, Igor Mammedov wrote:

Rebase of the missing bits from qemu-kvm for vcpu hotplug


Description, please. Please try to split up, at least into PIIX4
preparations and "the rest". Maybe also a patch for a generic CPU
hotplug infrastructure.


Signed-off-by: Igor Mammedov<imamm...@redhat.com>
---
  Makefile.objs     |    2 +-
  Makefile.target   |    2 +-
  hw/acpi_piix4.c   |   83 ++++++++++++++++++++++++++++++++++++++++++++++++++--
  hw/pc.c           |   21 +++++++++++++-
  hw/pc_piix.c      |    4 ++
  sysemu.h          |    2 +
  target-i386/cpu.h |    1 +
  7 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/Makefile.objs b/Makefile.objs
index 45df666..2a8ccc1 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -212,7 +212,7 @@ hw-obj-$(CONFIG_USB_UHCI) += usb-uhci.o
  hw-obj-$(CONFIG_USB_OHCI) += usb-ohci.o
  hw-obj-$(CONFIG_USB_EHCI) += usb-ehci.o
  hw-obj-$(CONFIG_FDC) += fdc.o
-hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o icc_bus.o
+hw-obj-$(CONFIG_ACPI) += acpi.o icc_bus.o
  hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o
  hw-obj-$(CONFIG_DMA) += dma.o
  hw-obj-$(CONFIG_HPET) += hpet.o
diff --git a/Makefile.target b/Makefile.target
index 06d79b8..4865dde 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -226,7 +226,7 @@ obj-y += device-hotplug.o
  # Hardware support
  obj-i386-y += vga.o
  obj-i386-y += mc146818rtc.o pc.o
-obj-i386-y += cirrus_vga.o sga.o apic.o ioapic.o piix_pci.o
+obj-i386-y += cirrus_vga.o sga.o apic.o ioapic.o piix_pci.o acpi_piix4.o


That's a qemu-kvm hack, see below for the discussion. No-go for upstream
- likely.

  obj-i386-y += vmport.o
  obj-i386-y += pci-hotplug.o smbios.o wdt_ib700.o
  obj-i386-y += debugcon.o multiboot.o
diff --git a/hw/acpi_piix4.c b/hw/acpi_piix4.c
index bdc55a1..f1cdcc1 100644
--- a/hw/acpi_piix4.c
+++ b/hw/acpi_piix4.c
@@ -39,13 +39,19 @@
  #define ACPI_DBG_IO_ADDR  0xb044

  #define GPE_BASE 0xafe0
+#define PROC_BASE 0xaf00
  #define GPE_LEN 4
  #define PCI_BASE 0xae00
  #define PCI_EJ_BASE 0xae08
  #define PCI_RMV_BASE 0xae0c

+#define PIIX4_CPU_HOTPLUG_STATUS 4
  #define PIIX4_PCI_HOTPLUG_STATUS 2

+struct gpe_regs {
+    uint8_t cpus_sts[32];
+};
+
  struct pci_status {
      uint32_t up;
      uint32_t down;
@@ -71,6 +77,7 @@ typedef struct PIIX4PMState {

      /* for pci hotplug */
      ACPIGPE gpe;
+    struct gpe_regs gpe_cpu;
      struct pci_status pci0_status;
      uint32_t pci0_hotplug_enable;
  } PIIX4PMState;
@@ -90,9 +97,11 @@ static void pm_update_sci(PIIX4PMState *s)
                     ACPI_BITMASK_POWER_BUTTON_ENABLE |
                     ACPI_BITMASK_GLOBAL_LOCK_ENABLE |
                     ACPI_BITMASK_TIMER_ENABLE)) != 0) ||
-        (((s->gpe.sts[0]&  s->gpe.en[0])&  PIIX4_PCI_HOTPLUG_STATUS) != 0);
+        (((s->gpe.sts[0]&  s->gpe.en[0])&
+         (PIIX4_PCI_HOTPLUG_STATUS | PIIX4_CPU_HOTPLUG_STATUS)) != 0);

      qemu_set_irq(s->irq, sci_level);
+
      /* schedule a timer interruption if needed */
      acpi_pm_tmr_update(&s->tmr, (s->pm1a.en&  ACPI_BITMASK_TIMER_ENABLE)&&
                         !(pmsts&  ACPI_BITMASK_TIMER_STATUS));
@@ -219,10 +228,9 @@ static int vmstate_acpi_post_load(void *opaque, int 
version_id)
   {                                                                   \
       .name       = (stringify(_field)),                              \
       .version_id = 0,                                                \
-     .num        = GPE_LEN,                                          \
       .info       =&vmstate_info_uint16,                             \
       .size       = sizeof(uint16_t),                                 \
-     .flags      = VMS_ARRAY | VMS_POINTER,                          \
+     .flags      = VMS_SINGLE | VMS_POINTER,                         \


Does this make the vmstate incompatible to the current version?

I suspect it makes it incompatible, but not sure what to do about it.
According to b2e4a396f7 in qemu-kvm it fixes migration bug. It probably
should be an independed patch not related to hotplug since qemu.git has
commit 23910d3f6 that introduced the code there and it applies to gpe
struct in general.

       .offset     = vmstate_offset_pointer(_state, _field, uint8_t),  \
   }

@@ -329,11 +337,16 @@ static void piix4_pm_machine_ready(Notifier *n, void 
*opaque)

  }

+static PIIX4PMState *global_piix4_pm_state; /* cpu hotadd */
+
  static int piix4_pm_initfn(PCIDevice *dev)
  {
      PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, dev);
      uint8_t *pci_conf;

+    /* for cpu hotadd */
+    global_piix4_pm_state = s;
+
      pci_conf = s->dev.config;
      pci_conf[0x06] = 0x80;
      pci_conf[0x07] = 0x02;
@@ -425,7 +438,16 @@ device_init(piix4_pm_register);
  static uint32_t gpe_readb(void *opaque, uint32_t addr)
  {
      PIIX4PMState *s = opaque;
-    uint32_t val = acpi_gpe_ioport_readb(&s->gpe, addr);
+    uint32_t val = 0;
+    struct gpe_regs *g =&s->gpe_cpu;
+
+    switch (addr) {
+    case PROC_BASE ... PROC_BASE+31:
+        val = g->cpus_sts[addr - PROC_BASE];
+        break;
+    default:
+        val = acpi_gpe_ioport_readb(&s->gpe, addr);
+    }

      PIIX4_DPRINTF("gpe read %x == %x\n", addr, val);
      return val;
@@ -519,11 +541,20 @@ static int piix4_device_hotplug(DeviceState *qdev, 
PCIDevice *dev,
  static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s)
  {
      struct pci_status *pci0_status =&s->pci0_status;
+    int i = 0, cpus = smp_cpus;
+
+    while (cpus>  0) {
+        s->gpe_cpu.cpus_sts[i++] = (cpus<  8) ? (1<<  cpus) - 1 : 0xff;
+        cpus -= 8;
+    }

      register_ioport_write(GPE_BASE, GPE_LEN, 1, gpe_writeb, s);
      register_ioport_read(GPE_BASE, GPE_LEN, 1,  gpe_readb, s);
      acpi_gpe_blk(&s->gpe, GPE_BASE);

+    register_ioport_write(PROC_BASE, 32, 1, gpe_writeb, s);
+    register_ioport_read(PROC_BASE, 32, 1,  gpe_readb, s);
+
      register_ioport_write(PCI_BASE, 8, 4, pcihotplug_write, pci0_status);
      register_ioport_read(PCI_BASE, 8, 4,  pcihotplug_read, pci0_status);

@@ -536,6 +567,50 @@ static void piix4_acpi_system_hot_add_init(PCIBus *bus, 
PIIX4PMState *s)
      pci_bus_hotplug(bus, piix4_device_hotplug,&s->dev.qdev);
  }

+extern const char *global_cpu_model;
+
+#ifdef TARGET_I386


Why only TARGET_I386? If the PIIX4 is used on other targets, the


May be there is no sence in spending time on abstacting PIIX4 that
will be used only by x86 target. Is there an even remote chance that PIIX4
could/will be used with other targets?

infrastructure should be prepared to enable hotplugging for them as well
(at a later point). pc_new_cpu is a bad name then. And APIC fiddling
should be pushed into the arch-specific new-cpu handler.


I've started to implement 'new-cpu' as you suggested but could you clarify what
you've meant under "APIC fiddling"?
Is there an arch specific place for pc like hw to put this in (pc_piix.c 
perhaps)?

Also note that target dependencies are a no-go for hwlib compilations
which is clearly preferrable today.

I guess that target specific deps are here because of the code below
needed access to CPUState of specific target, with proper abstraction
this deps could be avoided.

+static void enable_processor(PIIX4PMState *s, int cpu)
+{
+    struct gpe_regs *g =&s->gpe_cpu;
+    ACPIGPE *gpe =&s->gpe;
+
+    *gpe->sts = *gpe->sts | PIIX4_CPU_HOTPLUG_STATUS;
+    g->cpus_sts[cpu/8] |= (1<<  (cpu%8));


"cpu / 8", "cpu % 8", please. Here and below. checkpatch doesn't complain?


checkpatch was happy, I'll fix it.

+}
+
+static void disable_processor(PIIX4PMState *s, int cpu)
+{
+    struct gpe_regs *g =&s->gpe_cpu;
+    ACPIGPE *gpe =&s->gpe;
+
+    *gpe->sts = *gpe->sts | PIIX4_CPU_HOTPLUG_STATUS;
+    g->cpus_sts[cpu/8]&= ~(1<<  (cpu%8));
+}
+
+void qemu_system_cpu_hot_add(int cpu, int state)
+{
+    CPUState *env;
+    PIIX4PMState *s = global_piix4_pm_state;
+
+    if (state&&  !qemu_get_cpu(cpu)) {
+        env = pc_new_cpu(global_cpu_model);
+        if (!env) {
+            fprintf(stderr, "cpu %d creation failed\n", cpu);
+            return;
+        }
+        env->cpuid_apic_id = cpu;


See comment above about proper target abstraction.


Let suppose that we have on command line option "-smp 1,maxcpus=3"
and then call
  qemu_system_cpu_hot_add( 2, 1)
  qemu_system_cpu_hot_add( 2, 1)
and we end up with
  [cpu_index = 1, cpuid_apic_id = 2],
  [cpu_index = 2, cpuid_apic_id = 2]
qemu_get_cpu(cpu) uses cpu_index field to lookup vcpu and
cpu_init -> cpu_x86_init -> cpu_exec_init doesn't have an ability
to create vcpu with specific cpu_index and numbers them according
to position in vcpu list. So we end up with 2 new vcpus with the
same cpuid_apic_id. That for sure might confuse guest since cpuid
for last 2 cpus will return the same value. And I'm not sure what
will happen if we call
  qemu_system_cpu_hot_add( 2, 1)
  qemu_system_cpu_hot_add( 1, 1)
and end up with
  [cpu_index = 1, cpuid_apic_id = 2],
  [cpu_index = 2, cpuid_apic_id = 1]
I don't know what kind of trouble this could cause.

It seams that "env->cpuid_apic_id = cpu" is pointless especcialy
taking in account that in cpu_x86_init cpuid_apic_id is initialized
by cpu_index.
What we gain in having cpuid_apic_id that actually duplicate cpu_index?
May be there is sence to get rid of cpuid_apic_id?

Another question is about how hot-plug/unplug should be designed:
1st approach:
   With the current code we can't create vcpu with specific index.
   But we can implement xen like approach, where hot-plug command says
   which amount of active vcpus guest should have. This way we can
   leave current cpu_init -> cpu_x86_init -> cpu_exec_init call
   chain without change and plug/unplug next/last vcpu.

2nd approach:
   Ability to plug/unplug individual vcpus based on their cpu_index.
   to do this we need add cpu_index argument to cpu_init ->
   cpu_x86_init -> cpu_exec_init call chain. It will look more
   like the real hardware cpu hot-plug, but do virtual guests really
   need it. And what for if this way is more preferrable than the 1st.

+    }
+
+    if (state) {
+        enable_processor(s, cpu);
+    } else {
+        disable_processor(s, cpu);
+    }
+    pm_update_sci(s);
+}
+#endif
+
  static void enable_device(PIIX4PMState *s, int slot)
  {
      s->gpe.sts[0] |= PIIX4_PCI_HOTPLUG_STATUS;
diff --git a/hw/pc.c b/hw/pc.c
index 33d8090..c7342d8 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -44,6 +44,8 @@
  #include "ui/qemu-spice.h"
  #include "memory.h"
  #include "exec-memory.h"
+#include "cpus.h"
+#include "kvm.h"


kvm? Likely not what you want.

cpu_synchronize_post_init is declared in kvm.h
Any suggestions are welcome.


  /* output Bochs bios info messages */
  //#define DEBUG_BIOS
@@ -930,10 +932,22 @@ static void pc_cpu_reset(void *opaque)
      env->halted = !cpu_is_bsp(env);
  }

-static CPUState *pc_new_cpu(const char *cpu_model)
+CPUState *pc_new_cpu(const char *cpu_model)
  {
      CPUState *env;

+    if (cpu_model == NULL) {
+#ifdef TARGET_X86_64
+        cpu_model = "qemu64";
+#else
+        cpu_model = "qemu32";
+#endif


Just always initialize global_cpu_model to a non-NULL value.

+    }
+
+    if (runstate_is_running()) {
+        pause_all_vcpus();
+    }
+
      env = cpu_init(cpu_model);
      if (!env) {
          fprintf(stderr, "Unable to find x86 CPU definition\n");
@@ -944,6 +958,11 @@ static CPUState *pc_new_cpu(const char *cpu_model)
      }
      qemu_register_reset(pc_cpu_reset, env);
      pc_cpu_reset(env);
+
+    cpu_synchronize_post_init(env);
+    if (runstate_is_running()) {
+        resume_all_vcpus();
+    }
      return env;
  }


--
Thanks,
 Igor

Re: [Qemu-devel] [PATCH 2/3] VCPU hotplug support

Reply via email to