date:20151030

[PATCH v6 18/33] dimm: get mapped memory region from DIMMDeviceClass->get_memory_region

2015-10-30 Thread Xiao Guangrong

Curretly, the memory region of backed memory is directly mapped to
guest's address space, however, it is not true for nvdimm device

This patch let dimm device realize this fact and use
DIMMDeviceClass->get_memory_region method to get the mapped memory
region

Signed-off-by: Xiao Guangrong 
---
 hw/mem/dimm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index 4a63409..498d380 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -377,8 +377,9 @@ static void dimm_get_size(Object *obj, Visitor *v, void 
*opaque,
 int64_t value;
 MemoryRegion *mr;
 DIMMDevice *dimm = DIMM(obj);
+DIMMDeviceClass *ddc = DIMM_GET_CLASS(obj);
 
-mr = host_memory_backend_get_memory(dimm->hostmem, errp);
+mr = ddc->get_memory_region(dimm);
 value = memory_region_size(mr);
 
 visit_type_int(v, , name, errp);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 20/33] dimm: introduce realize callback

2015-10-30 Thread Xiao Guangrong

nvdimm need check if the backend memory is large enough to contain label
data and init its memory region when the device is realized, so introduce
realize callback which is called after common dimm has been realize

Signed-off-by: Xiao Guangrong 
---
 hw/mem/dimm.c | 5 +
 include/hw/mem/dimm.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index 7d1..0ae23ce 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -426,6 +426,7 @@ static void dimm_init(Object *obj)
 static void dimm_realize(DeviceState *dev, Error **errp)
 {
 DIMMDevice *dimm = DIMM(dev);
+DIMMDeviceClass *ddc = DIMM_GET_CLASS(dimm);
 
 if (!dimm->hostmem) {
 error_setg(errp, "'" DIMM_MEMDEV_PROP "' property is not set");
@@ -438,6 +439,10 @@ static void dimm_realize(DeviceState *dev, Error **errp)
dimm->node, nb_numa_nodes ? nb_numa_nodes : 1);
 return;
 }
+
+if (ddc->realize) {
+ddc->realize(dimm, errp);
+}
 }
 
 static void dimm_class_init(ObjectClass *oc, void *data)
diff --git a/include/hw/mem/dimm.h b/include/hw/mem/dimm.h
index 50f768a..72ec24c 100644
--- a/include/hw/mem/dimm.h
+++ b/include/hw/mem/dimm.h
@@ -65,6 +65,7 @@ typedef struct DIMMDeviceClass {
 DeviceClass parent_class;
 
 /* public */
+void (*realize)(DIMMDevice *dimm, Error **errp);
 MemoryRegion *(*get_memory_region)(DIMMDevice *dimm);
 } DIMMDeviceClass;
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 06/33] acpi: add aml_method_serialized

2015-10-30 Thread Xiao Guangrong

It avoid explicit Mutex and will be used by NVDIMM ACPI

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 26 --
 include/hw/acpi/aml-build.h |  1 +
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 9f792ab..8bee8b2 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -696,14 +696,36 @@ Aml *aml_while(Aml *predicate)
 }
 
 /* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefMethod */
-Aml *aml_method(const char *name, int arg_count)
+static Aml *__aml_method(const char *name, int arg_count, bool serialized)
 {
 Aml *var = aml_bundle(0x14 /* MethodOp */, AML_PACKAGE);
+int methodflags;
+
+/*
+ * MethodFlags:
+ *   bit 0-2: ArgCount (0-7)
+ *   bit 3: SerializeFlag
+ * 0: NotSerialized
+ * 1: Serialized
+ *   bit 4-7: reserved (must be 0)
+ */
+assert(!(arg_count & ~7));
+methodflags = arg_count | (serialized << 3);
 build_append_namestring(var->buf, "%s", name);
-build_append_byte(var->buf, arg_count); /* MethodFlags: ArgCount */
+build_append_byte(var->buf, methodflags);
 return var;
 }
 
+Aml *aml_method(const char *name, int arg_count)
+{
+return __aml_method(name, arg_count, false);
+}
+
+Aml *aml_method_serialized(const char *name, int arg_count)
+{
+return __aml_method(name, arg_count, true);
+}
+
 /* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefDevice */
 Aml *aml_device(const char *name_format, ...)
 {
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 5b8a118..00cf40e 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -263,6 +263,7 @@ Aml *aml_qword_memory(AmlDecode dec, AmlMinFixed min_fixed,
 Aml *aml_scope(const char *name_format, ...) GCC_FMT_ATTR(1, 2);
 Aml *aml_device(const char *name_format, ...) GCC_FMT_ATTR(1, 2);
 Aml *aml_method(const char *name, int arg_count);
+Aml *aml_method_serialized(const char *name, int arg_count);
 Aml *aml_if(Aml *predicate);
 Aml *aml_else(void);
 Aml *aml_while(Aml *predicate);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 13/33] pc-dimm: make pc_existing_dimms_capacity static and rename it

2015-10-30 Thread Xiao Guangrong

pc_existing_dimms_capacity() can be static since it is not used out of
pc-dimm.c and drop the pc_ prefix to prepare the work which abstracts
dimm device type from pc-dimm

Signed-off-by: Xiao Guangrong 
---
 hw/mem/pc-dimm.c | 73 
 include/hw/mem/pc-dimm.h |  1 -
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 80f424b..2dcbbcd 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -32,6 +32,38 @@ typedef struct pc_dimms_capacity {
  Error**errp;
 } pc_dimms_capacity;
 
+static int existing_dimms_capacity_internal(Object *obj, void *opaque)
+{
+pc_dimms_capacity *cap = opaque;
+uint64_t *size = >size;
+
+if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+DeviceState *dev = DEVICE(obj);
+
+if (dev->realized) {
+(*size) += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
+cap->errp);
+}
+
+if (cap->errp && *cap->errp) {
+return 1;
+}
+}
+object_child_foreach(obj, existing_dimms_capacity_internal, opaque);
+return 0;
+}
+
+static uint64_t existing_dimms_capacity(Error **errp)
+{
+pc_dimms_capacity cap;
+
+cap.size = 0;
+cap.errp = errp;
+
+existing_dimms_capacity_internal(qdev_get_machine(), );
+return cap.size;
+}
+
 void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
  MemoryRegion *mr, uint64_t align, Error **errp)
 {
@@ -39,7 +71,7 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState 
*hpms,
 MachineState *machine = MACHINE(qdev_get_machine());
 PCDIMMDevice *dimm = PC_DIMM(dev);
 Error *local_err = NULL;
-uint64_t existing_dimms_capacity = 0;
+uint64_t dimms_capacity = 0;
 uint64_t addr;
 
 addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, 
_err);
@@ -55,17 +87,16 @@ void pc_dimm_memory_plug(DeviceState *dev, 
MemoryHotplugState *hpms,
 goto out;
 }
 
-existing_dimms_capacity = pc_existing_dimms_capacity(_err);
+dimms_capacity = existing_dimms_capacity(_err);
 if (local_err) {
 goto out;
 }
 
-if (existing_dimms_capacity + memory_region_size(mr) >
+if (dimms_capacity + memory_region_size(mr) >
 machine->maxram_size - machine->ram_size) {
 error_setg(_err, "not enough space, currently 0x%" PRIx64
" in use of total hot pluggable 0x" RAM_ADDR_FMT,
-   existing_dimms_capacity,
-   machine->maxram_size - machine->ram_size);
+   dimms_capacity, machine->maxram_size - machine->ram_size);
 goto out;
 }
 
@@ -120,38 +151,6 @@ void pc_dimm_memory_unplug(DeviceState *dev, 
MemoryHotplugState *hpms,
 vmstate_unregister_ram(mr, dev);
 }
 
-static int pc_existing_dimms_capacity_internal(Object *obj, void *opaque)
-{
-pc_dimms_capacity *cap = opaque;
-uint64_t *size = >size;
-
-if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
-DeviceState *dev = DEVICE(obj);
-
-if (dev->realized) {
-(*size) += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
-cap->errp);
-}
-
-if (cap->errp && *cap->errp) {
-return 1;
-}
-}
-object_child_foreach(obj, pc_existing_dimms_capacity_internal, opaque);
-return 0;
-}
-
-uint64_t pc_existing_dimms_capacity(Error **errp)
-{
-pc_dimms_capacity cap;
-
-cap.size = 0;
-cap.errp = errp;
-
-pc_existing_dimms_capacity_internal(qdev_get_machine(), );
-return cap.size;
-}
-
 int qmp_pc_dimm_device_list(Object *obj, void *opaque)
 {
 MemoryDeviceInfoList ***prev = opaque;
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 11a8937..8a43548 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -87,7 +87,6 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
 int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
 
 int qmp_pc_dimm_device_list(Object *obj, void *opaque);
-uint64_t pc_existing_dimms_capacity(Error **errp);
 void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
  MemoryRegion *mr, uint64_t align, Error **errp);
 void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 21/33] nvdimm: implement NVDIMM device abstract

2015-10-30 Thread Xiao Guangrong

Introduce "nvdimm" device which is based on dimm device type

128K memory region which is the minimum namespace label size
required by NVDIMM Namespace Spec locates at the end of
backend memory device is reserved for label data

We can use "-m 1G,maxmem=100G,slots=10 -object memory-backend-file,
id=mem1,size=1G,mem-path=/dev/pmem0 -device nvdimm,memdev=mem1" to
create NVDIMM device for guest

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak   |   1 +
 default-configs/x86_64-softmmu.mak |   1 +
 hw/acpi/memory_hotplug.c   |   6 ++
 hw/mem/Makefile.objs   |   1 +
 hw/mem/nvdimm.c| 113 +
 include/hw/mem/nvdimm.h|  83 +++
 6 files changed, 205 insertions(+)
 create mode 100644 hw/mem/nvdimm.c
 create mode 100644 include/hw/mem/nvdimm.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 3ece8bb..4e84a1c 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -47,6 +47,7 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 92ea7c1..e877a86 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -47,6 +47,7 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index 20d3093..bb5a29f 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -1,6 +1,7 @@
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/pc-hotplug.h"
 #include "hw/mem/dimm.h"
+#include "hw/mem/nvdimm.h"
 #include "hw/boards.h"
 #include "hw/qdev-core.h"
 #include "trace.h"
@@ -231,6 +232,11 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, 
MemHotplugState *mem_st,
 {
 MemStatus *mdev;
 
+/* Currently, NVDIMM hotplug has not been supported yet. */
+if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
+return;
+}
+
 mdev = acpi_memory_slot_status(mem_st, dev, errp);
 if (!mdev) {
 return;
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index cebb4b1..12d9b72 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1,2 +1,3 @@
 common-obj-$(CONFIG_DIMM) += dimm.o
 common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
+common-obj-$(CONFIG_NVDIMM) += nvdimm.o
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
new file mode 100644
index 000..185aa1a
--- /dev/null
+++ b/hw/mem/nvdimm.c
@@ -0,0 +1,113 @@
+/*
+ * Non-Volatile Dual In-line Memory Module Virtualization Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * Currently, it only supports PMEM Virtualization.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see 
+ */
+
+#include "qapi/visitor.h"
+#include "hw/mem/nvdimm.h"
+
+static MemoryRegion *nvdimm_get_memory_region(DIMMDevice *dimm)
+{
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+
+return memory_region_size(>nvdimm_mr) ? >nvdimm_mr : NULL;
+}
+
+static void nvdimm_realize(DIMMDevice *dimm, Error **errp)
+{
+MemoryRegion *mr;
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+uint64_t size;
+
+nvdimm->label_size = MIN_NAMESPACE_LABEL_SIZE;
+
+mr = host_memory_backend_get_memory(dimm->hostmem, errp);
+size = memory_region_size(mr);
+
+if (size <= nvdimm->label_size) {
+char *path = 
object_get_canonical_path_component(OBJECT(dimm->hostmem));
+error_setg(errp, "the size of memdev %s (0x%" PRIx64 ") is too small"
+   " to contain nvdimm namespace label (0x%" PRIx64 ")", path,
+   memory_region_size(mr), nvdimm->label_size);
+return;
+}
+
+memory_region_init_alias(>nvdimm_mr, OBJECT(dimm), "nvdimm-memory",
+ mr, 0, size - nvdimm->label_size);
+nvdimm->label_data = memory_region_get_ram_ptr(mr) +
+ memory_region_size(>nvdimm_mr);
+}
+
+static void nvdimm_read_label_data(NVDIMMDevice *nvdimm, void *buf,
+

[PATCH v6 03/33] acpi: add aml_create_field

2015-10-30 Thread Xiao Guangrong

Implement CreateField term which is used by NVDIMM _DSM method in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 13 +
 include/hw/acpi/aml-build.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index a72214d..9fe5e7b 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1151,6 +1151,19 @@ Aml *aml_sizeof(Aml *arg)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefCreateField */
+Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
+build_append_byte(var->buf, 0x13); /* CreateFieldOp */
+aml_append(var, srcbuf);
+aml_append(var, index);
+aml_append(var, len);
+build_append_namestring(var->buf, "%s", name);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 7296efb..7e1c43b 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -276,6 +276,7 @@ Aml *aml_touuid(const char *uuid);
 Aml *aml_unicode(const char *str);
 Aml *aml_derefof(Aml *arg);
 Aml *aml_sizeof(Aml *arg);
+Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 14/33] pc-dimm: drop the prefix of pc-dimm

2015-10-30 Thread Xiao Guangrong

This patch is generated by this script:

find ./ -name "*.[ch]" -o -name "*.json" -o -name "trace-events" -type f \
| xargs sed -i "s/PC_DIMM/DIMM/g"

find ./ -name "*.[ch]" -o -name "*.json" -o -name "trace-events" -type f \
| xargs sed -i "s/PCDIMM/DIMM/g"

find ./ -name "*.[ch]" -o -name "*.json" -o -name "trace-events" -type f \
| xargs sed -i "s/pc_dimm/dimm/g"

find ./ -name "trace-events" -type f | xargs sed -i "s/pc-dimm/dimm/g"

It prepares the work which abstracts dimm device type for both pc-dimm and
nvdimm

Signed-off-by: Xiao Guangrong 
---
 hmp.c   |   2 +-
 hw/acpi/ich9.c  |   6 +-
 hw/acpi/memory_hotplug.c|  16 ++---
 hw/acpi/piix4.c |   6 +-
 hw/i386/pc.c|  32 -
 hw/mem/pc-dimm.c| 148 
 hw/ppc/spapr.c  |  18 ++---
 include/hw/mem/pc-dimm.h|  62 -
 numa.c  |   2 +-
 qapi-schema.json|   8 +--
 qmp.c   |   2 +-
 stubs/qmp_pc_dimm_device_list.c |   2 +-
 trace-events|   8 +--
 13 files changed, 156 insertions(+), 156 deletions(-)

diff --git a/hmp.c b/hmp.c
index 5048eee..5c617d2 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1952,7 +1952,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict 
*qdict)
 MemoryDeviceInfoList *info_list = qmp_query_memory_devices();
 MemoryDeviceInfoList *info;
 MemoryDeviceInfo *value;
-PCDIMMDeviceInfo *di;
+DIMMDeviceInfo *di;
 
 for (info = info_list; info; info = info->next) {
 value = info->value;
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 1c7fcfa..b0d6a67 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -440,7 +440,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, 
Error **errp)
 void ich9_pm_device_plug_cb(ICH9LPCPMRegs *pm, DeviceState *dev, Error **errp)
 {
 if (pm->acpi_memory_hotplug.is_enabled &&
-object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+object_dynamic_cast(OBJECT(dev), TYPE_DIMM)) {
 acpi_memory_plug_cb(>acpi_regs, pm->irq, >acpi_memory_hotplug,
 dev, errp);
 } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
@@ -455,7 +455,7 @@ void ich9_pm_device_unplug_request_cb(ICH9LPCPMRegs *pm, 
DeviceState *dev,
   Error **errp)
 {
 if (pm->acpi_memory_hotplug.is_enabled &&
-object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+object_dynamic_cast(OBJECT(dev), TYPE_DIMM)) {
 acpi_memory_unplug_request_cb(>acpi_regs, pm->irq,
   >acpi_memory_hotplug, dev, errp);
 } else {
@@ -468,7 +468,7 @@ void ich9_pm_device_unplug_cb(ICH9LPCPMRegs *pm, 
DeviceState *dev,
   Error **errp)
 {
 if (pm->acpi_memory_hotplug.is_enabled &&
-object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+object_dynamic_cast(OBJECT(dev), TYPE_DIMM)) {
 acpi_memory_unplug_cb(>acpi_memory_hotplug, dev, errp);
 } else {
 error_setg(errp, "acpi: device unplug for not supported device"
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index ce428df..e687852 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -54,23 +54,23 @@ static uint64_t acpi_memory_hotplug_read(void *opaque, 
hwaddr addr,
 o = OBJECT(mdev->dimm);
 switch (addr) {
 case 0x0: /* Lo part of phys address where DIMM is mapped */
-val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) : 0;
+val = o ? object_property_get_int(o, DIMM_ADDR_PROP, NULL) : 0;
 trace_mhp_acpi_read_addr_lo(mem_st->selector, val);
 break;
 case 0x4: /* Hi part of phys address where DIMM is mapped */
-val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) >> 32 : 
0;
+val = o ? object_property_get_int(o, DIMM_ADDR_PROP, NULL) >> 32 : 0;
 trace_mhp_acpi_read_addr_hi(mem_st->selector, val);
 break;
 case 0x8: /* Lo part of DIMM size */
-val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) : 0;
+val = o ? object_property_get_int(o, DIMM_SIZE_PROP, NULL) : 0;
 trace_mhp_acpi_read_size_lo(mem_st->selector, val);
 break;
 case 0xc: /* Hi part of DIMM size */
-val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) >> 32 : 
0;
+val = o ? object_property_get_int(o, DIMM_SIZE_PROP, NULL) >> 32 : 0;
 trace_mhp_acpi_read_size_hi(mem_st->selector, val);
 break;
 case 0x10: /* node proximity for _PXM method */
-val = o ? object_property_get_int(o, PC_DIMM_NODE_PROP, NULL) : 0;
+val = o ? object_property_get_int(o, DIMM_NODE_PROP, NULL) : 0;
 trace_mhp_acpi_read_pxm(mem_st->selector, val);
 break;
 case 0x14: /* pack and return is_*

[PATCH v6 12/33] pc-dimm: remove DEFAULT_PC_DIMMSIZE

2015-10-30 Thread Xiao Guangrong

It's not used any more

Signed-off-by: Xiao Guangrong 
---
 include/hw/mem/pc-dimm.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index d83bf30..11a8937 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -20,8 +20,6 @@
 #include "sysemu/hostmem.h"
 #include "hw/qdev.h"
 
-#define DEFAULT_PC_DIMMSIZE (1024*1024*1024)
-
 #define TYPE_PC_DIMM "pc-dimm"
 #define PC_DIMM(obj) \
 OBJECT_CHECK(PCDIMMDevice, (obj), TYPE_PC_DIMM)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 05/33] acpi: add aml_object_type

2015-10-30 Thread Xiao Guangrong

Implement ObjectType which is used by NVDIMM _DSM method in
later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 8 
 include/hw/acpi/aml-build.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index efc06ab..9f792ab 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1178,6 +1178,14 @@ Aml *aml_concatenate(Aml *source1, Aml *source2, Aml 
*target)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefObjectType */
+Aml *aml_object_type(Aml *object)
+{
+Aml *var = aml_opcode(0x8E /* ObjectTypeOp */);
+aml_append(var, object);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 325782d..5b8a118 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -278,6 +278,7 @@ Aml *aml_derefof(Aml *arg);
 Aml *aml_sizeof(Aml *arg);
 Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name);
 Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target);
+Aml *aml_object_type(Aml *object);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 16/33] pc-dimm: rename pc-dimm.c and pc-dimm.h

2015-10-30 Thread Xiao Guangrong

Rename:
   pc-dimm.c => dimm.c
   pc-dimm.h => dimm.h

It prepares the work which abstracts dimm device type for both pc-dimm and
nvdimm

Signed-off-by: Xiao Guangrong 
---
 hw/Makefile.objs | 2 +-
 hw/acpi/ich9.c   | 2 +-
 hw/acpi/memory_hotplug.c | 4 ++--
 hw/acpi/piix4.c  | 2 +-
 hw/i386/pc.c | 2 +-
 hw/mem/Makefile.objs | 2 +-
 hw/mem/{pc-dimm.c => dimm.c} | 2 +-
 hw/ppc/spapr.c   | 2 +-
 include/hw/i386/pc.h | 2 +-
 include/hw/mem/{pc-dimm.h => dimm.h} | 0
 include/hw/ppc/spapr.h   | 2 +-
 numa.c   | 2 +-
 qmp.c| 2 +-
 stubs/qmp_dimm_device_list.c | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)
 rename hw/mem/{pc-dimm.c => dimm.c} (99%)
 rename include/hw/mem/{pc-dimm.h => dimm.h} (100%)

diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 7e7c241..12ecda9 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -30,8 +30,8 @@ devices-dirs-$(CONFIG_SOFTMMU) += vfio/
 devices-dirs-$(CONFIG_VIRTIO) += virtio/
 devices-dirs-$(CONFIG_SOFTMMU) += watchdog/
 devices-dirs-$(CONFIG_SOFTMMU) += xen/
-devices-dirs-$(CONFIG_MEM_HOTPLUG) += mem/
 devices-dirs-$(CONFIG_SMBIOS) += smbios/
+devices-dirs-y += mem/
 devices-dirs-y += core/
 common-obj-y += $(devices-dirs-y)
 obj-y += $(devices-dirs-y)
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index b0d6a67..1e9ae20 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -35,7 +35,7 @@
 #include "exec/address-spaces.h"
 
 #include "hw/i386/ich9.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 
 //#define DEBUG
 
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index e687852..20d3093 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -1,6 +1,6 @@
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/pc-hotplug.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "hw/boards.h"
 #include "hw/qdev-core.h"
 #include "trace.h"
@@ -148,7 +148,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr 
addr, uint64_t data,
 
 dev = DEVICE(mdev->dimm);
 hotplug_ctrl = qdev_get_hotplug_handler(dev);
-/* call pc-dimm unplug cb */
+/* call dimm unplug cb */
 hotplug_handler_unplug(hotplug_ctrl, dev, _err);
 if (local_err) {
 trace_mhp_acpi_dimm_delete_failed(mem_st->selector);
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index 0b2cb6e..b2f5b2c 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -33,7 +33,7 @@
 #include "hw/acpi/pcihp.h"
 #include "hw/acpi/cpu_hotplug.h"
 #include "hw/hotplug.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/acpi_dev_interface.h"
 #include "hw/xen/xen.h"
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 67ecc4f..6bf569a 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -62,7 +62,7 @@
 #include "hw/boards.h"
 #include "hw/pci/pci_host.h"
 #include "acpi-build.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "qapi/visitor.h"
 #include "qapi-visit.h"
 
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index b000fb4..7563ef5 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1 +1 @@
-common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
+common-obj-$(CONFIG_MEM_HOTPLUG) += dimm.o
diff --git a/hw/mem/pc-dimm.c b/hw/mem/dimm.c
similarity index 99%
rename from hw/mem/pc-dimm.c
rename to hw/mem/dimm.c
index 67afc53..9f55cee 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/dimm.c
@@ -18,7 +18,7 @@
  * License along with this library; if not, see 
  */
 
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "qemu/config-file.h"
 #include "qapi/visitor.h"
 #include "qemu/range.h"
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index ab6eb83..9ff24fd 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2199,7 +2199,7 @@ static void spapr_machine_device_plug(HotplugHandler 
*hotplug_dev,
  *
  * - Memory gets hotplugged to a different node than what the user
  *   specified.
- * - Since pc-dimm subsystem in QEMU still thinks that memory belongs
+ * - Since dimm subsystem in QEMU still thinks that memory belongs
  *   to memory-less node, a reboot will set things accordingly
  *   and the previously hotplugged memory now ends in the right node.
  *   This appears as if some memory moved from one node to another.
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 606dbc2..62e8fb5 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -16,7 +16,7 @@
 #include "hw/pci/pci.h"
 #include "hw/boards.h"
 #include "hw/compat.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 
 #define HPET_INTCAP "hpet-intcap"
 
diff --git

[PATCH v6 19/33] dimm: keep the state of the whole backend memory

2015-10-30 Thread Xiao Guangrong

QEMU keeps the state of memory of dimm device during live migration,
however, it is not enough for nvdimm device as its memory does not
contain its label data, so that we should protect the whole backend
memory instead

Signed-off-by: Xiao Guangrong 
---
 hw/mem/dimm.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index 498d380..7d1 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -134,9 +134,16 @@ void dimm_memory_plug(DeviceState *dev, MemoryHotplugState 
*hpms,
 }
 
 memory_region_add_subregion(>mr, addr - hpms->base, mr);
-vmstate_register_ram(mr, dev);
 numa_set_mem_node_id(addr, memory_region_size(mr), dimm->node);
 
+/*
+ * save the state only for @mr is not enough as it does not contain
+ * the label data of NVDIMM device, so that we keep the state of
+ * whole hostmem instead.
+ */
+vmstate_register_ram(host_memory_backend_get_memory(dimm->hostmem, errp),
+ dev);
+
 out:
 error_propagate(errp, local_err);
 }
@@ -145,10 +152,13 @@ void dimm_memory_unplug(DeviceState *dev, 
MemoryHotplugState *hpms,
MemoryRegion *mr)
 {
 DIMMDevice *dimm = DIMM(dev);
+MemoryRegion *backend_mr;
+
+backend_mr = host_memory_backend_get_memory(dimm->hostmem, _abort);
 
 numa_unset_mem_node_id(dimm->addr, memory_region_size(mr), dimm->node);
 memory_region_del_subregion(>mr, mr);
-vmstate_unregister_ram(mr, dev);
+vmstate_unregister_ram(backend_mr, dev);
 }
 
 int qmp_dimm_device_list(Object *obj, void *opaque)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 22/33] docs: add NVDIMM ACPI documentation

2015-10-30 Thread Xiao Guangrong

It describes the basic concepts of NVDIMM ACPI and the interface
between QEMU and the ACPI BIOS

Signed-off-by: Xiao Guangrong 
---
 docs/specs/acpi_nvdimm.txt | 179 +
 1 file changed, 179 insertions(+)
 create mode 100644 docs/specs/acpi_nvdimm.txt

diff --git a/docs/specs/acpi_nvdimm.txt b/docs/specs/acpi_nvdimm.txt
new file mode 100644
index 000..cc5db2c
--- /dev/null
+++ b/docs/specs/acpi_nvdimm.txt
@@ -0,0 +1,179 @@
+QEMU<->ACPI BIOS NVDIMM interface
+-
+
+QEMU supports NVDIMM via ACPI. This document describes the basic concepts of
+NVDIMM ACPI and the interface between QEMU and the ACPI BIOS.
+
+NVDIMM ACPI Background
+--
+NVDIMM is introduced in ACPI 6.0 which defines an NVDIMM root device under
+_SB scope with a _HID of “ACPI0012”. For each NVDIMM present or intended
+to be supported by platform, platform firmware also exposes an ACPI
+Namespace Device under the root device.
+
+The NVDIMM child devices under the NVDIMM root device are defined with _ADR
+corresponding to the NFIT device handle. The NVDIMM root device and the
+NVDIMM devices can have device specific methods (_DSM) to provide additional
+functions specific to a particular NVDIMM implementation.
+
+This is an example from ACPI 6.0, a platform contains one NVDIMM:
+
+Scope (\_SB){
+   Device (NVDR) // Root device
+   {
+  Name (_HID, “ACPI0012”)
+  Method (_STA) {...}
+  Method (_FIT) {...}
+  Method (_DSM, ...) {...}
+  Device (NVD)
+  {
+ Name(_ADR, h) //where h is NFIT Device Handle for this NVDIMM
+ Method (_DSM, ...) {...}
+  }
+   }
+}
+
+Methods supported on both NVDIMM root device and NVDIMM device are
+1) _STA(Status)
+   It returns the current status of a device, which can be one of the
+   following: enabled, disabled, or removed.
+
+   Arguments: None
+
+   Return Value:
+   It returns an An Integer which is defined as followings:
+   Bit [0] – Set if the device is present.
+   Bit [1] – Set if the device is enabled and decoding its resources.
+   Bit [2] – Set if the device should be shown in the UI.
+   Bit [3] – Set if the device is functioning properly (cleared if device
+ failed its diagnostics).
+   Bit [4] – Set if the battery is present.
+   Bits [31:5] – Reserved (must be cleared).
+
+2) _DSM (Device Specific Method)
+   It is a control method that enables devices to provide device specific
+   control functions that are consumed by the device driver.
+   The NVDIMM DSM specification can be found at:
+http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
+
+   Arguments:
+   Arg0 – A Buffer containing a UUID (16 Bytes)
+   Arg1 – An Integer containing the Revision ID (4 Bytes)
+   Arg2 – An Integer containing the Function Index (4 Bytes)
+   Arg3 – A package containing parameters for the function specified by the
+  UUID, Revision ID, and Function Index
+
+   Return Value:
+   If Function Index = 0, a Buffer containing a function index bitfield.
+   Otherwise, the return value and type depends on the UUID, revision ID
+   and function index which are described in the DSM specification.
+
+Methods on NVDIMM ROOT Device
+_FIT(Firmware Interface Table)
+   It evaluates to a buffer returning data in the format of a series of NFIT
+   Type Structure.
+
+   Arguments: None
+
+   Return Value:
+   A Buffer containing a list of NFIT Type structure entries.
+
+   The detailed definition of the structure can be found at ACPI 6.0: 5.2.25
+   NVDIMM Firmware Interface Table (NFIT).
+
+QEMU NVDIMM Implemention
+
+QEMU reserves a page starting from 0xFF0 and 4 bytes IO Port starting
+from 0x0a18 for NVDIMM ACPI.
+
+Memory 0xFF0 - 0xFF00FFF:
+   This page is RAM-based and it is used to transfer data between _DSM
+   method and QEMU. If ACPI has control, this pages is owned by ACPI which
+   writes _DSM input data to it, otherwise, it is owned by QEMU which
+   emulates _DSM access and writes the output data to it.
+
+   ACPI Writes _DSM Input Data:
+   [0xFF0 - 0xFF3]: 4 bytes, NVDIMM Devcie Handle, 0 is reserved
+for NVDIMM Root device.
+   [0xFF4 - 0xFF7]: 4 bytes, Revision ID, that is the Arg1 of _DSM
+method.
+   [0xFF8 - 0xFFB]: 4 bytes. Function Index, that is the Arg2 of
+_DSM method.
+   [0xFFC - 0xFF00FFF]: 4084 bytes, the Arg3 of _DSM method
+
+   QEMU Writes Output Data:
+   [0xFF0 - 0xFF00FFF]: the DSM return result filled by QEMU
+
+IO Port 0x0a18 - 0xa1b:
+   ACPI uses it to transfer control from guest to QEMU and read the size
+   of return result filled by QEMU
+
+   Read Access:
+   [0x0a18 - 0xa1b]: 4 bytes, the buffer size of _DSM output data.
+
+_DSM process diagram:
+-
+The page, 0xFF0 - 0xFF00FFF, is used by _DSM Virtualization.
+

[PATCH v6 04/33] acpi: add aml_concatenate

2015-10-30 Thread Xiao Guangrong

Implement Concatenate term which is used by NVDIMM _DSM method
in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 14 ++
 include/hw/acpi/aml-build.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 9fe5e7b..efc06ab 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1164,6 +1164,20 @@ Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, 
const char *name)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefConcat */
+Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target)
+{
+Aml *var = aml_opcode(0x73 /* ConcatOp */);
+aml_append(var, source1);
+aml_append(var, source2);
+
+if (target) {
+aml_append(var, target);
+}
+
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 7e1c43b..325782d 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -277,6 +277,7 @@ Aml *aml_unicode(const char *str);
 Aml *aml_derefof(Aml *arg);
 Aml *aml_sizeof(Aml *arg);
 Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name);
+Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 17/33] dimm: abstract dimm device from pc-dimm

2015-10-30 Thread Xiao Guangrong

A base device, dimm, is abstracted from pc-dimm, so that we can
build nvdimm device based on dimm in the later patch

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak   |  1 +
 default-configs/ppc64-softmmu.mak  |  1 +
 default-configs/x86_64-softmmu.mak |  1 +
 hw/mem/Makefile.objs   |  3 ++-
 hw/mem/dimm.c  | 11 ++---
 hw/mem/pc-dimm.c   | 46 ++
 include/hw/mem/dimm.h  |  4 ++--
 include/hw/mem/pc-dimm.h   |  7 ++
 8 files changed, 62 insertions(+), 12 deletions(-)
 create mode 100644 hw/mem/pc-dimm.c
 create mode 100644 include/hw/mem/pc-dimm.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 43c96d1..3ece8bb 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -18,6 +18,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_X86_ICH=y
+CONFIG_DIMM=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index bb71b23..482b8a1 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -54,3 +54,4 @@ CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
 CONFIG_MC146818RTC=y
 CONFIG_ISA_TESTDEV=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_DIMM=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index dfb8095..92ea7c1 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -18,6 +18,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_X86_ICH=y
+CONFIG_DIMM=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index 7563ef5..cebb4b1 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1 +1,2 @@
-common-obj-$(CONFIG_MEM_HOTPLUG) += dimm.o
+common-obj-$(CONFIG_DIMM) += dimm.o
+common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index 9f55cee..4a63409 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -1,5 +1,5 @@
 /*
- * Dimm device for Memory Hotplug
+ * Dimm device abstraction
  *
  * Copyright ProfitBricks GmbH 2012
  * Copyright (C) 2014 Red Hat Inc
@@ -429,21 +429,13 @@ static void dimm_realize(DeviceState *dev, Error **errp)
 }
 }
 
-static MemoryRegion *dimm_get_memory_region(DIMMDevice *dimm)
-{
-return host_memory_backend_get_memory(dimm->hostmem, _abort);
-}
-
 static void dimm_class_init(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
-DIMMDeviceClass *ddc = DIMM_CLASS(oc);
 
 dc->realize = dimm_realize;
 dc->props = dimm_properties;
 dc->desc = "DIMM memory module";
-
-ddc->get_memory_region = dimm_get_memory_region;
 }
 
 static TypeInfo dimm_info = {
@@ -453,6 +445,7 @@ static TypeInfo dimm_info = {
 .instance_init = dimm_init,
 .class_init= dimm_class_init,
 .class_size= sizeof(DIMMDeviceClass),
+.abstract  = true,
 };
 
 static void dimm_register_types(void)
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
new file mode 100644
index 000..38323e9
--- /dev/null
+++ b/hw/mem/pc-dimm.c
@@ -0,0 +1,46 @@
+/*
+ * Dimm device for Memory Hotplug
+ *
+ * Copyright ProfitBricks GmbH 2012
+ * Copyright (C) 2014 Red Hat Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see 
+ */
+
+#include "hw/mem/pc-dimm.h"
+
+static MemoryRegion *pc_dimm_get_memory_region(DIMMDevice *dimm)
+{
+return host_memory_backend_get_memory(dimm->hostmem, _abort);
+}
+
+static void pc_dimm_class_init(ObjectClass *oc, void *data)
+{
+DIMMDeviceClass *ddc = DIMM_CLASS(oc);
+
+ddc->get_memory_region = pc_dimm_get_memory_region;
+}
+
+static TypeInfo pc_dimm_info = {
+.name  = TYPE_PC_DIMM,
+.parent= TYPE_DIMM,
+.class_init= pc_dimm_class_init,
+};
+
+static void pc_dimm_register_types(void)
+{
+type_register_static(_dimm_info);
+}
+
+type_init(pc_dimm_register_types)
diff --git a/include/hw/mem/dimm.h b/include/hw/mem/dimm.h
index ece8786..50f768a 100644
--- a/include/hw/mem/dimm.h
+++ b/include/hw/mem/dimm.h
@@ -1,5 +1,5 @@
 /*
- * PC DIMM device
+ * Dimm device abstraction
  *
  * Copyright ProfitBricks GmbH 2012
  *

[PATCH v6 09/33] exec: allow file_ram_alloc to work on file

2015-10-30 Thread Xiao Guangrong

Currently, file_ram_alloc() only works on directory - it creates a file
under @path and do mmap on it

This patch tries to allow it to work on file directly, if @path is a
directory it works as before, otherwise it treats @path as the target
file then directly allocate memory from it

Signed-off-by: Xiao Guangrong 
---
 exec.c | 80 ++
 1 file changed, 51 insertions(+), 29 deletions(-)

diff --git a/exec.c b/exec.c
index 3ca7e50..f219010 100644
--- a/exec.c
+++ b/exec.c
@@ -1174,14 +1174,60 @@ void qemu_mutex_unlock_ramlist(void)
 }
 
 #ifdef __linux__
+static bool path_is_dir(const char *path)
+{
+struct stat fs;
+
+return stat(path, ) == 0 && S_ISDIR(fs.st_mode);
+}
+
+static int open_file_path(RAMBlock *block, const char *path, size_t size)
+{
+char *filename;
+char *sanitized_name;
+char *c;
+int fd;
+
+if (!path_is_dir(path)) {
+int flags = (block->flags & RAM_SHARED) ? O_RDWR : O_RDONLY;
+
+flags |= O_EXCL;
+return open(path, flags);
+}
+
+/* Make name safe to use with mkstemp by replacing '/' with '_'. */
+sanitized_name = g_strdup(memory_region_name(block->mr));
+for (c = sanitized_name; *c != '\0'; c++) {
+if (*c == '/') {
+*c = '_';
+}
+}
+filename = g_strdup_printf("%s/qemu_back_mem.%s.XX", path,
+   sanitized_name);
+g_free(sanitized_name);
+fd = mkstemp(filename);
+if (fd >= 0) {
+unlink(filename);
+/*
+ * ftruncate is not supported by hugetlbfs in older
+ * hosts, so don't bother bailing out on errors.
+ * If anything goes wrong with it under other filesystems,
+ * mmap will fail.
+ */
+if (ftruncate(fd, size)) {
+perror("ftruncate");
+}
+}
+g_free(filename);
+
+return fd;
+}
+
 static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 const char *path,
 Error **errp)
 {
-char *filename;
-char *sanitized_name;
-char *c;
 void *area;
 int fd;
 uint64_t pagesize;
@@ -1211,38 +1257,14 @@ static void *file_ram_alloc(RAMBlock *block,
 goto error;
 }
 
-/* Make name safe to use with mkstemp by replacing '/' with '_'. */
-sanitized_name = g_strdup(memory_region_name(block->mr));
-for (c = sanitized_name; *c != '\0'; c++) {
-if (*c == '/')
-*c = '_';
-}
-
-filename = g_strdup_printf("%s/qemu_back_mem.%s.XX", path,
-   sanitized_name);
-g_free(sanitized_name);
+memory = ROUND_UP(memory, pagesize);
 
-fd = mkstemp(filename);
+fd = open_file_path(block, path, memory);
 if (fd < 0) {
 error_setg_errno(errp, errno,
  "unable to create backing store for path %s", path);
-g_free(filename);
 goto error;
 }
-unlink(filename);
-g_free(filename);
-
-memory = ROUND_UP(memory, pagesize);
-
-/*
- * ftruncate is not supported by hugetlbfs in older
- * hosts, so don't bother bailing out on errors.
- * If anything goes wrong with it under other filesystems,
- * mmap will fail.
- */
-if (ftruncate(fd, memory)) {
-perror("ftruncate");
-}
 
 area = qemu_ram_mmap(fd, memory, pagesize, block->flags & RAM_SHARED);
 if (area == MAP_FAILED) {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 18/21] KVM: ARM64: Add PMU overflow interrupt routing

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

When calling perf_event_create_kernel_counter to create perf_event,
assign a overflow handler. Then when perf event overflows, set
irq_pending and call kvm_vcpu_kick() to sync the interrupt.

Signed-off-by: Shannon Zhao 
---
 arch/arm/kvm/arm.c|  4 +++
 include/kvm/arm_pmu.h |  4 +++
 virt/kvm/arm/pmu.c| 76 ++-
 3 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 78b2869..9c0fec4 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -551,6 +552,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
 
if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
local_irq_enable();
+   kvm_pmu_sync_hwstate(vcpu);
kvm_vgic_sync_hwstate(vcpu);
preempt_enable();
kvm_timer_sync_hwstate(vcpu);
@@ -598,6 +600,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
kvm_guest_exit();
trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
+   kvm_pmu_post_sync_hwstate(vcpu);
+
kvm_vgic_sync_hwstate(vcpu);
 
preempt_enable();
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index acd025a..5e7f943 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -39,6 +39,8 @@ struct kvm_pmu {
 };
 
 #ifdef CONFIG_KVM_ARM_PMU
+void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu);
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx);
 void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u32 val);
 void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u32 val, bool all_enable);
@@ -49,6 +51,8 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, 
u32 data,
u32 select_idx);
 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u32 val);
 #else
+void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {}
+void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu) {}
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx)
 {
return 0;
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 11d1bfb..6d48d9a 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * kvm_pmu_get_counter_value - get PMU counter value
@@ -69,6 +70,78 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
 }
 
 /**
+ * kvm_pmu_sync_hwstate - sync pmu state for cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Inject virtual PMU IRQ if IRQ is pending for this cpu.
+ */
+void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+   struct kvm_pmu *pmu = >arch.pmu;
+   u32 overflow;
+
+   if (!vcpu_mode_is_32bit(vcpu))
+   overflow = vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+   else
+   overflow = vcpu_cp15(vcpu, c9_PMOVSSET);
+
+   if ((pmu->irq_pending || overflow != 0) && (pmu->irq_num != -1))
+   kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, pmu->irq_num, 1);
+
+   pmu->irq_pending = false;
+}
+
+/**
+ * kvm_pmu_post_sync_hwstate - post sync pmu state for cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Inject virtual PMU IRQ if IRQ is pending for this cpu when back from guest.
+ */
+void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+   struct kvm_pmu *pmu = >arch.pmu;
+
+   if (pmu->irq_pending && (pmu->irq_num != -1))
+   kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, pmu->irq_num, 1);
+
+   pmu->irq_pending = false;
+}
+
+/**
+ * When perf event overflows, set irq_pending and call kvm_vcpu_kick() to 
inject
+ * the interrupt.
+ */
+static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+   struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+   struct kvm_vcpu *vcpu = pmc->vcpu;
+   struct kvm_pmu *pmu = >arch.pmu;
+   int idx = pmc->idx;
+
+   if (!vcpu_mode_is_32bit(vcpu)) {
+   if ((vcpu_sys_reg(vcpu, PMINTENSET_EL1) >> idx) & 0x1) {
+   __set_bit(idx,
+   (unsigned long *)_sys_reg(vcpu, PMOVSSET_EL0));
+   __set_bit(idx,
+   (unsigned long *)_sys_reg(vcpu, PMOVSCLR_EL0));
+   pmu->irq_pending = true;
+   kvm_vcpu_kick(vcpu);
+   }
+   } else {
+   if ((vcpu_cp15(vcpu, c9_PMINTENSET) >> idx) & 0x1) {
+   __set_bit(idx,
+   (unsigned long *)_cp15(vcpu,

[PATCH v4 13/21] KVM: ARM64: Add reset and access handlers for PMOVSSET and PMOVSCLR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMOVSSET and PMOVSCLR is UNKNOWN, use
reset_unknown for its reset handler. Add a new case to emulate writing
PMOVSSET or PMOVSCLR register.

When writing non-zero value to PMOVSSET, pend PMU interrupt. When the
value writing to PMOVSCLR is equal to the current value, clear the PMU
pending interrupt.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 39 ---
 include/kvm/arm_pmu.h |  4 
 virt/kvm/arm/pmu.c| 30 ++
 3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 6d2febf..e03d3b8d 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -552,6 +552,21 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
break;
}
+   case PMOVSSET_EL0: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_overflow_set(vcpu, val);
+   vcpu_sys_reg(vcpu, r->reg) |= val;
+   vcpu_sys_reg(vcpu, PMOVSCLR_EL0) |= val;
+   break;
+   }
+   case PMOVSCLR_EL0: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_overflow_clear(vcpu, val,
+  vcpu_sys_reg(vcpu, r->reg));
+   vcpu_sys_reg(vcpu, r->reg) &= ~val;
+   vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~val;
+   break;
+   }
case PMCR_EL0: {
/* Only update writeable bits of PMCR */
val = vcpu_sys_reg(vcpu, r->reg);
@@ -790,7 +805,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_unknown, PMCNTENCLR_EL0 },
/* PMOVSCLR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMOVSCLR_EL0 },
/* PMSWINC_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100),
  trap_raz_wi },
@@ -817,7 +832,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  trap_raz_wi },
/* PMOVSSET_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMOVSSET_EL0 },
 
/* TPIDR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b), Op2(0b010),
@@ -1083,6 +1098,21 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
vcpu_cp15(vcpu, c9_PMINTENSET) &= ~val;
break;
}
+   case c9_PMOVSSET: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_overflow_set(vcpu, val);
+   vcpu_cp15(vcpu, r->reg) |= val;
+   vcpu_cp15(vcpu, c9_PMOVSCLR) |= val;
+   break;
+   }
+   case c9_PMOVSCLR: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_overflow_clear(vcpu, val,
+  vcpu_cp15(vcpu, r->reg));
+   vcpu_cp15(vcpu, r->reg) &= ~val;
+   vcpu_cp15(vcpu, c9_PMOVSSET) &= ~val;
+   break;
+   }
case c9_PMCR: {
/* Only update writeable bits of PMCR */
val = vcpu_cp15(vcpu, r->reg);
@@ -1162,7 +1192,8 @@ static const struct sys_reg_desc cp15_regs[] = {
  reset_unknown_cp15, c9_PMCNTENSET },
{ Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMCNTENCLR },
-   { Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMOVSCLR },
{ Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMSELR },
{ Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmu_cp15_regs,
@@ -1180,6 +1211,8 @@ static const struct sys_reg_desc cp15_regs[] = {
  reset_unknown_cp15, c9_PMINTENSET },
{ Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMINTENCLR },
+   { Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMOVSSET },
 
{ Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
{ Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 53d5907..ff17578 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -42,6 +42,8 @@

[PATCH v4 17/21] KVM: ARM64: Add helper to handle PMCR register bits

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

According to ARMv8 spec, when writing 1 to PMCR.E, all counters are
enabled by PMCNTENSET, while writing 0 to PMCR.E, all counters are
disabled. When writing 1 to PMCR.P, reset all event counters, not
including PMCCNTR, to zero. When writing 1 to PMCR.C, reset PMCCNTR to
zero.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c |  2 ++
 include/kvm/arm_pmu.h |  2 ++
 virt/kvm/arm/pmu.c| 50 +++
 3 files changed, 54 insertions(+)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 50bf3fb..a0bb9d2 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -578,6 +578,7 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
val &= ~ARMV8_PMCR_MASK;
val |= *vcpu_reg(vcpu, p->Rt) & ARMV8_PMCR_MASK;
vcpu_sys_reg(vcpu, r->reg) = val;
+   kvm_pmu_handle_pmcr(vcpu, val);
break;
}
case PMCEID0_EL0:
@@ -1213,6 +1214,7 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
val &= ~ARMV8_PMCR_MASK;
val |= *vcpu_reg(vcpu, p->Rt) & ARMV8_PMCR_MASK;
vcpu_cp15(vcpu, r->reg) = val;
+   kvm_pmu_handle_pmcr(vcpu, val);
break;
}
case c9_PMCEID0:
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index d7de7f1..acd025a 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -47,6 +47,7 @@ void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u32 val);
 void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u32 val);
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u32 data,
u32 select_idx);
+void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u32 val);
 #else
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx)
 {
@@ -59,6 +60,7 @@ void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u32 val) {}
 void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u32 val) {}
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u32 data,
u32 select_idx) {}
+void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u32 val) {}
 #endif
 
 #endif
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index ae21089..11d1bfb 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -121,6 +121,56 @@ void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u32 
val)
 }
 
 /**
+ * kvm_pmu_handle_pmcr - handle PMCR register
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCR register
+ */
+void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u32 val)
+{
+   struct kvm_pmu *pmu = >arch.pmu;
+   struct kvm_pmc *pmc;
+   u32 enable;
+   int i;
+
+   if (val & ARMV8_PMCR_E) {
+   if (!vcpu_mode_is_32bit(vcpu))
+   enable = vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+   else
+   enable = vcpu_cp15(vcpu, c9_PMCNTENSET);
+
+   kvm_pmu_enable_counter(vcpu, enable, true);
+   } else
+   kvm_pmu_disable_counter(vcpu, 0xUL);
+
+   if (val & ARMV8_PMCR_C) {
+   pmc = >pmc[ARMV8_MAX_COUNTERS - 1];
+   if (pmc->perf_event)
+   local64_set(>perf_event->count, 0);
+   if (!vcpu_mode_is_32bit(vcpu))
+   vcpu_sys_reg(vcpu, PMCCNTR_EL0) = 0;
+   else
+   vcpu_cp15(vcpu, c9_PMCCNTR) = 0;
+   }
+
+   if (val & ARMV8_PMCR_P) {
+   for (i = 0; i < ARMV8_MAX_COUNTERS - 1; i++) {
+   pmc = >pmc[i];
+   if (pmc->perf_event)
+   local64_set(>perf_event->count, 0);
+   if (!vcpu_mode_is_32bit(vcpu))
+   vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = 0;
+   else
+   vcpu_cp15(vcpu, c14_PMEVCNTR0 + i) = 0;
+   }
+   }
+
+   if (val & ARMV8_PMCR_LC) {
+   pmc = >pmc[ARMV8_MAX_COUNTERS - 1];
+   pmc->bitmask = 0xUL;
+   }
+}
+
+/**
  * kvm_pmu_overflow_clear - clear PMU overflow interrupt
  * @vcpu: The vcpu pointer
  * @val: the value guest writes to PMOVSCLR register
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 21/21] KVM: ARM64: Add a new kvm ARM PMU device

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Add a new kvm device type KVM_DEV_TYPE_ARM_PMU_V3 for ARM PMU. Implement
the kvm_device_ops for it.

Signed-off-by: Shannon Zhao 
---
 Documentation/virtual/kvm/devices/arm-pmu.txt | 15 +
 arch/arm64/include/uapi/asm/kvm.h |  3 +
 include/linux/kvm_host.h  |  1 +
 include/uapi/linux/kvm.h  |  2 +
 virt/kvm/arm/pmu.c| 92 +++
 virt/kvm/arm/vgic.c   |  8 +++
 virt/kvm/arm/vgic.h   |  1 +
 virt/kvm/kvm_main.c   |  4 ++
 8 files changed, 126 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/arm-pmu.txt

diff --git a/Documentation/virtual/kvm/devices/arm-pmu.txt 
b/Documentation/virtual/kvm/devices/arm-pmu.txt
new file mode 100644
index 000..49481c4
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/arm-pmu.txt
@@ -0,0 +1,15 @@
+ARM Virtual Performance Monitor Unit (vPMU)
+===
+
+Device types supported:
+  KVM_DEV_TYPE_ARM_PMU_V3 ARM Performance Monitor Unit v3
+
+Instantiate one PMU instance for per VCPU through this API.
+
+Groups:
+  KVM_DEV_ARM_PMU_GRP_IRQ
+  Attributes:
+A value describing the interrupt number of PMU overflow interrupt.
+
+  Errors:
+-EINVAL: Value set is out of the expected range
diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 0cd7b59..1309a93 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -204,6 +204,9 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_CTRL  4
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT   0
 
+/* Device Control API: ARM PMU */
+#define KVM_DEV_ARM_PMU_GRP_IRQ0
+
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT 24
 #define KVM_ARM_IRQ_TYPE_MASK  0xff
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bef9e2..f6be696 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1122,6 +1122,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
 extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
+extern struct kvm_device_ops kvm_arm_pmu_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a9256f0..f41e6b6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1025,6 +1025,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_FLIC  KVM_DEV_TYPE_FLIC
KVM_DEV_TYPE_ARM_VGIC_V3,
 #define KVM_DEV_TYPE_ARM_VGIC_V3   KVM_DEV_TYPE_ARM_VGIC_V3
+   KVM_DEV_TYPE_ARM_PMU_V3,
+#defineKVM_DEV_TYPE_ARM_PMU_V3 KVM_DEV_TYPE_ARM_PMU_V3
KVM_DEV_TYPE_MAX,
 };
 
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index d78ce7b..0a00d04 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -19,10 +19,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 
+#include "vgic.h"
+
 /**
  * kvm_pmu_get_counter_value - get PMU counter value
  * @vcpu: The vcpu pointer
@@ -416,3 +419,92 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, 
u32 data,
 
pmc->perf_event = event;
 }
+
+static int kvm_arm_pmu_set_irq(struct kvm *kvm, int irq)
+{
+   int j;
+   struct kvm_vcpu *vcpu;
+
+   kvm_for_each_vcpu(j, vcpu, kvm) {
+   struct kvm_pmu *pmu = >arch.pmu;
+
+   kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
+   pmu->irq_num = irq;
+   vgic_dist_irq_set_cfg(vcpu, irq, true);
+   }
+
+   return 0;
+}
+
+static int kvm_arm_pmu_create(struct kvm_device *dev, u32 type)
+{
+   int i, j;
+   struct kvm_vcpu *vcpu;
+   struct kvm *kvm = dev->kvm;
+
+   kvm_for_each_vcpu(j, vcpu, kvm) {
+   struct kvm_pmu *pmu = >arch.pmu;
+
+   memset(pmu, 0, sizeof(*pmu));
+   for (i = 0; i < ARMV8_MAX_COUNTERS; i++) {
+   pmu->pmc[i].idx = i;
+   pmu->pmc[i].vcpu = vcpu;
+   pmu->pmc[i].bitmask = 0xUL;
+   }
+   pmu->irq_num = -1;
+   }
+
+   return 0;
+}
+
+static void kvm_arm_pmu_destroy(struct kvm_device *dev)
+{
+   kfree(dev);
+}
+
+static int kvm_arm_pmu_set_attr(struct kvm_device *dev,
+   struct kvm_device_attr *attr)
+{
+   switch (attr->group) {
+   case KVM_DEV_ARM_PMU_GRP_IRQ: {
+   int __user *uaddr = (int __user *)(long)attr->addr;
+   int reg;
+
+   if (get_user(reg, uaddr))
+   return -EFAULT;
+
+   if (reg < VGIC_NR_SGIS || reg > dev->kvm->arch.vgic.nr_irqs)
+   return -EINVAL;
+
+

[PATCH v4 04/21] KVM: ARM64: Add reset and access handlers for PMCR_EL0 register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Add reset handler which gets host value of PMCR_EL0 and make writable
bits architecturally UNKNOWN except PMCR.E to zero. Add a common access
handler for PMU registers which emulates writing and reading register
and add emulation for PMCR.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 106 +-
 1 file changed, 104 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index d03d3af..5b591d6 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -446,6 +447,67 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const 
struct sys_reg_desc *r)
vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
 }
 
+static void vcpu_sysreg_write(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r, u64 val)
+{
+   if (!vcpu_mode_is_32bit(vcpu))
+   vcpu_sys_reg(vcpu, r->reg) = val;
+   else
+   vcpu_cp15(vcpu, r->reg) = lower_32_bits(val);
+}
+
+static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+   u64 pmcr, val;
+
+   asm volatile("mrs %0, pmcr_el0\n" : "=r" (pmcr));
+   /* Writable bits of PMCR_EL0 (ARMV8_PMCR_MASK) is reset to UNKNOWN
+* except PMCR.E resetting to zero.
+*/
+   val = ((pmcr & ~ARMV8_PMCR_MASK) | (ARMV8_PMCR_MASK & 0xdecafbad))
+ & (~ARMV8_PMCR_E);
+   vcpu_sysreg_write(vcpu, r, val);
+}
+
+/* PMU registers accessor. */
+static bool access_pmu_regs(struct kvm_vcpu *vcpu,
+   const struct sys_reg_params *p,
+   const struct sys_reg_desc *r)
+{
+   unsigned long val;
+
+   if (p->is_write) {
+   switch (r->reg) {
+   case PMCR_EL0: {
+   /* Only update writeable bits of PMCR */
+   val = vcpu_sys_reg(vcpu, r->reg);
+   val &= ~ARMV8_PMCR_MASK;
+   val |= *vcpu_reg(vcpu, p->Rt) & ARMV8_PMCR_MASK;
+   vcpu_sys_reg(vcpu, r->reg) = val;
+   break;
+   }
+   default:
+   vcpu_sys_reg(vcpu, r->reg) = *vcpu_reg(vcpu, p->Rt);
+   break;
+   }
+   } else {
+   switch (r->reg) {
+   case PMCR_EL0: {
+   /* PMCR.P & PMCR.C are RAZ */
+   val = vcpu_sys_reg(vcpu, r->reg)
+ & ~(ARMV8_PMCR_P | ARMV8_PMCR_C);
+   *vcpu_reg(vcpu, p->Rt) = val;
+   break;
+   }
+   default:
+   *vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg);
+   break;
+   }
+   }
+
+   return true;
+}
+
 /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
 #define DBG_BCR_BVR_WCR_WVR_EL1(n) \
/* DBGBVRn_EL1 */   \
@@ -630,7 +692,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
/* PMCR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000),
- trap_raz_wi },
+ access_pmu_regs, reset_pmcr, PMCR_EL0, },
/* PMCNTENSET_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001),
  trap_raz_wi },
@@ -864,6 +926,45 @@ static const struct sys_reg_desc cp14_64_regs[] = {
{ Op1( 0), CRm( 2), .access = trap_raz_wi },
 };
 
+/* PMU CP15 registers accessor. */
+static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
+const struct sys_reg_params *p,
+const struct sys_reg_desc *r)
+{
+   unsigned long val;
+
+   if (p->is_write) {
+   switch (r->reg) {
+   case c9_PMCR: {
+   /* Only update writeable bits of PMCR */
+   val = vcpu_cp15(vcpu, r->reg);
+   val &= ~ARMV8_PMCR_MASK;
+   val |= *vcpu_reg(vcpu, p->Rt) & ARMV8_PMCR_MASK;
+   vcpu_cp15(vcpu, r->reg) = val;
+   break;
+   }
+   default:
+   vcpu_cp15(vcpu, r->reg) = *vcpu_reg(vcpu, p->Rt);
+   break;
+   }
+   } else {
+   switch (r->reg) {
+   case c9_PMCR: {
+   /* PMCR.P & PMCR.C are RAZ */
+   val = vcpu_cp15(vcpu, r->reg)
+ & ~(ARMV8_PMCR_P | ARMV8_PMCR_C);
+   *vcpu_reg(vcpu, p->Rt) = val;
+   break;
+   }
+   default:
+   *vcpu_reg(vcpu,

[PATCH v4 01/21] ARM64: Move PMU register related defines to asm/pmu.h

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

To use the ARMv8 PMU related register defines from the KVM code,
we move the relevant definitions to asm/pmu.h header file.

Signed-off-by: Anup Patel 
Signed-off-by: Shannon Zhao 
---
 arch/arm64/include/asm/pmu.h   | 45 ++
 arch/arm64/kernel/perf_event.c | 35 
 2 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/include/asm/pmu.h b/arch/arm64/include/asm/pmu.h
index b7710a5..b9f394a 100644
--- a/arch/arm64/include/asm/pmu.h
+++ b/arch/arm64/include/asm/pmu.h
@@ -19,6 +19,51 @@
 #ifndef __ASM_PMU_H
 #define __ASM_PMU_H
 
+#define ARMV8_MAX_COUNTERS  32
+#define ARMV8_COUNTER_MASK  (ARMV8_MAX_COUNTERS - 1)
+
+/*
+ * Per-CPU PMCR: config reg
+ */
+#define ARMV8_PMCR_E   (1 << 0) /* Enable all counters */
+#define ARMV8_PMCR_P   (1 << 1) /* Reset all counters */
+#define ARMV8_PMCR_C   (1 << 2) /* Cycle counter reset */
+#define ARMV8_PMCR_D   (1 << 3) /* CCNT counts every 64th cpu cycle */
+#define ARMV8_PMCR_X   (1 << 4) /* Export to ETM */
+#define ARMV8_PMCR_DP  (1 << 5) /* Disable CCNT if non-invasive debug*/
+#defineARMV8_PMCR_N_SHIFT  11   /* Number of counters 
supported */
+#defineARMV8_PMCR_N_MASK   0x1f
+#defineARMV8_PMCR_MASK 0x3f /* Mask for writable bits */
+
+/*
+ * PMCNTEN: counters enable reg
+ */
+#defineARMV8_CNTEN_MASK0x  /* Mask for writable 
bits */
+
+/*
+ * PMINTEN: counters interrupt enable reg
+ */
+#defineARMV8_INTEN_MASK0x  /* Mask for writable 
bits */
+
+/*
+ * PMOVSR: counters overflow flag status reg
+ */
+#defineARMV8_OVSR_MASK 0x  /* Mask for writable 
bits */
+#defineARMV8_OVERFLOWED_MASK   ARMV8_OVSR_MASK
+
+/*
+ * PMXEVTYPER: Event selection reg
+ */
+#defineARMV8_EVTYPE_MASK   0xc80003ff  /* Mask for writable 
bits */
+#defineARMV8_EVTYPE_EVENT  0x3ff   /* Mask for EVENT bits 
*/
+
+/*
+ * Event filters for PMUv3
+ */
+#defineARMV8_EXCLUDE_EL1   (1 << 31)
+#defineARMV8_EXCLUDE_EL0   (1 << 30)
+#defineARMV8_INCLUDE_EL2   (1 << 27)
+
 #ifdef CONFIG_HW_PERF_EVENTS
 
 /* The events for a given PMU register set. */
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index f9a74d4..534e8ad 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -741,9 +741,6 @@ static const unsigned 
armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 #defineARMV8_IDX_COUNTER0  1
 #defineARMV8_IDX_COUNTER_LAST  (ARMV8_IDX_CYCLE_COUNTER + 
cpu_pmu->num_events - 1)
 
-#defineARMV8_MAX_COUNTERS  32
-#defineARMV8_COUNTER_MASK  (ARMV8_MAX_COUNTERS - 1)
-
 /*
  * ARMv8 low level PMU access
  */
@@ -754,38 +751,6 @@ static const unsigned 
armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 #defineARMV8_IDX_TO_COUNTER(x) \
(((x) - ARMV8_IDX_COUNTER0) & ARMV8_COUNTER_MASK)
 
-/*
- * Per-CPU PMCR: config reg
- */
-#define ARMV8_PMCR_E   (1 << 0) /* Enable all counters */
-#define ARMV8_PMCR_P   (1 << 1) /* Reset all counters */
-#define ARMV8_PMCR_C   (1 << 2) /* Cycle counter reset */
-#define ARMV8_PMCR_D   (1 << 3) /* CCNT counts every 64th cpu cycle */
-#define ARMV8_PMCR_X   (1 << 4) /* Export to ETM */
-#define ARMV8_PMCR_DP  (1 << 5) /* Disable CCNT if non-invasive debug*/
-#defineARMV8_PMCR_N_SHIFT  11   /* Number of counters 
supported */
-#defineARMV8_PMCR_N_MASK   0x1f
-#defineARMV8_PMCR_MASK 0x3f /* Mask for writable bits */
-
-/*
- * PMOVSR: counters overflow flag status reg
- */
-#defineARMV8_OVSR_MASK 0x  /* Mask for writable 
bits */
-#defineARMV8_OVERFLOWED_MASK   ARMV8_OVSR_MASK
-
-/*
- * PMXEVTYPER: Event selection reg
- */
-#defineARMV8_EVTYPE_MASK   0xc80003ff  /* Mask for writable 
bits */
-#defineARMV8_EVTYPE_EVENT  0x3ff   /* Mask for EVENT bits 
*/
-
-/*
- * Event filters for PMUv3
- */
-#defineARMV8_EXCLUDE_EL1   (1 << 31)
-#defineARMV8_EXCLUDE_EL0   (1 << 30)
-#defineARMV8_INCLUDE_EL2   (1 << 27)
-
 static inline u32 armv8pmu_pmcr_read(void)
 {
u32 val;
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 20/21] KVM: ARM64: Free perf event of PMU when destroying vcpu

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

When KVM frees VCPU, it needs to free the perf_event of PMU.

Signed-off-by: Shannon Zhao 
---
 arch/arm/kvm/arm.c|  1 +
 include/kvm/arm_pmu.h |  2 ++
 virt/kvm/arm/pmu.c| 21 +
 3 files changed, 24 insertions(+)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9c0fec4..90ddb93 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -259,6 +259,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
kvm_mmu_free_memory_caches(vcpu);
kvm_timer_vcpu_terminate(vcpu);
kvm_vgic_vcpu_destroy(vcpu);
+   kvm_pmu_vcpu_destroy(vcpu);
kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index e708c49..f2cd8d9 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -40,6 +40,7 @@ struct kvm_pmu {
 
 #ifdef CONFIG_KVM_ARM_PMU
 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu);
 void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu);
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx);
@@ -53,6 +54,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, 
u32 data,
 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u32 val);
 #else
 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
+void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {}
 void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {}
 void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu) {}
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx)
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 84720a2..d78ce7b 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -89,6 +89,27 @@ void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
 }
 
 /**
+ * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+   int i;
+   struct kvm_pmu *pmu = >arch.pmu;
+
+   for (i = 0; i < ARMV8_MAX_COUNTERS; i++) {
+   struct kvm_pmc *pmc = >pmc[i];
+
+   if (pmc->perf_event) {
+   perf_event_disable(pmc->perf_event);
+   perf_event_release_kernel(pmc->perf_event);
+   pmc->perf_event = NULL;
+   }
+   }
+}
+
+/**
  * kvm_pmu_sync_hwstate - sync pmu state for cpu
  * @vcpu: The vcpu pointer
  *
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 19/21] KVM: ARM64: Reset PMU state when resetting vcpu

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

When resetting vcpu, it needs to reset the PMU state to initial status.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/reset.c |  3 +++
 include/kvm/arm_pmu.h  |  2 ++
 virt/kvm/arm/pmu.c | 19 +++
 3 files changed, 24 insertions(+)

diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 91cf535..4da7f6c 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -120,6 +120,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
/* Reset system registers */
kvm_reset_sys_regs(vcpu);
 
+   /* Reset PMU */
+   kvm_pmu_vcpu_reset(vcpu);
+
/* Reset timer */
return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 5e7f943..e708c49 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -39,6 +39,7 @@ struct kvm_pmu {
 };
 
 #ifdef CONFIG_KVM_ARM_PMU
+void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
 void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu);
 void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu);
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx);
@@ -51,6 +52,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, 
u32 data,
u32 select_idx);
 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u32 val);
 #else
+void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {}
 void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {}
 void kvm_pmu_post_sync_hwstate(struct kvm_vcpu *vcpu) {}
 unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx)
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 6d48d9a..84720a2 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -70,6 +70,25 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
 }
 
 /**
+ * kvm_pmu_vcpu_reset - reset pmu state for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+   int i;
+   struct kvm_pmu *pmu = >arch.pmu;
+
+   for (i = 0; i < ARMV8_MAX_COUNTERS; i++) {
+   kvm_pmu_stop_counter(>pmc[i]);
+   pmu->pmc[i].idx = i;
+   pmu->pmc[i].vcpu = vcpu;
+   pmu->pmc[i].bitmask = 0xUL;
+   }
+   pmu->irq_pending = false;
+}
+
+/**
  * kvm_pmu_sync_hwstate - sync pmu state for cpu
  * @vcpu: The vcpu pointer
  *
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 00/21] KVM: ARM64: Add guest PMU support

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

This patchset adds guest PMU support for KVM on ARM64. It takes
trap-and-emulate approach. When guest wants to monitor one event, it
will be trapped by KVM and KVM will call perf_event API to create a perf
event and call relevant perf_event APIs to get the count value of event.

Use perf to test this patchset in guest. When using "perf list", it
shows the list of the hardware events and hardware cache events perf
supports. Then use "perf stat -e EVENT" to monitor some event. For
example, use "perf stat -e cycles" to count cpu cycles and
"perf stat -e cache-misses" to count cache misses.

Below are the outputs of "perf stat -r 5 sleep 5" when running in host
and guest.

Host:
 Performance counter stats for 'sleep 5' (5 runs):

  0.522048  task-clock (msec) #0.000 CPUs utilized  
  ( +-  1.50% )
 1  context-switches  #0.002 M/sec
 0  cpu-migrations#0.383 K/sec  
  ( +-100.00% )
48  page-faults   #0.092 M/sec  
  ( +-  0.66% )
   1088597  cycles#2.085 GHz
  ( +-  1.50% )
 stalled-cycles-frontend
 stalled-cycles-backend
524457  instructions  #0.48  insns per cycle
  ( +-  0.89% )
 branches
  9688  branch-misses #   18.557 M/sec  
  ( +-  1.78% )

   5.000851736 seconds time elapsed 
 ( +-  0.00% )

Guest:
 Performance counter stats for 'sleep 5' (5 runs):

  0.632288  task-clock (msec) #0.000 CPUs utilized  
  ( +-  1.11% )
 1  context-switches  #0.002 M/sec
 0  cpu-migrations#0.000 K/sec
49  page-faults   #0.078 M/sec  
  ( +-  1.19% )
   1119933  cycles#1.771 GHz
  ( +-  1.19% )
 stalled-cycles-frontend
 stalled-cycles-backend
568318  instructions  #0.51  insns per cycle
  ( +-  0.91% )
 branches
 10227  branch-misses #   16.175 M/sec  
  ( +-  1.71% )

   5.001170616 seconds time elapsed 
 ( +-  0.00% )

Have a cycle counter read test like below in guest and host:

static void test(void)
{
unsigned long count, count1, count2;
count1 = read_cycles();
count++;
count2 = read_cycles();
}

Host:
count1: 3044948797
count2: 3044948931
delta: 134

Guest:
count1: 5782364731
count2: 5782364885
delta: 154

The gap between guest and host is very small. One reason for this I
think is that it doesn't count the cycles in EL2 and host since we add
exclude_hv = 1. So the cycles spent to store/restore registers which
happens at EL2 are not included.

This patchset can be fetched from [1] and the relevant QEMU version for
test can be fetched from [2].

The results of 'perf test' can be found from [3][4].
The results of perf_event_tests test suite can be found from [5][6].

Thanks,
Shannon

[1] https://git.linaro.org/people/shannon.zhao/linux-mainline.git  
KVM_ARM64_PMU_v4
[2] https://git.linaro.org/people/shannon.zhao/qemu.git  virtual_PMU
[3] http://people.linaro.org/~shannon.zhao/PMU/perf-test-host.txt
[4] http://people.linaro.org/~shannon.zhao/PMU/perf-test-guest.txt
[5] http://people.linaro.org/~shannon.zhao/PMU/perf_event_tests-host.txt
[6] http://people.linaro.org/~shannon.zhao/PMU/perf_event_tests-guest.txt

Changes since v3:
* Rebase on new linux kernel mainline 
* Use ARMV8_MAX_COUNTERS instead of 32
* Reset PMCR.E to zero.
* Trigger overflow for software increment.
* Optimize PMU interrupt inject logic.
* Add handler for E,C,P bits of PMCR
* Fix the overflow bug found by perf_event_tests
* Run 'perf test', 'perf top' and perf_event_tests test suite
* Add exclude_hv = 1 configuration to not count in EL2

Changes since v2:
* Directly use perf raw event type to create perf_event in KVM
* Add a helper vcpu_sysreg_write
* remove unrelated header file

Changes since v1:
* Use switch...case for registers access handler instead of adding
  alone handler for each register
* Try to use the sys_regs to store the register value instead of adding
  new variables in struct kvm_pmc
* Fix the handle of cp15 regs
* Create a new kvm device vPMU, then userspace could choose whether to
  create PMU
* Fix the handle of PMU overflow interrupt

Shannon Zhao (21):
  ARM64: Move PMU register related defines to asm/pmu.h
  KVM: ARM64: Define PMU data structure for each vcpu
  KVM: ARM64: Add offset defines for PMU registers
  KVM: ARM64: Add reset and access handlers for PMCR_EL0 register
  KVM: ARM64: Add reset and access handlers for

[PATCH v4 14/21] KVM: ARM64: Add reset and access handlers for PMUSERENR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

The reset value of PMUSERENR_EL0 is UNKNOWN, use reset_unknown. While
the reset value of PMUSERENR is zero, use reset_val_cp15 with zero for
its reset handler.

Add a helper for CP15 registers reset to specified value.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 5 +++--
 arch/arm64/kvm/sys_regs.h | 8 
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e03d3b8d..c44c8e1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -829,7 +829,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_unknown, PMXEVCNTR_EL0 },
/* PMUSERENR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMUSERENR_EL0 },
/* PMOVSSET_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011),
  access_pmu_regs, reset_unknown, PMOVSSET_EL0 },
@@ -1206,7 +1206,8 @@ static const struct sys_reg_desc cp15_regs[] = {
  reset_unknown_cp15, c9_PMXEVTYPER },
{ Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMXEVCNTR },
-   { Op1( 0), CRn( 9), CRm(14), Op2( 0), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(14), Op2( 0), access_pmu_cp15_regs,
+ reset_val_cp15,  c9_PMUSERENR, 0 },
{ Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMINTENSET },
{ Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pmu_cp15_regs,
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 8afeff7..aba997d 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -125,6 +125,14 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const 
struct sys_reg_desc *r
vcpu_sys_reg(vcpu, r->reg) = r->val;
 }
 
+static inline void reset_val_cp15(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+   BUG_ON(!r->reg);
+   BUG_ON(r->reg >= NR_SYS_REGS);
+   vcpu_cp15(vcpu, r->reg) = r->val;
+}
+
 static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
  const struct sys_reg_desc *i2)
 {
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 02/21] KVM: ARM64: Define PMU data structure for each vcpu

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Here we plan to support virtual PMU for guest by full software
emulation, so define some basic structs and functions preparing for
futher steps. Define struct kvm_pmc for performance monitor counter and
struct kvm_pmu for performance monitor unit for each vcpu. According to
ARMv8 spec, the PMU contains at most 32(ARMV8_MAX_COUNTERS) counters.

Since this only supports ARM64 (or PMUv3), add a separate config symbol
for it.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/Kconfig|  8 
 include/kvm/arm_pmu.h | 41 +++
 3 files changed, 51 insertions(+)
 create mode 100644 include/kvm/arm_pmu.h

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index ed03968..cc843ca 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -37,6 +37,7 @@
 
 #include 
 #include 
+#include 
 
 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
 
@@ -132,6 +133,7 @@ struct kvm_vcpu_arch {
/* VGIC state */
struct vgic_cpu vgic_cpu;
struct arch_timer_cpu timer_cpu;
+   struct kvm_pmu pmu;
 
/*
 * Anything that is not used directly from assembly code goes
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 5c7e920..8f321b1 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
select KVM_VFIO
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
+   select KVM_ARM_PMU
---help---
  Support hosting virtualized guest machines.
 
@@ -41,4 +42,11 @@ config KVM_ARM_HOST
---help---
  Provides host support for ARM processors.
 
+config KVM_ARM_PMU
+   bool
+   depends on KVM_ARM_HOST
+   ---help---
+ Adds support for a virtual Performance Monitoring Unit (PMU) in
+ virtual machines.
+
 endif # VIRTUALIZATION
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
new file mode 100644
index 000..254d2b4
--- /dev/null
+++ b/include/kvm/arm_pmu.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Shannon Zhao 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#ifndef __ASM_ARM_KVM_PMU_H
+#define __ASM_ARM_KVM_PMU_H
+
+#include 
+#include 
+
+struct kvm_pmc {
+   u8 idx;/* index into the pmu->pmc array */
+   struct perf_event *perf_event;
+   struct kvm_vcpu *vcpu;
+   u64 bitmask;
+};
+
+struct kvm_pmu {
+#ifdef CONFIG_KVM_ARM_PMU
+   /* PMU IRQ Number per VCPU */
+   int irq_num;
+   /* IRQ pending flag */
+   bool irq_pending;
+   struct kvm_pmc pmc[ARMV8_MAX_COUNTERS];
+#endif
+};
+
+#endif
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 03/21] KVM: ARM64: Add offset defines for PMU registers

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

We are about to trap and emulate acccesses to each PMU register
individually. This adds the context offsets for the AArch64 PMU
registers and their AArch32 counterparts.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/include/asm/kvm_asm.h | 55 
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 5e37710..4f804c1 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -48,12 +48,34 @@
 #define MDSCR_EL1  22  /* Monitor Debug System Control Register */
 #define MDCCINT_EL123  /* Monitor Debug Comms Channel Interrupt Enable 
Reg */
 
+/* Performance Monitors Registers */
+#define PMCR_EL0   24  /* Control Register */
+#define PMOVSSET_EL0   25  /* Overflow Flag Status Set Register */
+#define PMOVSCLR_EL0   26  /* Overflow Flag Status Clear Register */
+#define PMSELR_EL0 27  /* Event Counter Selection Register */
+#define PMCEID0_EL028  /* Common Event Identification Register 0 */
+#define PMCEID1_EL029  /* Common Event Identification Register 1 */
+#define PMEVCNTR0_EL0  30  /* Event Counter Register (0-30) */
+#define PMEVCNTR30_EL0 60
+#define PMCCNTR_EL061  /* Cycle Counter Register */
+#define PMEVTYPER0_EL0 62  /* Event Type Register (0-30) */
+#define PMEVTYPER30_EL092
+#define PMCCFILTR_EL0  93  /* Cycle Count Filter Register */
+#define PMXEVCNTR_EL0  94  /* Selected Event Count Register */
+#define PMXEVTYPER_EL0 95  /* Selected Event Type Register */
+#define PMCNTENSET_EL0 96  /* Count Enable Set Register */
+#define PMCNTENCLR_EL0 97  /* Count Enable Clear Register */
+#define PMINTENSET_EL1 98  /* Interrupt Enable Set Register */
+#define PMINTENCLR_EL1 99  /* Interrupt Enable Clear Register */
+#define PMUSERENR_EL0  100 /* User Enable Register */
+#define PMSWINC_EL0101 /* Software Increment Register */
+
 /* 32bit specific registers. Keep them at the end of the range */
-#defineDACR32_EL2  24  /* Domain Access Control Register */
-#defineIFSR32_EL2  25  /* Instruction Fault Status Register */
-#defineFPEXC32_EL2 26  /* Floating-Point Exception Control 
Register */
-#defineDBGVCR32_EL227  /* Debug Vector Catch Register */
-#defineNR_SYS_REGS 28
+#defineDACR32_EL2  102 /* Domain Access Control Register */
+#defineIFSR32_EL2  103 /* Instruction Fault Status Register */
+#defineFPEXC32_EL2 104 /* Floating-Point Exception Control 
Register */
+#defineDBGVCR32_EL2105 /* Debug Vector Catch Register */
+#defineNR_SYS_REGS 106
 
 /* 32bit mapping */
 #define c0_MPIDR   (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
@@ -75,6 +97,24 @@
 #define c6_IFAR(c6_DFAR + 1)   /* Instruction Fault Address 
Register */
 #define c7_PAR (PAR_EL1 * 2)   /* Physical Address Register */
 #define c7_PAR_high(c7_PAR + 1)/* PAR top 32 bits */
+
+/* Performance Monitors*/
+#define c9_PMCR(PMCR_EL0 * 2)
+#define c9_PMOVSSET(PMOVSSET_EL0 * 2)
+#define c9_PMOVSCLR(PMOVSCLR_EL0 * 2)
+#define c9_PMCCNTR (PMCCNTR_EL0 * 2)
+#define c9_PMSELR  (PMSELR_EL0 * 2)
+#define c9_PMCEID0 (PMCEID0_EL0 * 2)
+#define c9_PMCEID1 (PMCEID1_EL0 * 2)
+#define c9_PMXEVCNTR   (PMXEVCNTR_EL0 * 2)
+#define c9_PMXEVTYPER  (PMXEVTYPER_EL0 * 2)
+#define c9_PMCNTENSET  (PMCNTENSET_EL0 * 2)
+#define c9_PMCNTENCLR  (PMCNTENCLR_EL0 * 2)
+#define c9_PMINTENSET  (PMINTENSET_EL1 * 2)
+#define c9_PMINTENCLR  (PMINTENCLR_EL1 * 2)
+#define c9_PMUSERENR   (PMUSERENR_EL0 * 2)
+#define c9_PMSWINC (PMSWINC_EL0 * 2)
+
 #define c10_PRRR   (MAIR_EL1 * 2)  /* Primary Region Remap Register */
 #define c10_NMRR   (c10_PRRR + 1)  /* Normal Memory Remap Register */
 #define c12_VBAR   (VBAR_EL1 * 2)  /* Vector Base Address Register */
@@ -86,6 +126,11 @@
 #define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
 #define c14_CNTKCTL(CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
 
+/* Performance Monitors*/
+#define c14_PMEVCNTR0  (PMEVCNTR0_EL0 * 2)
+#define c14_PMEVTYPER0 (PMEVTYPER0_EL0 * 2)
+#define c14_PMCCFILTR  (PMCCFILTR_EL0 * 2)
+
 #define cp14_DBGDSCRext(MDSCR_EL1 * 2)
 #define cp14_DBGBCR0   (DBGBCR0_EL1 * 2)
 #define cp14_DBGBVR0   (DBGBVR0_EL1 * 2)
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 11/21] KVM: ARM64: Add reset and access handlers for PMCNTENSET and PMCNTENCLR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMCNTENSET and PMCNTENCLR is UNKNOWN, use
reset_unknown for its reset handler. Add a new case to emulate writing
PMCNTENSET or PMCNTENCLR register.

When writing to PMCNTENSET, call perf_event_enable to enable the perf
event. When writing to PMCNTENCLR, call perf_event_disable to disable
the perf event.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 52 +++
 include/kvm/arm_pmu.h |  4 
 virt/kvm/arm/pmu.c| 52 +++
 3 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 059c84c..c358ae0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -519,6 +519,27 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
 *vcpu_reg(vcpu, p->Rt);
break;
}
+   case PMCNTENSET_EL0: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_enable_counter(vcpu, val,
+  vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMCR_E);
+   /* Value 1 of PMCNTENSET_EL0 and PMCNTENCLR_EL0 means
+* corresponding counter enabled.
+*/
+   vcpu_sys_reg(vcpu, r->reg) |= val;
+   vcpu_sys_reg(vcpu, PMCNTENCLR_EL0) |= val;
+   break;
+   }
+   case PMCNTENCLR_EL0: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_disable_counter(vcpu, val);
+   /* Value 0 of PMCNTENSET_EL0 and PMCNTENCLR_EL0 means
+* corresponding counter disabled.
+*/
+   vcpu_sys_reg(vcpu, r->reg) &= ~val;
+   vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
+   break;
+   }
case PMCR_EL0: {
/* Only update writeable bits of PMCR */
val = vcpu_sys_reg(vcpu, r->reg);
@@ -751,10 +772,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_pmcr, PMCR_EL0, },
/* PMCNTENSET_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMCNTENSET_EL0 },
/* PMCNTENCLR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMCNTENCLR_EL0 },
/* PMOVSCLR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011),
  trap_raz_wi },
@@ -1017,6 +1038,27 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
 *vcpu_reg(vcpu, p->Rt);
break;
}
+   case c9_PMCNTENSET: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_enable_counter(vcpu, val,
+  vcpu_cp15(vcpu, c9_PMCR) & ARMV8_PMCR_E);
+   /* Value 1 of PMCNTENSET_EL0 and PMCNTENCLR_EL0 means
+* corresponding counter enabled.
+*/
+   vcpu_cp15(vcpu, r->reg) |= val;
+   vcpu_cp15(vcpu, c9_PMCNTENCLR) |= val;
+   break;
+   }
+   case c9_PMCNTENCLR: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_disable_counter(vcpu, val);
+   /* Value 0 of PMCNTENSET_EL0 and PMCNTENCLR_EL0 means
+* corresponding counter disabled.
+*/
+   vcpu_cp15(vcpu, r->reg) &= ~val;
+   vcpu_cp15(vcpu, c9_PMCNTENSET) &= ~val;
+   break;
+   }
case c9_PMCR: {
/* Only update writeable bits of PMCR */
val = vcpu_cp15(vcpu, r->reg);
@@ -1092,8 +1134,10 @@ static const struct sys_reg_desc cp15_regs[] = {
/* PMU */
{ Op1( 0), CRn( 9), CRm(12), Op2( 0), access_pmu_cp15_regs,
  reset_pmcr, c9_PMCR },
-   { Op1( 0), CRn( 9), CRm(12), Op2( 1), trap_raz_wi },
-   { Op1( 0), CRn( 9), CRm(12), Op2( 2), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 1), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMCNTENSET },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMCNTENCLR },
{ Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmu_cp15_regs,

[PATCH v4 15/21] KVM: ARM64: Add reset and access handlers for PMSWINC register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Add access handler which emulates writing and reading PMSWINC
register and add support for creating software increment event.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 18 +++-
 include/kvm/arm_pmu.h |  2 ++
 virt/kvm/arm/pmu.c| 55 +++
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c44c8e1..c86f8dd 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -567,6 +567,11 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~val;
break;
}
+   case PMSWINC_EL0: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_software_increment(vcpu, val);
+   break;
+   }
case PMCR_EL0: {
/* Only update writeable bits of PMCR */
val = vcpu_sys_reg(vcpu, r->reg);
@@ -596,6 +601,8 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
*vcpu_reg(vcpu, p->Rt) = val;
break;
}
+   case PMSWINC_EL0:
+   return read_zero(vcpu, p);
case PMCR_EL0: {
/* PMCR.P & PMCR.C are RAZ */
val = vcpu_sys_reg(vcpu, r->reg)
@@ -808,7 +815,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_unknown, PMOVSCLR_EL0 },
/* PMSWINC_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMSWINC_EL0 },
/* PMSELR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101),
  access_pmu_regs, reset_unknown, PMSELR_EL0 },
@@ -1113,6 +1120,11 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
vcpu_cp15(vcpu, c9_PMOVSSET) &= ~val;
break;
}
+   case c9_PMSWINC: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   kvm_pmu_software_increment(vcpu, val);
+   break;
+   }
case c9_PMCR: {
/* Only update writeable bits of PMCR */
val = vcpu_cp15(vcpu, r->reg);
@@ -1142,6 +1154,8 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
*vcpu_reg(vcpu, p->Rt) = val;
break;
}
+   case c9_PMSWINC:
+   return read_zero(vcpu, p);
case c9_PMCR: {
/* PMCR.P & PMCR.C are RAZ */
val = vcpu_cp15(vcpu, r->reg)
@@ -1194,6 +1208,8 @@ static const struct sys_reg_desc cp15_regs[] = {
  reset_unknown_cp15, c9_PMCNTENCLR },
{ Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMOVSCLR },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMSWINC },
{ Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMSELR },
{ Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmu_cp15_regs,
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index ff17578..d7de7f1 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -44,6 +44,7 @@ void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u32 val);
 void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u32 val, bool all_enable);
 void kvm_pmu_overflow_clear(struct kvm_vcpu *vcpu, u32 val, u32 reg);
 void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u32 val);
+void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u32 val);
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u32 data,
u32 select_idx);
 #else
@@ -55,6 +56,7 @@ void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u32 val) 
{}
 void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u32 val, bool all_enable) {}
 void kvm_pmu_overflow_clear(struct kvm_vcpu *vcpu, u32 val, u32 reg) {}
 void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u32 val) {}
+void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u32 val) {}
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u32 data,
u32 select_idx) {}
 #endif
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 5761386..ae21089 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -151,6 +151,57 @@ void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u32 val)
 }
 
 /**
+ * kvm_pmu_software_increment - do software increment
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMSWINC register
+ */
+void

Re: [PATCH] vfio/platform: store mapped memory in region, instead of an on-stack copy

2015-10-30 Thread Baptiste Reynal

Hi James,

Thanks for this fix.

Acked-by: Baptiste Reynal 
Tested-by: Baptiste Reynal 

On Thu, Oct 29, 2015 at 5:50 PM, James Morse  wrote:
> vfio_platform_{read,write}_mmio() call ioremap_nocache() to map
> a region of io memory, which they store in struct vfio_platform_region to
> be eventually re-used, or unmapped by vfio_platform_regions_cleanup().
>
> These functions receive a copy of their struct vfio_platform_region
> argument on the stack - so these mapped areas are always allocated, and
> always leaked.
>
> Pass this argument as a pointer instead.
>
> Fixes: 6e3f26456009 "vfio/platform: read and write support for the device fd"
> Signed-off-by: James Morse 
> ---
>  drivers/vfio/platform/vfio_platform_common.c | 36 
> ++--
>  1 file changed, 18 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
> b/drivers/vfio/platform/vfio_platform_common.c
> index f3b6299..ccf5da5 100644
> --- a/drivers/vfio/platform/vfio_platform_common.c
> +++ b/drivers/vfio/platform/vfio_platform_common.c
> @@ -308,17 +308,17 @@ static long vfio_platform_ioctl(void *device_data,
> return -ENOTTY;
>  }
>
> -static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg,
> +static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
>char __user *buf, size_t count,
>loff_t off)
>  {
> unsigned int done = 0;
>
> -   if (!reg.ioaddr) {
> -   reg.ioaddr =
> -   ioremap_nocache(reg.addr, reg.size);
> +   if (!reg->ioaddr) {
> +   reg->ioaddr =
> +   ioremap_nocache(reg->addr, reg->size);
>
> -   if (!reg.ioaddr)
> +   if (!reg->ioaddr)
> return -ENOMEM;
> }
>
> @@ -328,7 +328,7 @@ static ssize_t vfio_platform_read_mmio(struct 
> vfio_platform_region reg,
> if (count >= 4 && !(off % 4)) {
> u32 val;
>
> -   val = ioread32(reg.ioaddr + off);
> +   val = ioread32(reg->ioaddr + off);
> if (copy_to_user(buf, , 4))
> goto err;
>
> @@ -336,7 +336,7 @@ static ssize_t vfio_platform_read_mmio(struct 
> vfio_platform_region reg,
> } else if (count >= 2 && !(off % 2)) {
> u16 val;
>
> -   val = ioread16(reg.ioaddr + off);
> +   val = ioread16(reg->ioaddr + off);
> if (copy_to_user(buf, , 2))
> goto err;
>
> @@ -344,7 +344,7 @@ static ssize_t vfio_platform_read_mmio(struct 
> vfio_platform_region reg,
> } else {
> u8 val;
>
> -   val = ioread8(reg.ioaddr + off);
> +   val = ioread8(reg->ioaddr + off);
> if (copy_to_user(buf, , 1))
> goto err;
>
> @@ -377,7 +377,7 @@ static ssize_t vfio_platform_read(void *device_data, char 
> __user *buf,
> return -EINVAL;
>
> if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
> -   return vfio_platform_read_mmio(vdev->regions[index],
> +   return vfio_platform_read_mmio(>regions[index],
> buf, count, off);
> else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
> return -EINVAL; /* not implemented */
> @@ -385,17 +385,17 @@ static ssize_t vfio_platform_read(void *device_data, 
> char __user *buf,
> return -EINVAL;
>  }
>
> -static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg,
> +static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
> const char __user *buf, size_t count,
> loff_t off)
>  {
> unsigned int done = 0;
>
> -   if (!reg.ioaddr) {
> -   reg.ioaddr =
> -   ioremap_nocache(reg.addr, reg.size);
> +   if (!reg->ioaddr) {
> +   reg->ioaddr =
> +   ioremap_nocache(reg->addr, reg->size);
>
> -   if (!reg.ioaddr)
> +   if (!reg->ioaddr)
> return -ENOMEM;
> }
>
> @@ -407,7 +407,7 @@ static ssize_t vfio_platform_write_mmio(struct 
> vfio_platform_region reg,
>
> if (copy_from_user(, buf, 4))
> goto err;
> -   iowrite32(val, reg.ioaddr + off);
> +   iowrite32(val, reg->ioaddr + off);
>
> filled = 4;
> } else if (count >= 2 && !(off % 2)) {
> @@ -415,7

[PATCH v4 12/21] KVM: ARM64: Add reset and access handlers for PMINTENSET and PMINTENCLR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMINTENSET and PMINTENCLR is UNKNOWN, use
reset_unknown for its reset handler. Add a new case to emulate writing
PMINTENSET or PMINTENCLR register.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 34 ++
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c358ae0..6d2febf 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -540,6 +540,18 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
break;
}
+   case PMINTENSET_EL1: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   vcpu_sys_reg(vcpu, r->reg) |= val;
+   vcpu_sys_reg(vcpu, PMINTENCLR_EL1) |= val;
+   break;
+   }
+   case PMINTENCLR_EL1: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   vcpu_sys_reg(vcpu, r->reg) &= ~val;
+   vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
+   break;
+   }
case PMCR_EL0: {
/* Only update writeable bits of PMCR */
val = vcpu_sys_reg(vcpu, r->reg);
@@ -729,10 +741,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
/* PMINTENSET_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMINTENSET_EL1 },
/* PMINTENCLR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMINTENCLR_EL1 },
 
/* MAIR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
@@ -1059,6 +1071,18 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
vcpu_cp15(vcpu, c9_PMCNTENSET) &= ~val;
break;
}
+   case c9_PMINTENSET: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   vcpu_cp15(vcpu, r->reg) |= val;
+   vcpu_cp15(vcpu, c9_PMINTENCLR) |= val;
+   break;
+   }
+   case c9_PMINTENCLR: {
+   val = *vcpu_reg(vcpu, p->Rt);
+   vcpu_cp15(vcpu, r->reg) &= ~val;
+   vcpu_cp15(vcpu, c9_PMINTENSET) &= ~val;
+   break;
+   }
case c9_PMCR: {
/* Only update writeable bits of PMCR */
val = vcpu_cp15(vcpu, r->reg);
@@ -1152,8 +1176,10 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMXEVCNTR },
{ Op1( 0), CRn( 9), CRm(14), Op2( 0), trap_raz_wi },
-   { Op1( 0), CRn( 9), CRm(14), Op2( 1), trap_raz_wi },
-   { Op1( 0), CRn( 9), CRm(14), Op2( 2), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMINTENSET },
+   { Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMINTENCLR },
 
{ Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
{ Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 07/21] KVM: ARM64: PMU: Add perf event map and introduce perf event creating function

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

When we use tools like perf on host, perf passes the event type and the
id of this event type category to kernel, then kernel will map them to
hardware event number and write this number to PMU PMEVTYPER_EL0
register. When getting the event number in KVM, directly use raw event
type to create a perf_event for it.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/include/asm/pmu.h |   2 +
 arch/arm64/kvm/Makefile  |   1 +
 include/kvm/arm_pmu.h|  13 +
 virt/kvm/arm/pmu.c   | 117 +++
 4 files changed, 133 insertions(+)
 create mode 100644 virt/kvm/arm/pmu.c

diff --git a/arch/arm64/include/asm/pmu.h b/arch/arm64/include/asm/pmu.h
index b9f394a..2c025f2 100644
--- a/arch/arm64/include/asm/pmu.h
+++ b/arch/arm64/include/asm/pmu.h
@@ -31,6 +31,8 @@
 #define ARMV8_PMCR_D   (1 << 3) /* CCNT counts every 64th cpu cycle */
 #define ARMV8_PMCR_X   (1 << 4) /* Export to ETM */
 #define ARMV8_PMCR_DP  (1 << 5) /* Disable CCNT if non-invasive debug*/
+/* Determines which PMCCNTR_EL0 bit generates an overflow */
+#define ARMV8_PMCR_LC  (1 << 6)
 #defineARMV8_PMCR_N_SHIFT  11   /* Number of counters 
supported */
 #defineARMV8_PMCR_N_MASK   0x1f
 #defineARMV8_PMCR_MASK 0x3f /* Mask for writable bits */
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 1949fe5..18d56d8 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -27,3 +27,4 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
+kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 254d2b4..1908c88 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -38,4 +38,17 @@ struct kvm_pmu {
 #endif
 };
 
+#ifdef CONFIG_KVM_ARM_PMU
+unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx);
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u32 data,
+   u32 select_idx);
+#else
+unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx)
+{
+   return 0;
+}
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u32 data,
+   u32 select_idx) {}
+#endif
+
 #endif
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
new file mode 100644
index 000..900a64c
--- /dev/null
+++ b/virt/kvm/arm/pmu.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Shannon Zhao 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * kvm_pmu_get_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+unsigned long kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u32 select_idx)
+{
+   u64 counter, enabled, running;
+   struct kvm_pmu *pmu = >arch.pmu;
+   struct kvm_pmc *pmc = >pmc[select_idx];
+
+   if (!vcpu_mode_is_32bit(vcpu))
+   counter = vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + select_idx);
+   else
+   counter = vcpu_cp15(vcpu, c14_PMEVCNTR0 + select_idx);
+
+   if (pmc->perf_event)
+   counter += perf_event_read_value(pmc->perf_event, ,
+);
+
+   return counter & pmc->bitmask;
+}
+
+/**
+ * kvm_pmu_stop_counter - stop PMU counter
+ * @pmc: The PMU counter pointer
+ *
+ * If this counter has been configured to monitor some event, release it here.
+ */
+static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
+{
+   struct kvm_vcpu *vcpu = pmc->vcpu;
+   u64 counter;
+
+   if (pmc->perf_event) {
+   counter = kvm_pmu_get_counter_value(vcpu, pmc->idx);
+   if (!vcpu_mode_is_32bit(vcpu))
+   vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + pmc->idx) = counter;
+   else
+   vcpu_cp15(vcpu, c14_PMEVCNTR0 + pmc->idx) = counter;
+
+   perf_event_release_kernel(pmc->perf_event);
+   pmc->perf_event = NULL;
+   }
+}
+
+/**
+ * kvm_pmu_set_counter_event_type - set selected counter to monitor some event

[PATCH v4 09/21] KVM: ARM64: Add reset and access handlers for PMXEVCNTR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMXEVCNTR is UNKNOWN, use reset_unknown for
its reset handler. Add access handler which emulates writing and reading
PMXEVCNTR register. When reading PMXEVCNTR, call perf_event_read_value
to get the count value of the perf event.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 36 ++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 4e606ea..b7ca2cd 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -491,6 +491,16 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
 
if (p->is_write) {
switch (r->reg) {
+   case PMXEVCNTR_EL0: {
+   int index = PMEVCNTR0_EL0
+   + vcpu_sys_reg(vcpu, PMSELR_EL0);
+
+   val = kvm_pmu_get_counter_value(vcpu,
+   vcpu_sys_reg(vcpu, PMSELR_EL0));
+   vcpu_sys_reg(vcpu, index) += (s64)*vcpu_reg(vcpu, p->Rt)
+- val;
+   break;
+   }
case PMXEVTYPER_EL0: {
val = vcpu_sys_reg(vcpu, PMSELR_EL0);
kvm_pmu_set_counter_event_type(vcpu,
@@ -519,6 +529,12 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
}
} else {
switch (r->reg) {
+   case PMXEVCNTR_EL0: {
+   val = kvm_pmu_get_counter_value(vcpu,
+   vcpu_sys_reg(vcpu, PMSELR_EL0));
+   *vcpu_reg(vcpu, p->Rt) = val;
+   break;
+   }
case PMCR_EL0: {
/* PMCR.P & PMCR.C are RAZ */
val = vcpu_sys_reg(vcpu, r->reg)
@@ -749,7 +765,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_unknown, PMXEVTYPER_EL0 },
/* PMXEVCNTR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMXEVCNTR_EL0 },
/* PMUSERENR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000),
  trap_raz_wi },
@@ -962,6 +978,15 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
 
if (p->is_write) {
switch (r->reg) {
+   case c9_PMXEVCNTR: {
+   int index = c14_PMEVCNTR0 + vcpu_cp15(vcpu, c9_PMSELR);
+
+   val = kvm_pmu_get_counter_value(vcpu,
+   vcpu_cp15(vcpu, c9_PMSELR));
+   vcpu_cp15(vcpu, index) += (s64)*vcpu_reg(vcpu, p->Rt)
+ - val;
+   break;
+   }
case c9_PMXEVTYPER: {
val = vcpu_cp15(vcpu, c9_PMSELR);
kvm_pmu_set_counter_event_type(vcpu,
@@ -989,6 +1014,12 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
}
} else {
switch (r->reg) {
+   case c9_PMXEVCNTR: {
+   val = kvm_pmu_get_counter_value(vcpu,
+   vcpu_cp15(vcpu, c9_PMSELR));
+   *vcpu_reg(vcpu, p->Rt) = val;
+   break;
+   }
case c9_PMCR: {
/* PMCR.P & PMCR.C are RAZ */
val = vcpu_cp15(vcpu, r->reg)
@@ -1047,7 +1078,8 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMXEVTYPER },
-   { Op1( 0), CRn( 9), CRm(13), Op2( 2), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMXEVCNTR },
{ Op1( 0), CRn( 9), CRm(14), Op2( 0), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(14), Op2( 1), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(14), Op2( 2), trap_raz_wi },
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] VFIO: Add a parameter to force nonthread IRQ

2015-10-30 Thread Yunhong Jiang

On Thu, Oct 29, 2015 at 10:45:44AM +0100, Paolo Bonzini wrote:
> 
> 
> On 29/10/2015 04:11, Alex Williamson wrote:
> > > The irqfd is already able to schedule a work item, because it runs with
> > > interrupts disabled, so I think we can always return IRQ_HANDLED.
> >
> > I'm confused by this.  The problem with adding IRQF_NO_THREAD to our
> > current handler is that it hits the spinlock that can sleep in
> > eventfd_signal() and the waitqueue further down the stack before we get
> > to the irqfd.  So if we split to a non-threaded handler vs a threaded
> > handler, where the non-threaded handler either returns IRQ_HANDLED or
> > IRQ_WAKE_THREAD to queue the threaded handler, there's only so much that
> > the non-threaded handler can do before we start running into the same
> > problem.
> 
> You're right.  I thought schedule_work used raw spinlocks (and then
> everything would be done in the inject callback), but I was wrong.
> 
> Basically where irqfd_wakeup now does schedule_work, it would need to
> return IRQ_WAKE_THREAD.  The threaded handler then can just do the
> eventfd_signal.
> 

And with this change, we even don't need the module option anymore, we first 
try the primary handler, which is in hard irq context, and if failed, then
threaded irq handler. Am I right?

Paolo/Alex, do you want to work on the patch yourself? If not, I will be 
happy to try this method.

Thanks
--jyh

> Paolo
> 
> > I think that means that the non-threaded handler needs to
> > return IRQ_WAKE_THREAD if we need to use the current eventfd_signal()
> > path, such as if the bypass path is not available.  If we can get
> > through the bypass path and the KVM irqfd side is safe for the
> > non-threaded handler, inject succeeds and we return IRQ_HANDLED, right?
> > Thanks,
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 16/21] KVM: ARM64: Add access handlers for PMEVCNTRn and PMEVTYPERn register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Add access handler which emulates writing and reading PMEVCNTRn and
PMEVTYPERn.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 164 ++
 1 file changed, 164 insertions(+)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c86f8dd..50bf3fb 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -634,6 +634,20 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
{ Op0(0b10), Op1(0b000), CRn(0b), CRm((n)), Op2(0b111), \
  trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
+/* Macro to expand the PMEVCNTRn_EL0 register */
+#define PMU_PMEVCNTR_EL0(n)\
+   /* PMEVCNTRn_EL0 */ \
+   { Op0(0b11), Op1(0b011), CRn(0b1110),   \
+ CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
+ access_pmu_regs, reset_unknown, (PMEVCNTR0_EL0 + n), }
+
+/* Macro to expand the PMEVTYPERn_EL0 register */
+#define PMU_PMEVTYPER_EL0(n)   \
+   /* PMEVTYPERn_EL0 */\
+   { Op0(0b11), Op1(0b011), CRn(0b1110),   \
+ CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
+ access_pmu_regs, reset_unknown, (PMEVTYPER0_EL0 + n), }
+
 /*
  * Architected system registers.
  * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -848,6 +862,74 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b), Op2(0b011),
  NULL, reset_unknown, TPIDRRO_EL0 },
 
+   /* PMEVCNTRn_EL0 */
+   PMU_PMEVCNTR_EL0(0),
+   PMU_PMEVCNTR_EL0(1),
+   PMU_PMEVCNTR_EL0(2),
+   PMU_PMEVCNTR_EL0(3),
+   PMU_PMEVCNTR_EL0(4),
+   PMU_PMEVCNTR_EL0(5),
+   PMU_PMEVCNTR_EL0(6),
+   PMU_PMEVCNTR_EL0(7),
+   PMU_PMEVCNTR_EL0(8),
+   PMU_PMEVCNTR_EL0(9),
+   PMU_PMEVCNTR_EL0(10),
+   PMU_PMEVCNTR_EL0(11),
+   PMU_PMEVCNTR_EL0(12),
+   PMU_PMEVCNTR_EL0(13),
+   PMU_PMEVCNTR_EL0(14),
+   PMU_PMEVCNTR_EL0(15),
+   PMU_PMEVCNTR_EL0(16),
+   PMU_PMEVCNTR_EL0(17),
+   PMU_PMEVCNTR_EL0(18),
+   PMU_PMEVCNTR_EL0(19),
+   PMU_PMEVCNTR_EL0(20),
+   PMU_PMEVCNTR_EL0(21),
+   PMU_PMEVCNTR_EL0(22),
+   PMU_PMEVCNTR_EL0(23),
+   PMU_PMEVCNTR_EL0(24),
+   PMU_PMEVCNTR_EL0(25),
+   PMU_PMEVCNTR_EL0(26),
+   PMU_PMEVCNTR_EL0(27),
+   PMU_PMEVCNTR_EL0(28),
+   PMU_PMEVCNTR_EL0(29),
+   PMU_PMEVCNTR_EL0(30),
+   /* PMEVTYPERn_EL0 */
+   PMU_PMEVTYPER_EL0(0),
+   PMU_PMEVTYPER_EL0(1),
+   PMU_PMEVTYPER_EL0(2),
+   PMU_PMEVTYPER_EL0(3),
+   PMU_PMEVTYPER_EL0(4),
+   PMU_PMEVTYPER_EL0(5),
+   PMU_PMEVTYPER_EL0(6),
+   PMU_PMEVTYPER_EL0(7),
+   PMU_PMEVTYPER_EL0(8),
+   PMU_PMEVTYPER_EL0(9),
+   PMU_PMEVTYPER_EL0(10),
+   PMU_PMEVTYPER_EL0(11),
+   PMU_PMEVTYPER_EL0(12),
+   PMU_PMEVTYPER_EL0(13),
+   PMU_PMEVTYPER_EL0(14),
+   PMU_PMEVTYPER_EL0(15),
+   PMU_PMEVTYPER_EL0(16),
+   PMU_PMEVTYPER_EL0(17),
+   PMU_PMEVTYPER_EL0(18),
+   PMU_PMEVTYPER_EL0(19),
+   PMU_PMEVTYPER_EL0(20),
+   PMU_PMEVTYPER_EL0(21),
+   PMU_PMEVTYPER_EL0(22),
+   PMU_PMEVTYPER_EL0(23),
+   PMU_PMEVTYPER_EL0(24),
+   PMU_PMEVTYPER_EL0(25),
+   PMU_PMEVTYPER_EL0(26),
+   PMU_PMEVTYPER_EL0(27),
+   PMU_PMEVTYPER_EL0(28),
+   PMU_PMEVTYPER_EL0(29),
+   PMU_PMEVTYPER_EL0(30),
+   /* PMCCFILTR_EL0 */
+   { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b), Op2(0b111),
+ access_pmu_regs, reset_unknown, PMCCFILTR_EL0, },
+
/* DACR32_EL2 */
{ Op0(0b11), Op1(0b100), CRn(0b0011), CRm(0b), Op2(0b000),
  NULL, reset_unknown, DACR32_EL2 },
@@ -1172,6 +1254,20 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
return true;
 }
 
+/* Macro to expand the PMEVCNTRn register */
+#define PMU_PMEVCNTR(n)
\
+   /* PMEVCNTRn */ \
+   { Op1(0), CRn(0b1110),  \
+ CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
+ access_pmu_cp15_regs, reset_unknown_cp15, (c14_PMEVCNTR0 + n), }
+
+/* Macro to expand the PMEVTYPERn register */
+#define PMU_PMEVTYPER(n)   \
+   /* PMEVTYPERn */\
+   { Op1(0), CRn(0b1110),  \
+ CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
+ access_pmu_cp15_regs, reset_unknown_cp15, (c14_PMEVTYPER0 + n), }
+
 /*
  * Trapped cp15

[PATCH v4 06/21] KVM: ARM64: Add reset and access handlers for PMCEID0 and PMCEID1 register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Add reset handler which gets host value of PMCEID0 or PMCEID1. Since
write action to PMCEID0 or PMCEID1 is ignored, add a new case for this.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 35d232e..cb82b15 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -469,6 +469,19 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct 
sys_reg_desc *r)
vcpu_sysreg_write(vcpu, r, val);
 }
 
+static void reset_pmceid(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+   u64 pmceid;
+
+   if (r->reg == PMCEID0_EL0 || r->reg == c9_PMCEID0)
+   asm volatile("mrs %0, pmceid0_el0\n" : "=r" (pmceid));
+   else
+   /* PMCEID1_EL0 or c9_PMCEID1 */
+   asm volatile("mrs %0, pmceid1_el0\n" : "=r" (pmceid));
+
+   vcpu_sysreg_write(vcpu, r, pmceid);
+}
+
 /* PMU registers accessor. */
 static bool access_pmu_regs(struct kvm_vcpu *vcpu,
const struct sys_reg_params *p,
@@ -486,6 +499,9 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
vcpu_sys_reg(vcpu, r->reg) = val;
break;
}
+   case PMCEID0_EL0:
+   case PMCEID1_EL0:
+   return ignore_write(vcpu, p);
default:
vcpu_sys_reg(vcpu, r->reg) = *vcpu_reg(vcpu, p->Rt);
break;
@@ -710,10 +726,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_unknown, PMSELR_EL0 },
/* PMCEID0_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110),
- trap_raz_wi },
+ access_pmu_regs, reset_pmceid, PMCEID0_EL0 },
/* PMCEID1_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111),
- trap_raz_wi },
+ access_pmu_regs, reset_pmceid, PMCEID1_EL0 },
/* PMCCNTR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
  trap_raz_wi },
@@ -943,6 +959,9 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
vcpu_cp15(vcpu, r->reg) = val;
break;
}
+   case c9_PMCEID0:
+   case c9_PMCEID1:
+   return ignore_write(vcpu, p);
default:
vcpu_cp15(vcpu, r->reg) = *vcpu_reg(vcpu, p->Rt);
break;
@@ -1000,8 +1019,10 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMSELR },
-   { Op1( 0), CRn( 9), CRm(12), Op2( 6), trap_raz_wi },
-   { Op1( 0), CRn( 9), CRm(12), Op2( 7), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmu_cp15_regs,
+ reset_pmceid, c9_PMCEID0 },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmu_cp15_regs,
+ reset_pmceid, c9_PMCEID1 },
{ Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(13), Op2( 1), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(13), Op2( 2), trap_raz_wi },
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 08/21] KVM: ARM64: Add reset and access handlers for PMXEVTYPER register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMXEVTYPER is UNKNOWN, use reset_unknown or
reset_unknown_cp15 for its reset handler. Add access handler which
emulates writing and reading PMXEVTYPER register. When writing to
PMXEVTYPER, call kvm_pmu_set_counter_event_type to create a perf_event
for the selected event type.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index cb82b15..4e606ea 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -491,6 +491,17 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
 
if (p->is_write) {
switch (r->reg) {
+   case PMXEVTYPER_EL0: {
+   val = vcpu_sys_reg(vcpu, PMSELR_EL0);
+   kvm_pmu_set_counter_event_type(vcpu,
+  *vcpu_reg(vcpu, p->Rt),
+  val);
+   vcpu_sys_reg(vcpu, PMXEVTYPER_EL0) =
+*vcpu_reg(vcpu, p->Rt);
+   vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + val) =
+*vcpu_reg(vcpu, p->Rt);
+   break;
+   }
case PMCR_EL0: {
/* Only update writeable bits of PMCR */
val = vcpu_sys_reg(vcpu, r->reg);
@@ -735,7 +746,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  trap_raz_wi },
/* PMXEVTYPER_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMXEVTYPER_EL0 },
/* PMXEVCNTR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010),
  trap_raz_wi },
@@ -951,6 +962,16 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
 
if (p->is_write) {
switch (r->reg) {
+   case c9_PMXEVTYPER: {
+   val = vcpu_cp15(vcpu, c9_PMSELR);
+   kvm_pmu_set_counter_event_type(vcpu,
+  *vcpu_reg(vcpu, p->Rt),
+  val);
+   vcpu_cp15(vcpu, c9_PMXEVTYPER) = *vcpu_reg(vcpu, p->Rt);
+   vcpu_cp15(vcpu, c14_PMEVTYPER0 + val) =
+*vcpu_reg(vcpu, p->Rt);
+   break;
+   }
case c9_PMCR: {
/* Only update writeable bits of PMCR */
val = vcpu_cp15(vcpu, r->reg);
@@ -1024,7 +1045,8 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmu_cp15_regs,
  reset_pmceid, c9_PMCEID1 },
{ Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
-   { Op1( 0), CRn( 9), CRm(13), Op2( 1), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMXEVTYPER },
{ Op1( 0), CRn( 9), CRm(13), Op2( 2), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(14), Op2( 0), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(14), Op2( 1), trap_raz_wi },
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH/RFC 0/4] dma ops and virtio

2015-10-30 Thread Cornelia Huck

On Thu, 29 Oct 2015 15:50:38 -0700
Andy Lutomirski  wrote:

> Progress!  After getting that sort-of-working, I figured out what was
> wrong with my earlier command, and I got that working, too.  Now I
> get:
> 
> qemu-system-s390x -fsdev
> local,id=virtfs1,path=/,security_model=none,readonly -device
> virtio-9p-ccw,fsdev=virtfs1,mount_tag=/dev/root -M s390-ccw-virtio
> -nodefaults -device sclpconsole,chardev=console -parallel none -net
> none -echr 1 -serial none -chardev stdio,id=console,signal=off,mux=on
> -serial chardev:console -mon chardev=console -vga none -display none
> -kernel arch/s390/boot/bzImage -append
> 'init=/home/luto/devel/virtme/virtme/guest/virtme-init
> psmouse.proto=exps "virtme_stty_con=rows 24 cols 150 iutf8"
> TERM=xterm-256color rootfstype=9p
> rootflags=ro,version=9p2000.L,trans=virtio,access=any
> raid=noautodetect debug'

The commandline looks sane AFAICS.

(...)

> vrfy: device 0.0.: rc=0 pgroup=0 mpath=0 vpm=80
> virtio_ccw 0.0.: Failed to set online: -5
> 
> ^^^ bad news!

I'd like to see where in the onlining process this fails. Could you set
up qemu tracing for css_* and virtio_ccw_* (instructions in
qemu/docs/tracing.txt)?

Which qemu version is this, btw.?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 32/33] nvdimm acpi: support _FIT method

2015-10-30 Thread Xiao Guangrong

FIT buffer is not completely mapped into guest address space, so a new
function, Read FIT, function index 0x, is reserved by QEMU to
read the piece of FIT buffer. The buffer is concatenated before _FIT
return

Refer to docs/specs/acpi-nvdimm.txt for detailed design

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 168 +--
 1 file changed, 164 insertions(+), 4 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index f8d7d19..3f35220 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -384,6 +384,18 @@ static void nvdimm_build_nfit(GSList *device_list, GArray 
*table_offsets,
 g_array_free(structures, true);
 }
 
+/*
+ * define UUID for NVDIMM Root Device according to Chapter 3 DSM Interface
+ * for NVDIMM Root Device - Example in DSM Spec Rev1.
+ */
+#define NVDIMM_DSM_ROOT_UUID "2F10E7A4-9E91-11E4-89D3-123B93F75CBA"
+
+/*
+ * Read FIT Function, which is a QEMU internal use only function, more detail
+ * refer to docs/specs/acpi_nvdimm.txt
+ */
+#define NVDIMM_DSM_FUNC_READ_FIT 0x
+
 /* define NVDIMM DSM return status codes according to DSM Spec Rev1. */
 enum {
 /* Common return status codes. */
@@ -420,6 +432,11 @@ struct NvdimmFuncInSetLabelData {
 } QEMU_PACKED;
 typedef struct NvdimmFuncInSetLabelData NvdimmFuncInSetLabelData;
 
+struct NvdimmFuncInReadFit {
+uint32_t offset; /* fit offset */
+} QEMU_PACKED;
+typedef struct NvdimmFuncInReadFit NvdimmFuncInReadFit;
+
 struct NvdimmDsmIn {
 uint32_t handle;
 uint32_t revision;
@@ -429,6 +446,7 @@ struct NvdimmDsmIn {
 uint8_t arg3[0];
 NvdimmFuncInSetLabelData func_set_label_data;
 NvdimmFuncInGetLabelData func_get_label_data;
+NvdimmFuncInReadFit func_read_fit;
 };
 } QEMU_PACKED;
 typedef struct NvdimmDsmIn NvdimmDsmIn;
@@ -450,13 +468,71 @@ struct NvdimmFuncOutGetLabelData {
 } QEMU_PACKED;
 typedef struct NvdimmFuncOutGetLabelData NvdimmFuncOutGetLabelData;
 
+struct NvdimmFuncOutReadFit {
+uint32_t status;/* return status code. */
+uint32_t length;/* the length of fit data we read. */
+uint8_t fit_data[0]; /* fit data. */
+} QEMU_PACKED;
+typedef struct NvdimmFuncOutReadFit NvdimmFuncOutReadFit;
+
 static void nvdimm_dsm_write_status(GArray *out, uint32_t status)
 {
 status = cpu_to_le32(status);
 build_append_int_noprefix(out, status, sizeof(status));
 }
 
-static void nvdimm_dsm_root(NvdimmDsmIn *in, GArray *out)
+/* Build fit memory which is presented to guest via _FIT method. */
+static void nvdimm_build_fit(AcpiNVDIMMState *state)
+{
+if (!state->fit) {
+GSList *device_list = nvdimm_get_plugged_device_list();
+
+nvdimm_debug("Rebuild FIT...\n");
+state->fit = nvdimm_build_device_structure(device_list);
+g_slist_free(device_list);
+}
+}
+
+/* Read FIT data, defined in docs/specs/acpi_nvdimm.txt. */
+static void nvdimm_dsm_func_read_fit(AcpiNVDIMMState *state,
+ NvdimmDsmIn *in, GArray *out)
+{
+NvdimmFuncInReadFit *read_fit = >func_read_fit;
+NvdimmFuncOutReadFit fit_out;
+uint32_t read_length = TARGET_PAGE_SIZE - sizeof(NvdimmFuncOutReadFit);
+uint32_t status = NVDIMM_DSM_ROOT_DEV_STATUS_INVALID_PARAS;
+
+nvdimm_build_fit(state);
+
+le32_to_cpus(_fit->offset);
+
+nvdimm_debug("Read FIT offset %#x.\n", read_fit->offset);
+
+if (read_fit->offset > state->fit->len) {
+nvdimm_debug("offset %#x is beyond fit size (%#x).\n",
+ read_fit->offset, state->fit->len);
+goto exit;
+}
+
+read_length = MIN(read_length, state->fit->len - read_fit->offset);
+nvdimm_debug("read length %#x.\n", read_length);
+
+fit_out.status = cpu_to_le32(NVDIMM_DSM_STATUS_SUCCESS);
+fit_out.length = cpu_to_le32(read_length);
+g_array_append_vals(out, _out, sizeof(fit_out));
+
+if (read_length) {
+g_array_append_vals(out, state->fit->data + read_fit->offset,
+read_length);
+}
+return;
+
+exit:
+nvdimm_dsm_write_status(out, status);
+}
+
+static void nvdimm_dsm_root(AcpiNVDIMMState *state, NvdimmDsmIn *in,
+GArray *out)
 {
 uint32_t status = NVDIMM_DSM_STATUS_NOT_SUPPORTED;
 
@@ -475,6 +551,10 @@ static void nvdimm_dsm_root(NvdimmDsmIn *in, GArray *out)
 return;
 }
 
+if (in->function == NVDIMM_DSM_FUNC_READ_FIT /* FIT Read */) {
+return nvdimm_dsm_func_read_fit(state, in, out);
+}
+
 nvdimm_debug("Return status %#x.\n", status);
 nvdimm_dsm_write_status(out, status);
 }
@@ -710,7 +790,7 @@ nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
 
 /* Handle 0 is reserved for NVDIMM Root Device. */
 if (!in->handle) {
-nvdimm_dsm_root(in, out);
+nvdimm_dsm_root(state, in, out);
 goto exit;
 }
 
@@ -925,8 +1005,88 @@ static void

[PATCH v6 23/33] nvdimm acpi: init the resource used by NVDIMM ACPI

2015-10-30 Thread Xiao Guangrong

A page staring from 0xFF0 and IO port 0x0a18 - 0xa1b in guest are
reserved for NVDIMM ACPI emulation, refer to docs/specs/acpi_nvdimm.txt
for detailed design

A parameter, 'nvdimm-support', is introduced for PIIX4_PM and ICH9-LPC
that controls if nvdimm support is enabled, it is true on default and
it is false on 2.4 and its earlier version to keep compatibility

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak |  1 +
 default-configs/mips-softmmu.mak |  1 +
 default-configs/mips64-softmmu.mak   |  1 +
 default-configs/mips64el-softmmu.mak |  1 +
 default-configs/mipsel-softmmu.mak   |  1 +
 default-configs/x86_64-softmmu.mak   |  1 +
 hw/acpi/Makefile.objs|  1 +
 hw/acpi/ich9.c   | 24 ++
 hw/acpi/nvdimm.c | 63 
 hw/acpi/piix4.c  | 27 
 include/hw/acpi/ich9.h   |  3 ++
 include/hw/i386/pc.h | 10 ++
 include/hw/mem/nvdimm.h  | 34 +++
 13 files changed, 161 insertions(+), 7 deletions(-)
 create mode 100644 hw/acpi/nvdimm.c

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 4e84a1c..51e71d4 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -48,6 +48,7 @@ CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
 CONFIG_NVDIMM=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/default-configs/mips-softmmu.mak b/default-configs/mips-softmmu.mak
index 44467c3..6b8b70e 100644
--- a/default-configs/mips-softmmu.mak
+++ b/default-configs/mips-softmmu.mak
@@ -17,6 +17,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
 CONFIG_I8257=y
diff --git a/default-configs/mips64-softmmu.mak 
b/default-configs/mips64-softmmu.mak
index 66ed5f9..ea820f6 100644
--- a/default-configs/mips64-softmmu.mak
+++ b/default-configs/mips64-softmmu.mak
@@ -17,6 +17,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
 CONFIG_I8257=y
diff --git a/default-configs/mips64el-softmmu.mak 
b/default-configs/mips64el-softmmu.mak
index bfca2b2..8993851 100644
--- a/default-configs/mips64el-softmmu.mak
+++ b/default-configs/mips64el-softmmu.mak
@@ -17,6 +17,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
 CONFIG_I8257=y
diff --git a/default-configs/mipsel-softmmu.mak 
b/default-configs/mipsel-softmmu.mak
index 0162ef0..87ab964 100644
--- a/default-configs/mipsel-softmmu.mak
+++ b/default-configs/mipsel-softmmu.mak
@@ -17,6 +17,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
 CONFIG_I8257=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index e877a86..0a7dc10 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -48,6 +48,7 @@ CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
 CONFIG_NVDIMM=y
+CONFIG_ACPI_NVDIMM=y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
index 7d3230c..84c082d 100644
--- a/hw/acpi/Makefile.objs
+++ b/hw/acpi/Makefile.objs
@@ -2,6 +2,7 @@ common-obj-$(CONFIG_ACPI_X86) += core.o piix4.o pcihp.o
 common-obj-$(CONFIG_ACPI_X86_ICH) += ich9.o tco.o
 common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu_hotplug.o
 common-obj-$(CONFIG_ACPI_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
 common-obj-$(CONFIG_ACPI) += acpi_interface.o
 common-obj-$(CONFIG_ACPI) += bios-linker-loader.o
 common-obj-$(CONFIG_ACPI) += aml-build.o
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 1e9ae20..603c1bd 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -280,6 +280,12 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm,
 acpi_memory_hotplug_init(pci_address_space_io(lpc_pci), 
OBJECT(lpc_pci),
  >acpi_memory_hotplug);
 }
+
+if (pm->acpi_nvdimm_state.is_enabled) {
+nvdimm_init_acpi_state(pci_address_space(lpc_pci),
+   pci_address_space_io(lpc_pci), OBJECT(lpc_pci),
+   >acpi_nvdimm_state);
+}
 }
 
 static void ich9_pm_get_gpe0_blk(Object *obj, Visitor *v,
@@ -307,6 +313,20 @@ static void ich9_pm_set_memory_hotplug_support(Object 
*obj, bool value,
 s->pm.acpi_memory_hotplug.is_enabled = value;
 }
 
+static bool ich9_pm_get_nvdimm_support(Object *obj, Error **errp)
+{
+ICH9LPCState *s = ICH9_LPC_DEVICE(obj);
+
+return s->pm.acpi_nvdimm_state.is_enabled;
+}
+
+static void ich9_pm_set_nvdimm_support(Object

[PATCH v6 28/33] nvdimm acpi: support Get Namespace Label Size function

2015-10-30 Thread Xiao Guangrong

Function 4 is used to get Namespace label size

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 87 +++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 300a3aa..67c4699 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -407,15 +407,48 @@ enum {
 NVDIMM_DSM_DEV_STATUS_VENDOR_SPECIFIC_ERROR = 4,
 };
 
+struct NvdimmFuncInGetLabelData {
+uint32_t offset; /* the offset in the namespace label data area. */
+uint32_t length; /* the size of data is to be read via the function. */
+} QEMU_PACKED;
+typedef struct NvdimmFuncInGetLabelData NvdimmFuncInGetLabelData;
+
+struct NvdimmFuncInSetLabelData {
+uint32_t offset; /* the offset in the namespace label data area. */
+uint32_t length; /* the size of data is to be written via the function. */
+uint8_t in_buf[0]; /* the data written to label data area. */
+} QEMU_PACKED;
+typedef struct NvdimmFuncInSetLabelData NvdimmFuncInSetLabelData;
+
 struct NvdimmDsmIn {
 uint32_t handle;
 uint32_t revision;
 uint32_t function;
/* the remaining size in the page is used by arg3. */
-uint8_t arg3[0];
+union {
+uint8_t arg3[0];
+NvdimmFuncInSetLabelData func_set_label_data;
+};
 } QEMU_PACKED;
 typedef struct NvdimmDsmIn NvdimmDsmIn;
 
+struct NvdimmFuncOutLabelSize {
+uint32_t status; /* return status code. */
+uint32_t label_size; /* the size of label data area. */
+/*
+ * Maximum size of the namespace label data length supported by
+ * the platform in Get/Set Namespace Label Data functions.
+ */
+uint32_t max_xfer;
+} QEMU_PACKED;
+typedef struct NvdimmFuncOutLabelSize NvdimmFuncOutLabelSize;
+
+struct NvdimmFuncOutGetLabelData {
+uint32_t status;/*return status code. */
+uint8_t out_buf[0]; /* the data got via Get Namesapce Label function. */
+} QEMU_PACKED;
+typedef struct NvdimmFuncOutGetLabelData NvdimmFuncOutGetLabelData;
+
 static void nvdimm_dsm_write_status(GArray *out, uint32_t status)
 {
 status = cpu_to_le32(status);
@@ -445,6 +478,55 @@ static void nvdimm_dsm_root(NvdimmDsmIn *in, GArray *out)
 nvdimm_dsm_write_status(out, status);
 }
 
+/*
+ * the max transfer size is the max size transferred by both a
+ * 'Get Namespace Label Data' function and a 'Set Namespace Label Data'
+ * function.
+ */
+static uint32_t nvdimm_get_max_xfer_label_size(void)
+{
+NvdimmDsmIn *in;
+uint32_t max_get_size, max_set_size, dsm_memory_size = TARGET_PAGE_SIZE;
+
+/*
+ * the max data ACPI can read one time which is transferred by
+ * the response of 'Get Namespace Label Data' function.
+ */
+max_get_size = dsm_memory_size - sizeof(NvdimmFuncOutGetLabelData);
+
+/*
+ * the max data ACPI can write one time which is transferred by
+ * 'Set Namespace Label Data' function.
+ */
+max_set_size = dsm_memory_size - offsetof(NvdimmDsmIn, arg3) -
+   sizeof(in->func_set_label_data);
+
+return MIN(max_get_size, max_set_size);
+}
+
+/*
+ * DSM Spec Rev1 4.4 Get Namespace Label Size (Function Index 4).
+ *
+ * It gets the size of Namespace Label data area and the max data size
+ * that Get/Set Namespace Label Data functions can transfer.
+ */
+static void nvdimm_dsm_func_label_size(NVDIMMDevice *nvdimm, GArray *out)
+{
+NvdimmFuncOutLabelSize func_label_size;
+uint32_t label_size, mxfer;
+
+label_size = nvdimm->label_size;
+mxfer = nvdimm_get_max_xfer_label_size();
+
+nvdimm_debug("label_size %#x, max_xfer %#x.\n", label_size, mxfer);
+
+func_label_size.status = cpu_to_le32(NVDIMM_DSM_STATUS_SUCCESS);
+func_label_size.label_size = cpu_to_le32(label_size);
+func_label_size.max_xfer = cpu_to_le32(mxfer);
+
+g_array_append_vals(out, _label_size, sizeof(func_label_size));
+}
+
 static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
 {
 GSList *list = nvdimm_get_plugged_device_list();
@@ -469,6 +551,9 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
1 << 6 /* Set Namespace Label Data */);
 build_append_int_noprefix(out, cmd_list, sizeof(cmd_list));
 goto free;
+case 0x4 /* Get Namespace Label Size */:
+nvdimm_dsm_func_label_size(nvdimm, out);
+goto free;
 default:
 status = NVDIMM_DSM_STATUS_NOT_SUPPORTED;
 };
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 00/33] implement vNVDIMM

2015-10-30 Thread Xiao Guangrong

This patchset can be found at:
  https://github.com/xiaogr/qemu.git nvdimm-v6

It is based on pci branch on Michael's tree and the top commit is:
commit 6f96a31a06c2a1 (tests: re-enable vhost-user-test).

Changelog in v6:
- changes from Stefan's comments:
  1) fix code style of struct naming by CamelCase way
  2) fix offset + length overflow when read/write label data
  3) compile hw/acpi/nvdimm.c for per target so that TARGET_PAGE_SIZE can
 be used to replace getpagesize()

Changelog in v5:
- changes from Michael's comments:
  1) prefix nvdimm_ to everything in NVDIMM source files
  2) make parsing _DSM Arg3 more clear
  3) comment style fix
  5) drop single used definition
  6) fix dirty dsm buffer lost due to memory write happened on host
  7) check dsm buffer if it is big enough to contain input data
  8) use build_append_int_noprefix to store single value to GArray

- changes from Michael's and Igor's comments:
  1) introduce 'nvdimm-support' parameter to control nvdimm
 enablement and it is disabled for 2.4 and its earlier versions
 to make live migration compatible
  2) only reserve 1 RAM page and 4 bytes IO Port for NVDIMM ACPI
 virtualization

- changes from Stefan's comments:
  1) do endian adjustment for the buffer length

- changes from Bharata B Rao's comments:
  1) fix compile on ppc

- others:
  1) the buffer length is directly got from IO read rather than got
 from dsm memory
  2) fix dirty label data lost due to memory write happened on host

Changelog in v4:
- changes from Michael's comments:
  1) show the message, "Memory is not allocated from HugeTlbfs", if file
 based memory is not allocated from hugetlbfs.
  2) introduce function, acpi_get_nvdimm_state(), to get NVDIMMState
 from Machine.
  3) statically define UUID and make its operation more clear
  4) use GArray to build device structures to avoid potential buffer
 overflow
  4) improve comments in the code
  5) improve code style

- changes from Igor's comments:
  1) add NVDIMM ACPI spec document
  2) use serialized method to avoid Mutex
  3) move NVDIMM ACPI's code to hw/acpi/nvdimm.c
  4) introduce a common ASL method used by _DSM for all devices to reduce
 ACPI size
  5) handle UUID in ACPI AML code. BTW, i'd keep handling revision in QEMU
 it's better to upgrade QEMU to support Rev2 in the future

- changes from Stefan's comments:
  1) copy input data from DSM memory to local buffer to avoid potential
 issues as DSM memory is visible to guest. Output data is handled
 in a similar way

- changes from Dan's comments:
  1) drop static namespace as Linux has already supported label-less
 nvdimm devices

- changes from Vladimir's comments:
  1) print better message, "failed to get file size for %s, can't create
 backend on it", if any file operation filed to obtain file size

- others:
  create a git repo on github.com for better review/test

Also, thanks for Eric Blake's review on QAPI's side.

Thank all of you to review this patchset.

Changelog in v3:
There is huge change in this version, thank Igor, Stefan, Paolo, Eduardo,
Michael for their valuable comments, the patchset finally gets better shape.
- changes from Igor's comments:
  1) abstract dimm device type from pc-dimm and create nvdimm device based on
 dimm, then it uses memory backend device as nvdimm's memory and NUMA has
 easily been implemented.
  2) let file-backend device support any kind of filesystem not only for
 hugetlbfs and let it work on file not only for directory which is
 achieved by extending 'mem-path' - if it's a directory then it works as
 current behavior, otherwise if it's file then directly allocates memory
 from it.
  3) we figure out a unused memory hole below 4G that is 0xFF0 ~ 
 0xFFF0, this range is large enough for NVDIMM ACPI as build 64-bit
 ACPI SSDT/DSDT table will break windows XP.
 BTW, only make SSDT.rev = 2 can not work since the width is only depended
 on DSDT.rev based on 19.6.28 DefinitionBlock (Declare Definition Block)
 in ACPI spec:
| Note: For compatibility with ACPI versions before ACPI 2.0, the bit 
| width of Integer objects is dependent on the ComplianceRevision of the DSDT.
| If the ComplianceRevision is less than 2, all integers are restricted to 32 
| bits. Otherwise, full 64-bit integers are used. The version of the DSDT sets 
| the global integer width for all integers, including integers in SSDTs.
  4) use the lowest ACPI spec version to document AML terms.
  5) use "nvdimm" as nvdimm device name instead of "pc-nvdimm"

- changes from Stefan's comments:
  1) do not do endian adjustment in-place since _DSM memory is visible to guest
  2) use target platform's target page size instead of fixed PAGE_SIZE
 definition
  3) lots of code style improvement and typo fixes.
  4) live migration fix
- changes from Paolo's comments:
  1) improve the name of memory region
  
- other changes:
  1) return exact buffer

[PATCH v6 07/33] util: introduce qemu_file_get_page_size()

2015-10-30 Thread Xiao Guangrong

There are three places use the some logic to get the page size on
the file path or file fd

This patch introduces qemu_file_get_page_size() to unify the code

Signed-off-by: Xiao Guangrong 
---
 include/qemu/osdep.h |  1 +
 target-ppc/kvm.c | 21 +++--
 util/oslib-posix.c   | 16 
 util/oslib-win32.c   |  5 +
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index b568424..d4dde02 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -302,4 +302,5 @@ int qemu_read_password(char *buf, int buf_size);
  */
 pid_t qemu_fork(Error **errp);
 
+size_t qemu_file_get_page_size(const char *mem_path);
 #endif
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index ac70f08..c661f1c 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -308,28 +308,13 @@ static void kvm_get_smmu_info(PowerPCCPU *cpu, struct 
kvm_ppc_smmu_info *info)
 
 static long gethugepagesize(const char *mem_path)
 {
-struct statfs fs;
-int ret;
-
-do {
-ret = statfs(mem_path, );
-} while (ret != 0 && errno == EINTR);
+long size = qemu_file_get_page_size(mem_path);
 
-if (ret != 0) {
-fprintf(stderr, "Couldn't statfs() memory path: %s\n",
-strerror(errno));
+if (!size) {
 exit(1);
 }
 
-#define HUGETLBFS_MAGIC   0x958458f6
-
-if (fs.f_type != HUGETLBFS_MAGIC) {
-/* Explicit mempath, but it's ordinary pages */
-return getpagesize();
-}
-
-/* It's hugepage, return the huge page size */
-return fs.f_bsize;
+return size;
 }
 
 static int find_max_supported_pagesize(Object *obj, void *opaque)
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 914cef5..ad94c5a 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -360,6 +360,22 @@ static size_t fd_getpagesize(int fd)
 return getpagesize();
 }
 
+size_t qemu_file_get_page_size(const char *path)
+{
+size_t size = 0;
+int fd = qemu_open(path, O_RDONLY);
+
+if (fd < 0) {
+fprintf(stderr, "Could not open %s.\n", path);
+goto exit;
+}
+
+size = fd_getpagesize(fd);
+qemu_close(fd);
+exit:
+return size;
+}
+
 void os_mem_prealloc(int fd, char *area, size_t memory)
 {
 int ret;
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 09f9e98..a18aa87 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -462,6 +462,11 @@ size_t getpagesize(void)
 return system_info.dwPageSize;
 }
 
+size_t qemu_file_get_page_size(const char *path)
+{
+return getpagesize();
+}
+
 void os_mem_prealloc(int fd, char *area, size_t memory)
 {
 int i;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 24/33] nvdimm acpi: build ACPI NFIT table

2015-10-30 Thread Xiao Guangrong

NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)

Currently, we only support PMEM mode. Each device has 3 structures:
- SPA structure, defines the PMEM region info

- MEM DEV structure, it has the @handle which is used to associate specified
  ACPI NVDIMM  device we will introduce in later patch.
  Also we can happily ignored the memory device's interleave, the real
  nvdimm hardware access is hidden behind host

- DCR structure, it defines vendor ID used to associate specified vendor
  nvdimm driver. Since we only implement PMEM mode this time, Command
  window and Data window are not needed

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c| 355 
 hw/i386/acpi-build.c|   6 +
 include/hw/mem/nvdimm.h |  10 ++
 3 files changed, 371 insertions(+)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 1223da2..dd84e5f 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -26,8 +26,348 @@
  * License along with this library; if not, see 
  */
 
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
 #include "hw/mem/nvdimm.h"
 
+static int nvdimm_plugged_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+if (memory_region_is_mapped(>nvdimm_mr)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+}
+
+object_child_foreach(obj, nvdimm_plugged_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire plugged NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+static GSList *nvdimm_get_plugged_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_plugged_device_list,
+ );
+return list;
+}
+
+#define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
+   { (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, (c) & 0xff, ((c) >> 8) & 0xff,  \
+ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }
+/*
+ * define Byte Addressable Persistent Memory (PM) Region according to
+ * ACPI 6.0: 5.2.25.1 System Physical Address Range Structure.
+ */
+static const uint8_t nvdimm_nfit_spa_uuid[] =
+  NVDIMM_UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d, 0x33,
+ 0x18, 0xb7, 0x8c, 0xdb);
+
+/*
+ * NVDIMM Firmware Interface Table
+ * @signature: "NFIT"
+ *
+ * It provides information that allows OSPM to enumerate NVDIMM present in
+ * the platform and associate system physical address ranges created by the
+ * NVDIMMs.
+ *
+ * It is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
+ */
+struct NvdimmNfitHeader {
+ACPI_TABLE_HEADER_DEF
+uint32_t reserved;
+} QEMU_PACKED;
+typedef struct NvdimmNfitHeader NvdimmNfitHeader;
+
+/*
+ * define NFIT structures according to ACPI 6.0: 5.2.25 NVDIMM Firmware
+ * Interface Table (NFIT).
+ */
+
+/*
+ * System Physical Address Range Structure
+ *
+ * It describes the system physical address ranges occupied by NVDIMMs and
+ * the types of the regions.
+ */
+struct NvdimmNfitSpa {
+uint16_t type;
+uint16_t length;
+uint16_t spa_index;
+uint16_t flags;
+uint32_t reserved;
+uint32_t proximity_domain;
+uint8_t type_guid[16];
+uint64_t spa_base;
+uint64_t spa_length;
+uint64_t mem_attr;
+} QEMU_PACKED;
+typedef struct NvdimmNfitSpa NvdimmNfitSpa;
+
+/*
+ * Memory Device to System Physical Address Range Mapping Structure
+ *
+ * It enables identifying each NVDIMM region and the corresponding SPA
+ * describing the memory interleave
+ */
+struct NvdimmNfitMemDev {
+uint16_t type;
+uint16_t length;
+uint32_t nfit_handle;
+uint16_t phys_id;
+uint16_t region_id;
+uint16_t spa_index;
+uint16_t dcr_index;
+uint64_t region_len;
+uint64_t region_offset;
+uint64_t region_dpa;
+uint16_t interleave_index;
+uint16_t interleave_ways;
+uint16_t flags;
+uint16_t reserved;
+} QEMU_PACKED;
+typedef struct NvdimmNfitMemDev NvdimmNfitMemDev;
+
+/*
+ * NVDIMM Control Region Structure
+ *
+ * It describes the NVDIMM and if applicable, Block Control Window.
+ */
+struct NvdimmNfitControlRegion {
+uint16_t type;
+uint16_t length;
+uint16_t dcr_index;
+uint16_t vendor_id;
+uint16_t device_id;
+uint16_t revision_id;
+uint16_t sub_vendor_id;
+uint16_t sub_device_id;
+uint16_t sub_revision_id;
+uint8_t reserved[6];
+uint32_t serial_number;
+uint16_t fic;
+uint16_t num_bcw;
+uint64_t bcw_size;
+uint64_t cmd_offset;
+uint64_t cmd_size;
+uint64_t status_offset;
+uint64_t status_size;
+uint16_t flags;
+uint8_t

[PATCH v6 15/33] stubs: rename qmp_pc_dimm_device_list.c

2015-10-30 Thread Xiao Guangrong

Rename qmp_pc_dimm_device_list.c to qmp_dimm_device_list.c

Signed-off-by: Xiao Guangrong 
---
 stubs/Makefile.objs | 2 +-
 stubs/{qmp_pc_dimm_device_list.c => qmp_dimm_device_list.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename stubs/{qmp_pc_dimm_device_list.c => qmp_dimm_device_list.c} (100%)

diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 251443b..d5c862a 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -32,6 +32,6 @@ stub-obj-y += vmstate.o
 stub-obj-$(CONFIG_WIN32) += fd-register.o
 stub-obj-y += cpus.o
 stub-obj-y += kvm.o
-stub-obj-y += qmp_pc_dimm_device_list.o
+stub-obj-y += qmp_dimm_device_list.o
 stub-obj-y += target-monitor-defs.o
 stub-obj-y += vhost.o
diff --git a/stubs/qmp_pc_dimm_device_list.c b/stubs/qmp_dimm_device_list.c
similarity index 100%
rename from stubs/qmp_pc_dimm_device_list.c
rename to stubs/qmp_dimm_device_list.c
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 27/33] nvdimm acpi: support function 0

2015-10-30 Thread Xiao Guangrong

__DSM is defined in ACPI 6.0: 9.14.1 _DSM (Device Specific Method)

Function 0 is a query function. We do not support any function on root
device and only 3 functions are support for NVDIMM device, Get Namespace
Label Size, Get Namespace Label Data and Set Namespace Label Data, that
means we currently only allow to access device's Label Namespace

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c |   2 +-
 hw/acpi/nvdimm.c| 156 +++-
 include/hw/acpi/aml-build.h |   1 +
 3 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 8bee8b2..90229c5 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -231,7 +231,7 @@ static void build_extop_package(GArray *package, uint8_t op)
 build_prepend_byte(package, 0x5B); /* ExtOpPrefix */
 }
 
-static void build_append_int_noprefix(GArray *table, uint64_t value, int size)
+void build_append_int_noprefix(GArray *table, uint64_t value, int size)
 {
 int i;
 
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index e179a72..300a3aa 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -212,6 +212,22 @@ static uint32_t nvdimm_slot_to_dcr_index(int slot)
 return nvdimm_slot_to_spa_index(slot) + 1;
 }
 
+static NVDIMMDevice
+*nvdimm_get_device_by_handle(GSList *list, uint32_t handle)
+{
+for (; list; list = list->next) {
+NVDIMMDevice *nvdimm = list->data;
+int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+   NULL);
+
+if (nvdimm_slot_to_handle(slot) == handle) {
+return nvdimm;
+}
+}
+
+return NULL;
+}
+
 /* ACPI 6.0: 5.2.25.1 System Physical Address Range Structure */
 static void
 nvdimm_build_structure_spa(GArray *structures, NVDIMMDevice *nvdimm)
@@ -368,6 +384,29 @@ static void nvdimm_build_nfit(GSList *device_list, GArray 
*table_offsets,
 g_array_free(structures, true);
 }
 
+/* define NVDIMM DSM return status codes according to DSM Spec Rev1. */
+enum {
+/* Common return status codes. */
+/* Success */
+NVDIMM_DSM_STATUS_SUCCESS = 0,
+/* Not Supported */
+NVDIMM_DSM_STATUS_NOT_SUPPORTED = 1,
+
+/* NVDIMM Root Device _DSM function return status codes*/
+/* Invalid Input Parameters */
+NVDIMM_DSM_ROOT_DEV_STATUS_INVALID_PARAS = 2,
+/* Function-Specific Error */
+NVDIMM_DSM_ROOT_DEV_STATUS_FUNCTION_SPECIFIC_ERROR = 3,
+
+/* NVDIMM Device (non-root) _DSM function return status codes*/
+/* Non-Existing Memory Device */
+NVDIMM_DSM_DEV_STATUS_NON_EXISTING_MEM_DEV = 2,
+/* Invalid Input Parameters */
+NVDIMM_DSM_DEV_STATUS_INVALID_PARAS = 3,
+/* Vendor Specific Error */
+NVDIMM_DSM_DEV_STATUS_VENDOR_SPECIFIC_ERROR = 4,
+};
+
 struct NvdimmDsmIn {
 uint32_t handle;
 uint32_t revision;
@@ -377,10 +416,125 @@ struct NvdimmDsmIn {
 } QEMU_PACKED;
 typedef struct NvdimmDsmIn NvdimmDsmIn;
 
+static void nvdimm_dsm_write_status(GArray *out, uint32_t status)
+{
+status = cpu_to_le32(status);
+build_append_int_noprefix(out, status, sizeof(status));
+}
+
+static void nvdimm_dsm_root(NvdimmDsmIn *in, GArray *out)
+{
+uint32_t status = NVDIMM_DSM_STATUS_NOT_SUPPORTED;
+
+/*
+ * Query command implemented per ACPI Specification, it is defined in
+ * ACPI 6.0: 9.14.1 _DSM (Device Specific Method).
+ */
+if (in->function == 0x0) {
+/*
+ * Set it to zero to indicate no function is supported for NVDIMM
+ * root.
+ */
+uint64_t cmd_list = cpu_to_le64(0);
+
+build_append_int_noprefix(out, cmd_list, sizeof(cmd_list));
+return;
+}
+
+nvdimm_debug("Return status %#x.\n", status);
+nvdimm_dsm_write_status(out, status);
+}
+
+static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
+{
+GSList *list = nvdimm_get_plugged_device_list();
+NVDIMMDevice *nvdimm = nvdimm_get_device_by_handle(list, in->handle);
+uint32_t status = NVDIMM_DSM_DEV_STATUS_NON_EXISTING_MEM_DEV;
+uint64_t cmd_list;
+
+if (!nvdimm) {
+goto set_status_free;
+}
+
+/* Encode DSM function according to DSM Spec Rev1. */
+switch (in->function) {
+/* see comments in nvdimm_dsm_root(). */
+case 0x0:
+cmd_list = cpu_to_le64(0x1 /* Bit 0 indicates whether there is
+  support for any functions other
+  than function 0.
+*/   |
+   1 << 4 /* Get Namespace Label Size */ |
+   1 << 5 /* Get Namespace Label Data */ |
+   1 << 6 /* Set Namespace Label Data */);
+build_append_int_noprefix(out, cmd_list, sizeof(cmd_list));
+goto free;
+default:

[PATCH v6 11/33] hostmem-file: use whole file size if possible

2015-10-30 Thread Xiao Guangrong

Use the whole file size if @size is not specified which is useful
if we want to directly pass a file to guest

Signed-off-by: Xiao Guangrong 
---
 backends/hostmem-file.c | 48 
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index 9097a57..e1bc9ff 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -9,6 +9,9 @@
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
+#include 
+#include 
+
 #include "qemu-common.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/sysemu.h"
@@ -33,20 +36,57 @@ struct HostMemoryBackendFile {
 char *mem_path;
 };
 
+static uint64_t get_file_size(const char *file)
+{
+struct stat stat_buf;
+uint64_t size = 0;
+int fd;
+
+fd = open(file, O_RDONLY);
+if (fd < 0) {
+return 0;
+}
+
+if (stat(file, _buf) < 0) {
+goto exit;
+}
+
+if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
+goto exit;
+}
+
+size = lseek(fd, 0, SEEK_END);
+if (size == -1) {
+size = 0;
+}
+exit:
+close(fd);
+return size;
+}
+
 static void
 file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
 {
 HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(backend);
 
-if (!backend->size) {
-error_setg(errp, "can't create backend with size 0");
-return;
-}
 if (!fb->mem_path) {
 error_setg(errp, "mem-path property not set");
 return;
 }
 
+if (!backend->size) {
+/*
+ * use the whole file size if @size is not specified.
+ */
+backend->size = get_file_size(fb->mem_path);
+}
+
+if (!backend->size) {
+error_setg(errp, "failed to get file size for %s, can't create "
+ "backend on it", mem_path);
+return;
+}
+
 backend->force_prealloc = mem_prealloc;
 memory_region_init_ram_from_file(>mr, OBJECT(backend),
  object_get_canonical_path(OBJECT(backend)),
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 08/33] exec: allow memory to be allocated from any kind of path

2015-10-30 Thread Xiao Guangrong

Currently file_ram_alloc() is designed for hugetlbfs, however, the memory
of nvdimm can come from either raw pmem device eg, /dev/pmem, or the file
locates at DAX enabled filesystem

So this patch let it work on any kind of path

Signed-off-by: Xiao Guangrong 
---
 exec.c | 56 +---
 1 file changed, 17 insertions(+), 39 deletions(-)

diff --git a/exec.c b/exec.c
index 8af2570..3ca7e50 100644
--- a/exec.c
+++ b/exec.c
@@ -1174,32 +1174,6 @@ void qemu_mutex_unlock_ramlist(void)
 }
 
 #ifdef __linux__
-
-#include 
-
-#define HUGETLBFS_MAGIC   0x958458f6
-
-static long gethugepagesize(const char *path, Error **errp)
-{
-struct statfs fs;
-int ret;
-
-do {
-ret = statfs(path, );
-} while (ret != 0 && errno == EINTR);
-
-if (ret != 0) {
-error_setg_errno(errp, errno, "failed to get page size of file %s",
- path);
-return 0;
-}
-
-if (fs.f_type != HUGETLBFS_MAGIC)
-fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
-
-return fs.f_bsize;
-}
-
 static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 const char *path,
@@ -1210,20 +1184,24 @@ static void *file_ram_alloc(RAMBlock *block,
 char *c;
 void *area;
 int fd;
-uint64_t hpagesize;
-Error *local_err = NULL;
+uint64_t pagesize;
 
-hpagesize = gethugepagesize(path, _err);
-if (local_err) {
-error_propagate(errp, local_err);
+pagesize = qemu_file_get_page_size(path);
+if (!pagesize) {
+error_setg(errp, "can't get page size for %s", path);
 goto error;
 }
-block->mr->align = hpagesize;
 
-if (memory < hpagesize) {
+if (pagesize == getpagesize()) {
+fprintf(stderr, "Memory is not allocated from HugeTlbfs.\n");
+}
+
+block->mr->align = pagesize;
+
+if (memory < pagesize) {
 error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
-   "or larger than huge page size 0x%" PRIx64,
-   memory, hpagesize);
+   "or larger than page size 0x%" PRIx64,
+   memory, pagesize);
 goto error;
 }
 
@@ -1247,14 +1225,14 @@ static void *file_ram_alloc(RAMBlock *block,
 fd = mkstemp(filename);
 if (fd < 0) {
 error_setg_errno(errp, errno,
- "unable to create backing store for hugepages");
+ "unable to create backing store for path %s", path);
 g_free(filename);
 goto error;
 }
 unlink(filename);
 g_free(filename);
 
-memory = ROUND_UP(memory, hpagesize);
+memory = ROUND_UP(memory, pagesize);
 
 /*
  * ftruncate is not supported by hugetlbfs in older
@@ -1266,10 +1244,10 @@ static void *file_ram_alloc(RAMBlock *block,
 perror("ftruncate");
 }
 
-area = qemu_ram_mmap(fd, memory, hpagesize, block->flags & RAM_SHARED);
+area = qemu_ram_mmap(fd, memory, pagesize, block->flags & RAM_SHARED);
 if (area == MAP_FAILED) {
 error_setg_errno(errp, errno,
- "unable to map backing store for hugepages");
+ "unable to map backing store for path %s", path);
 close(fd);
 goto error;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 31/33] nvdimm: allow using whole backend memory as pmem

2015-10-30 Thread Xiao Guangrong

Introduce a parameter, named "reserve-label-data", if it is
false which indicates that QEMU does not reserve any region
on the backend memory to support label data. It is a
'label-less' NVDIMM device mode that linux will use whole
memory on the device as a single namesapce

This is useful for the users who want to pass whole nvdimm
device and make its data completely be visible to guest

The parameter is false on default

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c| 12 
 hw/mem/nvdimm.c | 43 ---
 include/hw/mem/nvdimm.h |  6 ++
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 5c8be41..f8d7d19 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -531,6 +531,12 @@ static void nvdimm_dsm_func_label_size(NVDIMMDevice 
*nvdimm, GArray *out)
 static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm,
uint32_t offset, uint32_t length)
 {
+if (!nvdimm->reserve_label_data) {
+nvdimm_debug("read/write label request on the device without "
+ "label data reserved.\n");
+return NVDIMM_DSM_STATUS_NOT_SUPPORTED;
+}
+
 if (offset + length < offset) {
 nvdimm_debug("offset %#x + length %#x is overflow.\n", offset,
  length);
@@ -637,6 +643,12 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
1 << 4 /* Get Namespace Label Size */ |
1 << 5 /* Get Namespace Label Data */ |
1 << 6 /* Set Namespace Label Data */);
+
+/* no function support if the device does not have label data. */
+if (!nvdimm->reserve_label_data) {
+cmd_list = cpu_to_le64(0);
+}
+
 build_append_int_noprefix(out, cmd_list, sizeof(cmd_list));
 goto free;
 case 0x4 /* Get Namespace Label Size */:
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 185aa1a..1d89165 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -36,14 +36,15 @@ static void nvdimm_realize(DIMMDevice *dimm, Error **errp)
 {
 MemoryRegion *mr;
 NVDIMMDevice *nvdimm = NVDIMM(dimm);
-uint64_t size;
+uint64_t reserved_label_size, size;
 
 nvdimm->label_size = MIN_NAMESPACE_LABEL_SIZE;
+reserved_label_size = nvdimm->reserve_label_data ? nvdimm->label_size : 0;
 
 mr = host_memory_backend_get_memory(dimm->hostmem, errp);
 size = memory_region_size(mr);
 
-if (size <= nvdimm->label_size) {
+if (size <= reserved_label_size) {
 char *path = 
object_get_canonical_path_component(OBJECT(dimm->hostmem));
 error_setg(errp, "the size of memdev %s (0x%" PRIx64 ") is too small"
" to contain nvdimm namespace label (0x%" PRIx64 ")", path,
@@ -52,15 +53,19 @@ static void nvdimm_realize(DIMMDevice *dimm, Error **errp)
 }
 
 memory_region_init_alias(>nvdimm_mr, OBJECT(dimm), "nvdimm-memory",
- mr, 0, size - nvdimm->label_size);
-nvdimm->label_data = memory_region_get_ram_ptr(mr) +
- memory_region_size(>nvdimm_mr);
+ mr, 0, size - reserved_label_size);
+
+if (reserved_label_size) {
+nvdimm->label_data = memory_region_get_ram_ptr(mr) +
+ memory_region_size(>nvdimm_mr);
+}
 }
 
 static void nvdimm_read_label_data(NVDIMMDevice *nvdimm, void *buf,
uint64_t size, uint64_t offset)
 {
-assert((nvdimm->label_size >= size + offset) && (offset + size > offset));
+assert(nvdimm->reserve_label_data &&
+   (nvdimm->label_size >= size + offset) && (offset + size > offset));
 
 memcpy(buf, nvdimm->label_data + offset, size);
 }
@@ -72,7 +77,8 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, 
const void *buf,
 DIMMDevice *dimm = DIMM(nvdimm);
 uint64_t backend_offset;
 
-assert((nvdimm->label_size >= size + offset) && (offset + size > offset));
+assert(nvdimm->reserve_label_data &&
+   (nvdimm->label_size >= size + offset) && (offset + size > offset));
 
 memcpy(nvdimm->label_data + offset, buf, size);
 
@@ -97,10 +103,33 @@ static void nvdimm_class_init(ObjectClass *oc, void *data)
 nvc->write_label_data = nvdimm_write_label_data;
 }
 
+static bool nvdimm_get_reserve_label_data(Object *obj, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+return nvdimm->reserve_label_data;
+}
+
+static void
+nvdimm_set_reserve_label_data(Object *obj, bool value, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+
+nvdimm->reserve_label_data = value;
+}
+
+static void nvdimm_init(Object *obj)
+{
+object_property_add_bool(obj, "reserve-label-data",
+ nvdimm_get_reserve_label_data,
+

[PATCH v6 33/33] nvdimm: add maintain info

2015-10-30 Thread Xiao Guangrong

Add NVDIMM maintainer

Signed-off-by: Xiao Guangrong 
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3144113..865c0cf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -907,6 +907,13 @@ M: Jiri Pirko 
 S: Maintained
 F: hw/net/rocker/
 
+NVDIMM
+M: Xiao Guangrong 
+S: Maintained
+F: hw/acpi/nvdimm.c
+F: hw/mem/nvdimm.c
+F: include/hw/mem/nvdimm.h
+
 Subsystems
 --
 Audio
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 10/33] hostmem-file: clean up memory allocation

2015-10-30 Thread Xiao Guangrong

- hostmem-file.c is compiled only if CONFIG_LINUX is enabled so that is
  unnecessary to do the same check in the source file

- the interface, HostMemoryBackendClass->alloc(), is not called many
  times, do not need to check if the memory-region is initialized

Signed-off-by: Xiao Guangrong 
---
 backends/hostmem-file.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index e9b6d21..9097a57 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -46,17 +46,12 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error 
**errp)
 error_setg(errp, "mem-path property not set");
 return;
 }
-#ifndef CONFIG_LINUX
-error_setg(errp, "-mem-path not supported on this host");
-#else
-if (!memory_region_size(>mr)) {
-backend->force_prealloc = mem_prealloc;
-memory_region_init_ram_from_file(>mr, OBJECT(backend),
+
+backend->force_prealloc = mem_prealloc;
+memory_region_init_ram_from_file(>mr, OBJECT(backend),
  object_get_canonical_path(OBJECT(backend)),
  backend->size, fb->share,
  fb->mem_path, errp);
-}
-#endif
 }
 
 static void
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 29/33] nvdimm acpi: support Get Namespace Label Data function

2015-10-30 Thread Xiao Guangrong

Function 5 is used to get Namespace Label Data

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 63 
 1 file changed, 63 insertions(+)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 67c4699..8c27b25 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -428,6 +428,7 @@ struct NvdimmDsmIn {
 union {
 uint8_t arg3[0];
 NvdimmFuncInSetLabelData func_set_label_data;
+NvdimmFuncInGetLabelData func_get_label_data;
 };
 } QEMU_PACKED;
 typedef struct NvdimmDsmIn NvdimmDsmIn;
@@ -527,6 +528,65 @@ static void nvdimm_dsm_func_label_size(NVDIMMDevice 
*nvdimm, GArray *out)
 g_array_append_vals(out, _label_size, sizeof(func_label_size));
 }
 
+static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm,
+   uint32_t offset, uint32_t length)
+{
+if (offset + length < offset) {
+nvdimm_debug("offset %#x + length %#x is overflow.\n", offset,
+ length);
+return NVDIMM_DSM_DEV_STATUS_INVALID_PARAS;
+}
+
+if (nvdimm->label_size < offset + length) {
+nvdimm_debug("position %#x is beyond label data (len = %#lx).\n",
+ offset + length, nvdimm->label_size);
+return NVDIMM_DSM_DEV_STATUS_INVALID_PARAS;
+}
+
+if (length > nvdimm_get_max_xfer_label_size()) {
+nvdimm_debug("length (%#x) is larger than max_xfer (%#x).\n",
+ length, nvdimm_get_max_xfer_label_size());
+return NVDIMM_DSM_DEV_STATUS_INVALID_PARAS;
+}
+
+return NVDIMM_DSM_STATUS_SUCCESS;
+}
+
+/*
+ * DSM Spec Rev1 4.5 Get Namespace Label Data (Function Index 5).
+ */
+static void nvdimm_dsm_func_get_label_data(NVDIMMDevice *nvdimm,
+   NvdimmDsmIn *in, GArray *out)
+{
+NVDIMMClass *nvc = NVDIMM_GET_CLASS(nvdimm);
+NvdimmFuncInGetLabelData *get_label_data = >func_get_label_data;
+void *buf;
+uint32_t status;
+
+le32_to_cpus(_label_data->offset);
+le32_to_cpus(_label_data->length);
+
+nvdimm_debug("Read Label Data: offset %#x length %#x.\n",
+ get_label_data->offset, get_label_data->length);
+
+status = nvdimm_rw_label_data_check(nvdimm, get_label_data->offset,
+get_label_data->length);
+if (status != NVDIMM_DSM_STATUS_SUCCESS) {
+goto exit;
+}
+
+/* write nvdimm_func_out_get_label_data.status. */
+nvdimm_dsm_write_status(out, status);
+/* write nvdimm_func_out_get_label_data.out_buf. */
+buf = acpi_data_push(out, get_label_data->length);
+nvc->read_label_data(nvdimm, buf, get_label_data->length,
+ get_label_data->offset);
+return;
+
+exit:
+nvdimm_dsm_write_status(out, status);
+}
+
 static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
 {
 GSList *list = nvdimm_get_plugged_device_list();
@@ -554,6 +614,9 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
 case 0x4 /* Get Namespace Label Size */:
 nvdimm_dsm_func_label_size(nvdimm, out);
 goto free;
+case 0x5 /* Get Namespace Label Data */:
+nvdimm_dsm_func_get_label_data(nvdimm, in, out);
+goto free;
 default:
 status = NVDIMM_DSM_STATUS_NOT_SUPPORTED;
 };
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 26/33] nvdimm acpi: save arg3 for NVDIMM device _DSM method

2015-10-30 Thread Xiao Guangrong

Check if the input Arg3 is valid then store it into dsm_in if needed

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 27 ++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 53ed675..e179a72 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -524,13 +524,38 @@ static void nvdimm_build_acpi_devices(GSList 
*device_list, Aml *sb_scope)
 
 method = aml_method_serialized("NCAL", 4);
 {
-Aml *buffer_size = aml_local(0);
+Aml *ifctx, *pckg, *buffer_size = aml_local(0);
 
 aml_append(method, aml_store(aml_arg(0), aml_name("HDLE")));
 aml_append(method, aml_store(aml_arg(1), aml_name("REVS")));
 aml_append(method, aml_store(aml_arg(2), aml_name("FUNC")));
 
 /*
+ * The fourth parameter (Arg3) of _DSM is a package which contains
+ * a buffer, the layout of the buffer is specified by UUID (Arg0),
+ * Revision ID (Arg1) and Function Index (Arg2) which are documented
+ * in the DSM Spec.
+ */
+pckg = aml_arg(3);
+ifctx = aml_if(aml_and(aml_equal(aml_object_type(pckg),
+ aml_int(4 /* Package */)),
+   aml_equal(aml_sizeof(pckg),
+ aml_int(1;
+{
+Aml *pckg_index, *pckg_buf;
+
+pckg_index = aml_local(2);
+pckg_buf = aml_local(3);
+
+aml_append(ifctx, aml_store(aml_index(pckg, aml_int(0)),
+pckg_index));
+aml_append(ifctx, aml_store(aml_derefof(pckg_index),
+pckg_buf));
+aml_append(ifctx, aml_store(pckg_buf, aml_name("ARG3")));
+}
+aml_append(method, ifctx);
+
+/*
  * transfer control to QEMU and the buffer size filled by
  * QEMU is returned.
  */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 01/33] acpi: add aml_derefof

2015-10-30 Thread Xiao Guangrong

Implement DeRefOf term which is used by NVDIMM _DSM method in later patch

Reviewed-by: Igor Mammedov 
Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 8 
 include/hw/acpi/aml-build.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 0d4b324..cbd53f4 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1135,6 +1135,14 @@ Aml *aml_unicode(const char *str)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefDerefOf */
+Aml *aml_derefof(Aml *arg)
+{
+Aml *var = aml_opcode(0x83 /* DerefOfOp */);
+aml_append(var, arg);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 1b632dc..5a03d33 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -274,6 +274,7 @@ Aml *aml_create_dword_field(Aml *srcbuf, Aml *index, const 
char *name);
 Aml *aml_varpackage(uint32_t num_elements);
 Aml *aml_touuid(const char *uuid);
 Aml *aml_unicode(const char *str);
+Aml *aml_derefof(Aml *arg);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 25/33] nvdimm acpi: build ACPI nvdimm devices

2015-10-30 Thread Xiao Guangrong

NVDIMM devices is defined in ACPI 6.0 9.20 NVDIMM Devices

There is a root device under \_SB and specified NVDIMM devices are under the
root device. Each NVDIMM device has _ADR which returns its handle used to
associate MEMDEV structure in NFIT

We reserve handle 0 for root device. In this patch, we save handle, handle,
arg1 and arg2 to dsm memory. Arg3 is conditionally saved in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 184 +++
 1 file changed, 184 insertions(+)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index dd84e5f..53ed675 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -368,6 +368,15 @@ static void nvdimm_build_nfit(GSList *device_list, GArray 
*table_offsets,
 g_array_free(structures, true);
 }
 
+struct NvdimmDsmIn {
+uint32_t handle;
+uint32_t revision;
+uint32_t function;
+   /* the remaining size in the page is used by arg3. */
+uint8_t arg3[0];
+} QEMU_PACKED;
+typedef struct NvdimmDsmIn NvdimmDsmIn;
+
 static uint64_t
 nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
 {
@@ -377,6 +386,7 @@ nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
 static void
 nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size)
 {
+fprintf(stderr, "BUG: we never write DSM notification IO Port.\n");
 }
 
 static const MemoryRegionOps nvdimm_dsm_ops = {
@@ -402,6 +412,179 @@ void nvdimm_init_acpi_state(MemoryRegion *memory, 
MemoryRegion *io,
 memory_region_add_subregion(io, NVDIMM_ACPI_IO_BASE, >io_mr);
 }
 
+#define BUILD_STA_METHOD(_dev_, _method_)  \
+do {   \
+_method_ = aml_method("_STA", 0);  \
+aml_append(_method_, aml_return(aml_int(0x0f)));   \
+aml_append(_dev_, _method_);   \
+} while (0)
+
+#define BUILD_DSM_METHOD(_dev_, _method_, _handle_, _uuid_)\
+do {   \
+Aml *ifctx, *uuid; \
+_method_ = aml_method("_DSM", 4);  \
+/* check UUID if it is we expect, return the errorcode if not.*/   \
+uuid = aml_touuid(_uuid_); \
+ifctx = aml_if(aml_lnot(aml_equal(aml_arg(0), uuid))); \
+aml_append(ifctx, aml_return(aml_int(1 /* Not Supported */))); \
+aml_append(method, ifctx); \
+aml_append(method, aml_return(aml_call4("NCAL", aml_int(_handle_), \
+   aml_arg(1), aml_arg(2), aml_arg(3;  \
+aml_append(_dev_, _method_);   \
+} while (0)
+
+#define BUILD_FIELD_UNIT_SIZE(_field_, _byte_, _name_) \
+aml_append(_field_, aml_named_field(_name_, (_byte_) * BITS_PER_BYTE))
+
+#define BUILD_FIELD_UNIT_STRUCT(_field_, _s_, _f_, _name_) \
+BUILD_FIELD_UNIT_SIZE(_field_, sizeof(typeof_field(_s_, _f_)), _name_)
+
+static void build_nvdimm_devices(GSList *device_list, Aml *root_dev)
+{
+for (; device_list; device_list = device_list->next) {
+NVDIMMDevice *nvdimm = device_list->data;
+int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+   NULL);
+uint32_t handle = nvdimm_slot_to_handle(slot);
+Aml *dev, *method;
+
+dev = aml_device("NV%02X", slot);
+aml_append(dev, aml_name_decl("_ADR", aml_int(handle)));
+
+BUILD_STA_METHOD(dev, method);
+
+/*
+ * Chapter 4: _DSM Interface for NVDIMM Device (non-root) - Example
+ * in DSM Spec Rev1.
+ */
+BUILD_DSM_METHOD(dev, method,
+ handle /* NVDIMM Device Handle */,
+ "4309AC30-0D11-11E4-9191-0800200C9A66"
+ /* UUID for NVDIMM Devices. */);
+
+aml_append(root_dev, dev);
+}
+}
+
+static void nvdimm_build_acpi_devices(GSList *device_list, Aml *sb_scope)
+{
+Aml *dev, *method, *field;
+uint64_t page_size = TARGET_PAGE_SIZE;
+
+dev = aml_device("NVDR");
+aml_append(dev, aml_name_decl("_HID", aml_string("ACPI0012")));
+
+/* map DSM memory and IO into ACPI namespace. */
+aml_append(dev, aml_operation_region("NPIO", AML_SYSTEM_IO,
+   NVDIMM_ACPI_IO_BASE, NVDIMM_ACPI_IO_LEN));
+aml_append(dev, aml_operation_region("NRAM", AML_SYSTEM_MEMORY,
+   NVDIMM_ACPI_MEM_BASE, page_size));
+
+/*
+ * DSM notifier:
+ * @NOTI: Read it will notify QEMU that _DSM method is being
+ *called and the parameters can be found in NvdimmDsmIn.
+ *The value read

[PATCH v6 30/33] nvdimm acpi: support Set Namespace Label Data function

2015-10-30 Thread Xiao Guangrong

Function 6 is used to set Namespace Label Data

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/nvdimm.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 8c27b25..5c8be41 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -587,6 +587,34 @@ exit:
 nvdimm_dsm_write_status(out, status);
 }
 
+/*
+ * DSM Spec Rev1 4.6 Set Namespace Label Data (Function Index 6).
+ */
+static void nvdimm_dsm_func_set_label_data(NVDIMMDevice *nvdimm,
+   NvdimmDsmIn *in, GArray *out)
+{
+NVDIMMClass *nvc = NVDIMM_GET_CLASS(nvdimm);
+NvdimmFuncInSetLabelData *set_label_data = >func_set_label_data;
+uint32_t status;
+
+le32_to_cpus(_label_data->offset);
+le32_to_cpus(_label_data->length);
+
+nvdimm_debug("Write Label Data: offset %#x length %#x.\n",
+ set_label_data->offset, set_label_data->length);
+
+status = nvdimm_rw_label_data_check(nvdimm, set_label_data->offset,
+set_label_data->length);
+if (status != NVDIMM_DSM_STATUS_SUCCESS) {
+goto exit;
+}
+
+nvc->write_label_data(nvdimm, set_label_data->in_buf,
+  set_label_data->length, set_label_data->offset);
+exit:
+nvdimm_dsm_write_status(out, status);
+}
+
 static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
 {
 GSList *list = nvdimm_get_plugged_device_list();
@@ -617,6 +645,9 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, GArray *out)
 case 0x5 /* Get Namespace Label Data */:
 nvdimm_dsm_func_get_label_data(nvdimm, in, out);
 goto free;
+case 0x6 /* Set Namespace Label Data */:
+nvdimm_dsm_func_set_label_data(nvdimm, in, out);
+goto free;
 default:
 status = NVDIMM_DSM_STATUS_NOT_SUPPORTED;
 };
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v6 02/33] acpi: add aml_sizeof

2015-10-30 Thread Xiao Guangrong

Implement SizeOf term which is used by NVDIMM _DSM method in later patch

Reviewed-by: Igor Mammedov 
Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 8 
 include/hw/acpi/aml-build.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index cbd53f4..a72214d 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1143,6 +1143,14 @@ Aml *aml_derefof(Aml *arg)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefSizeOf */
+Aml *aml_sizeof(Aml *arg)
+{
+Aml *var = aml_opcode(0x87 /* SizeOfOp */);
+aml_append(var, arg);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 5a03d33..7296efb 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -275,6 +275,7 @@ Aml *aml_varpackage(uint32_t num_elements);
 Aml *aml_touuid(const char *uuid);
 Aml *aml_unicode(const char *str);
 Aml *aml_derefof(Aml *arg);
+Aml *aml_sizeof(Aml *arg);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 05/21] KVM: ARM64: Add reset and access handlers for PMSELR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMSELR_EL0 is UNKNOWN, use reset_unknown for
its reset handler. As it doesn't need to deal with the acsessing action
specially, it uses default case to emulate writing and reading PMSELR
register.

Add a helper for CP15 registers reset to UNKNOWN.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 5 +++--
 arch/arm64/kvm/sys_regs.h | 8 
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 5b591d6..35d232e 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -707,7 +707,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  trap_raz_wi },
/* PMSELR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMSELR_EL0 },
/* PMCEID0_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110),
  trap_raz_wi },
@@ -998,7 +998,8 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(12), Op2( 1), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(12), Op2( 2), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
-   { Op1( 0), CRn( 9), CRm(12), Op2( 5), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMSELR },
{ Op1( 0), CRn( 9), CRm(12), Op2( 6), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(12), Op2( 7), trap_raz_wi },
{ Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index eaa324e..8afeff7 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -110,6 +110,14 @@ static inline void reset_unknown(struct kvm_vcpu *vcpu,
vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
 }
 
+static inline void reset_unknown_cp15(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+   BUG_ON(!r->reg);
+   BUG_ON(r->reg >= NR_COPRO_REGS);
+   vcpu_cp15(vcpu, r->reg) = 0xdecafbad;
+}
+
 static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc 
*r)
 {
BUG_ON(!r->reg);
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 10/21] KVM: ARM64: Add reset and access handlers for PMCCNTR register

2015-10-30 Thread Shannon Zhao

From: Shannon Zhao 

Since the reset value of PMCCNTR is UNKNOWN, use reset_unknown for its
reset handler. Add a new case to emulate reading and writing to PMCCNTR
register.

Signed-off-by: Shannon Zhao 
---
 arch/arm64/kvm/sys_regs.c | 31 +--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index b7ca2cd..059c84c 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -491,6 +491,13 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
 
if (p->is_write) {
switch (r->reg) {
+   case PMCCNTR_EL0: {
+   val = kvm_pmu_get_counter_value(vcpu,
+   ARMV8_MAX_COUNTERS - 1);
+   vcpu_sys_reg(vcpu, r->reg) +=
+ (s64)*vcpu_reg(vcpu, p->Rt) - val;
+   break;
+   }
case PMXEVCNTR_EL0: {
int index = PMEVCNTR0_EL0
+ vcpu_sys_reg(vcpu, PMSELR_EL0);
@@ -529,6 +536,12 @@ static bool access_pmu_regs(struct kvm_vcpu *vcpu,
}
} else {
switch (r->reg) {
+   case PMCCNTR_EL0: {
+   val = kvm_pmu_get_counter_value(vcpu,
+   ARMV8_MAX_COUNTERS - 1);
+   *vcpu_reg(vcpu, p->Rt) = val;
+   break;
+   }
case PMXEVCNTR_EL0: {
val = kvm_pmu_get_counter_value(vcpu,
vcpu_sys_reg(vcpu, PMSELR_EL0));
@@ -759,7 +772,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
  access_pmu_regs, reset_pmceid, PMCEID1_EL0 },
/* PMCCNTR_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
- trap_raz_wi },
+ access_pmu_regs, reset_unknown, PMCCNTR_EL0 },
/* PMXEVTYPER_EL0 */
{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001),
  access_pmu_regs, reset_unknown, PMXEVTYPER_EL0 },
@@ -978,6 +991,13 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
 
if (p->is_write) {
switch (r->reg) {
+   case c9_PMCCNTR: {
+   val = kvm_pmu_get_counter_value(vcpu,
+   ARMV8_MAX_COUNTERS - 1);
+   vcpu_cp15(vcpu, r->reg) += (s64)*vcpu_reg(vcpu, p->Rt)
+  - val;
+   break;
+   }
case c9_PMXEVCNTR: {
int index = c14_PMEVCNTR0 + vcpu_cp15(vcpu, c9_PMSELR);
 
@@ -1014,6 +1034,12 @@ static bool access_pmu_cp15_regs(struct kvm_vcpu *vcpu,
}
} else {
switch (r->reg) {
+   case c9_PMCCNTR: {
+   val = kvm_pmu_get_counter_value(vcpu,
+   ARMV8_MAX_COUNTERS - 1);
+   *vcpu_reg(vcpu, p->Rt) = val;
+   break;
+   }
case c9_PMXEVCNTR: {
val = kvm_pmu_get_counter_value(vcpu,
vcpu_cp15(vcpu, c9_PMSELR));
@@ -1075,7 +1101,8 @@ static const struct sys_reg_desc cp15_regs[] = {
  reset_pmceid, c9_PMCEID0 },
{ Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmu_cp15_regs,
  reset_pmceid, c9_PMCEID1 },
-   { Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
+   { Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_cp15_regs,
+ reset_unknown_cp15, c9_PMCCNTR },
{ Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_cp15_regs,
  reset_unknown_cp15, c9_PMXEVTYPER },
{ Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_cp15_regs,
-- 
2.0.4


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: x86: removing unused variable

2015-10-30 Thread Saurabh Sengar

removing unused variables, found by coccinelle

Signed-off-by: Saurabh Sengar 
---
 arch/x86/kvm/x86.c | 16 +---
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a9a198..ec15294 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3424,41 +3424,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, 
struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-   int r = 0;
-
mutex_lock(>arch.vpit->pit_state.lock);
memcpy(ps, >arch.vpit->pit_state, sizeof(struct kvm_pit_state));
mutex_unlock(>arch.vpit->pit_state.lock);
-   return r;
+   return 0;
 }
 
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-   int r = 0;
-
mutex_lock(>arch.vpit->pit_state.lock);
memcpy(>arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
mutex_unlock(>arch.vpit->pit_state.lock);
-   return r;
+   return 0;
 }
 
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-   int r = 0;
-
mutex_lock(>arch.vpit->pit_state.lock);
memcpy(ps->channels, >arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(>arch.vpit->pit_state.lock);
memset(>reserved, 0, sizeof(ps->reserved));
-   return r;
+   return 0;
 }
 
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-   int r = 0, start = 0;
+   int start = 0;
u32 prev_legacy, cur_legacy;
mutex_lock(>arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & 
KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3470,7 +3464,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct 
kvm_pit_state2 *ps)
kvm->arch.vpit->pit_state.flags = ps->flags;
kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, 
start);
mutex_unlock(>arch.vpit->pit_state.lock);
-   return r;
+   return 0;
 }
 
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH/RFC 0/4] dma ops and virtio

2015-10-30 Thread Andy Lutomirski

On Fri, Oct 30, 2015 at 1:25 AM, Cornelia Huck  wrote:
> On Thu, 29 Oct 2015 15:50:38 -0700
> Andy Lutomirski  wrote:
>
>> Progress!  After getting that sort-of-working, I figured out what was
>> wrong with my earlier command, and I got that working, too.  Now I
>> get:
>>
>> qemu-system-s390x -fsdev
>> local,id=virtfs1,path=/,security_model=none,readonly -device
>> virtio-9p-ccw,fsdev=virtfs1,mount_tag=/dev/root -M s390-ccw-virtio
>> -nodefaults -device sclpconsole,chardev=console -parallel none -net
>> none -echr 1 -serial none -chardev stdio,id=console,signal=off,mux=on
>> -serial chardev:console -mon chardev=console -vga none -display none
>> -kernel arch/s390/boot/bzImage -append
>> 'init=/home/luto/devel/virtme/virtme/guest/virtme-init
>> psmouse.proto=exps "virtme_stty_con=rows 24 cols 150 iutf8"
>> TERM=xterm-256color rootfstype=9p
>> rootflags=ro,version=9p2000.L,trans=virtio,access=any
>> raid=noautodetect debug'
>
> The commandline looks sane AFAICS.
>
> (...)
>
>> vrfy: device 0.0.: rc=0 pgroup=0 mpath=0 vpm=80
>> virtio_ccw 0.0.: Failed to set online: -5
>>
>> ^^^ bad news!
>
> I'd like to see where in the onlining process this fails. Could you set
> up qemu tracing for css_* and virtio_ccw_* (instructions in
> qemu/docs/tracing.txt)?

I have a file called events that contains:

css_*
virtio_ccw_*

pointing -trace events= at it results in a trace- file that's 549
bytes long and contains nothing.  Are wildcards not as well-supported
as the docs suggest?

>
> Which qemu version is this, btw.?
>

git from yesterday.

--Andy



-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-10-30 Thread Arnd Bergmann

On Saturday 31 October 2015 10:17:22 Benjamin Herrenschmidt wrote:
> On Fri, 2015-10-30 at 11:32 +0100, Arnd Bergmann wrote:
> > On Thursday 29 October 2015 10:10:46 Benjamin Herrenschmidt wrote:
> > > 
> > > > Maybe we should at least coordinate IOMMU 'paranoid/fast' modes
> > > > across
> > > > architectures, and then the DMA_ATTR_IOMMU_BYPASS flag would have
> > > > a
> > > > sane meaning in the paranoid mode (and perhaps we'd want an ultra
> > > > -paranoid mode where it's not honoured).
> > > 
> > > Possibly, though ideally that would be a user policy but of course
> > > by
> > > the time you get to userspace it's generally too late.
> > 
> > IIRC, we have an 'iommu=force' command line switch for this, to
> > ensure
> > that no device can use a linear mapping and everything goes th ough
> > the page tables. This is often useful for both debugging and as a
> > security measure when dealing with unpriviledged DMA access (virtual
> > machines, vfio, ...).
> 
> That was used to force-enable the iommu on platforms like G5s where we
> would otherwise only do so if the memory was larger than 32-bit but we
> never implemented using it to prevent the bypass region.

Ah, I see. Thanks for the clarification.

> > If we add a DMA_ATTR_IOMMU_BYPASS attribute, we should clearly
> > document
> > which osed to force-enable the iommu on HGthe two we expect to take
> > priority in cases where we have a
> > choice.
> >
> > I wonder if the 'iommu=force' attribute is too coarse-grained though,
> > and if we should perhaps allow a per-device setting on architectures
> > that allow this.
> 
> The interesting thing, if we can make it work, is the bypass attribute
> being per mapping... 

I would say we want both: for the device driver it can make sense to
choose per mapping what it can do, but for the iommu driver, it
can also make sense to ensure we never provide a linear mapping,
because otherwise the additional security aspect is moot.

In particular for the unprivileged VM guest or vfio access, the
code that gives access to the device to something else should
have a way to tell the IOMMU that the linear mapping can no longer
be used.

Arnd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 0/3] KVM/arm64/arm: enhance armv7/8 fp/simd lazy switch

2015-10-30 Thread Mario Smarduch

This short patch series combines the previous armv7 and armv8 versions.
For an FP and lmbench load it reduces fp/simd context switch from 30-50% down 
to 2%. Results will vary with load but is no worse then current
approach. 

In summary current lazy vfp/simd implementation switches hardware context only 
on guest access and again on exit to host, otherwise hardware context is
skipped. This patch set builds on that functionality and executes a hardware 
context switch only when  vCPU is scheduled out or returns to user space.

Patches were tested on FVP sw platform. FP crunching applications summing up
values, with outcome compared to known result were executed on several guests,
and host.

The test can be found here, https://github.com/mjsmar/arm-arm64-fpsimd-test
Tests executed 24 hours.

armv7 test:
- On host executed 12 fp crunching applications - used taskset to bind 
- Two guests - with 12 fp crunching processes - used taskset to bind
- half ran with 1ms sleep, remaining with no sleep

armv8 test: 
- same as above except used mix of armv7 and armv8 guests.

Every so often injected a fault (via proc file entry) and mismatch between 
expected and crunched summed value was reported. The FP crunch processes could 
continue to run but with bad results.

Looked at 'paranoia.c' - appears like a comprehensive hardware FP 
precision/behavior test.  It will test various behaviors and may fail having 
nothing to do with world switch of fp/simd - 
- Adequacy of guard digits for Mult., Div. and Subt.
- UnderflowThreshold = an underflow threshold.
- V = an overflow threshold, roughly.
...

With outcomes like -
- Smallest strictly positive number found is E0 = 4.94066e-324
- Searching for Overflow threshold: This may generate an error.
...

Personally don't understand everything it's dong.

Opted to use the simple tst-float executable.

These patches are based on earlier arm64 fp/simd optimization work -
https://lists.cs.columbia.edu/pipermail/kvmarm/2015-July/015748.html

And subsequent fixes by Marc and Christoffer at KVM Forum hackathon to handle
32-bit guest on 64 bit host - 
https://lists.cs.columbia.edu/pipermail/kvmarm/2015-August/016128.html

Changes since v2->v3:
- combined arm v7 and v8 into one short patch series
- moved access to fpexec_el2 back to EL2
- Move host restore to EL1 from EL2 and call directly from host
- optimize trap enable code 
- renamed some variables to match usage

Changes since v1->v2:
- Fixed vfp/simd trap configuration to enable trace trapping
- Removed set_hcptr branch label
- Fixed handling of FPEXC to restore guest and host versions on vcpu_put
- Tested arm32/arm64
- rebased to 4.3-rc2
- changed a couple register accesses from 64 to 32 bit


Mario Smarduch (3):
  hooks for armv7 fp/simd lazy switch support
  enable enhanced armv7 fp/simd lazy switch
  enable enhanced armv8 fp/simd lazy switch

 arch/arm/include/asm/kvm_host.h   |  7 +
 arch/arm/kernel/asm-offsets.c |  2 ++
 arch/arm/kvm/arm.c|  6 
 arch/arm/kvm/interrupts.S | 60 ---
 arch/arm/kvm/interrupts_head.S| 14 +
 arch/arm64/include/asm/kvm_host.h |  4 +++
 arch/arm64/kernel/asm-offsets.c   |  1 +
 arch/arm64/kvm/hyp.S  | 37 
 8 files changed, 103 insertions(+), 28 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-10-30 Thread Benjamin Herrenschmidt

On Fri, 2015-10-30 at 11:32 +0100, Arnd Bergmann wrote:
> On Thursday 29 October 2015 10:10:46 Benjamin Herrenschmidt wrote:
> > 
> > > Maybe we should at least coordinate IOMMU 'paranoid/fast' modes
> > > across
> > > architectures, and then the DMA_ATTR_IOMMU_BYPASS flag would have
> > > a
> > > sane meaning in the paranoid mode (and perhaps we'd want an ultra
> > > -paranoid mode where it's not honoured).
> > 
> > Possibly, though ideally that would be a user policy but of course
> > by
> > the time you get to userspace it's generally too late.
> 
> IIRC, we have an 'iommu=force' command line switch for this, to
> ensure
> that no device can use a linear mapping and everything goes th ough
> the page tables. This is often useful for both debugging and as a
> security measure when dealing with unpriviledged DMA access (virtual
> machines, vfio, ...).

That was used to force-enable the iommu on platforms like G5s where we
would otherwise only do so if the memory was larger than 32-bit but we
never implemented using it to prevent the bypass region.

> If we add a DMA_ATTR_IOMMU_BYPASS attribute, we should clearly
> document
> which osed to force-enable the iommu on HGthe two we expect to take
> priority in cases where we have a
> choice.
>
> I wonder if the 'iommu=force' attribute is too coarse-grained though,
> and if we should perhaps allow a per-device setting on architectures
> that allow this.

The interesting thing, if we can make it work, is the bypass attribute
being per mapping... 

Ben. 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 00/33] implement vNVDIMM

2015-10-30 Thread Stefan Hajnoczi

On Fri, Oct 30, 2015 at 01:55:54PM +0800, Xiao Guangrong wrote:
> This patchset can be found at:
>   https://github.com/xiaogr/qemu.git nvdimm-v6
> 
> It is based on pci branch on Michael's tree and the top commit is:
> commit 6f96a31a06c2a1 (tests: re-enable vhost-user-test).
> 
> Changelog in v6:
> - changes from Stefan's comments:
>   1) fix code style of struct naming by CamelCase way
>   2) fix offset + length overflow when read/write label data
>   3) compile hw/acpi/nvdimm.c for per target so that TARGET_PAGE_SIZE can
>  be used to replace getpagesize()
> 
> Changelog in v5:
> - changes from Michael's comments:
>   1) prefix nvdimm_ to everything in NVDIMM source files
>   2) make parsing _DSM Arg3 more clear
>   3) comment style fix
>   5) drop single used definition
>   6) fix dirty dsm buffer lost due to memory write happened on host
>   7) check dsm buffer if it is big enough to contain input data
>   8) use build_append_int_noprefix to store single value to GArray
> 
> - changes from Michael's and Igor's comments:
>   1) introduce 'nvdimm-support' parameter to control nvdimm
>  enablement and it is disabled for 2.4 and its earlier versions
>  to make live migration compatible
>   2) only reserve 1 RAM page and 4 bytes IO Port for NVDIMM ACPI
>  virtualization
> 
> - changes from Stefan's comments:
>   1) do endian adjustment for the buffer length
> 
> - changes from Bharata B Rao's comments:
>   1) fix compile on ppc
> 
> - others:
>   1) the buffer length is directly got from IO read rather than got
>  from dsm memory
>   2) fix dirty label data lost due to memory write happened on host
> 
> Changelog in v4:
> - changes from Michael's comments:
>   1) show the message, "Memory is not allocated from HugeTlbfs", if file
>  based memory is not allocated from hugetlbfs.
>   2) introduce function, acpi_get_nvdimm_state(), to get NVDIMMState
>  from Machine.
>   3) statically define UUID and make its operation more clear
>   4) use GArray to build device structures to avoid potential buffer
>  overflow
>   4) improve comments in the code
>   5) improve code style
> 
> - changes from Igor's comments:
>   1) add NVDIMM ACPI spec document
>   2) use serialized method to avoid Mutex
>   3) move NVDIMM ACPI's code to hw/acpi/nvdimm.c
>   4) introduce a common ASL method used by _DSM for all devices to reduce
>  ACPI size
>   5) handle UUID in ACPI AML code. BTW, i'd keep handling revision in QEMU
>  it's better to upgrade QEMU to support Rev2 in the future
> 
> - changes from Stefan's comments:
>   1) copy input data from DSM memory to local buffer to avoid potential
>  issues as DSM memory is visible to guest. Output data is handled
>  in a similar way
> 
> - changes from Dan's comments:
>   1) drop static namespace as Linux has already supported label-less
>  nvdimm devices
> 
> - changes from Vladimir's comments:
>   1) print better message, "failed to get file size for %s, can't create
>  backend on it", if any file operation filed to obtain file size
> 
> - others:
>   create a git repo on github.com for better review/test
> 
> Also, thanks for Eric Blake's review on QAPI's side.
> 
> Thank all of you to review this patchset.
> 
> Changelog in v3:
> There is huge change in this version, thank Igor, Stefan, Paolo, Eduardo,
> Michael for their valuable comments, the patchset finally gets better shape.
> - changes from Igor's comments:
>   1) abstract dimm device type from pc-dimm and create nvdimm device based on
>  dimm, then it uses memory backend device as nvdimm's memory and NUMA has
>  easily been implemented.
>   2) let file-backend device support any kind of filesystem not only for
>  hugetlbfs and let it work on file not only for directory which is
>  achieved by extending 'mem-path' - if it's a directory then it works as
>  current behavior, otherwise if it's file then directly allocates memory
>  from it.
>   3) we figure out a unused memory hole below 4G that is 0xFF0 ~ 
>  0xFFF0, this range is large enough for NVDIMM ACPI as build 64-bit
>  ACPI SSDT/DSDT table will break windows XP.
>  BTW, only make SSDT.rev = 2 can not work since the width is only depended
>  on DSDT.rev based on 19.6.28 DefinitionBlock (Declare Definition Block)
>  in ACPI spec:
> | Note: For compatibility with ACPI versions before ACPI 2.0, the bit 
> | width of Integer objects is dependent on the ComplianceRevision of the DSDT.
> | If the ComplianceRevision is less than 2, all integers are restricted to 32 
> | bits. Otherwise, full 64-bit integers are used. The version of the DSDT 
> sets 
> | the global integer width for all integers, including integers in SSDTs.
>   4) use the lowest ACPI spec version to document AML terms.
>   5) use "nvdimm" as nvdimm device name instead of "pc-nvdimm"
> 
> - changes from Stefan's comments:
>   1) do not do endian adjustment in-place since _DSM memory is visible to

Re: [PATCH v6 27/33] nvdimm acpi: support function 0

2015-10-30 Thread Stefan Hajnoczi

On Fri, Oct 30, 2015 at 01:56:21PM +0800, Xiao Guangrong wrote:
>  static uint64_t
>  nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
>  {
> -return 0;
> +AcpiNVDIMMState *state = opaque;
> +MemoryRegion *dsm_ram_mr = >ram_mr;
> +NvdimmDsmIn *in;
> +GArray *out;
> +void *dsm_ram_addr;
> +uint32_t buf_size;
> +
> +assert(memory_region_size(dsm_ram_mr) >= sizeof(NvdimmDsmIn));
> +dsm_ram_addr = memory_region_get_ram_ptr(dsm_ram_mr);
> +
> +/*
> + * The DSM memory is mapped to guest address space so an evil guest
> + * can change its content while we are doing DSM emulation. Avoid
> + * this by copying DSM memory to QEMU local memory.
> + */
> +in = g_malloc(memory_region_size(dsm_ram_mr));
> +memcpy(in, dsm_ram_addr, memory_region_size(dsm_ram_mr));
> +
> +le32_to_cpus(>revision);
> +le32_to_cpus(>function);
> +le32_to_cpus(>handle);
> +
> +nvdimm_debug("Revision %#x Handler %#x Function %#x.\n", in->revision,
> + in->handle, in->function);
> +
> +out = g_array_new(false, true /* clear */, 1);
> +
> +if (in->revision != 0x1 /* Current we support DSM Spec Rev1. */) {
> +nvdimm_debug("Revision %#x is not supported, expect %#x.\n",
> +  in->revision, 0x1);
> +nvdimm_dsm_write_status(out, NVDIMM_DSM_STATUS_NOT_SUPPORTED);
> +goto exit;
> +}
> +
> +/* Handle 0 is reserved for NVDIMM Root Device. */
> +if (!in->handle) {
> +nvdimm_dsm_root(in, out);
> +goto exit;
> +}
> +
> +nvdimm_dsm_device(in, out);
> +
> +exit:
> +/* Write output result to dsm memory. */
> +memcpy(dsm_ram_addr, out->data, out->len);
> +memory_region_set_dirty(dsm_ram_mr, 0, out->len);

If you respin this series, please add this before the memcpy out:

  assert(out->len <= memory_region_size(dsm_ram_mr))

That way we can catch situations where too much output data was
generated by mistake.


signature.asc
Description: PGP signature

Re: [PATCH v4 2/6] virtio_ring: Support DMA APIs

2015-10-30 Thread Cornelia Huck

On Thu, 29 Oct 2015 18:09:47 -0700
Andy Lutomirski  wrote:

> virtio_ring currently sends the device (usually a hypervisor)
> physical addresses of its I/O buffers.  This is okay when DMA
> addresses and physical addresses are the same thing, but this isn't
> always the case.  For example, this never works on Xen guests, and
> it is likely to fail if a physical "virtio" device ever ends up
> behind an IOMMU or swiotlb.
> 
> The immediate use case for me is to enable virtio on Xen guests.
> For that to work, we need vring to support DMA address translation
> as well as a corresponding change to virtio_pci or to another
> driver.
> 
> With this patch, if enabled, virtfs survives kmemleak and
> CONFIG_DMA_API_DEBUG.
> 
> Signed-off-by: Andy Lutomirski 
> ---
>  drivers/virtio/Kconfig   |   2 +-
>  drivers/virtio/virtio_ring.c | 190 
> +++
>  tools/virtio/linux/dma-mapping.h |  17 
>  3 files changed, 172 insertions(+), 37 deletions(-)
>  create mode 100644 tools/virtio/linux/dma-mapping.h

>  static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>  {
> - unsigned int i;
> + unsigned int i, j;
> + u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> 
>   /* Clear data ptr. */
> - vq->data[head] = NULL;
> + vq->desc_state[head].data = NULL;
> 
> - /* Put back on free list: find end */
> + /* Put back on free list: unmap first-level descriptors and find end */
>   i = head;
> 
> - /* Free the indirect table */
> - if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, 
> VRING_DESC_F_INDIRECT))
> - kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, 
> vq->vring.desc[i].addr)));
> -
> - while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, 
> VRING_DESC_F_NEXT)) {
> + while (vq->vring.desc[i].flags & nextflag) {
> + vring_unmap_one(vq, >vring.desc[i]);
>   i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>   vq->vq.num_free++;
>   }
> 
> + vring_unmap_one(vq, >vring.desc[i]);
>   vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>   vq->free_head = head;
> +
>   /* Plus final descriptor */
>   vq->vq.num_free++;
> +
> + /* Free the indirect table, if any, now that it's unmapped. */
> + if (vq->desc_state[head].indir_desc) {
> + struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
> + u32 len = vq->vring.desc[head].len;

This one needs to be virtio32_to_cpu(...) as well.

> +
> + BUG_ON(!(vq->vring.desc[head].flags &
> +  cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> + BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> +
> + for (j = 0; j < len / sizeof(struct vring_desc); j++)
> + vring_unmap_one(vq, _desc[j]);
> +
> + kfree(vq->desc_state[head].indir_desc);
> + vq->desc_state[head].indir_desc = NULL;
> + }
>  }

With that change on top of your current branch, I can boot (root on
virtio-blk, either virtio-1 or legacy virtio) on current qemu master
with kvm enabled on s390. Haven't tried anything further.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 2/6] virtio_ring: Support DMA APIs

2015-10-30 Thread Christian Borntraeger

Am 30.10.2015 um 13:01 schrieb Cornelia Huck:
> On Thu, 29 Oct 2015 18:09:47 -0700
> Andy Lutomirski  wrote:
> 
>> virtio_ring currently sends the device (usually a hypervisor)
>> physical addresses of its I/O buffers.  This is okay when DMA
>> addresses and physical addresses are the same thing, but this isn't
>> always the case.  For example, this never works on Xen guests, and
>> it is likely to fail if a physical "virtio" device ever ends up
>> behind an IOMMU or swiotlb.
>>
>> The immediate use case for me is to enable virtio on Xen guests.
>> For that to work, we need vring to support DMA address translation
>> as well as a corresponding change to virtio_pci or to another
>> driver.
>>
>> With this patch, if enabled, virtfs survives kmemleak and
>> CONFIG_DMA_API_DEBUG.
>>
>> Signed-off-by: Andy Lutomirski 
>> ---
>>  drivers/virtio/Kconfig   |   2 +-
>>  drivers/virtio/virtio_ring.c | 190 
>> +++
>>  tools/virtio/linux/dma-mapping.h |  17 
>>  3 files changed, 172 insertions(+), 37 deletions(-)
>>  create mode 100644 tools/virtio/linux/dma-mapping.h
> 
>>  static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>>  {
>> -unsigned int i;
>> +unsigned int i, j;
>> +u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
>>
>>  /* Clear data ptr. */
>> -vq->data[head] = NULL;
>> +vq->desc_state[head].data = NULL;
>>
>> -/* Put back on free list: find end */
>> +/* Put back on free list: unmap first-level descriptors and find end */
>>  i = head;
>>
>> -/* Free the indirect table */
>> -if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, 
>> VRING_DESC_F_INDIRECT))
>> -kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, 
>> vq->vring.desc[i].addr)));
>> -
>> -while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, 
>> VRING_DESC_F_NEXT)) {
>> +while (vq->vring.desc[i].flags & nextflag) {
>> +vring_unmap_one(vq, >vring.desc[i]);
>>  i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
>>  vq->vq.num_free++;
>>  }
>>
>> +vring_unmap_one(vq, >vring.desc[i]);
>>  vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
>>  vq->free_head = head;
>> +
>>  /* Plus final descriptor */
>>  vq->vq.num_free++;
>> +
>> +/* Free the indirect table, if any, now that it's unmapped. */
>> +if (vq->desc_state[head].indir_desc) {
>> +struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
>> +u32 len = vq->vring.desc[head].len;
> 
> This one needs to be virtio32_to_cpu(...) as well.

Yes, just did the exact same change
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index f269e1c..f2249df 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -556,7 +556,7 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned 
int head)
/* Free the indirect table, if any, now that it's unmapped. */
if (vq->desc_state[head].indir_desc) {
struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
-   u32 len = vq->vring.desc[head].len;
+   u32 len = virtio32_to_cpu(vq->vq.vdev, 
vq->vring.desc[head].len);
 
BUG_ON(!(vq->vring.desc[head].flags &
 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));


now it boots.
> 
>> +
>> +BUG_ON(!(vq->vring.desc[head].flags &
>> + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
>> +BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>> +
>> +for (j = 0; j < len / sizeof(struct vring_desc); j++)
>> +vring_unmap_one(vq, _desc[j]);
>> +
>> +kfree(vq->desc_state[head].indir_desc);
>> +vq->desc_state[head].indir_desc = NULL;
>> +}
>>  }
> 
> With that change on top of your current branch, I can boot (root on
> virtio-blk, either virtio-1 or legacy virtio) on current qemu master
> with kvm enabled on s390. Haven't tried anything further.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] vhost: move is_le setup to the backend

2015-10-30 Thread Greg Kurz

The vq->is_le field is used to fix endianness when accessing the vring via
the cpu_to_vhost16() and vhost16_to_cpu() helpers in the following cases:

1) host is big endian and device is modern virtio

2) host has cross-endian support and device is legacy virtio with a different
   endianness than the host

Both cases rely on the VHOST_SET_FEATURES ioctl, but 2) also needs the
VHOST_SET_VRING_ENDIAN ioctl to be called by userspace. Since vq->is_le
is only needed when the backend is active, it was decided to set it at
backend start.

This is currently done in vhost_init_used()->vhost_init_is_le() but it
obfuscates the core vhost code. This patch moves the is_le setup to a
dedicated function that is called from the backend code.

Note vhost_net is the only backend that can pass vq->private_data == NULL to
vhost_init_used(), hence the "if (sock)" branch.

No behaviour change.

Signed-off-by: Greg Kurz 
---
 drivers/vhost/net.c   |6 ++
 drivers/vhost/scsi.c  |3 +++
 drivers/vhost/test.c  |2 ++
 drivers/vhost/vhost.c |   12 +++-
 drivers/vhost/vhost.h |1 +
 5 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e40678..d6319cb2664c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -917,6 +917,12 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
 
vhost_net_disable_vq(n, vq);
vq->private_data = sock;
+
+   if (sock)
+   vhost_set_is_le(vq);
+   else
+   vq->is_le = virtio_legacy_is_little_endian();
+
r = vhost_init_used(vq);
if (r)
goto err_used;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index e25a23692822..e2644a301fa5 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1276,6 +1276,9 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
vq = >vqs[i].vq;
mutex_lock(>mutex);
vq->private_data = vs_tpg;
+
+   vhost_set_is_le(vq);
+
vhost_init_used(vq);
mutex_unlock(>mutex);
}
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index f2882ac98726..b1c7df502211 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -196,6 +196,8 @@ static long vhost_test_run(struct vhost_test *n, int test)
oldpriv = vq->private_data;
vq->private_data = priv;
 
+   vhost_set_is_le(vq);
+
r = vhost_init_used(>vqs[index]);
 
mutex_unlock(>mutex);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index eec2f11809ff..6be863dcbd13 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -113,6 +113,12 @@ static void vhost_init_is_le(struct vhost_virtqueue *vq)
 }
 #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
 
+void vhost_set_is_le(struct vhost_virtqueue *vq)
+{
+   vhost_init_is_le(vq);
+}
+EXPORT_SYMBOL_GPL(vhost_set_is_le);
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
 {
@@ -1156,12 +1162,8 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 {
__virtio16 last_used_idx;
int r;
-   if (!vq->private_data) {
-   vq->is_le = virtio_legacy_is_little_endian();
+   if (!vq->private_data)
return 0;
-   }
-
-   vhost_init_is_le(vq);
 
r = vhost_update_used_flags(vq);
if (r)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 4772862b71a7..8a62041959fe 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -162,6 +162,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct 
vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
+void vhost_set_is_le(struct vhost_virtqueue *vq);
 
 #define vq_err(vq, fmt, ...) do {  \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__);   \

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vfio/platform: store mapped memory in region, instead of an on-stack copy

2015-10-30 Thread Eric Auger

Hi,
On 10/30/2015 09:51 AM, Baptiste Reynal wrote:
> Hi James,
> 
> Thanks for this fix.
> 
> Acked-by: Baptiste Reynal 
> Tested-by: Baptiste Reynal 
> 
> On Thu, Oct 29, 2015 at 5:50 PM, James Morse  wrote:
>> vfio_platform_{read,write}_mmio() call ioremap_nocache() to map
>> a region of io memory, which they store in struct vfio_platform_region to
>> be eventually re-used, or unmapped by vfio_platform_regions_cleanup().
>>
>> These functions receive a copy of their struct vfio_platform_region
>> argument on the stack - so these mapped areas are always allocated, and
>> always leaked.
I just noticed I have a leak in reset modules too. I am going to correct
this.

Thanks

Eric
>>
>> Pass this argument as a pointer instead.
>>
>> Fixes: 6e3f26456009 "vfio/platform: read and write support for the device fd"
>> Signed-off-by: James Morse 
>> ---
>>  drivers/vfio/platform/vfio_platform_common.c | 36 
>> ++--
>>  1 file changed, 18 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
>> b/drivers/vfio/platform/vfio_platform_common.c
>> index f3b6299..ccf5da5 100644
>> --- a/drivers/vfio/platform/vfio_platform_common.c
>> +++ b/drivers/vfio/platform/vfio_platform_common.c
>> @@ -308,17 +308,17 @@ static long vfio_platform_ioctl(void *device_data,
>> return -ENOTTY;
>>  }
>>
>> -static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg,
>> +static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
>>char __user *buf, size_t count,
>>loff_t off)
>>  {
>> unsigned int done = 0;
>>
>> -   if (!reg.ioaddr) {
>> -   reg.ioaddr =
>> -   ioremap_nocache(reg.addr, reg.size);
>> +   if (!reg->ioaddr) {
>> +   reg->ioaddr =
>> +   ioremap_nocache(reg->addr, reg->size);
>>
>> -   if (!reg.ioaddr)
>> +   if (!reg->ioaddr)
>> return -ENOMEM;
>> }
>>
>> @@ -328,7 +328,7 @@ static ssize_t vfio_platform_read_mmio(struct 
>> vfio_platform_region reg,
>> if (count >= 4 && !(off % 4)) {
>> u32 val;
>>
>> -   val = ioread32(reg.ioaddr + off);
>> +   val = ioread32(reg->ioaddr + off);
>> if (copy_to_user(buf, , 4))
>> goto err;
>>
>> @@ -336,7 +336,7 @@ static ssize_t vfio_platform_read_mmio(struct 
>> vfio_platform_region reg,
>> } else if (count >= 2 && !(off % 2)) {
>> u16 val;
>>
>> -   val = ioread16(reg.ioaddr + off);
>> +   val = ioread16(reg->ioaddr + off);
>> if (copy_to_user(buf, , 2))
>> goto err;
>>
>> @@ -344,7 +344,7 @@ static ssize_t vfio_platform_read_mmio(struct 
>> vfio_platform_region reg,
>> } else {
>> u8 val;
>>
>> -   val = ioread8(reg.ioaddr + off);
>> +   val = ioread8(reg->ioaddr + off);
>> if (copy_to_user(buf, , 1))
>> goto err;
>>
>> @@ -377,7 +377,7 @@ static ssize_t vfio_platform_read(void *device_data, 
>> char __user *buf,
>> return -EINVAL;
>>
>> if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
>> -   return vfio_platform_read_mmio(vdev->regions[index],
>> +   return vfio_platform_read_mmio(>regions[index],
>> buf, count, off);
>> else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
>> return -EINVAL; /* not implemented */
>> @@ -385,17 +385,17 @@ static ssize_t vfio_platform_read(void *device_data, 
>> char __user *buf,
>> return -EINVAL;
>>  }
>>
>> -static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg,
>> +static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
>> const char __user *buf, size_t count,
>> loff_t off)
>>  {
>> unsigned int done = 0;
>>
>> -   if (!reg.ioaddr) {
>> -   reg.ioaddr =
>> -   ioremap_nocache(reg.addr, reg.size);
>> +   if (!reg->ioaddr) {
>> +   reg->ioaddr =
>> +   ioremap_nocache(reg->addr, reg->size);
>>
>> -   if (!reg.ioaddr)
>> +   if (!reg->ioaddr)
>> return -ENOMEM;
>> }
>>
>> @@ -407,7 +407,7 @@ static ssize_t vfio_platform_write_mmio(struct 
>> vfio_platform_region reg,
>>
>> if (copy_from_user(, buf, 4))

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-10-30 Thread Arnd Bergmann

On Thursday 29 October 2015 10:10:46 Benjamin Herrenschmidt wrote:
> 
> > Maybe we should at least coordinate IOMMU 'paranoid/fast' modes across
> > architectures, and then the DMA_ATTR_IOMMU_BYPASS flag would have a
> > sane meaning in the paranoid mode (and perhaps we'd want an ultra
> > -paranoid mode where it's not honoured).
> 
> Possibly, though ideally that would be a user policy but of course by
> the time you get to userspace it's generally too late.

IIRC, we have an 'iommu=force' command line switch for this, to ensure
that no device can use a linear mapping and everything goes through
the page tables. This is often useful for both debugging and as a
security measure when dealing with unpriviledged DMA access (virtual
machines, vfio, ...).

If we add a DMA_ATTR_IOMMU_BYPASS attribute, we should clearly document
which of the two we expect to take priority in cases where we have a
choice.

I wonder if the 'iommu=force' attribute is too coarse-grained though,
and if we should perhaps allow a per-device setting on architectures
that allow this.

Arnd
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH net-next rfc V2 0/2] basic busy polling support for vhost_net

2015-10-30 Thread Jason Wang



On 10/29/2015 04:45 PM, Jason Wang wrote:
> Hi all:
>
> This series tries to add basic busy polling for vhost net. The idea is
> simple: at the end of tx processing, busy polling for new tx added
> descriptor and rx receive socket for a while. The maximum number of
> time (in us) could be spent on busy polling was specified through
> module parameter.
>
> Test were done through:
>
> - 50 us as busy loop timeout
> - Netperf 2.6
> - Two machines with back to back connected mlx4
> - Guest with 8 vcpus and 1 queue
>
> Result shows very huge improvement on both tx (at most 158%) and rr
> (at most 53%) while rx is as much as in the past. Most cases the cpu
> utilization is also improved:
>

Just notice there's something wrong in the setup. So the numbers are
incorrect here. Will re-run and post correct number here.

Sorry.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 18/21] KVM: ARM64: Add PMU overflow interrupt routing

2015-10-30 Thread kbuild test robot

Hi Shannon,

[auto build test ERROR on kvm/linux-next -- if it's inappropriate base, please 
suggest rules for selecting the more suitable base]

url:
https://github.com/0day-ci/linux/commits/Shannon-Zhao/KVM-ARM64-Add-guest-PMU-support/20151030-143148
config: arm-axm55xx_defconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=arm 

All errors (new ones prefixed by >>):

   In file included from arch/arm/kvm/arm.c:31:0:
>> include/kvm/arm_pmu.h:22:21: fatal error: asm/pmu.h: No such file or 
>> directory
#include 
^
   compilation terminated.

vim +22 include/kvm/arm_pmu.h

219856f5 Shannon Zhao 2015-10-30  16   */
219856f5 Shannon Zhao 2015-10-30  17  
219856f5 Shannon Zhao 2015-10-30  18  #ifndef __ASM_ARM_KVM_PMU_H
219856f5 Shannon Zhao 2015-10-30  19  #define __ASM_ARM_KVM_PMU_H
219856f5 Shannon Zhao 2015-10-30  20  
219856f5 Shannon Zhao 2015-10-30  21  #include 
219856f5 Shannon Zhao 2015-10-30 @22  #include 
219856f5 Shannon Zhao 2015-10-30  23  
219856f5 Shannon Zhao 2015-10-30  24  struct kvm_pmc {
219856f5 Shannon Zhao 2015-10-30  25u8 idx;/* index into the pmu->pmc array 
*/

:: The code at line 22 was first introduced by commit
:: 219856f54d23298fff48e6e20e7e87fc45e42798 KVM: ARM64: Define PMU data 
structure for each vcpu

:: TO: Shannon Zhao <shannon.z...@linaro.org>
:: CC: 0day robot <fengguang...@intel.com>

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU

2015-10-30 Thread kbuild test robot

Hi Suresh,

[auto build test ERROR on kvm/linux-next -- if it's inappropriate base, please 
suggest rules for selecting the more suitable base]

url:
https://github.com/0day-ci/linux/commits/Suresh-Warrier/KVM-PPC-Book3S-HV-Optimize-wakeup-VCPU-from-H_IPI/20151030-081329
config: powerpc-defconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/kernel/smp.c: In function 'smp_ipi_demux':
>> arch/powerpc/kernel/smp.c:261:25: error: 'PPC_MSG_RM_HOST_ACTION' undeclared 
>> (first use in this function)
  if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
^
   arch/powerpc/kernel/smp.c:239:41: note: in definition of macro 'IPI_MESSAGE'
#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
^
   arch/powerpc/kernel/smp.c:261:25: note: each undeclared identifier is 
reported only once for each function it appears in
  if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
^
   arch/powerpc/kernel/smp.c:239:41: note: in definition of macro 'IPI_MESSAGE'
#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
^
--
   arch/powerpc/kvm/book3s_hv_rm_xics.c: In function 'icp_rm_set_vcpu_irq':
   arch/powerpc/kvm/book3s_hv_rm_xics.c:142:4: error: implicit declaration of 
function 'smp_muxed_ipi_rm_message_pass' [-Werror=implicit-function-declaration]
   smp_muxed_ipi_rm_message_pass(hcpu,
   ^
>> arch/powerpc/kvm/book3s_hv_rm_xics.c:143:7: error: 'PPC_MSG_RM_HOST_ACTION' 
>> undeclared (first use in this function)
  PPC_MSG_RM_HOST_ACTION);
  ^
   arch/powerpc/kvm/book3s_hv_rm_xics.c:143:7: note: each undeclared identifier 
is reported only once for each function it appears in
   cc1: all warnings being treated as errors

vim +/PPC_MSG_RM_HOST_ACTION +261 arch/powerpc/kernel/smp.c

   255   * Must check for PPC_MSG_RM_HOST_ACTION messages
   256   * before PPC_MSG_CALL_FUNCTION messages because when
   257   * a VM is destroyed, we call kick_all_cpus_sync()
   258   * to ensure that any pending PPC_MSG_RM_HOST_ACTION
   259   * messages have completed before we free any VCPUs.
   260   */
 > 261  if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
   262  kvmppc_xics_ipi_action();
   263  #endif
   264  if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

Re: [PATCH v4 18/21] KVM: ARM64: Add PMU overflow interrupt routing

2015-10-30 Thread Shannon Zhao

Hi,

Thanks for your test:)
It fails because there is no arch/arm/include/asm/pmu.h while
arch/arm64/include/asm/pmu.h exists. Will fix this at next version.

On 2015/10/30 20:08, kbuild test robot wrote:
> Hi Shannon,
> 
> [auto build test ERROR on kvm/linux-next -- if it's inappropriate base, 
> please suggest rules for selecting the more suitable base]
> 
> url:
> https://github.com/0day-ci/linux/commits/Shannon-Zhao/KVM-ARM64-Add-guest-PMU-support/20151030-143148
> config: arm-axm55xx_defconfig (attached as .config)
> reproduce:
> wget 
> https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
>  -O ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> make.cross ARCH=arm 
> 
> All errors (new ones prefixed by >>):
> 
>In file included from arch/arm/kvm/arm.c:31:0:
>>> include/kvm/arm_pmu.h:22:21: fatal error: asm/pmu.h: No such file or 
>>> directory
> #include 
> ^
>compilation terminated.
> 
> vim +22 include/kvm/arm_pmu.h
> 
> 219856f5 Shannon Zhao 2015-10-30  16   */
> 219856f5 Shannon Zhao 2015-10-30  17  
> 219856f5 Shannon Zhao 2015-10-30  18  #ifndef __ASM_ARM_KVM_PMU_H
> 219856f5 Shannon Zhao 2015-10-30  19  #define __ASM_ARM_KVM_PMU_H
> 219856f5 Shannon Zhao 2015-10-30  20  
> 219856f5 Shannon Zhao 2015-10-30  21  #include 
> 219856f5 Shannon Zhao 2015-10-30 @22  #include 
> 219856f5 Shannon Zhao 2015-10-30  23  
> 219856f5 Shannon Zhao 2015-10-30  24  struct kvm_pmc {
> 219856f5 Shannon Zhao 2015-10-30  25  u8 idx;/* index into the 
> pmu->pmc array */
> 
> :: The code at line 22 was first introduced by commit
> :: 219856f54d23298fff48e6e20e7e87fc45e42798 KVM: ARM64: Define PMU data 
> structure for each vcpu
> 
> :: TO: Shannon Zhao <shannon.z...@linaro.org>
> :: CC: 0day robot <fengguang...@intel.com>
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation
> 

-- 
Shannon

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] virtio-net: Stop doing DMA from the stack

2015-10-30 Thread Andy Lutomirski

On Fri, Oct 30, 2015 at 6:55 AM, Christian Borntraeger
 wrote:
> Am 30.10.2015 um 02:09 schrieb Andy Lutomirski:
>> From: "Michael S. Tsirkin" 
>>
>> Once virtio starts using the DMA API, we won't be able to safely DMA
>> from the stack.  virtio-net does a couple of config DMA requests
>> from small stack buffers -- switch to using dynamically-allocated
>> memory.
>>
>> This should have no effect on any performance-critical code paths.
>>
>> [I wrote the subject and commit message.  mst wrote the code. --luto]
>>
>> Signed-off-by: Andy Lutomirski 
>> signed-off-by: Michael S. Tsirkin 
>
> I still get an error when using multiqueue:
>
> #  ethtool -L eth0 combined 4
> [   33.534686] virtio_ccw 0.0.000d: DMA-API: device driver maps memory from 
> stack [addr=629e7c06]

Fixed in my branch, I think.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCHv2 0/3] dma ops and virtio

2015-10-30 Thread Christian Borntraeger

here is the 2nd version of providing an DMA API for s390.

There are some attempts to unify the dma ops (Christoph) as well
as some attempts to make virtio use the dma API (Andy).

At kernel summit we concluded that we want to use the same code on all
platforms, whereever possible, so having a dummy dma_op might be the
easiest solution to keep virtio-ccw as similar as possible to
virtio-pci.Together with a fixed up patch set from Andy Lutomirski
this seems to work.  

We will also need a fixup for powerc and QEMU changes to make virtio
work with iommu on power and x86.

TODO:
- future add-on patches to also fold in x86 no iommu
- dma_mask
- checking?
- make compilation of dma-noop dependent on something

v1->v2:
- initial testing
- always use dma_noop_ops if device has no private dma_ops
- get rid of setup in virtio_ccw,kvm_virtio
- set CONFIG_HAS_DMA(ATTRS) for virtio (fixes compile for !PCI)
- rename s390_dma_ops to s390_pci_dma_ops

Christian Borntraeger (3):
  Provide simple noop dma ops
  alpha: use common noop dma ops
  s390/dma: Allow per device dma ops

 arch/alpha/kernel/pci-noop.c| 46 ++
 arch/s390/Kconfig   |  3 +-
 arch/s390/include/asm/device.h  |  6 ++-
 arch/s390/include/asm/dma-mapping.h |  6 ++-
 arch/s390/pci/pci.c |  1 +
 arch/s390/pci/pci_dma.c |  4 +-
 include/linux/dma-mapping.h |  2 +
 lib/Makefile|  2 +-
 lib/dma-noop.c  | 77 +
 9 files changed, 98 insertions(+), 49 deletions(-)
 create mode 100644 lib/dma-noop.c

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 1/6] virtio-net: Stop doing DMA from the stack

2015-10-30 Thread Christian Borntraeger

Am 30.10.2015 um 02:09 schrieb Andy Lutomirski:
> From: "Michael S. Tsirkin" 
> 
> Once virtio starts using the DMA API, we won't be able to safely DMA
> from the stack.  virtio-net does a couple of config DMA requests
> from small stack buffers -- switch to using dynamically-allocated
> memory.
> 
> This should have no effect on any performance-critical code paths.
> 
> [I wrote the subject and commit message.  mst wrote the code. --luto]
> 
> Signed-off-by: Andy Lutomirski 
> signed-off-by: Michael S. Tsirkin 

I still get an error when using multiqueue:

#  ethtool -L eth0 combined 4
[   33.534686] virtio_ccw 0.0.000d: DMA-API: device driver maps memory from 
stack [addr=629e7c06]
[   33.534704] [ cut here ]
[   33.534705] WARNING: at lib/dma-debug.c:1169
[   33.534706] Modules linked in: dm_multipath
[   33.534709] CPU: 1 PID: 1087 Comm: ethtool Not tainted 4.3.0-rc3+ #269
[   33.534710] task: 616f9978 ti: 629e4000 task.ti: 
629e4000
[   33.534712] Krnl PSW : 0704d0018000 005869d2 
(check_for_stack+0xb2/0x118)
[   33.534716]R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 
EA:3
Krnl GPRS: 006a 00d60f44 005a 64ee0870
[   33.534718]005869ce  0001 
629e7c06
[   33.534719] 0c06 0002 
6467f800
[   33.534720]64673428 629e7c06 005869ce 
629e7928
[   33.534726] Krnl Code: 005869c2: c0200024ad4elarl
%r2,a1c45e
   005869c8: c0e5ffe6d6fc   brasl   %r14,2617c0
  #005869ce: a7f40001   brc 15,5869d0
  >005869d2: c010003465eb   larl%r1,c135a8
   005869d8: e3101012   lt  %r1,0(%r1)
   005869de: a784000a   brc 8,5869f2
   005869e2: e340f0b4   lg  %r4,176(%r15)
   005869e8: ebcff0a4   lmg %r12,%r15,160(%r15)
[   33.534736] Call Trace:
[   33.534737] ([<005869ce>] check_for_stack+0xae/0x118)
[   33.534738]  [<00586e3c>] debug_dma_map_page+0x114/0x160
[   33.534740]  [<005a31f8>] vring_map_one_sg.isra.7+0x98/0xc0
[   33.534742]  [<005a3b72>] virtqueue_add_sgs+0x1e2/0x788
[   33.534744]  [<00618afc>] virtnet_send_command+0xcc/0x140
[   33.534745]  [<00618c0c>] virtnet_set_queues+0x9c/0x110
[   33.534747]  [<00619928>] virtnet_set_channels+0x78/0xe0
[   33.534748]  [<006f63ea>] ethtool_set_channels+0x62/0x88
[   33.534750]  [<006f8900>] dev_ethtool+0x10d8/0x1a48
[   33.534752]  [<0070c540>] dev_ioctl+0x190/0x510
[   33.534754]  [<006cf2da>] sock_do_ioctl+0x7a/0x90
[   33.534755]  [<006cf840>] sock_ioctl+0x1e8/0x2d0
[   33.534758]  [<002e6c78>] do_vfs_ioctl+0x3a8/0x508
[   33.534759]  [<002e6e7c>] SyS_ioctl+0xa4/0xb8
[   33.534762]  [<008231ec>] system_call+0x244/0x264
[   33.534763]  [<03ff922026d2>] 0x3ff922026d2
[   33.534764] Last Breaking-Event-Address:
[   33.534765]  [<005869ce>] check_for_stack+0xae/0x118
[   33.534766] ---[ end trace 2379df65f4decfc4 ]---


> ---
>  drivers/net/virtio_net.c | 34 +++---
>  1 file changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index d8838dedb7a4..f94ab786088f 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -140,6 +140,12 @@ struct virtnet_info {
> 
>   /* CPU hot plug notifier */
>   struct notifier_block nb;
> +
> + /* Control VQ buffers: protected by the rtnl lock */
> + struct virtio_net_ctrl_hdr ctrl_hdr;
> + virtio_net_ctrl_ack ctrl_status;
> + u8 ctrl_promisc;
> + u8 ctrl_allmulti;
>  };
> 
>  struct padded_vnet_hdr {
> @@ -976,31 +982,30 @@ static bool virtnet_send_command(struct virtnet_info 
> *vi, u8 class, u8 cmd,
>struct scatterlist *out)
>  {
>   struct scatterlist *sgs[4], hdr, stat;
> - struct virtio_net_ctrl_hdr ctrl;
> - virtio_net_ctrl_ack status = ~0;
>   unsigned out_num = 0, tmp;
> 
>   /* Caller should know better */
>   BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
> 
> - ctrl.class = class;
> - ctrl.cmd = cmd;
> + vi->ctrl_status = ~0;
> + vi->ctrl_hdr.class = class;
> + vi->ctrl_hdr.cmd = cmd;
>   /* Add header */
> - sg_init_one(, , sizeof(ctrl));
> + sg_init_one(, >ctrl_hdr, sizeof(vi->ctrl_hdr));
>   sgs[out_num++] = 
> 
>   if (out)
>   sgs[out_num++] = out;
> 
>   /* Add return status. */
> - sg_init_one(, , sizeof(status));
> + sg_init_one(, >ctrl_status, sizeof(vi->ctrl_status));
>   sgs[out_num] = 
> 
>

Re: [PATCH 4/4] s390/virtio: use noop dma ops

2015-10-30 Thread Cornelia Huck

On Fri, 30 Oct 2015 13:26:09 +0100
Christian Borntraeger  wrote:

> I am currently reworking this to 
> 
>  static inline struct dma_map_ops *get_dma_ops(struct device *dev)
>  {
>   if (dev && dev->archdata.dma_ops)
>   return dev->archdata.dma_ops;
>   return _noop_ops;
>  }
> 
> 
> Which uses the dma_noop_ops for everything unless the device overrides (PCI 
> does)

Yes, opt-in seems less error-prone here.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 27/33] nvdimm acpi: support function 0

2015-10-30 Thread Xiao Guangrong




On 10/30/2015 06:14 PM, Stefan Hajnoczi wrote:

On Fri, Oct 30, 2015 at 01:56:21PM +0800, Xiao Guangrong wrote:

  static uint64_t
  nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size)
  {
-return 0;
+AcpiNVDIMMState *state = opaque;
+MemoryRegion *dsm_ram_mr = >ram_mr;
+NvdimmDsmIn *in;
+GArray *out;
+void *dsm_ram_addr;
+uint32_t buf_size;
+
+assert(memory_region_size(dsm_ram_mr) >= sizeof(NvdimmDsmIn));
+dsm_ram_addr = memory_region_get_ram_ptr(dsm_ram_mr);
+
+/*
+ * The DSM memory is mapped to guest address space so an evil guest
+ * can change its content while we are doing DSM emulation. Avoid
+ * this by copying DSM memory to QEMU local memory.
+ */
+in = g_malloc(memory_region_size(dsm_ram_mr));
+memcpy(in, dsm_ram_addr, memory_region_size(dsm_ram_mr));
+
+le32_to_cpus(>revision);
+le32_to_cpus(>function);
+le32_to_cpus(>handle);
+
+nvdimm_debug("Revision %#x Handler %#x Function %#x.\n", in->revision,
+ in->handle, in->function);
+
+out = g_array_new(false, true /* clear */, 1);
+
+if (in->revision != 0x1 /* Current we support DSM Spec Rev1. */) {
+nvdimm_debug("Revision %#x is not supported, expect %#x.\n",
+  in->revision, 0x1);
+nvdimm_dsm_write_status(out, NVDIMM_DSM_STATUS_NOT_SUPPORTED);
+goto exit;
+}
+
+/* Handle 0 is reserved for NVDIMM Root Device. */
+if (!in->handle) {
+nvdimm_dsm_root(in, out);
+goto exit;
+}
+
+nvdimm_dsm_device(in, out);
+
+exit:
+/* Write output result to dsm memory. */
+memcpy(dsm_ram_addr, out->data, out->len);
+memory_region_set_dirty(dsm_ram_mr, 0, out->len);


If you respin this series, please add this before the memcpy out:

   assert(out->len <= memory_region_size(dsm_ram_mr))

That way we can catch situations where too much output data was
generated by mistake.



Okay. If this patchset is okay to be merged, i will add this in
future development.

Thanks for your continuous and active review, Stefan!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH v6 07/33] util: introduce qemu_file_get_page_size()

2015-10-30 Thread Vladimir Sementsov-Ogievskiy


logic is changed:
in old version gethugepagesize on statfs error generates exit(1)
in new it returns getpagesize() in this case (through fd_getpagesize)
(I think, fd_getpagesize should be fixed to handle error)

also, in new version for windows we have getpagesize(), when in old 
version there was no difference (how did it work?). May be it's ok, but 
should be mentioned in commit message



On 30.10.2015 08:56, Xiao Guangrong wrote:

There are three places use the some logic to get the page size on
the file path or file fd

This patch introduces qemu_file_get_page_size() to unify the code

Signed-off-by: Xiao Guangrong 
---
  include/qemu/osdep.h |  1 +
  target-ppc/kvm.c | 21 +++--
  util/oslib-posix.c   | 16 
  util/oslib-win32.c   |  5 +
  4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index b568424..d4dde02 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -302,4 +302,5 @@ int qemu_read_password(char *buf, int buf_size);
   */
  pid_t qemu_fork(Error **errp);
  
+size_t qemu_file_get_page_size(const char *mem_path);

  #endif
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index ac70f08..c661f1c 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -308,28 +308,13 @@ static void kvm_get_smmu_info(PowerPCCPU *cpu, struct 
kvm_ppc_smmu_info *info)
  
  static long gethugepagesize(const char *mem_path)

  {
-struct statfs fs;
-int ret;
-
-do {
-ret = statfs(mem_path, );
-} while (ret != 0 && errno == EINTR);
+long size = qemu_file_get_page_size(mem_path);
  
-if (ret != 0) {

-fprintf(stderr, "Couldn't statfs() memory path: %s\n",
-strerror(errno));
+if (!size) {
  exit(1);
  }
  
-#define HUGETLBFS_MAGIC   0x958458f6

-
-if (fs.f_type != HUGETLBFS_MAGIC) {
-/* Explicit mempath, but it's ordinary pages */
-return getpagesize();
-}
-
-/* It's hugepage, return the huge page size */
-return fs.f_bsize;
+return size;
  }
  
  static int find_max_supported_pagesize(Object *obj, void *opaque)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 914cef5..ad94c5a 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -360,6 +360,22 @@ static size_t fd_getpagesize(int fd)
  return getpagesize();
  }
  
+size_t qemu_file_get_page_size(const char *path)

+{
+size_t size = 0;
+int fd = qemu_open(path, O_RDONLY);
+
+if (fd < 0) {
+fprintf(stderr, "Could not open %s.\n", path);
+goto exit;
+}
+
+size = fd_getpagesize(fd);
+qemu_close(fd);
+exit:
+return size;
+}
+
  void os_mem_prealloc(int fd, char *area, size_t memory)
  {
  int ret;
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 09f9e98..a18aa87 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -462,6 +462,11 @@ size_t getpagesize(void)
  return system_info.dwPageSize;
  }
  
+size_t qemu_file_get_page_size(const char *path)

+{
+return getpagesize();
+}
+
  void os_mem_prealloc(int fd, char *area, size_t memory)
  {
  int i;



--
Best regards,
Vladimir
* now, @virtuozzo.com instead of @parallels.com. Sorry for this inconvenience.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/4] s390/virtio: use noop dma ops

2015-10-30 Thread Christian Borntraeger

Am 30.10.2015 um 13:17 schrieb Cornelia Huck:
> On Tue, 27 Oct 2015 23:48:51 +0100
> Christian Borntraeger  wrote:
> 
>> With all infrastructure in place, lets provide dma_ops for virtio
>> devices on s390.
>>
>> Signed-off-by: Christian Borntraeger 
>> ---
>>  drivers/s390/virtio/kvm_virtio.c | 2 ++
>>  drivers/s390/virtio/virtio_ccw.c | 2 ++
>>  2 files changed, 4 insertions(+)
>>
>> diff --git a/drivers/s390/virtio/kvm_virtio.c 
>> b/drivers/s390/virtio/kvm_virtio.c
>> index 53fb975..05adaa9 100644
>> --- a/drivers/s390/virtio/kvm_virtio.c
>> +++ b/drivers/s390/virtio/kvm_virtio.c
>> @@ -13,6 +13,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -318,6 +319,7 @@ static void add_kvm_device(struct kvm_device_desc *d, 
>> unsigned int offset)
>>  return;
>>  }
>>
>> +kdev->vdev.dev.archdata.dma_ops = _noop_ops;
> 
> This provides dma_ops for the vdev, while Andy's virtio code looks for
> dma_ops in the vdev's parent (in the ccw and pci cases, the proxy
> device; in this case, it would be our root device).
> 
> With
> 
> diff --git a/drivers/s390/virtio/kvm_virtio.c 
> b/drivers/s390/virtio/kvm_virtio.c
> index 05adaa9..5f79c52 100644
> --- a/drivers/s390/virtio/kvm_virtio.c
> +++ b/drivers/s390/virtio/kvm_virtio.c
> @@ -319,7 +319,6 @@ static void add_kvm_device(struct kvm_device_desc *d, 
> unsigned int offset)
>   return;
>   }
> 
> - kdev->vdev.dev.archdata.dma_ops = _noop_ops;
>   kdev->vdev.dev.parent = kvm_root;
>   kdev->vdev.id.device = d->type;
>   kdev->vdev.config = _vq_configspace_ops;
> @@ -473,6 +472,7 @@ static int __init kvm_devices_init(void)
>   vmem_remove_mapping(total_memory_size, PAGE_SIZE);
>   return rc;
>   }
> + kvm_root->archdata.dma_ops = _noop_ops;
> 
>   INIT_WORK(_work, hotplug_devices);
> 
> applied (and the endianness fix in the virtio code), I can boot a
> s390-virtio guest as well.

I am currently reworking this to 

 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
if (dev && dev->archdata.dma_ops)
return dev->archdata.dma_ops;
return _noop_ops;
 }


Which uses the dma_noop_ops for everything unless the device overrides (PCI 
does)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [kvm-unit-tests PATCHv5 3/3] arm: pmu: Add CPI checking

2015-10-30 Thread Andrew Jones

On Wed, Oct 28, 2015 at 03:12:55PM -0400, Christopher Covington wrote:
> Calculate the numbers of cycles per instruction (CPI) implied by ARM
> PMU cycle counter values. The code includes a strict checking facility
> intended for the -icount option in TCG mode but it is not yet enabled
> in the configuration file. Enabling it must wait on infrastructure
> improvements which allow for different tests to be run on TCG versus
> KVM.
> 
> Signed-off-by: Christopher Covington 
> ---
>  arm/pmu.c | 103 
> +-
>  1 file changed, 102 insertions(+), 1 deletion(-)
> 
> diff --git a/arm/pmu.c b/arm/pmu.c
> index 4334de4..76a 100644
> --- a/arm/pmu.c
> +++ b/arm/pmu.c
> @@ -43,6 +43,23 @@ static inline unsigned long get_pmccntr(void)
>   asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
>   return cycles;
>  }
> +
> +/*
> + * Extra instructions inserted by the compiler would be difficult to 
> compensate
> + * for, so hand assemble everything between, and including, the PMCR accesses
> + * to start and stop counting.
> + */
> +static inline void loop(int i, uint32_t pmcr)
> +{
> + asm volatile(
> + "   mcr p15, 0, %[pmcr], c9, c12, 0\n"
> + "1: subs%[i], %[i], #1\n"
> + "   bgt 1b\n"
> + "   mcr p15, 0, %[z], c9, c12, 0\n"
> + : [i] "+r" (i)
> + : [pmcr] "r" (pmcr), [z] "r" (0)
> + : "cc");
> +}
>  #elif defined(__aarch64__)
>  static inline uint32_t get_pmcr(void)
>  {
> @@ -64,6 +81,23 @@ static inline unsigned long get_pmccntr(void)
>   asm volatile("mrs %0, pmccntr_el0" : "=r" (cycles));
>   return cycles;
>  }
> +
> +/*
> + * Extra instructions inserted by the compiler would be difficult to 
> compensate
> + * for, so hand assemble everything between, and including, the PMCR accesses
> + * to start and stop counting.
> + */
> +static inline void loop(int i, uint32_t pmcr)
> +{
> + asm volatile(
> + "   msr pmcr_el0, %[pmcr]\n"
> + "1: subs%[i], %[i], #1\n"
> + "   b.gt1b\n"
> + "   msr pmcr_el0, xzr\n"
> + : [i] "+r" (i)
> + : [pmcr] "r" (pmcr)
> + : "cc");
> +}
>  #endif
>  
>  struct pmu_data {
> @@ -131,12 +165,79 @@ static bool check_cycles_increase(void)
>   return true;
>  }
>  
> -int main(void)
> +/*
> + * Execute a known number of guest instructions. Only odd instruction counts
> + * greater than or equal to 3 are supported by the in-line assembly code. The
> + * control register (PMCR_EL0) is initialized with the provided value 
> (allowing
> + * for example for the cycle counter or event counters to be reset). At the 
> end
> + * of the exact instruction loop, zero is written to PMCR_EL0 to disable
> + * counting, allowing the cycle counter or event counters to be read at the
> + * leisure of the calling code.
> + */
> +static void measure_instrs(int num, uint32_t pmcr)
> +{
> + int i = (num - 1) / 2;
> +
> + assert(num >= 3 && ((num - 1) % 2 == 0));
> + loop(i, pmcr);
> +}
> +
> +/*
> + * Measure cycle counts for various known instruction counts. Ensure that the
> + * cycle counter progresses (similar to check_cycles_increase() but with more
> + * instructions and using reset and stop controls). If supplied a positive,
> + * nonzero CPI parameter, also strictly check that every measurement matches
> + * it. Strict CPI checking is used to test -icount mode.
> + */
> +static bool check_cpi(int cpi)
> +{
> + struct pmu_data pmu = {0};
> +
> + pmu.cycle_counter_reset = 1;
> + pmu.enable = 1;
> +
> + if (cpi > 0)
> + printf("Checking for CPI=%d.\n", cpi);
> + printf("instrs : cycles0 cycles1 ...\n");
> +
> + for (int i = 3; i < 300; i += 32) {
> + int avg, sum = 0;
> +
> + printf("%d :", i);
> + for (int j = 0; j < NR_SAMPLES; j++) {
> + int cycles;
> +
> + measure_instrs(i, pmu.pmcr_el0);
> + cycles = get_pmccntr();
> + printf(" %d", cycles);
> +
> + if (!cycles || (cpi > 0 && cycles != i * cpi)) {
> + printf("\n");
> + return false;
> + }
> +
> + sum += cycles;
> + }
> + avg = sum / NR_SAMPLES;
> + printf(" sum=%d avg=%d avg_ipc=%d avg_cpi=%d\n",
> + sum, avg, i / avg, avg / i);
> + }
> +
> + return true;
> +}
> +
> +int main(int argc, char *argv[])
>  {
> + int cpi = 0;
> +
> + if (argc >= 1)
> + cpi = atol(argv[0]);
> +
>   report_prefix_push("pmu");
>  
>   report("Control register", check_pmcr());
>   report("Monotonically increasing cycle count", check_cycles_increase());
> + report("Cycle/instruction ratio", check_cpi(cpi));
>  
>   return report_summary();
>  }
> -- 
> Qualcomm

Re: [Qemu-devel] [PATCH v6 08/33] exec: allow memory to be allocated from any kind of path

2015-10-30 Thread Vladimir Sementsov-Ogievskiy


On 30.10.2015 08:56, Xiao Guangrong wrote:

Currently file_ram_alloc() is designed for hugetlbfs, however, the memory
of nvdimm can come from either raw pmem device eg, /dev/pmem, or the file
locates at DAX enabled filesystem

So this patch let it work on any kind of path

Signed-off-by: Xiao Guangrong 
---
  exec.c | 56 +---
  1 file changed, 17 insertions(+), 39 deletions(-)

diff --git a/exec.c b/exec.c
index 8af2570..3ca7e50 100644
--- a/exec.c
+++ b/exec.c
@@ -1174,32 +1174,6 @@ void qemu_mutex_unlock_ramlist(void)
  }
  
  #ifdef __linux__

-
-#include 
-
-#define HUGETLBFS_MAGIC   0x958458f6
-
-static long gethugepagesize(const char *path, Error **errp)
-{
-struct statfs fs;
-int ret;
-
-do {
-ret = statfs(path, );
-} while (ret != 0 && errno == EINTR);
-
-if (ret != 0) {
-error_setg_errno(errp, errno, "failed to get page size of file %s",
- path);
-return 0;
-}
-
-if (fs.f_type != HUGETLBFS_MAGIC)
-fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
-
-return fs.f_bsize;
-}
-
  static void *file_ram_alloc(RAMBlock *block,
  ram_addr_t memory,
  const char *path,
@@ -1210,20 +1184,24 @@ static void *file_ram_alloc(RAMBlock *block,
  char *c;
  void *area;
  int fd;
-uint64_t hpagesize;
-Error *local_err = NULL;
+uint64_t pagesize;
  
-hpagesize = gethugepagesize(path, _err);

-if (local_err) {
-error_propagate(errp, local_err);
+pagesize = qemu_file_get_page_size(path);
+if (!pagesize) {
+error_setg(errp, "can't get page size for %s", path);
  goto error;
  }
-block->mr->align = hpagesize;
  
-if (memory < hpagesize) {

+if (pagesize == getpagesize()) {
+fprintf(stderr, "Memory is not allocated from HugeTlbfs.\n");
+}


It is strange to see this warning every time.

Shouldn't the differentiation be done explicitly in command line? May be 
separate option mem-tlb, or separate flag tlbfs=on, or for new feature - 
new option mem-file, or prefixes for paths (tlbfs://, file://).. Or the 
other way to not mix things but split them.



+
+block->mr->align = pagesize;
+
+if (memory < pagesize) {
  error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
-   "or larger than huge page size 0x%" PRIx64,
-   memory, hpagesize);
+   "or larger than page size 0x%" PRIx64,
+   memory, pagesize);
  goto error;
  }
  
@@ -1247,14 +1225,14 @@ static void *file_ram_alloc(RAMBlock *block,

  fd = mkstemp(filename);
  if (fd < 0) {
  error_setg_errno(errp, errno,
- "unable to create backing store for hugepages");
+ "unable to create backing store for path %s", path);
  g_free(filename);
  goto error;
  }
  unlink(filename);
  g_free(filename);
  
-memory = ROUND_UP(memory, hpagesize);

+memory = ROUND_UP(memory, pagesize);
  
  /*

   * ftruncate is not supported by hugetlbfs in older
@@ -1266,10 +1244,10 @@ static void *file_ram_alloc(RAMBlock *block,
  perror("ftruncate");
  }
  
-area = qemu_ram_mmap(fd, memory, hpagesize, block->flags & RAM_SHARED);

+area = qemu_ram_mmap(fd, memory, pagesize, block->flags & RAM_SHARED);
  if (area == MAP_FAILED) {
  error_setg_errno(errp, errno,
- "unable to map backing store for hugepages");
+ "unable to map backing store for path %s", path);
  close(fd);
  goto error;
  }



--
Best regards,
Vladimir
* now, @virtuozzo.com instead of @parallels.com. Sorry for this inconvenience.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/4] Provide simple noop dma ops

2015-10-30 Thread Christian Borntraeger

Am 28.10.2015 um 01:41 schrieb Joerg Roedel:
> Hi Christian,
> 
> On Tue, Oct 27, 2015 at 11:48:48PM +0100, Christian Borntraeger wrote:
>> +static dma_addr_t dma_noop_map_page(struct device *dev, struct page *page,
>> +  unsigned long offset, size_t size,
>> +  enum dma_data_direction dir,
>> +  struct dma_attrs *attrs)
>> +{
>> +return page_to_phys(page) + offset;
>> +}
> 
> X86 also has its own version of these noop dma_ops, see
> arch/x86/kernel/pci-nommu.c. This one also checks the dma_mask and
> prints a warning if the physical address doesn't fit into the mask.
> 
> I think this would make sense here too, and that we can also make x86
> use the same generic noop-dma-ops your are introducing.

It not trivial without understanding the dma mask details. Do I read the x86
implementation right, that it limits the dma to 32 bit? Then we cannot collapse
both implementations. Or maybe we can hide this in dma_capable. Dont know

So I would prefer to keep it as is and let someone with x86 test environment do
the unification. Christoph, I think you wanted to do that anyway, are you 
willing
to do that?

Christian

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/4] s390/virtio: use noop dma ops

2015-10-30 Thread Christian Borntraeger

Am 28.10.2015 um 01:43 schrieb Joerg Roedel:
> On Tue, Oct 27, 2015 at 11:48:51PM +0100, Christian Borntraeger wrote:
>> @@ -1093,6 +1094,7 @@ static void virtio_ccw_auto_online(void *data, 
>> async_cookie_t cookie)
>>  struct ccw_device *cdev = data;
>>  int ret;
>>  
>> +cdev->dev.archdata.dma_ops = _noop_ops;
>>  ret = ccw_device_set_online(cdev);
>>  if (ret)
>>  dev_warn(>dev, "Failed to set online: %d\n", ret);
> 
> Hmm, drivers usually don't deal with setting the dma_ops for their
> devices, as they depend on the platform and not so much on the device
> itself.
> 
> Can you do this special-case handling from device independent platform
> code, where you also setup dma_ops for other devices?
> 

Yes, fixed in v2.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/4] s390/virtio: use noop dma ops

2015-10-30 Thread Cornelia Huck

On Tue, 27 Oct 2015 23:48:51 +0100
Christian Borntraeger  wrote:

> With all infrastructure in place, lets provide dma_ops for virtio
> devices on s390.
> 
> Signed-off-by: Christian Borntraeger 
> ---
>  drivers/s390/virtio/kvm_virtio.c | 2 ++
>  drivers/s390/virtio/virtio_ccw.c | 2 ++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/drivers/s390/virtio/kvm_virtio.c 
> b/drivers/s390/virtio/kvm_virtio.c
> index 53fb975..05adaa9 100644
> --- a/drivers/s390/virtio/kvm_virtio.c
> +++ b/drivers/s390/virtio/kvm_virtio.c
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -318,6 +319,7 @@ static void add_kvm_device(struct kvm_device_desc *d, 
> unsigned int offset)
>   return;
>   }
> 
> + kdev->vdev.dev.archdata.dma_ops = _noop_ops;

This provides dma_ops for the vdev, while Andy's virtio code looks for
dma_ops in the vdev's parent (in the ccw and pci cases, the proxy
device; in this case, it would be our root device).

With

diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 05adaa9..5f79c52 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -319,7 +319,6 @@ static void add_kvm_device(struct kvm_device_desc *d, 
unsigned int offset)
return;
}
 
-   kdev->vdev.dev.archdata.dma_ops = _noop_ops;
kdev->vdev.dev.parent = kvm_root;
kdev->vdev.id.device = d->type;
kdev->vdev.config = _vq_configspace_ops;
@@ -473,6 +472,7 @@ static int __init kvm_devices_init(void)
vmem_remove_mapping(total_memory_size, PAGE_SIZE);
return rc;
}
+   kvm_root->archdata.dma_ops = _noop_ops;
 
INIT_WORK(_work, hotplug_devices);
 
applied (and the endianness fix in the virtio code), I can boot a
s390-virtio guest as well.

>   kdev->vdev.dev.parent = kvm_root;
>   kdev->vdev.id.device = d->type;
>   kdev->vdev.config = _vq_configspace_ops;

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] Provide simple noop dma ops

2015-10-30 Thread Christian Borntraeger

We are going to require dma_ops for several common drivers, even for
systems that do have an identity mapping. Lets provide some minimal
no-op dma_ops that can be used for that purpose.

Signed-off-by: Christian Borntraeger 
---
 include/linux/dma-mapping.h |  2 ++
 lib/Makefile|  2 +-
 lib/dma-noop.c  | 77 +
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 lib/dma-noop.c

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index ac07ff0..7912f54 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -66,6 +66,8 @@ struct dma_map_ops {
int is_phys;
 };
 
+extern struct dma_map_ops dma_noop_ops;
+
 #define DMA_BIT_MASK(n)(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
 
 #define DMA_MASK_NONE  0x0ULL
diff --git a/lib/Makefile b/lib/Makefile
index 13a7c6a..b04ba71 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 sha1.o md5.o irq_regs.o argv_split.o \
 proportions.o flex_proportions.o ratelimit.o show_mem.o \
 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-earlycpio.o seq_buf.o nmi_backtrace.o
+earlycpio.o seq_buf.o nmi_backtrace.o dma-noop.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/dma-noop.c b/lib/dma-noop.c
new file mode 100644
index 000..3ce31302
--- /dev/null
+++ b/lib/dma-noop.c
@@ -0,0 +1,77 @@
+/*
+ * lib/dma-noop.c
+ *
+ * Stub DMA noop-ops
+ */
+#include 
+#include 
+#include 
+#include 
+
+static void *dma_noop_alloc(struct device *dev, size_t size,
+   dma_addr_t *dma_handle, gfp_t gfp,
+   struct dma_attrs *attrs)
+{
+   void *ret;
+
+   ret = (void *)__get_free_pages(gfp, get_order(size));
+   if (ret) {
+   memset(ret, 0, size);
+   *dma_handle = virt_to_phys(ret);
+   }
+   return ret;
+}
+
+static void dma_noop_free(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr,
+ struct dma_attrs *attrs)
+{
+   free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+static dma_addr_t dma_noop_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+   return page_to_phys(page) + offset;
+}
+
+static int dma_noop_map_sg(struct device *dev, struct scatterlist *sgl, int 
nents,
+enum dma_data_direction dir, struct dma_attrs 
*attrs)
+{
+   int i;
+   struct scatterlist *sg;
+
+   for_each_sg(sgl, sg, nents, i) {
+   void *va;
+
+   BUG_ON(!sg_page(sg));
+   va = sg_virt(sg);
+   sg_dma_address(sg) = (dma_addr_t)virt_to_phys(va);
+   sg_dma_len(sg) = sg->length;
+   }
+
+   return nents;
+}
+
+static int dma_noop_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+   return 0;
+}
+
+static int dma_noop_supported(struct device *dev, u64 mask)
+{
+   return 1;
+}
+
+struct dma_map_ops dma_noop_ops = {
+   .alloc  = dma_noop_alloc,
+   .free   = dma_noop_free,
+   .map_page   = dma_noop_map_page,
+   .map_sg = dma_noop_map_sg,
+   .mapping_error  = dma_noop_mapping_error,
+   .dma_supported  = dma_noop_supported,
+};
+
+EXPORT_SYMBOL(dma_noop_ops);
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] alpha: use common noop dma ops

2015-10-30 Thread Christian Borntraeger

Some of the alpha pci noop dma ops are identical to the common ones.
Use them.

Signed-off-by: Christian Borntraeger 
---
 arch/alpha/kernel/pci-noop.c | 46 
 1 file changed, 4 insertions(+), 42 deletions(-)

diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index 2b1f4a1..8e735b5e 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -123,44 +123,6 @@ static void *alpha_noop_alloc_coherent(struct device *dev, 
size_t size,
return ret;
 }
 
-static void alpha_noop_free_coherent(struct device *dev, size_t size,
-void *cpu_addr, dma_addr_t dma_addr,
-struct dma_attrs *attrs)
-{
-   free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-static dma_addr_t alpha_noop_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
-   return page_to_pa(page) + offset;
-}
-
-static int alpha_noop_map_sg(struct device *dev, struct scatterlist *sgl, int 
nents,
-enum dma_data_direction dir, struct dma_attrs 
*attrs)
-{
-   int i;
-   struct scatterlist *sg;
-
-   for_each_sg(sgl, sg, nents, i) {
-   void *va;
-
-   BUG_ON(!sg_page(sg));
-   va = sg_virt(sg);
-   sg_dma_address(sg) = (dma_addr_t)virt_to_phys(va);
-   sg_dma_len(sg) = sg->length;
-   }
-
-   return nents;
-}
-
-static int alpha_noop_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-   return 0;
-}
-
 static int alpha_noop_supported(struct device *dev, u64 mask)
 {
return mask < 0x00ffUL ? 0 : 1;
@@ -168,10 +130,10 @@ static int alpha_noop_supported(struct device *dev, u64 
mask)
 
 struct dma_map_ops alpha_noop_ops = {
.alloc  = alpha_noop_alloc_coherent,
-   .free   = alpha_noop_free_coherent,
-   .map_page   = alpha_noop_map_page,
-   .map_sg = alpha_noop_map_sg,
-   .mapping_error  = alpha_noop_mapping_error,
+   .free   = dma_noop_free_coherent,
+   .map_page   = dma_noop_map_page,
+   .map_sg = dma_noop_map_sg,
+   .mapping_error  = dma_noop_mapping_error,
.dma_supported  = alpha_noop_supported,
 };
 
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/3] s390/dma: Allow per device dma ops

2015-10-30 Thread Christian Borntraeger

As virtio-ccw now has dma ops, we can no longer default to the PCI ones.
Make use of dev_archdata to keep the dma_ops per device. The pci devices
now use that to override the default, and the default is changed to use
the noop ops for everything that is not PCI. To compile without PCI
support we also have to enable the DMA api with virtio.

Signed-off-by: Christian Borntraeger 
---
 arch/s390/Kconfig   | 3 ++-
 arch/s390/include/asm/device.h  | 6 +-
 arch/s390/include/asm/dma-mapping.h | 6 --
 arch/s390/pci/pci.c | 1 +
 arch/s390/pci/pci_dma.c | 4 ++--
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1d57000..04f0e02 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -113,6 +113,7 @@ config S390
select GENERIC_FIND_FIRST_BIT
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
+   select HAS_DMA
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_EARLY_PFN_TO_NID
@@ -124,6 +125,7 @@ config S390
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
select HAVE_DEBUG_KMEMLEAK
+   select HAVE_DMA_ATTRS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_FTRACE_MCOUNT_RECORD
@@ -580,7 +582,6 @@ config QDIO
 
 menuconfig PCI
bool "PCI support"
-   select HAVE_DMA_ATTRS
select PCI_MSI
help
  Enable PCI support.
diff --git a/arch/s390/include/asm/device.h b/arch/s390/include/asm/device.h
index d8f9872..4a9f35e 100644
--- a/arch/s390/include/asm/device.h
+++ b/arch/s390/include/asm/device.h
@@ -3,5 +3,9 @@
  *
  * This file is released under the GPLv2
  */
-#include 
+struct dev_archdata {
+   struct dma_map_ops *dma_ops;
+};
 
+struct pdev_archdata {
+};
diff --git a/arch/s390/include/asm/dma-mapping.h 
b/arch/s390/include/asm/dma-mapping.h
index b3fd54d..cb05f5c 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -11,11 +11,13 @@
 
 #define DMA_ERROR_CODE (~(dma_addr_t) 0x0)
 
-extern struct dma_map_ops s390_dma_ops;
+extern struct dma_map_ops s390_pci_dma_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-   return _dma_ops;
+   if (dev && dev->archdata.dma_ops)
+   return dev->archdata.dma_ops;
+   return _noop_ops;
 }
 
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 7ef12a3..fa41605 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -649,6 +649,7 @@ int pcibios_add_device(struct pci_dev *pdev)
 
zdev->pdev = pdev;
pdev->dev.groups = zpci_attr_groups;
+   pdev->dev.archdata.dma_ops = _pci_dma_ops;
zpci_map_resources(pdev);
 
for (i = 0; i < PCI_BAR_COUNT; i++) {
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 37505b8..ea39c3f 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -495,7 +495,7 @@ static int __init dma_debug_do_init(void)
 }
 fs_initcall(dma_debug_do_init);
 
-struct dma_map_ops s390_dma_ops = {
+struct dma_map_ops s390_pci_dma_ops = {
.alloc  = s390_dma_alloc,
.free   = s390_dma_free,
.map_sg = s390_dma_map_sg,
@@ -506,7 +506,7 @@ struct dma_map_ops s390_dma_ops = {
.is_phys= 0,
/* dma_supported is unconditionally true without a callback */
 };
-EXPORT_SYMBOL_GPL(s390_dma_ops);
+EXPORT_SYMBOL_GPL(s390_pci_dma_ops);
 
 static int __init s390_iommu_setup(char *str)
 {
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/4] Provide simple noop dma ops

2015-10-30 Thread Joerg Roedel

On Fri, Oct 30, 2015 at 01:55:56PM +0100, Christian Borntraeger wrote:
> It not trivial without understanding the dma mask details. Do I read
> the x86 implementation right, that it limits the dma to 32 bit? Then
> we cannot collapse both implementations. Or maybe we can hide this in
> dma_capable. Dont know

No, DMA is not limited to 32bit on x86. Each device has its own
dma_mask, and the requested address+size is checked against it. The
DMA_BIT_MASK(32) check is only to there to print a warning when a 32bit
capable device trys to access memory above 4GB.

Joerg

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: x86: zero apic_arb_prio on reset

2015-10-30 Thread Radim Krčmář

BSP doesn't get INIT so its apic_arb_prio isn't zeroed after reboot.
BSP won't get lowest priority interrupts until other VCPUs get enough
interrupts to match their pre-reboot apic_arb_prio.

That behavior doesn't fit into KVM's round-robin-like interpretation of
lowest priority delivery ... userspace should KVM_SET_LAPIC on reset, so
just zero apic_arb_prio there.

Reported-by: Yuki Shibuya 
Signed-off-by: Radim Krčmář 
---
 arch/x86/kvm/lapic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1b02c44c7b8b..08655020417d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1924,6 +1924,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+   vcpu->arch.apic_arb_prio = 0;
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-- 
2.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH v6 09/33] exec: allow file_ram_alloc to work on file

2015-10-30 Thread Vladimir Sementsov-Ogievskiy


On 30.10.2015 08:56, Xiao Guangrong wrote:

Currently, file_ram_alloc() only works on directory - it creates a file
under @path and do mmap on it

This patch tries to allow it to work on file directly, if @path is a
directory it works as before, otherwise it treats @path as the target
file then directly allocate memory from it

Signed-off-by: Xiao Guangrong 
---
  exec.c | 80 ++
  1 file changed, 51 insertions(+), 29 deletions(-)

diff --git a/exec.c b/exec.c
index 3ca7e50..f219010 100644
--- a/exec.c
+++ b/exec.c
@@ -1174,14 +1174,60 @@ void qemu_mutex_unlock_ramlist(void)
  }
  
  #ifdef __linux__

+static bool path_is_dir(const char *path)
+{
+struct stat fs;
+
+return stat(path, ) == 0 && S_ISDIR(fs.st_mode);
+}
+
+static int open_file_path(RAMBlock *block, const char *path, size_t size)


I think the name should be more descriptive, as it is very special 
function in common exec.c and it doesn't "just open file path'.. May be 
open_ram_file_path




+{
+char *filename;
+char *sanitized_name;
+char *c;
+int fd;
+
+if (!path_is_dir(path)) {
+int flags = (block->flags & RAM_SHARED) ? O_RDWR : O_RDONLY;
+
+flags |= O_EXCL;
+return open(path, flags);
+}


Was not there old scenarios when path is file? statfs will  success for 
any file withing mounted filesystem.


Also, may be we shouldn't warn about "Memory is not allocated from 
HugeTlbfs.\n" in case of !path_is_dir ?



+
+/* Make name safe to use with mkstemp by replacing '/' with '_'. */
+sanitized_name = g_strdup(memory_region_name(block->mr));
+for (c = sanitized_name; *c != '\0'; c++) {
+if (*c == '/') {
+*c = '_';
+}
+}
+filename = g_strdup_printf("%s/qemu_back_mem.%s.XX", path,
+   sanitized_name);
+g_free(sanitized_name);
+fd = mkstemp(filename);
+if (fd >= 0) {
+unlink(filename);
+/*
+ * ftruncate is not supported by hugetlbfs in older
+ * hosts, so don't bother bailing out on errors.
+ * If anything goes wrong with it under other filesystems,
+ * mmap will fail.
+ */
+if (ftruncate(fd, size)) {
+perror("ftruncate");
+}
+}
+g_free(filename);
+
+return fd;
+}
+
  static void *file_ram_alloc(RAMBlock *block,
  ram_addr_t memory,
  const char *path,
  Error **errp)
  {
-char *filename;
-char *sanitized_name;
-char *c;
  void *area;
  int fd;
  uint64_t pagesize;
@@ -1211,38 +1257,14 @@ static void *file_ram_alloc(RAMBlock *block,
  goto error;
  }
  
-/* Make name safe to use with mkstemp by replacing '/' with '_'. */

-sanitized_name = g_strdup(memory_region_name(block->mr));
-for (c = sanitized_name; *c != '\0'; c++) {
-if (*c == '/')
-*c = '_';
-}
-
-filename = g_strdup_printf("%s/qemu_back_mem.%s.XX", path,
-   sanitized_name);
-g_free(sanitized_name);
+memory = ROUND_UP(memory, pagesize);
  
-fd = mkstemp(filename);

+fd = open_file_path(block, path, memory);
  if (fd < 0) {
  error_setg_errno(errp, errno,
   "unable to create backing store for path %s", path);
-g_free(filename);
  goto error;
  }
-unlink(filename);
-g_free(filename);
-
-memory = ROUND_UP(memory, pagesize);
-
-/*
- * ftruncate is not supported by hugetlbfs in older
- * hosts, so don't bother bailing out on errors.
- * If anything goes wrong with it under other filesystems,
- * mmap will fail.
- */
-if (ftruncate(fd, memory)) {
-perror("ftruncate");
-}
  
  area = qemu_ram_mmap(fd, memory, pagesize, block->flags & RAM_SHARED);

  if (area == MAP_FAILED) {



--
Best regards,
Vladimir
* now, @virtuozzo.com instead of @parallels.com. Sorry for this inconvenience.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH v6 10/33] hostmem-file: clean up memory allocation

2015-10-30 Thread Vladimir Sementsov-Ogievskiy


On 30.10.2015 08:56, Xiao Guangrong wrote:

- hostmem-file.c is compiled only if CONFIG_LINUX is enabled so that is
   unnecessary to do the same check in the source file

- the interface, HostMemoryBackendClass->alloc(), is not called many
   times, do not need to check if the memory-region is initialized

Signed-off-by: Xiao Guangrong 
---
  backends/hostmem-file.c | 11 +++
  1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index e9b6d21..9097a57 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -46,17 +46,12 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error 
**errp)
  error_setg(errp, "mem-path property not set");
  return;
  }
-#ifndef CONFIG_LINUX
-error_setg(errp, "-mem-path not supported on this host");
-#else
-if (!memory_region_size(>mr)) {
-backend->force_prealloc = mem_prealloc;
-memory_region_init_ram_from_file(>mr, OBJECT(backend),
+
+backend->force_prealloc = mem_prealloc;
+memory_region_init_ram_from_file(>mr, OBJECT(backend),
   object_get_canonical_path(OBJECT(backend)),
   backend->size, fb->share,
   fb->mem_path, errp);
-}
-#endif
  }
  
  static void


Similar function for memory backend (the only other 
HostMemoryBackendClass) - ram_backend_memory_alloc - has not such check 
too. It's ok..


Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir
* now, @virtuozzo.com instead of @parallels.com. Sorry for this inconvenience.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 0/3] virtio DMA API core stuff

2015-10-30 Thread Joerg Roedel

On Thu, Oct 29, 2015 at 11:01:41AM +0200, Michael S. Tsirkin wrote:
> Example: you have a mix of assigned devices and virtio devices. You
> don't trust your assigned device vendor not to corrupt your memory so
> you want to limit the damage your assigned device can do to your guest,
> so you use an IOMMU for that.  Thus existing iommu=pt within guest is out.
> 
> But you trust your hypervisor (you have no choice anyway),
> and you don't want the overhead of tweaking IOMMU
> on data path for virtio. Thus iommu=on is out too.

IOMMUs on x86 usually come with an ACPI table that describes which
IOMMUs are in the system and which devices they translate. So you can
easily describe all devices there that are not behind an IOMMU.

The ACPI table is built by the BIOS, and the platform intialization code
sets the device dma_ops accordingly. If the BIOS provides wrong
information in the ACPI table this is a platform bug.

> I'm not sure what ACPI has to do with it.  It's about a way for guest
> users to specify whether they want to bypass an IOMMU for a given
> device.

We have no way yet to request passthrough-mode per-device from the IOMMU
drivers, but that can easily be added. But as I see it:

> By the way, a bunch of code is missing on the QEMU side
> to make this useful:
> 1. virtio ignores the iommu
> 2. vhost user ignores the iommu
> 3. dataplane ignores the iommu
> 4. vhost-net ignores the iommu
> 5. VFIO ignores the iommu

Qemu does not implement IOMMU translation for virtio devices anyway
(which is fine), so it just should tell the guest so in the ACPI table
built to describe the emulated IOMMU.

Joerg

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] [PATCH v6 11/33] hostmem-file: use whole file size if possible

2015-10-30 Thread Vladimir Sementsov-Ogievskiy


On 30.10.2015 08:56, Xiao Guangrong wrote:

Use the whole file size if @size is not specified which is useful
if we want to directly pass a file to guest

Signed-off-by: Xiao Guangrong 
---
  backends/hostmem-file.c | 48 
  1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index 9097a57..e1bc9ff 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -9,6 +9,9 @@
   * This work is licensed under the terms of the GNU GPL, version 2 or later.
   * See the COPYING file in the top-level directory.
   */
+#include 
+#include 
+
  #include "qemu-common.h"
  #include "sysemu/hostmem.h"
  #include "sysemu/sysemu.h"
@@ -33,20 +36,57 @@ struct HostMemoryBackendFile {
  char *mem_path;
  };
  
+static uint64_t get_file_size(const char *file)

+{
+struct stat stat_buf;
+uint64_t size = 0;
+int fd;
+
+fd = open(file, O_RDONLY);
+if (fd < 0) {
+return 0;
+}
+
+if (stat(file, _buf) < 0) {
+goto exit;
+}
+
+if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
+goto exit;
+}
+
+size = lseek(fd, 0, SEEK_END);
+if (size == -1) {
+size = 0;
+}
+exit:
+close(fd);
+return size;
+}
+
  static void
  file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
  {
  HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(backend);
  
-if (!backend->size) {

-error_setg(errp, "can't create backend with size 0");
-return;
-}
  if (!fb->mem_path) {
  error_setg(errp, "mem-path property not set");
  return;
  }
  
+if (!backend->size) {

+/*
+ * use the whole file size if @size is not specified.
+ */
+backend->size = get_file_size(fb->mem_path);
+}
+
+if (!backend->size) {
+error_setg(errp, "failed to get file size for %s, can't create "
+ "backend on it", mem_path);
+return;
+}
+
  backend->force_prealloc = mem_prealloc;
  memory_region_init_ram_from_file(>mr, OBJECT(backend),
   object_get_canonical_path(OBJECT(backend)),


You can use error_setg_errno to get an error descriptioin for free, as 
all possible errors from get_file_size comes with errno. Just zero size 
should be separated for this, for example by usiing int64_t as return 
type and -1 for an error.


Sorry for this nit-picking)

--
Best regards,
Vladimir
* now, @virtuozzo.com instead of @parallels.com. Sorry for this inconvenience.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU

2015-10-30 Thread Suresh E. Warrier

This patch set depends upon a previous patch set that I had submitted to
linux-ppc. The URL for that is:

https://lists.ozlabs.org/pipermail/linuxppc-dev/2015-October/135794.html

-suresh

On 10/29/2015 11:52 PM, kbuild test robot wrote:
> Hi Suresh,
> 
> [auto build test ERROR on kvm/linux-next -- if it's inappropriate base, 
> please suggest rules for selecting the more suitable base]
> 
> url:
> https://github.com/0day-ci/linux/commits/Suresh-Warrier/KVM-PPC-Book3S-HV-Optimize-wakeup-VCPU-from-H_IPI/20151030-081329
> config: powerpc-defconfig (attached as .config)
> reproduce:
> wget 
> https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
>  -O ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> make.cross ARCH=powerpc 
> 
> All errors (new ones prefixed by >>):
> 
>arch/powerpc/kvm/book3s_hv_rm_xics.c: In function 'icp_rm_set_vcpu_irq':
>>> arch/powerpc/kvm/book3s_hv_rm_xics.c:142:4: error: implicit declaration of 
>>> function 'smp_muxed_ipi_rm_message_pass' 
>>> [-Werror=implicit-function-declaration]
>smp_muxed_ipi_rm_message_pass(hcpu,
>^
>arch/powerpc/kvm/book3s_hv_rm_xics.c:143:7: error: 
> 'PPC_MSG_RM_HOST_ACTION' undeclared (first use in this function)
>   PPC_MSG_RM_HOST_ACTION);
>   ^
>arch/powerpc/kvm/book3s_hv_rm_xics.c:143:7: note: each undeclared 
> identifier is reported only once for each function it appears in
>cc1: all warnings being treated as errors
> 
> vim +/smp_muxed_ipi_rm_message_pass +142 arch/powerpc/kvm/book3s_hv_rm_xics.c
> 
>136hcore = -1;
>137if (kvmppc_host_rm_ops_hv)
>138hcore = 
> find_available_hostcore(XICS_RM_KICK_VCPU);
>139if (hcore != -1) {
>140hcpu = hcore << threads_shift;
>141
> kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
>  > 142smp_muxed_ipi_rm_message_pass(hcpu,
>143
> PPC_MSG_RM_HOST_ACTION);
>144} else {
>145this_icp->rm_action |= 
> XICS_RM_KICK_VCPU;
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: x86: fix RSM into 64-bit protected mode, round 2

2015-10-30 Thread Radim Krčmář

2015-10-26 17:32+0100, Paolo Bonzini:
> On 26/10/2015 16:43, Laszlo Ersek wrote:
>>> The code would be cleaner if we had a different approach, but this works
>>> too and is safer for stable. In case you prefer to leave the rewrite for
>>> a future victim,
>> 
>> It's hard to express how much I prefer that.
> 
> Radim, if you want to have a try go ahead since I cannot apply the patch
> until next Monday.

The future I originally had in mind was more hoverboardy, but a series
just landed, "KVM: x86: simplify RSM into 64-bit protected mode".

Laszlo, I'd be grateful if you could check that it works.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] KVM: x86: add read_phys to x86_emulate_ops

2015-10-30 Thread Radim Krčmář

We want to read the physical memory when emulating RSM.

X86EMUL_IO_NEEDED is returned on all errors for consistency with other
helpers.

Signed-off-by: Radim Krčmář 
---
 arch/x86/include/asm/kvm_emulate.h | 10 ++
 arch/x86/kvm/x86.c | 10 ++
 2 files changed, 20 insertions(+)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index e16466ec473c..96f1d1c5e6cb 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -112,6 +112,16 @@ struct x86_emulate_ops {
struct x86_exception *fault);
 
/*
+* read_phys: Read bytes of standard (non-emulated/special) memory.
+*Used for descriptor reading.
+*  @addr:  [IN ] Physical address from which to read.
+*  @val:   [OUT] Value read from memory.
+*  @bytes: [IN ] Number of bytes to read from memory.
+*/
+   int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+   void *val, unsigned int bytes);
+
+   /*
 * write_std: Write bytes of standard (non-emulated/special) memory.
 *Used for descriptor writing.
 *  @addr:  [IN ] Linear address to which to write.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 441cb9d4ec8a..ae5af651af89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4097,6 +4097,15 @@ static int kvm_read_guest_virt_system(struct 
x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+   unsigned long addr, void *val, unsigned int bytes)
+{
+   struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+   int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+   return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
   gva_t addr, void *val,
   unsigned int bytes,
@@ -4832,6 +4841,7 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr   = emulator_write_gpr,
.read_std= kvm_read_guest_virt_system,
.write_std   = kvm_write_guest_virt_system,
+   .read_phys   = kvm_read_guest_phys_system,
.fetch   = kvm_fetch_guest_virt,
.read_emulated   = emulator_read_emulated,
.write_emulated  = emulator_write_emulated,
-- 
2.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/3] KVM: x86: simplify RSM into 64-bit protected mode

2015-10-30 Thread Radim Krčmář

This series bases on "KVM: x86: fix RSM into 64-bit protected mode,
round 2" and reverts it in [3/3].  To avoid regressions after doing so,
[1/2] introduces a helper that is used in [2/2] to hopefully get the
same behavior.

I'll set up test environment next week, unless a random act of kindness
allows me not to.


Radim Krčmář (3):
  KVM: x86: add read_phys to x86_emulate_ops
  KVM: x86: handle SMBASE as physical address in RSM
  KVM: x86: simplify RSM into 64-bit protected mode

 arch/x86/include/asm/kvm_emulate.h | 10 +
 arch/x86/kvm/emulate.c | 44 +-
 arch/x86/kvm/x86.c | 10 +
 3 files changed, 30 insertions(+), 34 deletions(-)

-- 
2.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

1 2 >

1 - 100 of 135 matches

Mail list logo