from:"David Woodhouse"

[PATCH v2 11/27] hw/xen: Pass grant ref to gnttab unmap operation

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

The previous commit introduced redirectable gnttab operations fairly
much like-for-like, with the exception of the extra arguments to the
->open() call which were always NULL/0 anyway.

This *changes* the arguments to the ->unmap() operation to include the
original ref# that was mapped. Under real Xen it isn't necessary; all we
need to do from QEMU is munmap(), then the kernel will release the grant,
and Xen does the tracking/refcounting for the guest.

When we have emulated grant tables though, we need to do all that for
ourselves. So let's have the back ends keep track of what they mapped
and pass it in to the ->unmap() method for us.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/9pfs/xen-9p-backend.c|  7 ---
 hw/block/dataplane/xen-block.c  |  1 +
 hw/char/xen_console.c   |  2 +-
 hw/net/xen_nic.c| 13 -
 hw/usb/xen-usb.c| 21 -
 hw/xen/xen-bus.c|  4 ++--
 hw/xen/xen-legacy-backend.c |  4 ++--
 hw/xen/xen-operations.c |  9 -
 include/hw/xen/xen-bus.h|  2 +-
 include/hw/xen/xen-legacy-backend.h |  6 +++---
 include/hw/xen/xen_backend_ops.h|  7 ---
 11 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index 864bdaf952..d8bb0e847c 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -359,12 +359,13 @@ static int xen_9pfs_free(struct XenLegacyDevice *xendev)
 if (xen_9pdev->rings[i].data != NULL) {
 xen_be_unmap_grant_refs(&xen_9pdev->xendev,
 xen_9pdev->rings[i].data,
+xen_9pdev->rings[i].intf->ref,
 (1 << xen_9pdev->rings[i].ring_order));
 }
 if (xen_9pdev->rings[i].intf != NULL) {
-xen_be_unmap_grant_refs(&xen_9pdev->xendev,
-xen_9pdev->rings[i].intf,
-1);
+xen_be_unmap_grant_ref(&xen_9pdev->xendev,
+   xen_9pdev->rings[i].intf,
+   xen_9pdev->rings[i].ref);
 }
 if (xen_9pdev->rings[i].bh != NULL) {
 qemu_bh_delete(xen_9pdev->rings[i].bh);
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 2785b9e849..e55b713002 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -705,6 +705,7 @@ void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
 Error *local_err = NULL;
 
 xen_device_unmap_grant_refs(xendev, dataplane->sring,
+dataplane->ring_ref,
 dataplane->nr_ring_ref, &local_err);
 dataplane->sring = NULL;
 
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 63153dfde4..19ad6c946a 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -271,7 +271,7 @@ static void con_disconnect(struct XenLegacyDevice *xendev)
 if (!xendev->dev) {
 xenforeignmemory_unmap(xen_fmem, con->sring, 1);
 } else {
-xen_be_unmap_grant_ref(xendev, con->sring);
+xen_be_unmap_grant_ref(xendev, con->sring, con->ring_ref);
 }
 con->sring = NULL;
 }
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 7d92c2d022..166d03787d 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -181,7 +181,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
 qemu_send_packet(qemu_get_queue(netdev->nic),
  page + txreq.offset, txreq.size);
 }
-xen_be_unmap_grant_ref(&netdev->xendev, page);
+xen_be_unmap_grant_ref(&netdev->xendev, page, txreq.gref);
 net_tx_response(netdev, &txreq, NETIF_RSP_OKAY);
 }
 if (!netdev->tx_work) {
@@ -261,7 +261,7 @@ static ssize_t net_rx_packet(NetClientState *nc, const 
uint8_t *buf, size_t size
 return -1;
 }
 memcpy(page + NET_IP_ALIGN, buf, size);
-xen_be_unmap_grant_ref(&netdev->xendev, page);
+xen_be_unmap_grant_ref(&netdev->xendev, page, rxreq.gref);
 net_rx_response(netdev, &rxreq, NETIF_RSP_OKAY, NET_IP_ALIGN, size, 0);
 
 return size;
@@ -343,7 +343,8 @@ static int net_connect(struct XenLegacyDevice *xendev)
netdev->rx_ring_ref,
PROT_READ | PROT_WRITE);
 if (!netdev->rxs) {
-xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs);
+xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs,
+   net

[PATCH v2 27/27] docs: Update Xen-on-KVM documentation for PV disk support

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 docs/system/i386/xen.rst | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/docs/system/i386/xen.rst b/docs/system/i386/xen.rst
index a00523b492..f06765e88c 100644
--- a/docs/system/i386/xen.rst
+++ b/docs/system/i386/xen.rst
@@ -9,6 +9,8 @@ KVM has support for hosting Xen guests, intercepting Xen 
hypercalls and event
 channel (Xen PV interrupt) delivery. This allows guests which expect to be
 run under Xen to be hosted in QEMU under Linux/KVM instead.
 
+Using the split irqchip is mandatory for Xen support.
+
 Setup
 -
 
@@ -17,14 +19,14 @@ accelerator, for example for Xen 4.10:
 
 .. parsed-literal::
 
-  |qemu_system| --accel kvm,xen-version=0x4000a
+  |qemu_system| --accel kvm,xen-version=0x4000a,kernel-irqchip=split
 
 Additionally, virtual APIC support can be advertised to the guest through the
 ``xen-vapic`` CPU flag:
 
 .. parsed-literal::
 
-  |qemu_system| --accel kvm,xen-version=0x4000a --cpu host,+xen_vapic
+  |qemu_system| --accel kvm,xen-version=0x4000a,kernel-irqchip=split --cpu 
host,+xen_vapic
 
 When Xen support is enabled, QEMU changes hypervisor identification (CPUID
 0x4000..0x400A) to Xen. The KVM identification and features are not
@@ -33,11 +35,25 @@ moves to leaves 0x4100..0x410A.
 
 The Xen platform device is enabled automatically for a Xen guest. This allows
 a guest to unplug all emulated devices, in order to use Xen PV block and 
network
-drivers instead. Note that until the Xen PV device back ends are enabled to 
work
-with Xen mode in QEMU, that is unlikely to cause significant joy. Linux guests
-can be dissuaded from this by adding 'xen_emul_unplug=never' on their command
-line, and it can also be noted that AHCI disk controllers are exempt from being
-unplugged, as are passthrough VFIO PCI devices.
+drivers instead. Under Xen, the boot disk is typically available both via IDE
+emulation, and as a PV block device. Guest bootloaders typically use IDE to 
load
+the guest kernel, which then unplugs the IDE and continues with the Xen PV 
block
+device.
+
+This configuration can be achieved as follows
+
+.. parsed-literal::
+
+  |qemu_system| -M pc --accel kvm,xen-version=0x4000a,kernel-irqchip=split \\
+   -drive file=${GUEST_IMAGE},if=none,id=disk,file.locking=off -device 
xen-disk,drive=disk,vdev=xvda \\
+   -drive file=${GUEST_IMAGE},index=2,media=disk,file.locking=off,if=ide
+
+It is necessary to use the pc machine type, as the q35 machine uses AHCI 
instead
+of legacy IDE, and AHCI disks are not unplugged through the Xen PV unplug
+mechanism.
+
+VirtIO devices can also be used; Linux guests may need to be dissuaded from
+umplugging them by adding 'xen_emul_unplug=never' on their command line.
 
 Properties
 --
-- 
2.39.0

[PATCH v2 04/27] hw/xen: Implement XenStore transactions

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Given that the whole thing supported copy on write from the beginning,
transactions end up being fairly simple. On starting a transaction, just
take a ref of the existing root; swap it back in on a successful commit.

The main tree has a transaction ID too, and we keep a record of the last
transaction ID given out. if the main tree is ever modified when it isn't
the latest, it gets a new transaction ID.

A commit can only succeed if the main tree hasn't moved on since it was
forked. Strictly speaking, the XenStore protocol allows a transaction to
succeed as long as nothing *it* read or wrote has changed in the interim,
but no implementations do that; *any* change is sufficient to abort a
transaction.

This does not yet fire watches on the changed nodes on a commit. That bit
is more fun and will come in a follow-on commit.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xenstore_impl.c | 150 ++--
 tests/unit/test-xs-node.c   | 118 
 2 files changed, 262 insertions(+), 6 deletions(-)

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 9c2348835f..0812e367b0 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -46,13 +46,56 @@ typedef struct XsWatch {
 int rel_prefix;
 } XsWatch;
 
+typedef struct XsTransaction {
+XsNode *root;
+unsigned int nr_nodes;
+unsigned int base_tx;
+unsigned int tx_id;
+unsigned int dom_id;
+} XsTransaction;
+
 struct XenstoreImplState {
 XsNode *root;
 unsigned int nr_nodes;
 GHashTable *watches;
 unsigned int nr_domu_watches;
+GHashTable *transactions;
+unsigned int nr_domu_transactions;
+unsigned int root_tx;
+unsigned int last_tx;
 };
 
+
+static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
+{
+unsigned int *new_tx_id = user_data;
+XsTransaction *tx = value;
+
+if (tx->base_tx == *new_tx_id) {
+/* Transactions based on XBT_NULL will always fail */
+tx->base_tx = XBT_NULL;
+}
+}
+
+static inline unsigned int next_tx(struct XenstoreImplState *s)
+{
+unsigned int tx_id;
+
+/* Find the next TX id which isn't either XBT_NULL or in use. */
+do {
+tx_id = ++s->last_tx;
+} while (tx_id == XBT_NULL || tx_id == s->root_tx ||
+ g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
+
+/*
+ * It is vanishingly unlikely, but ensure that no outstanding transaction
+ * is based on the (previous incarnation of the) newly-allocated TX id.
+ */
+g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
+
+return tx_id;
+}
+
 static inline XsNode *xs_node_new(void)
 {
 XsNode *n = g_new0(XsNode, 1);
@@ -159,6 +202,7 @@ struct walk_op {
 
 GList *watches;
 unsigned int dom_id;
+unsigned int tx_id;
 
 /* The number of nodes which will exist in the tree if this op succeeds. */
 unsigned int new_nr_nodes;
@@ -176,6 +220,7 @@ struct walk_op {
 bool inplace;
 bool mutating;
 bool create_dirs;
+bool in_transaction;
 };
 
 static void fire_watches(struct walk_op *op, bool parents)
@@ -183,7 +228,7 @@ static void fire_watches(struct walk_op *op, bool parents)
 GList *l = NULL;
 XsWatch *w;
 
-if (!op->mutating) {
+if (!op->mutating || op->in_transaction) {
 return;
 }
 
@@ -450,10 +495,23 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 assert(!op->watches);
 /*
  * On completing the recursion back up the path walk and reaching the
- * top, assign the new node count if the operation was successful.
+ * top, assign the new node count if the operation was successful. If
+ * the main tree was changed, bump its tx ID so that outstanding
+ * transactions correctly fail. But don't bump it every time; only
+ * if it makes a difference.
  */
 if (!err && op->mutating) {
-op->s->nr_nodes = op->new_nr_nodes;
+if (!op->in_transaction) {
+if (op->s->root_tx != op->s->last_tx) {
+op->s->root_tx = next_tx(op->s);
+}
+op->s->nr_nodes = op->new_nr_nodes;
+} else {
+XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
+
GINT_TO_POINTER(op->tx_id));
+assert(tx);
+tx->nr_nodes = op->new_nr_nodes;
+}
 }
 }
 return err;
@@ -535,14 +593,23 @@ static int init_walk_op(XenstoreImplState *s, struct 
walk_op *op,
 op->inplace = true;
 op->mutating = false;
 op->create_dirs = false;
+op->in_transaction = false;
 op->dom_id = dom_id;
+op->tx_

[PATCH v2 13/27] hw/xen: Add xenstore operations to allow redirection to internal emulation

2023-03-07 Thread David Woodhouse

From: Paul Durrant 

Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 accel/xen/xen-all.c |  11 +-
 hw/char/xen_console.c   |   2 +-
 hw/i386/kvm/xen_xenstore.c  |   3 -
 hw/i386/kvm/xenstore_impl.h |   8 +-
 hw/xen/xen-bus-helper.c |  62 +++
 hw/xen/xen-bus.c| 261 
 hw/xen/xen-legacy-backend.c | 119 +++--
 hw/xen/xen-operations.c | 198 +
 hw/xen/xen_devconfig.c  |   4 +-
 hw/xen/xen_pt_graphics.c|   1 -
 hw/xen/xen_pvdev.c  |  49 +-
 include/hw/xen/xen-bus-helper.h |  26 +--
 include/hw/xen/xen-bus.h|  17 +-
 include/hw/xen/xen-legacy-backend.h |   6 +-
 include/hw/xen/xen_backend_ops.h| 163 +
 include/hw/xen/xen_common.h |   1 -
 include/hw/xen/xen_pvdev.h  |   2 +-
 softmmu/globals.c   |   1 +
 18 files changed, 525 insertions(+), 409 deletions(-)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index e85e4aeba5..425216230f 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -90,12 +90,15 @@ void xenstore_store_pv_console_info(int i, Chardev *chr)
 }
 
 
-static void xenstore_record_dm_state(struct xs_handle *xs, const char *state)
+static void xenstore_record_dm_state(const char *state)
 {
+struct xs_handle *xs;
 char path[50];
 
+/* We now have everything we need to set the xenstore entry. */
+xs = xs_open(0);
 if (xs == NULL) {
-error_report("xenstore connection not initialized");
+fprintf(stderr, "Could not contact XenStore\n");
 exit(1);
 }
 
@@ -109,6 +112,8 @@ static void xenstore_record_dm_state(struct xs_handle *xs, 
const char *state)
 error_report("error recording dm state");
 exit(1);
 }
+
+xs_close(xs);
 }
 
 
@@ -117,7 +122,7 @@ static void xen_change_state_handler(void *opaque, bool 
running,
 {
 if (running) {
 /* record state running */
-xenstore_record_dm_state(xenstore, "running");
+xenstore_record_dm_state("running");
 }
 }
 
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index e9cef3e1ef..ad8638a86d 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -181,7 +181,7 @@ static int con_init(struct XenLegacyDevice *xendev)
 const char *output;
 
 /* setup */
-dom = xs_get_domain_path(xenstore, con->xendev.dom);
+dom = qemu_xen_xs_get_domain_path(xenstore, con->xendev.dom);
 if (!xendev->dev) {
 snprintf(con->console, sizeof(con->console), "%s/console", dom);
 } else {
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index fb3648a058..35898e9b37 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -38,9 +38,6 @@
 #define TYPE_XEN_XENSTORE "xen-xenstore"
 OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 #define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t))
 
diff --git a/hw/i386/kvm/xenstore_impl.h b/hw/i386/kvm/xenstore_impl.h
index bbe2391e2e..0df2a91aae 100644
--- a/hw/i386/kvm/xenstore_impl.h
+++ b/hw/i386/kvm/xenstore_impl.h
@@ -12,13 +12,7 @@
 #ifndef QEMU_XENSTORE_IMPL_H
 #define QEMU_XENSTORE_IMPL_H
 
-typedef uint32_t xs_transaction_t;
-
-#define XBT_NULL 0
-
-#define XS_PERM_NONE  0x00
-#define XS_PERM_READ  0x01
-#define XS_PERM_WRITE 0x02
+#include "hw/xen/xen_backend_ops.h"
 
 typedef struct XenstoreImplState XenstoreImplState;
 
diff --git a/hw/xen/xen-bus-helper.c b/hw/xen/xen-bus-helper.c
index 5a1e12b374..b2b2cc9c5d 100644
--- a/hw/xen/xen-bus-helper.c
+++ b/hw/xen/xen-bus-helper.c
@@ -10,6 +10,7 @@
 #include "hw/xen/xen-bus.h"
 #include "hw/xen/xen-bus-helper.h"
 #include "qapi/error.h"
+#include "trace.h"
 
 #include 
 
@@ -46,34 +47,28 @@ const char *xs_strstate(enum xenbus_state state)
 return "INVALID";
 }
 
-void xs_node_create(struct xs_handle *xsh, xs_transaction_t tid,
-const char *node, struct xs_permissions perms[],
-unsigned int nr_perms, Error **errp)
+void xs_node_create(struct qemu_xs_handle *h, xs_transaction_t tid,
+const char *node, unsigned int owner, unsigned int domid,
+unsigned int perms, Error **errp)
 {
 trace_xs_node_create(node);
 
-if (!xs_write(xsh, tid, node, "", 0)) {
+if (!qemu_xen_xs_create(h, tid, owner, domid, perms, node)) {
 error_setg_errno(errp, errno, "failed to create node '%s'", node);
-return;
-}
-
-if (!xs_set_permissions(xsh, t

[PATCH v2 02/27] hw/xen: Add basic XenStore tree walk and write/read/directory support

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

This is a fairly simple implementation of a copy-on-write tree.

The node walk function starts off at the root, with 'inplace == true'.
If it ever encounters a node with a refcount greater than one (including
the root node), then that node is shared with other trees, and cannot
be modified in place, so the inplace flag is cleared and we copy on
write from there on down.

Xenstore write has 'mkdir -p' semantics and will create the intermediate
nodes if they don't already exist, so in that case we flip the inplace
flag back to true as we populate the newly-created nodes.

We put a copy of the absolute path into the buffer in the struct walk_op,
with *two* NUL terminators at the end. As xs_node_walk() goes down the
tree, it replaces the next '/' separator with a NUL so that it can use
the 'child name' in place. The next recursion down then puts the '/'
back and repeats the exercise for the next path element... if it doesn't
hit that *second* NUL termination which indicates the true end of the
path.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xenstore_impl.c | 527 +++-
 tests/unit/meson.build  |   1 +
 tests/unit/test-xs-node.c   | 197 ++
 3 files changed, 718 insertions(+), 7 deletions(-)
 create mode 100644 tests/unit/test-xs-node.c

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 31dbc98fe0..9e10a31bea 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -10,13 +10,470 @@
  */
 
 #include "qemu/osdep.h"
+#include "qom/object.h"
 
 #include "xen_xenstore.h"
 #include "xenstore_impl.h"
 
+#include "hw/xen/interface/io/xs_wire.h"
+
+#define XS_MAX_WATCHES  128
+#define XS_MAX_DOMAIN_NODES 1000
+#define XS_MAX_NODE_SIZE2048
+#define XS_MAX_TRANSACTIONS 10
+#define XS_MAX_PERMS_PER_NODE   5
+
+#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
+   "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
+   "0123456789-/_"
+
+typedef struct XsNode {
+uint32_t ref;
+GByteArray *content;
+GHashTable *children;
+uint64_t gencnt;
+#ifdef XS_NODE_UNIT_TEST
+gchar *name; /* debug only */
+#endif
+} XsNode;
+
 struct XenstoreImplState {
+XsNode *root;
+unsigned int nr_nodes;
 };
 
+static inline XsNode *xs_node_new(void)
+{
+XsNode *n = g_new0(XsNode, 1);
+n->ref = 1;
+
+#ifdef XS_NODE_UNIT_TEST
+nr_xs_nodes++;
+xs_node_list = g_list_prepend(xs_node_list, n);
+#endif
+return n;
+}
+
+static inline XsNode *xs_node_ref(XsNode *n)
+{
+/* With just 10 transactions, it can never get anywhere near this. */
+g_assert(n->ref < INT_MAX);
+
+g_assert(n->ref);
+n->ref++;
+return n;
+}
+
+static inline void xs_node_unref(XsNode *n)
+{
+if (!n) {
+return;
+}
+g_assert(n->ref);
+if (--n->ref) {
+return;
+}
+
+if (n->content) {
+g_byte_array_unref(n->content);
+}
+if (n->children) {
+g_hash_table_unref(n->children);
+}
+#ifdef XS_NODE_UNIT_TEST
+g_free(n->name);
+nr_xs_nodes--;
+xs_node_list = g_list_remove(xs_node_list, n);
+#endif
+g_free(n);
+}
+
+/* For copying from one hash table to another using g_hash_table_foreach() */
+static void do_insert(gpointer key, gpointer value, gpointer user_data)
+{
+g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
+}
+
+static XsNode *xs_node_copy(XsNode *old)
+{
+XsNode *n = xs_node_new();
+
+n->gencnt = old->gencnt;
+if (old->children) {
+n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+(GDestroyNotify)xs_node_unref);
+g_hash_table_foreach(old->children, do_insert, n->children);
+}
+if (old && old->content) {
+n->content = g_byte_array_ref(old->content);
+}
+return n;
+}
+
+/* Returns true if it made a change to the hash table */
+static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
+{
+assert(!strchr(path_elem, '/'));
+
+if (!child) {
+assert(n->children);
+return g_hash_table_remove(n->children, path_elem);
+}
+
+#ifdef XS_NODE_UNIT_TEST
+g_free(child->name);
+child->name = g_strdup(path_elem);
+#endif
+if (!n->children) {
+n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+(GDestroyNotify)xs_node_unref);
+}
+
+/*
+ * The documentation for g_hash_table_insert() says that it "returns a
+ * boolean value to indicate whether the newly added value was already
+ * in the hash table or not."
+ *
+ * It could

[PATCH v2 19/27] hw/xen: Only advertise ring-page-order for xen-block if gnttab supports it

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Whem emulating Xen, multi-page grants are distinctly non-trivial and we
have elected not to support them for the time being. Don't advertise
them to the guest.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/block/xen-block.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 87299615e3..f5a744589d 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -83,7 +83,8 @@ static void xen_block_connect(XenDevice *xendev, Error **errp)
 g_free(ring_ref);
 return;
 }
-} else if (order <= blockdev->props.max_ring_page_order) {
+} else if (qemu_xen_gnttab_can_map_multi() &&
+   order <= blockdev->props.max_ring_page_order) {
 unsigned int i;
 
 nr_ring_ref = 1 << order;
@@ -255,8 +256,12 @@ static void xen_block_realize(XenDevice *xendev, Error 
**errp)
 }
 
 xen_device_backend_printf(xendev, "feature-flush-cache", "%u", 1);
-xen_device_backend_printf(xendev, "max-ring-page-order", "%u",
-  blockdev->props.max_ring_page_order);
+
+if (qemu_xen_gnttab_can_map_multi()) {
+xen_device_backend_printf(xendev, "max-ring-page-order", "%u",
+  blockdev->props.max_ring_page_order);
+}
+
 xen_device_backend_printf(xendev, "info", "%u", blockdev->info);
 
 xen_device_frontend_printf(xendev, "virtual-device", "%lu",
-- 
2.39.0

[PATCH v2 10/27] hw/xen: Add gnttab operations to allow redirection to internal emulation

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Move the existing code using libxengnttab to xen-operations.c and allow
the operations to be redirected so that we can add emulation of grant
table mapping for backend drivers.

In emulation, mapping more than one grant ref to be virtually contiguous
would be fairly difficult. The best way to do it might be to make the
ram_block mappings actually backed by a file (shmem or a deleted file,
perhaps) so that we can have multiple *shared* mappings of it. But that
would be fairly intrusive.

Making the backend drivers cope with page *lists* instead of expecting
the mapping to be contiguous is also non-trivial, since some structures
would actually *cross* page boundaries (e.g. the 32-bit blkif responses
which are 12 bytes).

So for now, we'll support only single-page mappings in emulation. Add a
XEN_GNTTAB_OP_FEATURE_MAP_MULTIPLE flag to indicate that the native Xen
implementation *does* support multi-page maps, and a helper function to
query it.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/xen/xen-bus.c| 112 ++--
 hw/xen/xen-legacy-backend.c | 125 ++
 hw/xen/xen-operations.c | 157 
 hw/xen/xen_pvdev.c  |   2 +-
 include/hw/xen/xen-bus.h|   3 +-
 include/hw/xen/xen-legacy-backend.h |  13 +--
 include/hw/xen/xen_backend_ops.h| 100 ++
 include/hw/xen/xen_common.h |  39 ---
 softmmu/globals.c   |   1 +
 9 files changed, 280 insertions(+), 272 deletions(-)

diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index d0b1ae93da..b247e86f28 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -947,7 +947,7 @@ static void xen_device_frontend_destroy(XenDevice *xendev)
 void xen_device_set_max_grant_refs(XenDevice *xendev, unsigned int nr_refs,
Error **errp)
 {
-if (xengnttab_set_max_grants(xendev->xgth, nr_refs)) {
+if (qemu_xen_gnttab_set_max_grants(xendev->xgth, nr_refs)) {
 error_setg_errno(errp, errno, "xengnttab_set_max_grants failed");
 }
 }
@@ -956,9 +956,8 @@ void *xen_device_map_grant_refs(XenDevice *xendev, uint32_t 
*refs,
 unsigned int nr_refs, int prot,
 Error **errp)
 {
-void *map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_refs,
-xendev->frontend_id, refs,
-prot);
+void *map = qemu_xen_gnttab_map_refs(xendev->xgth, nr_refs,
+ xendev->frontend_id, refs, prot);
 
 if (!map) {
 error_setg_errno(errp, errno,
@@ -971,109 +970,17 @@ void *xen_device_map_grant_refs(XenDevice *xendev, 
uint32_t *refs,
 void xen_device_unmap_grant_refs(XenDevice *xendev, void *map,
  unsigned int nr_refs, Error **errp)
 {
-if (xengnttab_unmap(xendev->xgth, map, nr_refs)) {
+if (qemu_xen_gnttab_unmap(xendev->xgth, map, nr_refs)) {
 error_setg_errno(errp, errno, "xengnttab_unmap failed");
 }
 }
 
-static void compat_copy_grant_refs(XenDevice *xendev, bool to_domain,
-   XenDeviceGrantCopySegment segs[],
-   unsigned int nr_segs, Error **errp)
-{
-uint32_t *refs = g_new(uint32_t, nr_segs);
-int prot = to_domain ? PROT_WRITE : PROT_READ;
-void *map;
-unsigned int i;
-
-for (i = 0; i < nr_segs; i++) {
-XenDeviceGrantCopySegment *seg = &segs[i];
-
-refs[i] = to_domain ? seg->dest.foreign.ref :
-seg->source.foreign.ref;
-}
-
-map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_segs,
-  xendev->frontend_id, refs,
-  prot);
-if (!map) {
-error_setg_errno(errp, errno,
- "xengnttab_map_domain_grant_refs failed");
-goto done;
-}
-
-for (i = 0; i < nr_segs; i++) {
-XenDeviceGrantCopySegment *seg = &segs[i];
-void *page = map + (i * XC_PAGE_SIZE);
-
-if (to_domain) {
-memcpy(page + seg->dest.foreign.offset, seg->source.virt,
-   seg->len);
-} else {
-memcpy(seg->dest.virt, page + seg->source.foreign.offset,
-   seg->len);
-}
-}
-
-if (xengnttab_unmap(xendev->xgth, map, nr_segs)) {
-error_setg_errno(errp, errno, "xengnttab_unmap failed");
-}
-
-done:
-g_free(refs);
-}
-
 void xen_device_copy_grant_refs(XenDevice *xendev, bool to_domain,
 XenDeviceGrantCopySegment segs[],
 unsigned int nr_segs, Error **errp)
 {
-xengnttab_grant_copy_segment_t *x

[PATCH v2 03/27] hw/xen: Implement XenStore watches

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Starts out fairly simple: a hash table of watches based on the path.

Except there can be multiple watches on the same path, so the watch ends
up being a simple linked list, and the head of that list is in the hash
table. Which makes removal a bit of a PITA but it's not so bad; we just
special-case "I had to remove the head of the list and now I have to
replace it in / remove it from the hash table". And if we don't remove
the head, it's a simple linked-list operation.

We do need to fire watches on *deleted* nodes, so instead of just a simple
xs_node_unref() on the topmost victim, we need to recurse down and fire
watches on them all.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xenstore_impl.c | 253 +---
 tests/unit/test-xs-node.c   |  85 
 2 files changed, 323 insertions(+), 15 deletions(-)

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 9e10a31bea..9c2348835f 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -37,9 +37,20 @@ typedef struct XsNode {
 #endif
 } XsNode;
 
+typedef struct XsWatch {
+struct XsWatch *next;
+xs_impl_watch_fn *cb;
+void *cb_opaque;
+char *token;
+unsigned int dom_id;
+int rel_prefix;
+} XsWatch;
+
 struct XenstoreImplState {
 XsNode *root;
 unsigned int nr_nodes;
+GHashTable *watches;
+unsigned int nr_domu_watches;
 };
 
 static inline XsNode *xs_node_new(void)
@@ -146,6 +157,7 @@ struct walk_op {
 void *op_opaque;
 void *op_opaque2;
 
+GList *watches;
 unsigned int dom_id;
 
 /* The number of nodes which will exist in the tree if this op succeeds. */
@@ -166,6 +178,35 @@ struct walk_op {
 bool create_dirs;
 };
 
+static void fire_watches(struct walk_op *op, bool parents)
+{
+GList *l = NULL;
+XsWatch *w;
+
+if (!op->mutating) {
+return;
+}
+
+if (parents) {
+l = op->watches;
+}
+
+w = g_hash_table_lookup(op->s->watches, op->path);
+while (w || l) {
+if (!w) {
+/* Fire the parent nodes from 'op' if asked to */
+w = l->data;
+l = l->next;
+continue;
+}
+
+assert(strlen(op->path) > w->rel_prefix);
+w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
+
+w = w->next;
+}
+}
+
 static int xs_node_add_content(XsNode **n, struct walk_op *op)
 {
 GByteArray *data = op->op_opaque;
@@ -213,6 +254,8 @@ static int xs_node_get_content(XsNode **n, struct walk_op 
*op)
 static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
 {
 struct walk_op *op = user_data;
+int path_len = strlen(op->path);
+int key_len = strlen(key);
 XsNode *n = value;
 bool this_inplace = op->inplace;
 
@@ -220,11 +263,22 @@ static int node_rm_recurse(gpointer key, gpointer value, 
gpointer user_data)
 op->inplace = 0;
 }
 
+assert(key_len + path_len + 2 <= sizeof(op->path));
+op->path[path_len] = '/';
+memcpy(op->path + path_len + 1, key, key_len + 1);
+
 if (n->children) {
 g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
 }
 op->new_nr_nodes--;
 
+/*
+ * Fire watches on *this* node but not the parents because they are
+ * going to be deleted too, so the watch will fire for them anyway.
+ */
+fire_watches(op, false);
+op->path[path_len] = '\0';
+
 /*
  * Actually deleting the child here is just an optimisation; if we
  * don't then the final unref on the topmost victim will just have
@@ -238,7 +292,7 @@ static int xs_node_rm(XsNode **n, struct walk_op *op)
 {
 bool this_inplace = op->inplace;
 
-/* Keep count of the nodes in the subtree which gets deleted. */
+/* Fire watches for, and count, nodes in the subtree which get deleted */
 if ((*n)->children) {
 g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
 }
@@ -269,9 +323,11 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 XsNode *old = *n, *child = NULL;
 bool stole_child = false;
 bool this_inplace;
+XsWatch *watch;
 int err;
 
 namelen = strlen(op->path);
+watch = g_hash_table_lookup(op->s->watches, op->path);
 
 /* Is there a child, or do we hit the double-NUL termination? */
 if (op->path[namelen + 1]) {
@@ -292,6 +348,9 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 if (!child_name) {
 /* This is the actual node on which the operation shall be performed */
 err = op->op_fn(n, op);
+if (!err) {
+fire_watches(op, true);
+}
 goto out;
 }
 
@@ -333,11 +392,24 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 g

[PATCH v2 16/27] hw/xen: Rename xen_common.h to xen_native.h

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

This header is now only for native Xen code, not PV backends that may be
used in Xen emulation. Since the toolstack libraries may depend on the
specific version of Xen headers that they pull in (and will set the
__XEN_TOOLS__ macro to enable internal definitions that they depend on),
the rule is that xen_native.h (and thus the toolstack library headers)
must be included *before* any of the headers in include/hw/xen/interface.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 accel/xen/xen-all.c   |  1 +
 hw/9pfs/xen-9p-backend.c  |  1 +
 hw/block/dataplane/xen-block.c|  3 ++-
 hw/block/xen-block.c  |  1 -
 hw/i386/pc_piix.c |  4 ++--
 hw/i386/xen/xen-hvm.c | 11 +-
 hw/i386/xen/xen-mapcache.c|  2 +-
 hw/i386/xen/xen_platform.c|  7 +++---
 hw/xen/trace-events   |  2 +-
 hw/xen/xen-operations.c   |  2 +-
 hw/xen/xen_pt.c   |  2 +-
 hw/xen/xen_pt.h   |  2 +-
 hw/xen/xen_pt_config_init.c   |  2 +-
 hw/xen/xen_pt_msi.c   |  4 ++--
 include/hw/xen/xen.h  | 22 ---
 include/hw/xen/{xen_common.h => xen_native.h} | 10 ++---
 include/hw/xen/xen_pvdev.h|  3 ++-
 17 files changed, 47 insertions(+), 32 deletions(-)
 rename include/hw/xen/{xen_common.h => xen_native.h} (98%)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 2d51c41e40..00221e23c5 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -12,6 +12,7 @@
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "qapi/error.h"
+#include "hw/xen/xen_native.h"
 #include "hw/xen/xen-legacy-backend.h"
 #include "hw/xen/xen_pt.h"
 #include "chardev/char.h"
diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index d8bb0e847c..74f3a05f88 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -22,6 +22,7 @@
 #include "qemu/config-file.h"
 #include "qemu/main-loop.h"
 #include "qemu/option.h"
+#include "qemu/iov.h"
 #include "fsdev/qemu-fsdev.h"
 
 #define VERSIONS "1"
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 8322a1de82..734da42ea7 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -23,8 +23,9 @@
 #include "qemu/main-loop.h"
 #include "qemu/memalign.h"
 #include "qapi/error.h"
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen.h"
 #include "hw/block/xen_blkif.h"
+#include "hw/xen/interface/io/ring.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/iothread.h"
 #include "xen-block.h"
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 345b284d70..87299615e3 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -19,7 +19,6 @@
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "qom/object_interfaces.h"
-#include "hw/xen/xen_common.h"
 #include "hw/block/xen_blkif.h"
 #include "hw/qdev-properties.h"
 #include "hw/xen/xen-block.h"
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4bf15f9c1f..30eedd62a3 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -47,8 +47,6 @@
 #include "hw/kvm/clock.h"
 #include "hw/sysbus.h"
 #include "hw/i2c/smbus_eeprom.h"
-#include "hw/xen/xen-x86.h"
-#include "hw/xen/xen.h"
 #include "exec/memory.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/piix4.h"
@@ -60,6 +58,8 @@
 #include 
 #include "hw/xen/xen_pt.h"
 #endif
+#include "hw/xen/xen-x86.h"
+#include "hw/xen/xen.h"
 #include "migration/global_state.h"
 #include "migration/misc.h"
 #include "sysemu/numa.h"
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index cb1d24f592..56641a550e 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -18,7 +18,7 @@
 #include "hw/irq.h"
 #include "hw/hw.h"
 #include "hw/i386/apic-msidef.h"
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen_native.h"
 #include "hw/xen/xen-legacy-backend.h"
 #include "hw/xen/xen-bus.h"
 #include "hw/xen/xen-x86.h"
@@ -52,10 +52,11 @@ static bool xen_in_migration;
 
 /* Compatibility with older version */
 
-/* This allows QEMU to build on a system that has Xen 4.5 or earlier
- * installed.  This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
- * needs to be

[PATCH v2 15/27] hw/xen: Use XEN_PAGE_SIZE in PV backend drivers

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

XC_PAGE_SIZE comes from the actual Xen libraries, while XEN_PAGE_SIZE is
provided by QEMU itself in xen_backend_ops.h. For backends which may be
built for emulation mode, use the latter.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/block/dataplane/xen-block.c |  8 
 hw/display/xenfb.c | 12 ++--
 hw/net/xen_nic.c   | 12 ++--
 hw/usb/xen-usb.c   |  8 
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index e55b713002..8322a1de82 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -101,9 +101,9 @@ static XenBlockRequest 
*xen_block_start_request(XenBlockDataPlane *dataplane)
  * re-use requests, allocate the memory once here. It will be freed
  * xen_block_dataplane_destroy() when the request list is freed.
  */
-request->buf = qemu_memalign(XC_PAGE_SIZE,
+request->buf = qemu_memalign(XEN_PAGE_SIZE,
  BLKIF_MAX_SEGMENTS_PER_REQUEST *
- XC_PAGE_SIZE);
+ XEN_PAGE_SIZE);
 dataplane->requests_total++;
 qemu_iovec_init(&request->v, 1);
 } else {
@@ -185,7 +185,7 @@ static int xen_block_parse_request(XenBlockRequest *request)
 goto err;
 }
 if (request->req.seg[i].last_sect * dataplane->sector_size >=
-XC_PAGE_SIZE) {
+XEN_PAGE_SIZE) {
 error_report("error: page crossing");
 goto err;
 }
@@ -740,7 +740,7 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
 
 dataplane->protocol = protocol;
 
-ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
+ring_size = XEN_PAGE_SIZE * dataplane->nr_ring_ref;
 switch (dataplane->protocol) {
 case BLKIF_PROTOCOL_NATIVE:
 {
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 2c4016fcbd..0074a9b6f8 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -489,13 +489,13 @@ static int xenfb_map_fb(struct XenFB *xenfb)
 }
 
 if (xenfb->pixels) {
-munmap(xenfb->pixels, xenfb->fbpages * XC_PAGE_SIZE);
+munmap(xenfb->pixels, xenfb->fbpages * XEN_PAGE_SIZE);
 xenfb->pixels = NULL;
 }
 
-xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XC_PAGE_SIZE);
+xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XEN_PAGE_SIZE);
 n_fbdirs = xenfb->fbpages * mode / 8;
-n_fbdirs = DIV_ROUND_UP(n_fbdirs, XC_PAGE_SIZE);
+n_fbdirs = DIV_ROUND_UP(n_fbdirs, XEN_PAGE_SIZE);
 
 pgmfns = g_new0(xen_pfn_t, n_fbdirs);
 fbmfns = g_new0(xen_pfn_t, xenfb->fbpages);
@@ -528,8 +528,8 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t 
fb_len_lim,
 {
 size_t mfn_sz = sizeof_field(struct xenfb_page, pd[0]);
 size_t pd_len = sizeof_field(struct xenfb_page, pd) / mfn_sz;
-size_t fb_pages = pd_len * XC_PAGE_SIZE / mfn_sz;
-size_t fb_len_max = fb_pages * XC_PAGE_SIZE;
+size_t fb_pages = pd_len * XEN_PAGE_SIZE / mfn_sz;
+size_t fb_len_max = fb_pages * XEN_PAGE_SIZE;
 int max_width, max_height;
 
 if (fb_len_lim > fb_len_max) {
@@ -930,7 +930,7 @@ static void fb_disconnect(struct XenLegacyDevice *xendev)
  *   instead.  This releases the guest pages and keeps qemu happy.
  */
 qemu_xen_foreignmem_unmap(fb->pixels, fb->fbpages);
-fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE,
+fb->pixels = mmap(fb->pixels, fb->fbpages * XEN_PAGE_SIZE,
   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
   -1, 0);
 if (fb->pixels == MAP_FAILED) {
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 166d03787d..9bbf6599fc 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -145,7 +145,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
 continue;
 }
 
-if ((txreq.offset + txreq.size) > XC_PAGE_SIZE) {
+if ((txreq.offset + txreq.size) > XEN_PAGE_SIZE) {
 xen_pv_printf(&netdev->xendev, 0, "error: page crossing\n");
 net_tx_error(netdev, &txreq, rc);
 continue;
@@ -171,7 +171,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
 if (txreq.flags & NETTXF_csum_blank) {
 /* have read-only mapping -> can't fill checksum in-place */
 if (!tmpbuf) {
-tmpbuf = g_malloc(XC_PAGE_SIZE);
+tmpbuf = g_malloc(XEN_PAGE_SIZE);
 }
 memcpy(tmpbuf, page + txreq.offset, txreq.size);
 net_checksum_calculate(tmpbuf, txreq.size, CSUM_ALL);
@@ -243,9 +243,9 @@ static ssize_t net_rx_packet(NetClie

[PATCH v2 08/27] hw/xen: Create initial XenStore nodes

2023-03-07 Thread David Woodhouse

From: Paul Durrant 

Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_xenstore.c | 70 ++
 1 file changed, 70 insertions(+)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 520422b147..fb3648a058 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -76,9 +76,39 @@ struct XenXenstoreState *xen_xenstore_singleton;
 static void xen_xenstore_event(void *opaque);
 static void fire_watch_cb(void *opaque, const char *path, const char *token);
 
+static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s,
+GList *perms,
+const char *relpath,
+const char *fmt, ...)
+{
+gchar *abspath;
+gchar *value;
+va_list args;
+GByteArray *data;
+int err;
+
+abspath = g_strdup_printf("/local/domain/%u/%s", xen_domid, relpath);
+va_start(args, fmt);
+value = g_strdup_vprintf(fmt, args);
+va_end(args);
+
+data = g_byte_array_new_take((void *)value, strlen(value));
+
+err = xs_impl_write(s->impl, DOMID_QEMU, XBT_NULL, abspath, data);
+assert(!err);
+
+g_byte_array_unref(data);
+
+err = xs_impl_set_perms(s->impl, DOMID_QEMU, XBT_NULL, abspath, perms);
+assert(!err);
+
+g_free(abspath);
+}
+
 static void xen_xenstore_realize(DeviceState *dev, Error **errp)
 {
 XenXenstoreState *s = XEN_XENSTORE(dev);
+GList *perms;
 
 if (xen_mode != XEN_EMULATE) {
 error_setg(errp, "Xen xenstore support is for Xen emulation");
@@ -102,6 +132,46 @@ static void xen_xenstore_realize(DeviceState *dev, Error 
**errp)
xen_xenstore_event, NULL, NULL, NULL, s);
 
 s->impl = xs_impl_create(xen_domid);
+
+/* Populate the default nodes */
+
+/* Nodes owned by 'dom0' but readable by the guest */
+perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, DOMID_QEMU));
+perms = g_list_append(perms, xs_perm_as_string(XS_PERM_READ, xen_domid));
+
+relpath_printf(s, perms, "", "%s", "");
+
+relpath_printf(s, perms, "domid", "%u", xen_domid);
+
+relpath_printf(s, perms, "control/platform-feature-xs_reset_watches", 
"%u", 1);
+relpath_printf(s, perms, 
"control/platform-feature-multiprocessor-suspend", "%u", 1);
+
+relpath_printf(s, perms, "platform/acpi", "%u", 1);
+relpath_printf(s, perms, "platform/acpi_s3", "%u", 1);
+relpath_printf(s, perms, "platform/acpi_s4", "%u", 1);
+relpath_printf(s, perms, "platform/acpi_laptop_slate", "%u", 0);
+
+g_list_free_full(perms, g_free);
+
+/* Nodes owned by the guest */
+perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, xen_domid));
+
+relpath_printf(s, perms, "attr", "%s", "");
+
+relpath_printf(s, perms, "control/shutdown", "%s", "");
+relpath_printf(s, perms, "control/feature-poweroff", "%u", 1);
+relpath_printf(s, perms, "control/feature-reboot", "%u", 1);
+relpath_printf(s, perms, "control/feature-suspend", "%u", 1);
+relpath_printf(s, perms, "control/feature-s3", "%u", 1);
+relpath_printf(s, perms, "control/feature-s4", "%u", 1);
+
+relpath_printf(s, perms, "data", "%s", "");
+relpath_printf(s, perms, "device", "%s", "");
+relpath_printf(s, perms, "drivers", "%s", "");
+relpath_printf(s, perms, "error", "%s", "");
+relpath_printf(s, perms, "feature", "%s", "");
+
+g_list_free_full(perms, g_free);
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
-- 
2.39.0

[PATCH v2 21/27] hw/xen: Add emulated implementation of grant table operations

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

This is limited to mapping a single grant at a time, because under Xen the
pages are mapped *contiguously* into qemu's address space, and that's very
hard to do when those pages actually come from anonymous mappings in qemu
in the first place.

Eventually perhaps we can look at using shared mappings of actual objects
for system RAM, and then we can make new mappings of the same backing
store (be it deleted files, shmem, whatever). But for now let's stick to
a page at a time.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_gnttab.c | 299 ++-
 1 file changed, 296 insertions(+), 3 deletions(-)

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 1e691ded32..2bf91d36c0 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -22,6 +22,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_gnttab.h"
 
@@ -34,11 +35,10 @@
 #define TYPE_XEN_GNTTAB "xen-gnttab"
 OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 
+static struct gnttab_backend_ops emu_gnttab_backend_ops;
+
 struct XenGnttabState {
 /*< private >*/
 SysBusDevice busdev;
@@ -57,6 +57,8 @@ struct XenGnttabState {
 MemoryRegion gnt_frames;
 MemoryRegion *gnt_aliases;
 uint64_t *gnt_frame_gpas;
+
+uint8_t *map_track;
 };
 
 struct XenGnttabState *xen_gnttab_singleton;
@@ -88,9 +90,15 @@ static void xen_gnttab_realize(DeviceState *dev, Error 
**errp)
 s->gnt_frame_gpas[i] = INVALID_GPA;
 }
 
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
 qemu_mutex_init(&s->gnt_lock);
 
 xen_gnttab_singleton = s;
+
+s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+xen_gnttab_ops = &emu_gnttab_backend_ops;
 }
 
 static int xen_gnttab_post_load(void *opaque, int version_id)
@@ -230,3 +238,288 @@ int xen_gnttab_query_size_op(struct gnttab_query_size 
*size)
 size->max_nr_frames = s->max_frames;
 return 0;
 }
+
+/* Track per-open refs, to allow close() to clean up. */
+struct active_ref {
+MemoryRegionSection mrs;
+void *virtaddr;
+uint32_t refcnt;
+int prot;
+};
+
+static void gnt_unref(XenGnttabState *s, grant_ref_t ref,
+  MemoryRegionSection *mrs, int prot)
+{
+if (mrs && mrs->mr) {
+if (prot & PROT_WRITE) {
+memory_region_set_dirty(mrs->mr, mrs->offset_within_region,
+XEN_PAGE_SIZE);
+}
+memory_region_unref(mrs->mr);
+mrs->mr = NULL;
+}
+assert(s->map_track[ref] != 0);
+
+if (--s->map_track[ref] == 0) {
+grant_entry_v1_t *gnt_p = &s->entries.v1[ref];
+qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing));
+}
+}
+
+static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot)
+{
+uint16_t mask = GTF_type_mask | GTF_sub_page;
+grant_entry_v1_t gnt, *gnt_p;
+int retries = 0;
+
+if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 ||
+s->map_track[ref] == UINT8_MAX) {
+return INVALID_GPA;
+}
+
+if (prot & PROT_WRITE) {
+mask |= GTF_readonly;
+}
+
+gnt_p = &s->entries.v1[ref];
+
+/*
+ * The guest can legitimately be changing the GTF_readonly flag. Allow
+ * that, but don't let a malicious guest cause a livelock.
+ */
+for (retries = 0; retries < 5; retries++) {
+uint16_t new_flags;
+
+/* Read the entry before an atomic operation on its flags */
+gnt = *(volatile grant_entry_v1_t *)gnt_p;
+
+if ((gnt.flags & mask) != GTF_permit_access ||
+gnt.domid != DOMID_QEMU) {
+return INVALID_GPA;
+}
+
+new_flags = gnt.flags | GTF_reading;
+if (prot & PROT_WRITE) {
+new_flags |= GTF_writing;
+}
+
+if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) 
{
+return (uint64_t)gnt.frame << XEN_PAGE_SHIFT;
+}
+}
+
+return INVALID_GPA;
+}
+
+struct xengntdev_handle {
+GHashTable *active_maps;
+};
+
+static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt,
+uint32_t nr_grants)
+{
+return 0;
+}
+
+static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt,
+uint32_t count, uint32_t domid,
+uint32_t *refs, in

[PATCH v2 17/27] hw/xen: Build PV backend drivers for CONFIG_XEN_BUS

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Now that we have the redirectable Xen backend operations we can build the
PV backends even without the Xen libraries.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/9pfs/meson.build| 2 +-
 hw/block/dataplane/meson.build | 2 +-
 hw/block/meson.build   | 2 +-
 hw/char/meson.build| 2 +-
 hw/display/meson.build | 2 +-
 hw/usb/meson.build | 2 +-
 hw/xen/meson.build | 5 -
 7 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/hw/9pfs/meson.build b/hw/9pfs/meson.build
index 12443b6ad5..fd37b7a02d 100644
--- a/hw/9pfs/meson.build
+++ b/hw/9pfs/meson.build
@@ -15,7 +15,7 @@ fs_ss.add(files(
 ))
 fs_ss.add(when: 'CONFIG_LINUX', if_true: files('9p-util-linux.c'))
 fs_ss.add(when: 'CONFIG_DARWIN', if_true: files('9p-util-darwin.c'))
-fs_ss.add(when: 'CONFIG_XEN', if_true: files('xen-9p-backend.c'))
+fs_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-9p-backend.c'))
 softmmu_ss.add_all(when: 'CONFIG_FSDEV_9P', if_true: fs_ss)
 
 specific_ss.add(when: 'CONFIG_VIRTIO_9P', if_true: files('virtio-9p-device.c'))
diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build
index 12c6a264f1..78d7ac1a11 100644
--- a/hw/block/dataplane/meson.build
+++ b/hw/block/dataplane/meson.build
@@ -1,2 +1,2 @@
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
-specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
+specific_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c'))
diff --git a/hw/block/meson.build b/hw/block/meson.build
index b434d5654c..cc2a75cc50 100644
--- a/hw/block/meson.build
+++ b/hw/block/meson.build
@@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PFLASH_CFI02', if_true: 
files('pflash_cfi02.c'))
 softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c'))
 softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80_sfdp.c'))
 softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c'))
 softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
 
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c', 
'virtio-blk-common.c'))
diff --git a/hw/char/meson.build b/hw/char/meson.build
index 7b594f51b8..e02c60dd54 100644
--- a/hw/char/meson.build
+++ b/hw/char/meson.build
@@ -18,7 +18,7 @@ softmmu_ss.add(when: 'CONFIG_SERIAL_PCI', if_true: 
files('serial-pci.c'))
 softmmu_ss.add(when: 'CONFIG_SERIAL_PCI_MULTI', if_true: 
files('serial-pci-multi.c'))
 softmmu_ss.add(when: 'CONFIG_SHAKTI_UART', if_true: files('shakti_uart.c'))
 softmmu_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-console.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen_console.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen_console.c'))
 softmmu_ss.add(when: 'CONFIG_XILINX', if_true: files('xilinx_uartlite.c'))
 
 softmmu_ss.add(when: 'CONFIG_AVR_USART', if_true: files('avr_usart.c'))
diff --git a/hw/display/meson.build b/hw/display/meson.build
index f470179122..4191694380 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PL110', if_true: 
files('pl110.c'))
 softmmu_ss.add(when: 'CONFIG_SII9022', if_true: files('sii9022.c'))
 softmmu_ss.add(when: 'CONFIG_SSD0303', if_true: files('ssd0303.c'))
 softmmu_ss.add(when: 'CONFIG_SSD0323', if_true: files('ssd0323.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xenfb.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xenfb.c'))
 
 softmmu_ss.add(when: 'CONFIG_VGA_PCI', if_true: files('vga-pci.c'))
 softmmu_ss.add(when: 'CONFIG_VGA_ISA', if_true: files('vga-isa.c'))
diff --git a/hw/usb/meson.build b/hw/usb/meson.build
index bdf34cbd3e..599dc24f0d 100644
--- a/hw/usb/meson.build
+++ b/hw/usb/meson.build
@@ -84,6 +84,6 @@ if libusb.found()
   hw_usb_modules += {'host': usbhost_ss}
 endif
 
-softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN', libusb], if_true: 
files('xen-usb.c'))
+softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN_BUS', libusb], if_true: 
files('xen-usb.c'))
 
 modules += { 'hw-usb': hw_usb_modules }
diff --git a/hw/xen/meson.build b/hw/xen/meson.build
index f195bbd25c..19c6aabc7c 100644
--- a/hw/xen/meson.build
+++ b/hw/xen/meson.build
@@ -1,10 +1,13 @@
-softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
+softmmu_ss.add(when: ['CONFIG_XEN_BUS'], if_true: files(
   'xen-backend.c',
   'xen-bus-helper.c',
   'xen-bus.c',
   'xen-legacy-backend.c',
   'xen_devconfig.c',
   'xen_pvdev.c',
+))
+
+softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
   'xen-operations.c',
 ))
 
-- 
2.39.0

[PATCH v2 25/27] i386/xen: Initialize Xen backends from pc_basic_device_init() for emulation

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Now that all the work is done to enable the PV backends to work without
actual Xen, instantiate the bus from pc_basic_device_init() for emulated
mode.

This allows us finally to launch an emulated Xen guest with PV disk.

   qemu-system-x86_64 -serial mon:stdio -M q35 -cpu host -display none \
 -m 1G -smp 2 -accel kvm,xen-version=0x4000a,kernel-irqchip=split \
 -kernel bzImage -append "console=ttyS0 root=/dev/xvda1" \
 -drive file=/var/lib/libvirt/images/fedora28.qcow2,if=none,id=disk \
 -device xen-disk,drive=disk,vdev=xvda

If we use -M pc instead of q35, we can even add an IDE disk and boot a
guest image normally through grub. But q35 gives us AHCI and that isn't
unplugged by the Xen magic, so the guests ends up seeing "both" disks.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/pc.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7bebea57e3..1489abf010 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -102,6 +102,11 @@
 #include "trace.h"
 #include CONFIG_DEVICES
 
+#ifdef CONFIG_XEN_EMU
+#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen-bus.h"
+#endif
+
 /*
  * Helper for setting model-id for CPU models that changed model-id
  * depending on QEMU versions up to QEMU 2.4.
@@ -1318,6 +1323,8 @@ void pc_basic_device_init(struct PCMachineState *pcms,
 if (pcms->bus) {
 pci_create_simple(pcms->bus, -1, "xen-platform");
 }
+xen_bus_init();
+xen_be_init();
 }
 #endif
 
-- 
2.39.0

[PATCH v2 20/27] hw/xen: Hook up emulated implementation for event channel operations

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

We provided the backend-facing evtchn functions very early on as part of
the core Xen platform support, since things like timers and xenstore need
to use them.

By what may or may not be an astonishing coincidence, those functions
just *happen* all to have exactly the right function prototypes to slot
into the evtchn_backend_ops table and be called by the PV backends.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 886fbf6b3b..98a7b85047 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -34,6 +34,7 @@
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "hw/irq.h"
+#include "hw/xen/xen_backend_ops.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -278,6 +279,17 @@ static const TypeInfo xen_evtchn_info = {
 .class_init= xen_evtchn_class_init,
 };
 
+static struct evtchn_backend_ops emu_evtchn_backend_ops = {
+.open = xen_be_evtchn_open,
+.bind_interdomain = xen_be_evtchn_bind_interdomain,
+.unbind = xen_be_evtchn_unbind,
+.close = xen_be_evtchn_close,
+.get_fd = xen_be_evtchn_fd,
+.notify = xen_be_evtchn_notify,
+.unmask = xen_be_evtchn_unmask,
+.pending = xen_be_evtchn_pending,
+};
+
 static void gsi_assert_bh(void *opaque)
 {
 struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
@@ -318,6 +330,9 @@ void xen_evtchn_create(void)
 s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
 s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
 s->pirq = g_new0(struct pirq_info, s->nr_pirqs);
+
+/* Set event channel functions for backend drivers to use */
+xen_evtchn_ops = &emu_evtchn_backend_ops;
 }
 
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
-- 
2.39.0

[PATCH v2 12/27] hw/xen: Add foreignmem operations to allow redirection to internal emulation

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/char/xen_console.c|  8 +++---
 hw/display/xenfb.c   | 20 +++---
 hw/xen/xen-operations.c  | 45 
 include/hw/xen/xen_backend_ops.h | 26 ++
 include/hw/xen/xen_common.h  | 13 -
 softmmu/globals.c|  1 +
 tests/unit/test-xs-node.c|  1 +
 7 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 19ad6c946a..e9cef3e1ef 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -237,9 +237,9 @@ static int con_initialise(struct XenLegacyDevice *xendev)
 
 if (!xendev->dev) {
 xen_pfn_t mfn = con->ring_ref;
-con->sring = xenforeignmemory_map(xen_fmem, con->xendev.dom,
-  PROT_READ | PROT_WRITE,
-  1, &mfn, NULL);
+con->sring = qemu_xen_foreignmem_map(con->xendev.dom, NULL,
+ PROT_READ | PROT_WRITE,
+ 1, &mfn, NULL);
 } else {
 con->sring = xen_be_map_grant_ref(xendev, con->ring_ref,
   PROT_READ | PROT_WRITE);
@@ -269,7 +269,7 @@ static void con_disconnect(struct XenLegacyDevice *xendev)
 
 if (con->sring) {
 if (!xendev->dev) {
-xenforeignmemory_unmap(xen_fmem, con->sring, 1);
+qemu_xen_foreignmem_unmap(con->sring, 1);
 } else {
 xen_be_unmap_grant_ref(xendev, con->sring, con->ring_ref);
 }
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 260eb38a76..2c4016fcbd 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -98,8 +98,9 @@ static int common_bind(struct common *c)
 if (xenstore_read_fe_int(&c->xendev, "event-channel", 
&c->xendev.remote_port) == -1)
 return -1;
 
-c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom,
-   PROT_READ | PROT_WRITE, 1, &mfn, NULL);
+c->page = qemu_xen_foreignmem_map(c->xendev.dom, NULL,
+  PROT_READ | PROT_WRITE, 1, &mfn,
+  NULL);
 if (c->page == NULL)
 return -1;
 
@@ -115,7 +116,7 @@ static void common_unbind(struct common *c)
 {
 xen_pv_unbind_evtchn(&c->xendev);
 if (c->page) {
-xenforeignmemory_unmap(xen_fmem, c->page, 1);
+qemu_xen_foreignmem_unmap(c->page, 1);
 c->page = NULL;
 }
 }
@@ -500,15 +501,16 @@ static int xenfb_map_fb(struct XenFB *xenfb)
 fbmfns = g_new0(xen_pfn_t, xenfb->fbpages);
 
 xenfb_copy_mfns(mode, n_fbdirs, pgmfns, pd);
-map = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom,
-   PROT_READ, n_fbdirs, pgmfns, NULL);
+map = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL, PROT_READ,
+  n_fbdirs, pgmfns, NULL);
 if (map == NULL)
 goto out;
 xenfb_copy_mfns(mode, xenfb->fbpages, fbmfns, map);
-xenforeignmemory_unmap(xen_fmem, map, n_fbdirs);
+qemu_xen_foreignmem_unmap(map, n_fbdirs);
 
-xenfb->pixels = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom,
-PROT_READ, xenfb->fbpages, fbmfns, NULL);
+xenfb->pixels = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL,
+PROT_READ, xenfb->fbpages,
+fbmfns, NULL);
 if (xenfb->pixels == NULL)
 goto out;
 
@@ -927,7 +929,7 @@ static void fb_disconnect(struct XenLegacyDevice *xendev)
  *   Replacing the framebuffer with anonymous shared memory
  *   instead.  This releases the guest pages and keeps qemu happy.
  */
-xenforeignmemory_unmap(xen_fmem, fb->pixels, fb->fbpages);
+qemu_xen_foreignmem_unmap(fb->pixels, fb->fbpages);
 fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE,
   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
   -1, 0);
diff --git a/hw/xen/xen-operations.c b/hw/xen/xen-operations.c
index c5956d28c6..440e566bb1 100644
--- a/hw/xen/xen-operations.c
+++ b/hw/xen/xen-operations.c
@@ -22,6 +22,7 @@
  */
 #undef XC_WANT_COMPAT_EVTCHN_API
 #undef XC_WANT_COMPAT_GNTTAB_API
+#undef XC_WANT_COMPAT_MAP_FOREIGN_API
 
 #include 
 
@@ -56,10 +57,13 @@ typedef xc_gnttab xengnttab_handle;
 #define xengnttab_map_domain_grant_refs(h, c, d, r, p) \
 xc_gnttab_map_domain_grant_refs(h, c, d, r, p)
 
+typedef xc_interface xenforeignmemory_handle;
+
 #else /* CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40701 */
 
 #include 
 #include 
+#include 
 
 #endif
 
@@ -218,6 +222,46 @@

[PATCH v2 18/27] hw/xen: Avoid crash when backend watch fires too early

2023-03-07 Thread David Woodhouse

From: Paul Durrant 

The xen-block code ends up calling aio_poll() through blkconf_geometry(),
which means we see watch events during the indirect call to
xendev_class->realize() in xen_device_realize(). Unfortunately this call
is made before populating the initial frontend and backend device nodes
in xenstore and hence xen_block_frontend_changed() (which is called from
a watch event) fails to read the frontend's 'state' node, and hence
believes the device is being torn down. This in-turn sets the backend
state to XenbusStateClosed and causes the device to be deleted before it
is fully set up, leading to the crash.
By simply moving the call to xendev_class->realize() after the initial
xenstore nodes are populated, this sorry state of affairs is avoided.

Reported-by: David Woodhouse 
Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/xen/xen-bus.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index 9fe54967d4..c59850b1de 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -1034,13 +1034,6 @@ static void xen_device_realize(DeviceState *dev, Error 
**errp)
 goto unrealize;
 }
 
-if (xendev_class->realize) {
-xendev_class->realize(xendev, errp);
-if (*errp) {
-goto unrealize;
-}
-}
-
 xen_device_backend_printf(xendev, "frontend", "%s",
   xendev->frontend_path);
 xen_device_backend_printf(xendev, "frontend-id", "%u",
@@ -1059,6 +1052,13 @@ static void xen_device_realize(DeviceState *dev, Error 
**errp)
 xen_device_frontend_set_state(xendev, XenbusStateInitialising, true);
 }
 
+if (xendev_class->realize) {
+xendev_class->realize(xendev, errp);
+if (*errp) {
+goto unrealize;
+}
+}
+
 xendev->exit.notify = xen_device_exit;
 qemu_add_exit_notifier(&xendev->exit);
 return;
-- 
2.39.0

[PATCH v2 14/27] hw/xen: Move xenstore_store_pv_console_info to xen_console.c

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

There's no need for this to be in the Xen accel code, and as we want to
use the Xen console support with KVM-emulated Xen we'll want to have a
platform-agnostic version of it. Make it use GString to build up the
path while we're at it.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 accel/xen/xen-all.c   | 61 ---
 hw/char/xen_console.c | 45 +--
 include/hw/xen/xen.h  |  2 --
 3 files changed, 43 insertions(+), 65 deletions(-)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 425216230f..2d51c41e40 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -29,67 +29,6 @@ xc_interface *xen_xc;
 xenforeignmemory_handle *xen_fmem;
 xendevicemodel_handle *xen_dmod;
 
-static int store_dev_info(int domid, Chardev *cs, const char *string)
-{
-struct xs_handle *xs = NULL;
-char *path = NULL;
-char *newpath = NULL;
-char *pts = NULL;
-int ret = -1;
-
-/* Only continue if we're talking to a pty. */
-if (!CHARDEV_IS_PTY(cs)) {
-return 0;
-}
-pts = cs->filename + 4;
-
-/* We now have everything we need to set the xenstore entry. */
-xs = xs_open(0);
-if (xs == NULL) {
-fprintf(stderr, "Could not contact XenStore\n");
-goto out;
-}
-
-path = xs_get_domain_path(xs, domid);
-if (path == NULL) {
-fprintf(stderr, "xs_get_domain_path() error\n");
-goto out;
-}
-newpath = realloc(path, (strlen(path) + strlen(string) +
-strlen("/tty") + 1));
-if (newpath == NULL) {
-fprintf(stderr, "realloc error\n");
-goto out;
-}
-path = newpath;
-
-strcat(path, string);
-strcat(path, "/tty");
-if (!xs_write(xs, XBT_NULL, path, pts, strlen(pts))) {
-fprintf(stderr, "xs_write for '%s' fail", string);
-goto out;
-}
-ret = 0;
-
-out:
-free(path);
-xs_close(xs);
-
-return ret;
-}
-
-void xenstore_store_pv_console_info(int i, Chardev *chr)
-{
-if (i == 0) {
-store_dev_info(xen_domid, chr, "/console");
-} else {
-char buf[32];
-snprintf(buf, sizeof(buf), "/device/console/%d", i);
-store_dev_info(xen_domid, chr, buf);
-}
-}
-
-
 static void xenstore_record_dm_state(const char *state)
 {
 struct xs_handle *xs;
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index ad8638a86d..c7a19c0e7c 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -173,6 +173,48 @@ static void xencons_send(struct XenConsole *con)
 
 /*  */
 
+static int store_con_info(struct XenConsole *con)
+{
+Chardev *cs = qemu_chr_fe_get_driver(&con->chr);
+char *pts = NULL;
+char *dom_path;
+GString *path;
+int ret = -1;
+
+/* Only continue if we're talking to a pty. */
+if (!CHARDEV_IS_PTY(cs)) {
+return 0;
+}
+pts = cs->filename + 4;
+
+dom_path = qemu_xen_xs_get_domain_path(xenstore, xen_domid);
+if (!dom_path) {
+return 0;
+}
+
+path = g_string_new(dom_path);
+free(dom_path);
+
+if (con->xendev.dev) {
+g_string_append_printf(path, "/device/console/%d", con->xendev.dev);
+} else {
+g_string_append(path, "/console");
+}
+g_string_append(path, "/tty");
+
+if (xenstore_write_str(con->console, path->str, pts)) {
+fprintf(stderr, "xenstore_write_str for '%s' fail", path->str);
+goto out;
+}
+ret = 0;
+
+out:
+g_string_free(path, true);
+free(path);
+
+return ret;
+}
+
 static int con_init(struct XenLegacyDevice *xendev)
 {
 struct XenConsole *con = container_of(xendev, struct XenConsole, xendev);
@@ -215,8 +257,7 @@ static int con_init(struct XenLegacyDevice *xendev)
  &error_abort);
 }
 
-xenstore_store_pv_console_info(con->xendev.dev,
-   qemu_chr_fe_get_driver(&con->chr));
+store_con_info(con);
 
 out:
 g_free(type);
diff --git a/include/hw/xen/xen.h b/include/hw/xen/xen.h
index 03983939f9..56b1c2a827 100644
--- a/include/hw/xen/xen.h
+++ b/include/hw/xen/xen.h
@@ -39,8 +39,6 @@ int xen_is_pirq_msi(uint32_t msi_data);
 
 qemu_irq *xen_interrupt_controller_init(void);
 
-void xenstore_store_pv_console_info(int i, Chardev *chr);
-
 void xen_register_framebuffer(struct MemoryRegion *mr);
 
 #endif /* QEMU_HW_XEN_H */
-- 
2.39.0

[PATCH v2 24/27] hw/xen: Implement soft reset for emulated gnttab

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

This is only part of it; we will also need to get the PV back end drivers
to tear down their own mappings (or do it for them, but they kind of need
to stop using the pointers too).

Some more work on the actual PV back ends and xen-bus code is going to be
needed to really make soft reset and migration fully functional, and this
part is the basis for that.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_gnttab.c  | 26 --
 hw/i386/kvm/xen_gnttab.h  |  1 +
 target/i386/kvm/xen-emu.c |  5 +
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 2bf91d36c0..21c30e3659 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -72,13 +72,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error 
**errp)
 error_setg(errp, "Xen grant table support is for Xen emulation");
 return;
 }
-s->nr_frames = 0;
 s->max_frames = kvm_xen_get_gnttab_max_frames();
 memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
XEN_PAGE_SIZE * s->max_frames, &error_abort);
 memory_region_set_enabled(&s->gnt_frames, true);
 s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
-memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
 
 /* Create individual page-sizes aliases for overlays */
 s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
@@ -90,8 +88,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error 
**errp)
 s->gnt_frame_gpas[i] = INVALID_GPA;
 }
 
+s->nr_frames = 0;
+memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
 s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
 s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
 qemu_mutex_init(&s->gnt_lock);
 
 xen_gnttab_singleton = s;
@@ -523,3 +524,24 @@ static struct gnttab_backend_ops emu_gnttab_backend_ops = {
 .unmap = xen_be_gnttab_unmap,
 };
 
+int xen_gnttab_reset(void)
+{
+XenGnttabState *s = xen_gnttab_singleton;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+QEMU_LOCK_GUARD(&s->gnt_lock);
+
+s->nr_frames = 0;
+
+memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
+memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index 3bdbe96191..ee215239b0 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -13,6 +13,7 @@
 #define QEMU_XEN_GNTTAB_H
 
 void xen_gnttab_create(void);
+int xen_gnttab_reset(void);
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
 struct gnttab_set_version;
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index bad3131d08..0bb6c601c9 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -1406,6 +1406,11 @@ int kvm_xen_soft_reset(void)
 return err;
 }
 
+err = xen_gnttab_reset();
+if (err) {
+return err;
+}
+
 err = xen_xenstore_reset();
 if (err) {
 return err;
-- 
2.39.0

Re: [RFC PATCH v1 07/25] hw/xen: Implement core serialize/deserialize methods for xenstore_impl

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 16:39 +, Paul Durrant wrote:
> On 07/03/2023 16:33, David Woodhouse wrote:
> > On Thu, 2023-03-02 at 15:34 +0000, David Woodhouse wrote:
> > > From: David Woodhouse 
> > > 
> > > In fact I think we want to only serialize the contents of the domain's
> > > path in /local/domain/${domid} and leave the rest to be recreated? Will
> > > defer to Paul for that.
> > > 
> > > Signed-off-by: David Woodhouse 
> > 
> > Paul, your Reviewed-by: on this one is conspicuous in its absence. I
> > mentioned migration in the cover letter — this much is working fine,
> > but it's the PV back ends that don't yet work.
> > 
> > I'd quite like to merge the basic serialization/deserialization of
> > XenStore itself, with the unit tests.
> 
> The patch is basically ok, I just think the serialization should be 
> limited to the guest nodes... filtering out those not owned by xen_domid 
> would probably work for that.

Yeah, so let's just do this (as part of this patch #7) for now:

--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -235,6 +235,7 @@ static int xen_xenstore_post_load(void *opaque, int
ver)
 
 static const VMStateDescription xen_xenstore_vmstate = {
 .name = "xen_xenstore",
+.unmigratable = 1, /* The PV back ends don't migrate yet */
 .version_id = 1,
 .minimum_version_id = 1,
 .needed = xen_xenstore_is_needed,


It means we can't migrate guests even if they're only using fully
emulated devices... but I think that's a reasonable limitation until we
implement it fully.




smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH v1 07/25] hw/xen: Implement core serialize/deserialize methods for xenstore_impl

2023-03-07 Thread David Woodhouse

On Thu, 2023-03-02 at 15:34 +, David Woodhouse wrote:
> From: David Woodhouse 
> 
> In fact I think we want to only serialize the contents of the domain's
> path in /local/domain/${domid} and leave the rest to be recreated? Will
> defer to Paul for that.
> 
> Signed-off-by: David Woodhouse 

Paul, your Reviewed-by: on this one is conspicuous in its absence. I
mentioned migration in the cover letter — this much is working fine,
but it's the PV back ends that don't yet work.

I'd quite like to merge the basic serialization/deserialization of
XenStore itself, with the unit tests.

Perhaps we can also set TYPE_XEN_DEVICE or TYPE_XEN_BLOCK_DEVICE to be
unmigratable? Ideally I think we want TYPE_XEN_DEVICE to be
unmigratable by default *unless* the specific device class (including
net and other as we port them from XenLegacyDevice) says otherwise.

Is there a way to do that?

smime.p7s
Description: S/MIME cryptographic signature

[RFC PATCH v1 27/25] docs: Update Xen-on-KVM documentation for PV disk support

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 docs/system/i386/xen.rst | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/docs/system/i386/xen.rst b/docs/system/i386/xen.rst
index a00523b492..f06765e88c 100644
--- a/docs/system/i386/xen.rst
+++ b/docs/system/i386/xen.rst
@@ -9,6 +9,8 @@ KVM has support for hosting Xen guests, intercepting Xen 
hypercalls and event
 channel (Xen PV interrupt) delivery. This allows guests which expect to be
 run under Xen to be hosted in QEMU under Linux/KVM instead.
 
+Using the split irqchip is mandatory for Xen support.
+
 Setup
 -
 
@@ -17,14 +19,14 @@ accelerator, for example for Xen 4.10:
 
 .. parsed-literal::
 
-  |qemu_system| --accel kvm,xen-version=0x4000a
+  |qemu_system| --accel kvm,xen-version=0x4000a,kernel-irqchip=split
 
 Additionally, virtual APIC support can be advertised to the guest through the
 ``xen-vapic`` CPU flag:
 
 .. parsed-literal::
 
-  |qemu_system| --accel kvm,xen-version=0x4000a --cpu host,+xen_vapic
+  |qemu_system| --accel kvm,xen-version=0x4000a,kernel-irqchip=split --cpu 
host,+xen_vapic
 
 When Xen support is enabled, QEMU changes hypervisor identification (CPUID
 0x4000..0x400A) to Xen. The KVM identification and features are not
@@ -33,11 +35,25 @@ moves to leaves 0x4100..0x410A.
 
 The Xen platform device is enabled automatically for a Xen guest. This allows
 a guest to unplug all emulated devices, in order to use Xen PV block and 
network
-drivers instead. Note that until the Xen PV device back ends are enabled to 
work
-with Xen mode in QEMU, that is unlikely to cause significant joy. Linux guests
-can be dissuaded from this by adding 'xen_emul_unplug=never' on their command
-line, and it can also be noted that AHCI disk controllers are exempt from being
-unplugged, as are passthrough VFIO PCI devices.
+drivers instead. Under Xen, the boot disk is typically available both via IDE
+emulation, and as a PV block device. Guest bootloaders typically use IDE to 
load
+the guest kernel, which then unplugs the IDE and continues with the Xen PV 
block
+device.
+
+This configuration can be achieved as follows
+
+.. parsed-literal::
+
+  |qemu_system| -M pc --accel kvm,xen-version=0x4000a,kernel-irqchip=split \\
+   -drive file=${GUEST_IMAGE},if=none,id=disk,file.locking=off -device 
xen-disk,drive=disk,vdev=xvda \\
+   -drive file=${GUEST_IMAGE},index=2,media=disk,file.locking=off,if=ide
+
+It is necessary to use the pc machine type, as the q35 machine uses AHCI 
instead
+of legacy IDE, and AHCI disks are not unplugged through the Xen PV unplug
+mechanism.
+
+VirtIO devices can also be used; Linux guests may need to be dissuaded from
+umplugging them by adding 'xen_emul_unplug=never' on their command line.
 
 Properties
 --
-- 
2.34.1




smime.p7s
Description: S/MIME cryptographic signature

[RFC PATCH v1 26/25] MAINTAINERS: Add entry for Xen on KVM emulation

2023-03-07 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 MAINTAINERS | 9 +
 1 file changed, 9 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index da29661b37..76b705e467 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -443,6 +443,15 @@ F: target/i386/kvm/
 F: target/i386/sev*
 F: scripts/kvm/vmxcap
 
+Xen emulation on X86 KVM CPUs
+M: David Woodhouse 
+M: Paul Durrant 
+S: Supported
+F: include/sysemu/kvm_xen.h
+F: target/i386/kvm/xen*
+F: hw/i386/kvm/xen*
+F: tests/avocado/xen_guest.py
+
 Guest CPU Cores (other accelerators)
 
 Overall
-- 
2.34.1




smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH v1 21/25] hw/xen: Add emulated implementation of grant table operations

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 16:07 +, Paul Durrant wrote:
> On 02/03/2023 15:34, David Woodhouse wrote:
> > From: David Woodhouse 
> > 
> > This is limited to mapping a single grant at a time, because under Xen the
> > pages are mapped *contiguously* into qemu's address space, and that's very
> > hard to do when those pages actually come from anonymous mappings in qemu
> > in the first place.
> > 
> > Eventually perhaps we can look at using shared mappings of actual objects
> > for system RAM, and then we can make new mappings of the same backing
> > store (be it deleted files, shmem, whatever). But for now let's stick to
> > a page at a time.
> > 
> > Signed-off-by: David Woodhouse 
> > ---
> >   hw/i386/kvm/xen_gnttab.c | 299 ++-
> >   1 file changed, 296 insertions(+), 3 deletions(-)
> > 
> [snip]
> > +static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot)
> > +{
> > +    uint16_t mask = GTF_type_mask | GTF_sub_page;
> > +    grant_entry_v1_t gnt, *gnt_p;
> > +    int retries = 0;
> > +
> > +    if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 ||
> > +    s->map_track[ref] == UINT8_MAX) {
> > +    return INVALID_GPA;
> > +    }
> > +
> > +    if (prot & PROT_WRITE) {
> > +    mask |= GTF_readonly;
> > +    }
> > +
> > +    gnt_p = &s->entries.v1[ref];
> > +
> > +    /*
> > + * The guest can legitimately be changing the GTF_readonly flag. Allow
> 
> I'd call a guest playing with the ref after setting GTF_permit_access a 
> buggy guest and not bother with the loop.

Didn't we have this argument already when I tried to get you to change
your one? :)

I argue that it's OK for a guest to *increase* permissions to include
write permission, even while a read-only grant may be in progress.

And also to *decrease* permissions to take away write permission, even
while read-only grants may be in progress.

The loop is what Xen does, so let's do it that way.

> 
> > + * that, but don't let a malicious guest cause a livelock.
> > + */
> > +    for (retries = 0; retries < 5; retries++) {
> > +    uint16_t new_flags;
> > +
> > +    /* Read the entry before an atomic operation on its flags */
> > +    gnt = *(volatile grant_entry_v1_t *)gnt_p;
> > +
> > +    if ((gnt.flags & mask) != GTF_permit_access ||
> > +    gnt.domid != DOMID_QEMU) {
> > +    return INVALID_GPA;
> > +    }
> > +
> > +    new_flags = gnt.flags | GTF_reading;
> > +    if (prot & PROT_WRITE) {
> > +    new_flags |= GTF_writing;
> > +    }
> > +
> > +    if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == 
> > gnt.flags) {
> 
> Xen actually does a cmpxchg on both the flags and the domid. We probably 
> ought to fail to set the flags if the guest is playing with the domid
> but since we're single-tenant it doesn't *really* matter... just a 
> nice-to-have. So...

Yeah, changing the *domid* while it's active is definitely not an OK
thing to do.



smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH v1 13/25] hw/xen: Add xenstore operations to allow redirection to internal emulation

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 14:44 +, Paul Durrant wrote:
> On 02/03/2023 15:34, David Woodhouse wrote:
> > From: Paul Durrant 
> > 
> > Signed-off-by: Paul Durrant 
> > Signed-off-by: David Woodhouse 
> > ---
> 
> Reviewed-by: Paul Durrant 

You're reviewing your own code on some of those... :)

Do we need to get review from *another* person listed in MAINTAINERS
for Xen? Or shall I add my own R-by tags for those ones too?

smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH v1 12/25] hw/xen: Add foreignmem operations to allow redirection to internal emulation

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 14:40 +, Paul Durrant wrote:
> 
> > -
> > -#define xenforeignmemory_unmap(h, p, s) munmap(p, s * XC_PAGE_SIZE)
> > -
> 
> Actually, probably best 'static inline' that, or at least put brackets 
> round the 'p' and 's' for safety.
> 
That's the one we're *removing* :)

> With either one of those options chosen...
> 
> Reviewed-by: Paul Durrant 

Taking that anyway.


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH v9 2/7] hw/intc/i8259: Implement legacy LTIM Edge/Level Bank Select

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 10:36 -0300, Daniel Henrique Barboza wrote:
> 
> Checkpatch will nag about it claiming that we need spaces between
> '*'. The maintainer can fix it in-tree though.

I saw that and explicitly didn't care. 3/5 of the existing examples
within the tree look like that one — which is how the docs tell us to
do it — so I figured checkpatch could just be sad.

> Reviewed-by: Daniel Henrique Barboza 

Thanks.

smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH v1 05/25] hw/xen: Watches on XenStore transactions

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 13:32 +, Paul Durrant wrote:
> 
> Reviewed-by: Paul Durrant 
> 
> ... with a couple of nits in comments called out below...

Thanks, I'm fixing these and pushing them out to my tree at
https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
(and occasionally to Gitlab CI) as you go.

Still harbouring the fantasy that we might manage to make the soft
freeze deadline :)

smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH v1 03/25] hw/xen: Implement XenStore watches

2023-03-07 Thread David Woodhouse

On Tue, 2023-03-07 at 11:29 +, Paul Durrant wrote:
> 
> I think you could stash a tail pointer here...
> 
> > +    }
> > +
> > +    if (dom_id && s->nr_domu_watches >= XS_MAX_WATCHES) {
> > +    return E2BIG;
> > +    }
> > +
> > +    w = g_new0(XsWatch, 1);
> > +    w->token = g_strdup(token);
> > +    w->cb = fn;
> > +    w->cb_opaque = opaque;
> > +    w->dom_id = dom_id;
> > +    w->rel_prefix = strlen(abspath) - strlen(path);
> > +
> > +    l = g_hash_table_lookup(s->watches, abspath);
> 
> ... to avoid the duplicate hash lookup here.

Good point. The EEXIST check was added later as I was reviewing and
comparing with the real xenstored, so I didn't spot that. Thanks.

--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -673,7 +673,7 @@ int xs_impl_watch(XenstoreImplState *s, unsigned int 
dom_id, const char *path,
 }
 
 /* Check for duplicates */
-w = g_hash_table_lookup(s->watches, abspath);
+l = w = g_hash_table_lookup(s->watches, abspath);
 while (w) {
 if (!g_strcmp0(token, w->token) &&  opaque == w->cb_opaque &&
 fn == w->cb && dom_id == w->dom_id) {
@@ -693,7 +693,7 @@ int xs_impl_watch(XenstoreImplState *s, unsigned int 
dom_id, const char *path,
 w->dom_id = dom_id;
 w->rel_prefix = strlen(abspath) - strlen(path);
 
-l = g_hash_table_lookup(s->watches, abspath);
+/* l was looked up above when checking for duplicates */
 if (l) {
 w->next = l->next;
 l->next = w;



smime.p7s
Description: S/MIME cryptographic signature

Re: [SeaBIOS] Re: [SeaBIOS PATCH] xen: require Xen info structure at 0x1000 to detect Xen

2023-03-07 Thread David Woodhouse

On Thu, 2023-02-02 at 10:10 +0100, Gerd Hoffmann wrote:
> > Thanks, Kevin.
> > 
> > I'd like to get the rest of the Xen platform support in to qemu 8.0
> > if
> > possible. Which is currently scheduled for March.
> > 
> > Is there likely to be a SeaBIOS release before then which Gerd
> > would
> > pull into qemu anyway, or should I submit a submodule update to a
> > snapshot of today's tree? That would just pull in this commit, and
> > the
> > one other fix that's in the SeaBIOS tree since 1.16.1?
> 
> Tagging 1.16.2 in time for the qemu 8.0 should not be a problem given
> that we have only bugfixes in master.  Roughly around soft freeze is
> probably a good time for that.

That's, erm, at the *end* of today 2023-03-07, and the freeze happens
only *after* Paul has reviewed the phase 2 Xen PV back end support,
right?

https://wiki.qemu.org/Planning/8.0


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 4/4] test/avocado: test Linux boot up in x2APIC with userspace local APIC

2023-03-06 Thread David Woodhouse

On Tue, 2023-02-21 at 23:05 +0700, Bui Quang Minh wrote:
> 
> +    def test_physical_x2apic(self):
> +    """
> +    :avocado: tags=physical_x2apic
> +    """
> +
> +    self.common_vm_setup(True)
> +
> +    self.kernel_params = (self.distro.default_kernel_params +
> +  ' quiet intel_iommu=on x2apic_phys')
> +    self.run_and_check()
> +    self.ssh_command('dmesg | grep "Switched APIC routing to physical 
> x2apic"')
> +
> +    def test_cluster_x2apic(self):
> +    """
> +    :avocado: tags=cluster_x2apic
> +    """
> +
> +    self.common_vm_setup(True)
> +    self.kernel_params = (self.distro.default_kernel_params +
> +  ' quiet intel_iommu=on')
> +    self.run_and_check()
> +    self.ssh_command('dmesg | grep "Switched APIC routing to cluster 
> x2apic"')

Shouldn't Linux also enable X2APIC in physical mode if it doesn't have
an IOMMU? It'll just refuse to online any CPUs with APIC ID > 255.

For bonus points, make the Linux guest realise that we can do those
extra 7 bits of Extended Destinatation ID, which it normally detects
from the KVM_FEATURE_MSI_EXT_DEST_ID bit under KVM.


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 0/4] Support x2APIC mode with TCG accelerator

2023-03-06 Thread David Woodhouse

On Tue, 2023-02-21 at 23:04 +0700, Bui Quang Minh wrote:
> This series implements x2APIC mode in userspace local APIC and the
> RDMSR/WRMSR helper to access x2APIC registers in x2APIC mode. With this
> series, we can now boot up Linux kernel in x2APIC with TCG accelerator.
> 
> Bui Quang Minh (4):
>   apic: add support for x2APIC mode
>   i386/tcg: implement x2APIC registers MSR access
>   intel_iommu: allow Extended Interrupt Mode when using userspace local
>     APIC
>   test/avocado: test Linux boot up in x2APIC with userspace local APIC

Please can you ensure CPUID 0x0B is correctly populated with the APIC
ID (and other information) when X2APIC is enabled?


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH] hw/intc/ioapic: Update KVM routes before redelivering IRQ, on RTE update

2023-03-06 Thread David Woodhouse

On Mon, 2023-03-06 at 11:39 -0500, Peter Xu wrote:
> On Mon, Mar 06, 2023 at 09:25:35AM +0000, David Woodhouse wrote:
> > I think if we want to fix the lack of IR translation faults from the
> > IOMMU, we have to change this anyway.
> > 
> > At the very least, it can only use KVM to deliver the IRQ if there *is*
> > a valid translation. And if not, it needs to ask the IOMMU to
> > retranslate, with a 'delivering_now' flag indicating that the fault
> > should be raised because this isn't a preemptive translation in
> > advance.
> 
> I see that as a separate problem of what this patch wants to (rightfully)
> fix. 
> 

Agreed. Which is why I didn't just rip the kvm_set_irq() out and call
it fixed, and put that discussion below the ^--- cutoff.

>  I also agree we'll need that if e.g. we want to support ioapic in
> kvm.
>
> Sorry in advancie since I don't think I have the whole picture here.  Could
> you remind me the whole purpose of having such?  Is that for part of Xen's
> effort?
> 
> It was a long time ago, but IIRC such mechanism wasn't implemented when we
> worked on vIOMMU IR initially, because at least at that time the "keeping
> kvm irq routes always updated when IRQ entries modified" was good enough
> for qemu to work with IR.
> 
> The only outlier was in-kernel ioapic, but there was a talk from Google
> showing that in kernel ioapic brought mostly nothing good but more exposure
> on attack surface.  It does sound reasonable since fast devices shouldn't
> use ioapic at all to me, so IIRC the plan was split irqchip should be
> always optimal hence no immediate concern of not supporting IR with
> in-kernel ioapic.  But I definitely can miss important things along the
> way.

Indeed, I don't think we care about the in-kernel I/OAPIC. I don't
think we care about the kernel knowing about e.g. "GSI #11" at all. We
can just deliver it as MSI (for the I/OAPIC) or using KVM_INTERRUPT and
the interrupt window as we do for the PIC. Which is why I'd happily rip
that out and let it be delivered via the APIC intercept at 0xfeex.

The existing code which just keeps IRQ routes updated when they're
valid is kind of OK, and well-behaved guests can function. But it isn't
*right* in the case where they aren't valid.

What *ought* to happen is that the IOMMU should raise a fault at the
moment the interrupt occurs, if the translation isn't valid. And we
don't have that at all.

As for why I care? I don't really *need* it, as I have everything I
need for Xen PIRQ support already merged in 
https://gitlab.com/qemu-project/qemu/-/commit/6096cf7877

So while the thread at
https://lore.kernel.org/qemu-devel/aaef9961d210ac1873153bf3cf01d984708744fc.ca...@infradead.org/
was partly driven by expecting to need this for Xen PIRQ support
(because in $DAYJOB I did those things in the other order and the PIRQ
support ended up just being a trivial different translator like the
IOMMU's IR)... I'd still quite like to fix it up in QEMU anyway, just
for correctness and fidelity in the faulting cases.

We can do more efficient invalidation too, rather than blowing away the
entire routing table every time. Just disconnect the IRQFD for the
specific interrupts that get invalidated, and let them get fixed up
again next time they occur.

smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 1/4] apic: add support for x2APIC mode

2023-03-06 Thread David Woodhouse

On Mon, 2023-03-06 at 23:39 +0700, Bui Quang Minh wrote:
> On 3/6/23 22:51, David Woodhouse wrote:
> > On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:
> > > > However, there are still problems while trying to extending support to
> > > > APIC ID larger than 255 because there are many places assume APIC ID is
> > > > 8-bit long.
> > > 
> > > that's what I was concerned about (i.e. just enabling x2apic without 
> > > fixing
> > > with all code that just assumes 8bit apicid).
> > 
> > Even before you extend the physical APIC IDs past 254 or 255, there's
> > still the issue that 255 isn't a broadcast in X2APIC mode. And that
> > *logical* addressing will use more than 8 bits even when the physical
> > IDs are lower.
> > 
> > > > One of that is interrupt remapping which returns 32-bit
> > > > destination ID but uses MSI (which has 8-bit destination) to send to
> > > > APIC. I will look more into this.
> > 
> > The translated 'output' from the interrupt remapping doesn't "use MSI",
> > in the sense of a write transaction which happens to go to addresses in
> > the 0xFEEx range. The *input* to interrupt remapping comes
> > from that intercept.
> > 
> > The output is already "known" to be an MSI message, with a full 64-bit
> > address and 32-bit data, and the KVM API puts the high 24 bits of the
> > target APIC ID into the high word of the address.
> > 
> > If you look at the "generic" x86_iommu_irq_to_msi_message() in
> > hw/intc/x86-iommu.c you'll see it's also using the same trick:
> > 
> >  msg.__addr_hi = irq->dest & 0xff00;
> 
> Yeah, I see that trick too, with this hunk and temporarily disable 
> broadcast destination id 0xff in physical mode, I am able to boot Linux 
> with 255 CPUs (I can't see how to use few CPUs but configure with high 
> APIC ID)

I never worked out how to explicitly assign high APIC IDs but you can
at least spread them out by explicitly setting the topology to
something weird like sockets=17,cores=3,threads=3 so that some APIC IDs
get skipped.

Of course, that doesn't let you exercise the interesting corner case of
physical APIC ID 0xff though. I wonder if there's a way of doing it
such that only CPU#0 and CPU#255 are *online* at boot, even if the rest
theoretically exist? 

> @@ -814,7 +816,12 @@ static void apic_send_msi(MSIMessage *msi)
>   {
>   uint64_t addr = msi->address;
>   uint32_t data = msi->data;
> -    uint8_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> +    uint32_t dest = (addr & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
> +    /*
> + * The higher 3 bytes of destination id is stored in higher word of
> + * msi address. See x86_iommu_irq_to_msi_message()
> + */
> +    dest = dest | (addr >> 32);
> 
> I am reading the manual now, looks like broadcast destination id in 
> x2APIC is 0x_ in both physical and logic mode.

Yep, that looks about right.


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 1/4] apic: add support for x2APIC mode

2023-03-06 Thread David Woodhouse

On Tue, 2023-02-21 at 23:04 +0700, Bui Quang Minh wrote:
> @@ -454,7 +500,7 @@ static int apic_find_dest(uint8_t dest)
>  }
>  
>  static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
> -  uint8_t dest, uint8_t dest_mode)
> +  uint32_t dest, uint8_t dest_mode)
>  {
>  APICCommonState *apic_iter;
>  int i;


I think somewhere here between these two hunks, you've forgotten to
stop interpreting 0xFF as broadcast when you're in X2APIC mode.

> @@ -474,14 +520,22 @@ static void apic_get_delivery_bitmask(uint32_t 
> *deliver_bitmask,
>  for(i = 0; i < MAX_APICS; i++) {
>  apic_iter = local_apics[i];
>  if (apic_iter) {
> -    if (apic_iter->dest_mode == 0xf) {
> -    if (dest & apic_iter->log_dest)
> -    apic_set_bit(deliver_bitmask, i);
> -    } else if (apic_iter->dest_mode == 0x0) {
> -    if ((dest & 0xf0) == (apic_iter->log_dest & 0xf0) &&
> -    (dest & apic_iter->log_dest & 0x0f)) {
> +    /* x2APIC mode */
> +    if (apic_iter->apicbase & MSR_IA32_APICBASE_EXTD) {
> +    if ((dest & 0x) == (apic_iter->log_dest & 
> 0x) &&
> +    (dest & apic_iter->log_dest & 0x)) {




smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 1/4] apic: add support for x2APIC mode

2023-03-06 Thread David Woodhouse

On Mon, 2023-03-06 at 11:43 +0100, Igor Mammedov wrote:
> > However, there are still problems while trying to extending support to 
> > APIC ID larger than 255 because there are many places assume APIC ID is 
> > 8-bit long.
>
> that's what I was concerned about (i.e. just enabling x2apic without fixing
> with all code that just assumes 8bit apicid).

Even before you extend the physical APIC IDs past 254 or 255, there's
still the issue that 255 isn't a broadcast in X2APIC mode. And that
*logical* addressing will use more than 8 bits even when the physical
IDs are lower.

> > One of that is interrupt remapping which returns 32-bit 
> > destination ID but uses MSI (which has 8-bit destination) to send to 
> > APIC. I will look more into this.

The translated 'output' from the interrupt remapping doesn't "use MSI",
in the sense of a write transaction which happens to go to addresses in
the 0xFEEx range. The *input* to interrupt remapping comes
from that intercept.

The output is already "known" to be an MSI message, with a full 64-bit
address and 32-bit data, and the KVM API puts the high 24 bits of the
target APIC ID into the high word of the address.

If you look at the "generic" x86_iommu_irq_to_msi_message() in
hw/intc/x86-iommu.c you'll see it's also using the same trick:

msg.__addr_hi = irq->dest & 0xff00;

That hack isn't guest-visible. There *is* a trick which is guest-
visible, implemented by both Hyper-V and KVM, which is to recognise
that actually there was an 'Extended Destination ID' field in bits 4-11
of the actual 32-bit MSI. Intel eventually used the low bit as the
selector for 'Remappable Format' MSI messages, but we've used the other
seven to extend the APIC ID to 15 bits even in a guest-visible way,
allowing up to 32768 CPUs without having to expose a virtual IOMMU.

But that should get translated to the KVM format with the high bits in
the top 32 bits of the address, fairly much transparently. All you need
to do is ensure that the TCG X2APIC support takes that KVM format, and
that it all enabled in the right circumstances.

smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH] hw/intc/ioapic: Update KVM routes before redelivering IRQ, on RTE update

2023-03-06 Thread David Woodhouse

On Mon, 2023-03-06 at 06:51 +, David Woodhouse wrote:
> On 5 March 2023 22:36:18 GMT, Peter Xu  wrote:
> > On Sun, Mar 05, 2023 at 06:43:42PM +, 
> > > ---
> > > Alternative fixes might have been just to remove the part in
> > > ioapic_service() which delivers the IRQ via kvm_set_irq() because
> > > surely delivering as MSI ought to work just fine anyway in all cases?
> > > That code lacks a comment justifying its existence.
> > 
> > Didn't check all details, but AFAIU there're still some different paths
> > triggered so at least it looks still clean to use the path it's for.
> > 
> > E.g., I think if someone traces kvm_set_irq() in kernel this specific irq
> > triggered right after unmasking might seem to be missed misterously (but
> > actually it was not).
> 
> Hm, not sure that's a reason we care about. The I/OAPIC is purely a
> device to turn line interrupts into MSIs. Which these days need to be
> translated by IOMMU interrupt remapping device models in userspace. I
> don't think a user has any valid reason to expect that the kernel
> will even know about any GSIs with any specific numbers. Tracing on
> that in the kernel would making some dodgy assumptions.

I think if we want to fix the lack of IR translation faults from the
IOMMU, we have to change this anyway.

At the very least, it can only use KVM to deliver the IRQ if there *is*
a valid translation. And if not, it needs to ask the IOMMU to
retranslate, with a 'delivering_now' flag indicating that the fault
should be raised because this isn't a preemptive translation in
advance.




smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH] hw/intc/ioapic: Update KVM routes before redelivering IRQ, on RTE update

2023-03-05 Thread David Woodhouse

On 5 March 2023 22:36:18 GMT, Peter Xu  wrote:
>On Sun, Mar 05, 2023 at 06:43:42PM +, 
>> ---
>> Alternative fixes might have been just to remove the part in
>> ioapic_service() which delivers the IRQ via kvm_set_irq() because
>> surely delivering as MSI ought to work just fine anyway in all cases?
>> That code lacks a comment justifying its existence.
>
>Didn't check all details, but AFAIU there're still some different paths
>triggered so at least it looks still clean to use the path it's for.
>
>E.g., I think if someone traces kvm_set_irq() in kernel this specific irq
>triggered right after unmasking might seem to be missed misterously (but
>actually it was not).

Hm, not sure that's a reason we care about. The I/OAPIC is purely a device to 
turn line interrupts into MSIs. Which these days need to be translated by IOMMU 
interrupt remapping device models in userspace. I don't think a user has any 
valid reason to expect that the kernel will even know about any GSIs with any 
specific numbers. Tracing on that in the kernel would making some dodgy 
assumptions.

[PATCH] hw/intc/ioapic: Update KVM routes before redelivering IRQ, on RTE update

2023-03-05 Thread David Woodhouse

From: David Woodhouse 

A Linux guest will perform IRQ migration after the IRQ has happened,
updating the RTE to point to the new destination CPU and then unmasking
the interrupt.

However, when the guest updates the RTE, ioapic_mem_write() calls
ioapic_service(), which redelivers the pending level interrupt via
kvm_set_irq(), *before* calling ioapic_update_kvm_routes() which sets
the new target CPU.

Thus, the IRQ which is supposed to go to the new target CPU is instead
misdelivered to the previous target. An example where the guest kernel
is attempting to migrate from CPU#2 to CPU#0 shows:

xenstore_read tx 0 path control/platform-feature-xs_reset_watches
ioapic_set_irq vector: 11 level: 1
ioapic_set_remote_irr set remote irr for pin 11
ioapic_service: trigger KVM IRQ 11
[0.523627] The affinity mask was 0-3 and the handler is on 2
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x27 size 0x4 val 0x26
ioapic_update_kvm_routes: update KVM route for IRQ 11: fee02000 8021
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x26 size 0x4 val 0x18021
xenstore_reset_watches
ioapic_set_irq vector: 11 level: 1
ioapic_mem_read ioapic mem read addr 0x10 regsel: 0x26 size 0x4 retval 0x1c021
[0.524569] ioapic_ack_level IRQ 11 moveit = 1
ioapic_eoi_broadcast EOI broadcast for vector 33
ioapic_clear_remote_irr clear remote irr for pin 11 vector 33
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x26 size 0x4 val 0x26
ioapic_mem_read ioapic mem read addr 0x10 regsel: 0x26 size 0x4 retval 0x18021
[0.525235] ioapic_finish_move IRQ 11 calls irq_move_masked_irq()
[0.526147] irq_do_set_affinity for IRQ 11, 0
[0.526732] ioapic_set_affinity for IRQ 11, 0
[0.527330] ioapic_setup_msg_from_msi for IRQ11 target 0
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x26 size 0x4 val 0x27
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x27 size 0x4 val 0x0
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x27 size 0x4 val 0x26
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x26 size 0x4 val 0x18021
[0.527623] ioapic_set_affinity returns 0
[0.527623] ioapic_finish_move IRQ 11 calls unmask_ioapic_irq()
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x26 size 0x4 val 0x26
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x26 size 0x4 val 0x8021
ioapic_set_remote_irr set remote irr for pin 11
ioapic_service: trigger KVM IRQ 11
ioapic_update_kvm_routes: update KVM route for IRQ 11: fee0 8021
[0.529571] The affinity mask was 0 and the handler is on 2
[xenstore_watch path memory/target token 92847D40

There are no other code paths in ioapic_mem_write() which need the KVM
IRQ routing table to be updated, so just shift the call from the end
of the function to happen right before the call to ioapic_service()
and thus deliver the re-enabled IRQ to the right place.

Fixes: 15eafc2e602f "kvm: x86: add support for KVM_CAP_SPLIT_IRQCHIP"
Signed-off-by: David Woodhouse 
---
Alternative fixes might have been just to remove the part in
ioapic_service() which delivers the IRQ via kvm_set_irq() because
surely delivering as MSI ought to work just fine anyway in all cases?
That code lacks a comment justifying its existence.

Or maybe in the specific case shown in the above log, it would have
sufficed for ioapic_update_kvm_routes() to update the route *even*
when the IRQ is masked. It's not like it's actually going to get
triggered unless QEMU deliberately does so, anyway? But that only
works because the target CPU happens to be in the high word of the
RTE; if something in the *low* word (vector, perhaps) was changed
at the same time as the unmask, we'd still trigger with stale data.

 hw/intc/ioapic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 6364ecab1b..716ffc8bbb 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -405,6 +405,7 @@ ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
 s->ioredtbl[index] |= ro_bits;
 s->irq_eoi[index] = 0;
 ioapic_fix_edge_remote_irr(&s->ioredtbl[index]);
+ioapic_update_kvm_routes(s);
 ioapic_service(s);
 }
 }
@@ -417,8 +418,6 @@ ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
 ioapic_eoi_broadcast(val);
 break;
 }
-
-ioapic_update_kvm_routes(s);
 }
 
 static const MemoryRegionOps ioapic_io_ops = {
-- 
2.39.0




smime.p7s
Description: S/MIME cryptographic signature

Re: IRQ affinity not working on Xen pci-platform device^W^W^W QEMU split-irqchip I/O APIC.

2023-03-04 Thread David Woodhouse

On Sat, 2023-03-04 at 09:57 +, David Woodhouse wrote:
> I wonder if the EOI is going missing because it's coming
> from the wrong CPU? Note no 'EOI broadcast' after the last line in the
> log I showed above; it isn't just that I trimmed it there.

I'm running on a host kernel without commit fceb3a36c29a so that's
probably it.

https://git.kernel.org/torvalds/c/fceb3a36c29a


smime.p7s
Description: S/MIME cryptographic signature

Re: IRQ affinity not working on Xen pci-platform device^W^W^W QEMU split-irqchip I/O APIC.

2023-03-04 Thread David Woodhouse

On Sat, 2023-03-04 at 01:28 +0100, Thomas Gleixner wrote:
> David!
> 
> On Fri, Mar 03 2023 at 16:54, David Woodhouse wrote:
> > On Fri, 2023-03-03 at 17:51 +0100, Thomas Gleixner wrote:
> > > > 
> > > > [    0.577173] ACPI: \_SB_.LNKC: Enabled at IRQ 11
> > > > [    0.578149] The affinity mask was 0-3
> > > > [    0.579081] The affinity mask is 0-3 and the handler is on 2
> > > > [    0.580288] The affinity mask is 0 and the handler is on 2
> > > 
> > > What happens is that once the interrupt is requested, the affinity
> > > setting is deferred to the first interrupt. See the marvelous dance in
> > > arch/x86/kernel/apic/msi.c::msi_set_affinity().
> > > 
> > > If you do the setting before request_irq() then the startup will assign
> > > it to the target mask right away.
> > > 
> > > Btw, you are using irq_get_affinity_mask(), which gives you the desired
> > > target mask. irq_get_effective_affinity_mask() gives you the real one.
> > > 
> > > Can you verify that the thing moves over after the first interrupt or is
> > > that too late already?
> > 
> > It doesn't seem to move. The hack to just return IRQ_NONE if invoked on
> > CPU != 0 was intended to do just that. It's a level-triggered interrupt
> > so when the handler does nothing on the "wrong" CPU, it ought to get
> > invoked again on the *correct* CPU and actually work that time.
> 
> So much for the theory. This is virt after all so it does not
> necessarily behave like real hardware.

I think you're right. This looks like a QEMU bug with the "split
irqchip" I/OAPIC.

For reasons I'm unclear about, and which lack a comment in the code,
QEMU still injects I/OAPIC events into the kernel with kvm_set_irq().
(I think it's do to with caching, because QEMU doesn't cache interrupt-
remapping translations anywhere *except* in the KVM IRQ routing table,
so if it just synthesised an MSI message every time it'd have to
retranslate it every time?)

Tracing the behaviour here shows:

 • First interrupt happens on CPU2.
 • Linux updates the I/OAPIC RTE to point to CPU0, but QEMU doesn't
   update the KVM IRQ routing table yet.
 * QEMU retriggers the (still-high, level triggered) IRQ.
 • QEMU calls kvm_set_irq(11), delivering it to CPU2 again.
 • QEMU *finally* calls ioapic_update_kvm_routes().
 • Linux sees the interrupt on CPU2 again.

  $ qemu-system-x86_64 -display none -serial mon:stdio \
 -accel kvm,xen-version=0x4000a,kernel-irqchip=split \
 -kernel ~/git/linux/arch/x86/boot//bzImage \
 -append "console=ttyS0,115200 xen_no_vector_callback" \
 -smp 4 --trace ioapic\* --trace xenstore\*


...

xenstore_read tx 0 path control/platform-feature-xs_reset_watches
ioapic_set_irq vector: 11 level: 1
ioapic_set_remote_irr set remote irr for pin 11
ioapic_service: trigger KVM IRQ 11
[0.523627] The affinity mask was 0-3 and the handler is on 2
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x27 size 0x4 val 0x26
ioapic_update_kvm_routes: update KVM route for IRQ 11: fee02000 8021
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x26 size 0x4 val 0x18021
xenstore_reset_watches 
ioapic_set_irq vector: 11 level: 1
ioapic_mem_read ioapic mem read addr 0x10 regsel: 0x26 size 0x4 retval 0x1c021
[0.524569] ioapic_ack_level IRQ 11 moveit = 1
ioapic_eoi_broadcast EOI broadcast for vector 33
ioapic_clear_remote_irr clear remote irr for pin 11 vector 33
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x26 size 0x4 val 0x26
ioapic_mem_read ioapic mem read addr 0x10 regsel: 0x26 size 0x4 retval 0x18021
[0.525235] ioapic_finish_move IRQ 11 calls irq_move_masked_irq()
[0.526147] irq_do_set_affinity for IRQ 11, 0
[0.526732] ioapic_set_affinity for IRQ 11, 0
[0.527330] ioapic_setup_msg_from_msi for IRQ11 target 0
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x26 size 0x4 val 0x27
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x27 size 0x4 val 0x0
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x27 size 0x4 val 0x26
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x26 size 0x4 val 0x18021
[0.527623] ioapic_set_affinity returns 0
[0.527623] ioapic_finish_move IRQ 11 calls unmask_ioapic_irq()
ioapic_mem_write ioapic mem write addr 0x0 regsel: 0x26 size 0x4 val 0x26
ioapic_mem_write ioapic mem write addr 0x10 regsel: 0x26 size 0x4 val 0x8021
ioapic_set_remote_irr set remote irr for pin 11
ioapic_service: trigger KVM IRQ 11
ioapic_update_kvm_routes: update KVM route for IRQ 11: fee0 8021
[0.529571] The affinity mask was 0 and the handler is on 2
[xenstore_watch path memory/target token 92847D40
xenstore_watch_event path memory/target token 92847D40
ioapic_set_irq vector: 11 level: 1
0.530486

Re: [PULL 00/62] i386, misc changes for QEMU 8.0 soft freeze

2023-03-03 Thread David Woodhouse

On Fri, 2023-03-03 at 13:46 +, Peter Maydell wrote:
> On Fri, 3 Mar 2023 at 13:44, David Woodhouse 
> wrote:
> > 
> > On Fri, 2023-03-03 at 11:03 +, Peter Maydell wrote:
> > > 
> > > Applied, thanks.
> > > 
> > > Please update the changelog at
> > > https://wiki.qemu.org/ChangeLog/8.0
> > > for any user-visible changes.
> > 
> > Can I have a wiki account 'dwmw2' with which to do so, please?
> 
> Per our irc conversation, I've created one for you.

Thanks. Updated https://wiki.qemu.org/ChangeLog/8.0#x86 with a
reference to the documentation at
https://qemu-project.gitlab.io/qemu/system/i386/xen.html


smime.p7s
Description: S/MIME cryptographic signature

Re: [PULL 00/62] i386, misc changes for QEMU 8.0 soft freeze

2023-03-03 Thread David Woodhouse

On Fri, 2023-03-03 at 11:03 +, Peter Maydell wrote:
> 
> Applied, thanks.
> 
> Please update the changelog at https://wiki.qemu.org/ChangeLog/8.0
> for any user-visible changes.

Can I have a wiki account 'dwmw2' with which to do so, please?


smime.p7s
Description: S/MIME cryptographic signature

[PATCH 1/2] tests/avocado: Add Fedora 34 distro, including kernel/initrd checksums

2023-03-03 Thread David Woodhouse

From: David Woodhouse 

The kernel in Fedora 31 doesn't support 'xen_no_vector_callback' on
its command line, so add a slightly newer version as a prelude to
enabling avocado tests for Xen guests.

Signed-off-by: David Woodhouse 
---
 tests/avocado/avocado_qemu/__init__.py | 27 ++
 1 file changed, 27 insertions(+)

diff --git a/tests/avocado/avocado_qemu/__init__.py 
b/tests/avocado/avocado_qemu/__init__.py
index a313e88c07..49e414e267 100644
--- a/tests/avocado/avocado_qemu/__init__.py
+++ b/tests/avocado/avocado_qemu/__init__.py
@@ -485,6 +485,23 @@ class LinuxDistro:
   ' console=tty0'),
  },
 },
+'34': {
+'x86_64':
+{'checksum': ('b9b621b26725ba95442d9a56cbaa0547'
+  '84e0779a9522ec6eafff07c6e6f717ea'),
+ 'pxeboot_url': ('https://archives.fedoraproject.org/'
+ 'pub/archive/fedora/linux/releases/34/'
+ 'Everything/x86_64/os/images/pxeboot/'),
+ 'kernel_hash': ('085fc6e47f2e3a271b591f3e56739ca9'
+ '4c16718837a5f431ab95468e1e95f9eb'),
+ 'initrd_hash': ('d6cd2e03e8188eed6c896fd65ff05f81'
+ '2c4c1c8777d630b5909e9a1a4627e337'),
+ 'kernel_params': ('root=UUID=386769a3-cfa5-47c8-8797-'
+   'd5ec58c9cb6c ro no_timer_check '
+   'net.ifnames=0 console=tty1 '
+   'console=ttyS0,115200n8'),
+},
+},
 }
 }
 
@@ -513,6 +530,16 @@ def pxeboot_url(self):
 """Gets the repository url where pxeboot files can be found"""
 return self._info.get('pxeboot_url', None)
 
+@property
+def kernel_hash(self):
+"""Gets checksum of the pxeboot kernel image"""
+return self._info.get('kernel_hash', None)
+
+@property
+def initrd_hash(self):
+"""Gets checksum of the pxeboot initrd image"""
+return self._info.get('initrd_hash', None)
+
 @property
 def default_kernel_params(self):
 """Gets the default kernel parameters"""
-- 
2.39.0

[PATCH 0/2] tests/avocado: Test Xen guest support under KVM

2023-03-03 Thread David Woodhouse

Add avocado tests to boot Xen guests in various interesting modes:
 • MSI delivered via PIRQ.
 • MSI delivered directly with vAPIC.
 • Event channel interrupt to I/O APIC.
 • Event channel interrupt to legacy PIC.

Using AHCI disk for now so this should work with the basic platform
support that's already in Paolo's pull request. After phase 2 of the
Xen support is merged, we can switch it to use xen-disk instead.

The warnings about not being able to validate the kernel and initrd
images made me sad, so I added hashes of those to the distro structure.

David Woodhouse (2):
  tests/avocado: Add Fedora 34 distro, including kernel/initrd checksums
  tests/avocado: Test Xen guest support under KVM

 tests/avocado/avocado_qemu/__init__.py |  27 
 tests/avocado/xen_guest.py | 113 +
 2 files changed, 140 insertions(+)

[PATCH 2/2] tests/avocado: Test Xen guest support under KVM

2023-03-03 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 tests/avocado/xen_guest.py | 113 +
 1 file changed, 113 insertions(+)
 create mode 100644 tests/avocado/xen_guest.py

diff --git a/tests/avocado/xen_guest.py b/tests/avocado/xen_guest.py
new file mode 100644
index 00..111ef3ca08
--- /dev/null
+++ b/tests/avocado/xen_guest.py
@@ -0,0 +1,113 @@
+# Xen guest functional tests
+#
+# Copyright © 2021 Red Hat, Inc.
+# Copyright © 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Author:
+#  David Woodhouse 
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+import os
+
+from avocado import skipIf
+from avocado_qemu import LinuxTest
+
+@skipIf(os.getenv('GITLAB_CI'), 'Running on GitLab')
+class XenGuest(LinuxTest):
+"""
+:avocado: tags=arch:x86_64
+:avocado: tags=distro:fedora
+:avocado: tags=distro_version:34
+:avocado: tags=machine:q35
+:avocado: tags=accel:kvm
+:avocado: tags=xen_guest
+"""
+
+kernel_path = None
+initrd_path = None
+kernel_params = None
+
+def set_up_boot(self):
+path = self.download_boot()
+self.vm.add_args('-drive', 'file=%s,if=none,id=drv0' % path)
+if False: # Soon...
+self.vm.add_args('-device', 'xen-disk,drive=drv0,vdev=xvda')
+else:
+self.vm.add_args('-device', 'ahci,id=ahci')
+self.vm.add_args('-device', 'ide-hd,drive=drv0,bus=ahci.0')
+
+def setUp(self):
+super(XenGuest, self).setUp(None, 'virtio-net-pci')
+
+def common_vm_setup(self, custom_kernel=None):
+self.require_accelerator("kvm")
+self.vm.add_args("-accel", 
"kvm,xen-version=0x4000a,kernel-irqchip=split")
+
+if custom_kernel is None:
+return
+
+kernel_url = self.distro.pxeboot_url + 'vmlinuz'
+initrd_url = self.distro.pxeboot_url + 'initrd.img'
+self.kernel_path = self.fetch_asset(kernel_url, algorithm='sha256',
+asset_hash=self.distro.kernel_hash)
+self.initrd_path = self.fetch_asset(initrd_url, algorithm='sha256',
+asset_hash=self.distro.initrd_hash)
+
+def run_and_check(self):
+if self.kernel_path:
+self.vm.add_args('-kernel', self.kernel_path,
+ '-append', self.kernel_params,
+ '-initrd', self.initrd_path)
+self.launch_and_wait()
+self.ssh_command('cat /proc/cmdline')
+self.ssh_command('dmesg | grep -e "Grant table initialized"')
+
+def test_xen_guest(self):
+"""
+:avocado: tags=xen_guest
+"""
+
+self.common_vm_setup(True)
+
+self.kernel_params = (self.distro.default_kernel_params +
+  ' xen_emul_unplug=ide-disks')
+self.run_and_check()
+self.ssh_command('grep xen-pirq.*msi /proc/interrupts')
+
+def test_xen_guest_vapic(self):
+"""
+:avocado: tags=xen_guest_vapic
+"""
+
+self.common_vm_setup(True)
+self.vm.add_args('-cpu', 'host,+xen-vapic')
+self.kernel_params = (self.distro.default_kernel_params +
+  ' xen_emul_unplug=ide-disks')
+self.run_and_check()
+self.ssh_command('grep xen-pirq /proc/interrupts')
+self.ssh_command('grep PCI-MSI /proc/interrupts')
+
+def test_xen_guest_novector(self):
+"""
+:avocado: tags=xen_guest_novector
+"""
+
+self.common_vm_setup(True)
+self.kernel_params = (self.distro.default_kernel_params +
+  ' xen_emul_unplug=ide-disks' +
+  ' xen_no_vector_callback')
+self.run_and_check()
+self.ssh_command('grep xen-platform-pci /proc/interrupts')
+
+def test_xen_guest_novector_noapic(self):
+"""
+:avocado: tags=xen_guest_novector_noapic
+"""
+
+self.common_vm_setup(True)
+self.kernel_params = (self.distro.default_kernel_params +
+  ' xen_emul_unplug=ide-disks' +
+  ' xen_no_vector_callback noapic')
+self.run_and_check()
+self.ssh_command('grep xen-platform-pci /proc/interrupts')
-- 
2.39.0

Re: [PATCH v3 00/18] hw/ide: Untangle ISA/PCI abuses of ide_init_ioport()

2023-03-02 Thread David Woodhouse

On 2 March 2023 22:40:40 GMT, "Philippe Mathieu-Daudé"  
wrote:
>Since v2: rebased
>
>I'm posting this series as it to not block Bernhard's PIIX
>cleanup work. I don't have code change planned, but eventually
>reword / improve commit descriptions.
>
>Tested commit after commit to be sure it is bisectable. Sadly
>this was before Zoltan & Thomas report a problem with commit
>bb98e0f59c ("hw/isa/vt82c686: Remove intermediate IRQ forwarder").

However much I stare at the partial revert which fixes it, I just cannot 
believe that the change could make any difference at all. There's got to be 
something weird going on there.

I was going to ask if the level mode for the PIT made any difference, but this 
is the output IRQ from the PIT to the CPU itself so I don't see how it would.

Would like to see a report with tracing from pic_update_irq, the CPU interrupt 
"handler" and the intermediate IRQ handler. With the intermediate present and 
without it. To compare the two.

[RFC PATCH v1 14/25] hw/xen: Move xenstore_store_pv_console_info to xen_console.c

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

There's no need for this to be in the Xen accel code, and as we want to
use the Xen console support with KVM-emulated Xen we'll want to have a
platform-agnostic version of it. Make it use GString to build up the
path while we're at it.

Signed-off-by: David Woodhouse 
---
 accel/xen/xen-all.c   | 61 ---
 hw/char/xen_console.c | 45 +--
 include/hw/xen/xen.h  |  2 --
 3 files changed, 43 insertions(+), 65 deletions(-)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 425216230f..2d51c41e40 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -29,67 +29,6 @@ xc_interface *xen_xc;
 xenforeignmemory_handle *xen_fmem;
 xendevicemodel_handle *xen_dmod;
 
-static int store_dev_info(int domid, Chardev *cs, const char *string)
-{
-struct xs_handle *xs = NULL;
-char *path = NULL;
-char *newpath = NULL;
-char *pts = NULL;
-int ret = -1;
-
-/* Only continue if we're talking to a pty. */
-if (!CHARDEV_IS_PTY(cs)) {
-return 0;
-}
-pts = cs->filename + 4;
-
-/* We now have everything we need to set the xenstore entry. */
-xs = xs_open(0);
-if (xs == NULL) {
-fprintf(stderr, "Could not contact XenStore\n");
-goto out;
-}
-
-path = xs_get_domain_path(xs, domid);
-if (path == NULL) {
-fprintf(stderr, "xs_get_domain_path() error\n");
-goto out;
-}
-newpath = realloc(path, (strlen(path) + strlen(string) +
-strlen("/tty") + 1));
-if (newpath == NULL) {
-fprintf(stderr, "realloc error\n");
-goto out;
-}
-path = newpath;
-
-strcat(path, string);
-strcat(path, "/tty");
-if (!xs_write(xs, XBT_NULL, path, pts, strlen(pts))) {
-fprintf(stderr, "xs_write for '%s' fail", string);
-goto out;
-}
-ret = 0;
-
-out:
-free(path);
-xs_close(xs);
-
-return ret;
-}
-
-void xenstore_store_pv_console_info(int i, Chardev *chr)
-{
-if (i == 0) {
-store_dev_info(xen_domid, chr, "/console");
-} else {
-char buf[32];
-snprintf(buf, sizeof(buf), "/device/console/%d", i);
-store_dev_info(xen_domid, chr, buf);
-}
-}
-
-
 static void xenstore_record_dm_state(const char *state)
 {
 struct xs_handle *xs;
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index ad8638a86d..c7a19c0e7c 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -173,6 +173,48 @@ static void xencons_send(struct XenConsole *con)
 
 /*  */
 
+static int store_con_info(struct XenConsole *con)
+{
+Chardev *cs = qemu_chr_fe_get_driver(&con->chr);
+char *pts = NULL;
+char *dom_path;
+GString *path;
+int ret = -1;
+
+/* Only continue if we're talking to a pty. */
+if (!CHARDEV_IS_PTY(cs)) {
+return 0;
+}
+pts = cs->filename + 4;
+
+dom_path = qemu_xen_xs_get_domain_path(xenstore, xen_domid);
+if (!dom_path) {
+return 0;
+}
+
+path = g_string_new(dom_path);
+free(dom_path);
+
+if (con->xendev.dev) {
+g_string_append_printf(path, "/device/console/%d", con->xendev.dev);
+} else {
+g_string_append(path, "/console");
+}
+g_string_append(path, "/tty");
+
+if (xenstore_write_str(con->console, path->str, pts)) {
+fprintf(stderr, "xenstore_write_str for '%s' fail", path->str);
+goto out;
+}
+ret = 0;
+
+out:
+g_string_free(path, true);
+free(path);
+
+return ret;
+}
+
 static int con_init(struct XenLegacyDevice *xendev)
 {
 struct XenConsole *con = container_of(xendev, struct XenConsole, xendev);
@@ -215,8 +257,7 @@ static int con_init(struct XenLegacyDevice *xendev)
  &error_abort);
 }
 
-xenstore_store_pv_console_info(con->xendev.dev,
-   qemu_chr_fe_get_driver(&con->chr));
+store_con_info(con);
 
 out:
 g_free(type);
diff --git a/include/hw/xen/xen.h b/include/hw/xen/xen.h
index 03983939f9..56b1c2a827 100644
--- a/include/hw/xen/xen.h
+++ b/include/hw/xen/xen.h
@@ -39,8 +39,6 @@ int xen_is_pirq_msi(uint32_t msi_data);
 
 qemu_irq *xen_interrupt_controller_init(void);
 
-void xenstore_store_pv_console_info(int i, Chardev *chr);
-
 void xen_register_framebuffer(struct MemoryRegion *mr);
 
 #endif /* QEMU_HW_XEN_H */
-- 
2.39.0

[RFC PATCH v1 25/25] i386/xen: Initialize Xen backends from pc_basic_device_init() for emulation

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Now that all the work is done to enable the PV backends to work without
actual Xen, instantiate the bus from pc_basic_device_init() for emulated
mode.

This allows us finally to launch an emulated Xen guest with PV disk.

   qemu-system-x86_64 -serial mon:stdio -M q35 -cpu host -display none \
 -m 1G -smp 2 -accel kvm,xen-version=0x4000a,kernel-irqchip=split \
 -kernel bzImage -append "console=ttyS0 root=/dev/xvda1" \
 -drive file=/var/lib/libvirt/images/fedora28.qcow2,if=none,id=disk \
 -device xen-disk,drive=disk,vdev=xvda

If we use -M pc instead of q35, we can even add an IDE disk and boot a
guest image normally through grub. But q35 gives us AHCI and that isn't
unplugged by the Xen magic, so the guests ends up seeing "both" disks.

Signed-off-by: David Woodhouse 
---
 hw/i386/pc.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index fd17ce7a94..3fe028c86c 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -102,6 +102,11 @@
 #include "trace.h"
 #include CONFIG_DEVICES
 
+#ifdef CONFIG_XEN_EMU
+#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen-bus.h"
+#endif
+
 /*
  * Helper for setting model-id for CPU models that changed model-id
  * depending on QEMU versions up to QEMU 2.4.
@@ -1318,6 +1323,8 @@ void pc_basic_device_init(struct PCMachineState *pcms,
 if (pcms->bus) {
 pci_create_simple(pcms->bus, -1, "xen-platform");
 }
+xen_bus_init();
+xen_be_init();
 }
 #endif
 
-- 
2.39.0

[RFC PATCH v1 23/25] hw/xen: Map guest XENSTORE_PFN grant in emulated Xenstore

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_xenstore.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 028f80499e..f9b7387024 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -21,6 +21,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_evtchn.h"
 #include "xen_xenstore.h"
@@ -34,6 +35,7 @@
 
 #include "hw/xen/interface/io/xs_wire.h"
 #include "hw/xen/interface/event_channel.h"
+#include "hw/xen/interface/grant_table.h"
 
 #define TYPE_XEN_XENSTORE "xen-xenstore"
 OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
@@ -66,6 +68,9 @@ struct XenXenstoreState {
 
 uint8_t *impl_state;
 uint32_t impl_state_size;
+
+struct xengntdev_handle *gt;
+void *granted_xs;
 };
 
 struct XenXenstoreState *xen_xenstore_singleton;
@@ -1452,6 +1457,17 @@ int xen_xenstore_reset(void)
 }
 s->be_port = err;
 
+/*
+ * We don't actually access the guest's page through the grant, because
+ * this isn't real Xen, and we can just use the page we gave it in the
+ * first place. Map the grant anyway, mostly for cosmetic purposes so
+ * it *looks* like it's in use in the guest-visible grant table.
+ */
+s->gt = qemu_xen_gnttab_open();
+uint32_t xs_gntref = GNTTAB_RESERVED_XENSTORE;
+s->granted_xs = qemu_xen_gnttab_map_refs(s->gt, 1, xen_domid, &xs_gntref,
+ PROT_READ | PROT_WRITE);
+
 return 0;
 }
 
-- 
2.39.0

[RFC PATCH v1 21/25] hw/xen: Add emulated implementation of grant table operations

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

This is limited to mapping a single grant at a time, because under Xen the
pages are mapped *contiguously* into qemu's address space, and that's very
hard to do when those pages actually come from anonymous mappings in qemu
in the first place.

Eventually perhaps we can look at using shared mappings of actual objects
for system RAM, and then we can make new mappings of the same backing
store (be it deleted files, shmem, whatever). But for now let's stick to
a page at a time.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_gnttab.c | 299 ++-
 1 file changed, 296 insertions(+), 3 deletions(-)

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 1e691ded32..2bf91d36c0 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -22,6 +22,7 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/xen/xen_backend_ops.h"
 #include "xen_overlay.h"
 #include "xen_gnttab.h"
 
@@ -34,11 +35,10 @@
 #define TYPE_XEN_GNTTAB "xen-gnttab"
 OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 
+static struct gnttab_backend_ops emu_gnttab_backend_ops;
+
 struct XenGnttabState {
 /*< private >*/
 SysBusDevice busdev;
@@ -57,6 +57,8 @@ struct XenGnttabState {
 MemoryRegion gnt_frames;
 MemoryRegion *gnt_aliases;
 uint64_t *gnt_frame_gpas;
+
+uint8_t *map_track;
 };
 
 struct XenGnttabState *xen_gnttab_singleton;
@@ -88,9 +90,15 @@ static void xen_gnttab_realize(DeviceState *dev, Error 
**errp)
 s->gnt_frame_gpas[i] = INVALID_GPA;
 }
 
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
 qemu_mutex_init(&s->gnt_lock);
 
 xen_gnttab_singleton = s;
+
+s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+xen_gnttab_ops = &emu_gnttab_backend_ops;
 }
 
 static int xen_gnttab_post_load(void *opaque, int version_id)
@@ -230,3 +238,288 @@ int xen_gnttab_query_size_op(struct gnttab_query_size 
*size)
 size->max_nr_frames = s->max_frames;
 return 0;
 }
+
+/* Track per-open refs, to allow close() to clean up. */
+struct active_ref {
+MemoryRegionSection mrs;
+void *virtaddr;
+uint32_t refcnt;
+int prot;
+};
+
+static void gnt_unref(XenGnttabState *s, grant_ref_t ref,
+  MemoryRegionSection *mrs, int prot)
+{
+if (mrs && mrs->mr) {
+if (prot & PROT_WRITE) {
+memory_region_set_dirty(mrs->mr, mrs->offset_within_region,
+XEN_PAGE_SIZE);
+}
+memory_region_unref(mrs->mr);
+mrs->mr = NULL;
+}
+assert(s->map_track[ref] != 0);
+
+if (--s->map_track[ref] == 0) {
+grant_entry_v1_t *gnt_p = &s->entries.v1[ref];
+qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing));
+}
+}
+
+static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot)
+{
+uint16_t mask = GTF_type_mask | GTF_sub_page;
+grant_entry_v1_t gnt, *gnt_p;
+int retries = 0;
+
+if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 ||
+s->map_track[ref] == UINT8_MAX) {
+return INVALID_GPA;
+}
+
+if (prot & PROT_WRITE) {
+mask |= GTF_readonly;
+}
+
+gnt_p = &s->entries.v1[ref];
+
+/*
+ * The guest can legitimately be changing the GTF_readonly flag. Allow
+ * that, but don't let a malicious guest cause a livelock.
+ */
+for (retries = 0; retries < 5; retries++) {
+uint16_t new_flags;
+
+/* Read the entry before an atomic operation on its flags */
+gnt = *(volatile grant_entry_v1_t *)gnt_p;
+
+if ((gnt.flags & mask) != GTF_permit_access ||
+gnt.domid != DOMID_QEMU) {
+return INVALID_GPA;
+}
+
+new_flags = gnt.flags | GTF_reading;
+if (prot & PROT_WRITE) {
+new_flags |= GTF_writing;
+}
+
+if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) 
{
+return (uint64_t)gnt.frame << XEN_PAGE_SHIFT;
+}
+}
+
+return INVALID_GPA;
+}
+
+struct xengntdev_handle {
+GHashTable *active_maps;
+};
+
+static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt,
+uint32_t nr_grants)
+{
+return 0;
+}
+
+static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt,
+uint32_t count, uint32_t domid,
+uint32_t *refs, int prot)
+{
+XenGnt

[RFC PATCH v1 16/25] hw/xen: Rename xen_common.h to xen_native.h

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

This header is now only for native Xen code, not PV backends that may be
used in Xen emulation. Since the toolstack libraries may depend on the
specific version of Xen headers that they pull in (and will set the
__XEN_TOOLS__ macro to enable internal definitions that they depend on),
the rule is that xen_native.h (and thus the toolstack library headers)
must be included *before* any of the headers in include/hw/xen/interface.

Signed-off-by: David Woodhouse 
---
 accel/xen/xen-all.c   |  1 +
 hw/9pfs/xen-9p-backend.c  |  1 +
 hw/block/dataplane/xen-block.c|  3 ++-
 hw/block/xen-block.c  |  1 -
 hw/i386/pc_piix.c |  4 ++--
 hw/i386/xen/xen-hvm.c | 11 +-
 hw/i386/xen/xen-mapcache.c|  2 +-
 hw/i386/xen/xen_platform.c|  7 +++---
 hw/xen/trace-events   |  2 +-
 hw/xen/xen-operations.c   |  2 +-
 hw/xen/xen_pt.c   |  2 +-
 hw/xen/xen_pt.h   |  2 +-
 hw/xen/xen_pt_config_init.c   |  2 +-
 hw/xen/xen_pt_msi.c   |  4 ++--
 include/hw/xen/xen.h  | 22 ---
 include/hw/xen/{xen_common.h => xen_native.h} | 10 ++---
 include/hw/xen/xen_pvdev.h|  3 ++-
 17 files changed, 47 insertions(+), 32 deletions(-)
 rename include/hw/xen/{xen_common.h => xen_native.h} (98%)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 2d51c41e40..00221e23c5 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -12,6 +12,7 @@
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "qapi/error.h"
+#include "hw/xen/xen_native.h"
 #include "hw/xen/xen-legacy-backend.h"
 #include "hw/xen/xen_pt.h"
 #include "chardev/char.h"
diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index d8bb0e847c..74f3a05f88 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -22,6 +22,7 @@
 #include "qemu/config-file.h"
 #include "qemu/main-loop.h"
 #include "qemu/option.h"
+#include "qemu/iov.h"
 #include "fsdev/qemu-fsdev.h"
 
 #define VERSIONS "1"
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 8322a1de82..734da42ea7 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -23,8 +23,9 @@
 #include "qemu/main-loop.h"
 #include "qemu/memalign.h"
 #include "qapi/error.h"
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen.h"
 #include "hw/block/xen_blkif.h"
+#include "hw/xen/interface/io/ring.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/iothread.h"
 #include "xen-block.h"
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 345b284d70..87299615e3 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -19,7 +19,6 @@
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qstring.h"
 #include "qom/object_interfaces.h"
-#include "hw/xen/xen_common.h"
 #include "hw/block/xen_blkif.h"
 #include "hw/qdev-properties.h"
 #include "hw/xen/xen-block.h"
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 126b6c11df..e0f768b664 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -47,8 +47,6 @@
 #include "hw/kvm/clock.h"
 #include "hw/sysbus.h"
 #include "hw/i2c/smbus_eeprom.h"
-#include "hw/xen/xen-x86.h"
-#include "hw/xen/xen.h"
 #include "exec/memory.h"
 #include "hw/acpi/acpi.h"
 #include "hw/acpi/piix4.h"
@@ -60,6 +58,8 @@
 #include 
 #include "hw/xen/xen_pt.h"
 #endif
+#include "hw/xen/xen-x86.h"
+#include "hw/xen/xen.h"
 #include "migration/global_state.h"
 #include "migration/misc.h"
 #include "sysemu/numa.h"
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index cb1d24f592..56641a550e 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -18,7 +18,7 @@
 #include "hw/irq.h"
 #include "hw/hw.h"
 #include "hw/i386/apic-msidef.h"
-#include "hw/xen/xen_common.h"
+#include "hw/xen/xen_native.h"
 #include "hw/xen/xen-legacy-backend.h"
 #include "hw/xen/xen-bus.h"
 #include "hw/xen/xen-x86.h"
@@ -52,10 +52,11 @@ static bool xen_in_migration;
 
 /* Compatibility with older version */
 
-/* This allows QEMU to build on a system that has Xen 4.5 or earlier
- * installed.  This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
- * needs to be included before this block and hw

[RFC PATCH v1 06/25] hw/xen: Implement XenStore permissions

2023-03-02 Thread David Woodhouse

From: Paul Durrant 

Store perms as a GList of strings, check permissions.

Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_xenstore.c  |   2 +-
 hw/i386/kvm/xenstore_impl.c | 259 +---
 hw/i386/kvm/xenstore_impl.h |   8 +-
 tests/unit/test-xs-node.c   |  27 +++-
 4 files changed, 275 insertions(+), 21 deletions(-)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 64d8f1a38f..3b409e3817 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -98,7 +98,7 @@ static void xen_xenstore_realize(DeviceState *dev, Error 
**errp)
 aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
xen_xenstore_event, NULL, NULL, NULL, s);
 
-s->impl = xs_impl_create();
+s->impl = xs_impl_create(xen_domid);
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 380f8003ec..7988bde88f 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -12,6 +12,8 @@
 #include "qemu/osdep.h"
 #include "qom/object.h"
 
+#include "hw/xen/xen.h"
+
 #include "xen_xenstore.h"
 #include "xenstore_impl.h"
 
@@ -30,6 +32,7 @@
 typedef struct XsNode {
 uint32_t ref;
 GByteArray *content;
+GList *perms;
 GHashTable *children;
 uint64_t gencnt;
 bool deleted_in_tx;
@@ -133,6 +136,9 @@ static inline void xs_node_unref(XsNode *n)
 if (n->content) {
 g_byte_array_unref(n->content);
 }
+if (n->perms) {
+g_list_free_full(n->perms, g_free);
+}
 if (n->children) {
 g_hash_table_unref(n->children);
 }
@@ -144,8 +150,51 @@ static inline void xs_node_unref(XsNode *n)
 g_free(n);
 }
 
+char *xs_perm_as_string(unsigned int perm, unsigned int domid)
+{
+char letter;
+
+switch (perm) {
+case XS_PERM_READ | XS_PERM_WRITE:
+letter = 'b';
+break;
+case XS_PERM_READ:
+letter = 'r';
+break;
+case XS_PERM_WRITE:
+letter = 'w';
+break;
+case XS_PERM_NONE:
+default:
+letter = 'n';
+break;
+}
+
+return g_strdup_printf("%c%u", letter, domid);
+}
+
+static gpointer do_perm_copy(gconstpointer src, gpointer user_data)
+{
+return g_strdup(src);
+}
+
+static XsNode *xs_node_create(const char *name, GList *perms)
+{
+XsNode *n = xs_node_new();
+
+#ifdef XS_NODE_UNIT_TEST
+if (name) {
+n->name = g_strdup(name);
+}
+#endif
+
+n->perms = g_list_copy_deep(perms, do_perm_copy, NULL);
+
+return n;
+}
+
 /* For copying from one hash table to another using g_hash_table_foreach() */
-static void do_insert(gpointer key, gpointer value, gpointer user_data)
+static void do_child_insert(gpointer key, gpointer value, gpointer user_data)
 {
 g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
 }
@@ -162,12 +211,16 @@ static XsNode *xs_node_copy(XsNode *old)
 }
 #endif
 
+assert(old);
 if (old->children) {
 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
 (GDestroyNotify)xs_node_unref);
-g_hash_table_foreach(old->children, do_insert, n->children);
+g_hash_table_foreach(old->children, do_child_insert, n->children);
 }
-if (old && old->content) {
+if (old->perms) {
+n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
+}
+if (old->content) {
 n->content = g_byte_array_ref(old->content);
 }
 return n;
@@ -383,6 +436,9 @@ static XsNode *xs_node_copy_deleted(XsNode *old, struct 
walk_op *op)
 op->op_opaque2 = n->children;
 g_hash_table_foreach(old->children, copy_deleted_recurse, op);
 }
+if (old->perms) {
+n->perms = g_list_copy_deep(old->perms, do_perm_copy, NULL);
+}
 n->deleted_in_tx = true;
 /* If it gets resurrected we only fire a watch if it lost its content */
 if (old->content) {
@@ -417,6 +473,104 @@ static int xs_node_rm(XsNode **n, struct walk_op *op)
 return 0;
 }
 
+static int xs_node_get_perms(XsNode **n, struct walk_op *op)
+{
+GList **perms = op->op_opaque;
+
+assert(op->inplace);
+assert(*n);
+
+*perms = g_list_copy_deep((*n)->perms, do_perm_copy, NULL);
+return 0;
+}
+
+static void parse_perm(const char *perm, char *letter, unsigned int *dom_id)
+{
+unsigned int n = sscanf(perm, "%c%u", letter, dom_id);
+
+assert(n == 2);
+}
+
+static bool can_access(unsigned int dom_id, GList *perms, const char *letters)
+{
+unsigned int i, n;
+char perm_letter;
+unsigned int perm_dom_id;
+bool access;
+
+if (dom_id == 0) {
+return true;

[RFC PATCH v1 09/25] hw/xen: Add evtchn operations to allow redirection to internal emulation

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

The existing implementation calling into the real libxenevtchn moves to
a new file hw/xen/xen-operations.c, and is called via a function table
which in a subsequent commit will also be able to invoke the emulated
event channel support.

Signed-off-by: David Woodhouse 
Signed-off-by: Paul Durrant 
---
 hw/9pfs/xen-9p-backend.c|  24 +++---
 hw/i386/xen/xen-hvm.c   |  27 ---
 hw/xen/meson.build  |   1 +
 hw/xen/xen-bus.c|  22 +++---
 hw/xen/xen-legacy-backend.c |   8 +-
 hw/xen/xen-operations.c |  71 +
 hw/xen/xen_pvdev.c  |  12 +--
 include/hw/xen/xen-bus.h|   1 +
 include/hw/xen/xen-legacy-backend.h |   1 +
 include/hw/xen/xen_backend_ops.h| 118 
 include/hw/xen/xen_common.h |  12 ---
 include/hw/xen/xen_pvdev.h  |   1 +
 softmmu/globals.c   |   1 +
 13 files changed, 242 insertions(+), 57 deletions(-)
 create mode 100644 hw/xen/xen-operations.c
 create mode 100644 include/hw/xen/xen_backend_ops.h

diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index 65c4979c3c..864bdaf952 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -241,7 +241,7 @@ static void xen_9pfs_push_and_notify(V9fsPDU *pdu)
 xen_wmb();
 
 ring->inprogress = false;
-xenevtchn_notify(ring->evtchndev, ring->local_port);
+qemu_xen_evtchn_notify(ring->evtchndev, ring->local_port);
 
 qemu_bh_schedule(ring->bh);
 }
@@ -324,8 +324,8 @@ static void xen_9pfs_evtchn_event(void *opaque)
 Xen9pfsRing *ring = opaque;
 evtchn_port_t port;
 
-port = xenevtchn_pending(ring->evtchndev);
-xenevtchn_unmask(ring->evtchndev, port);
+port = qemu_xen_evtchn_pending(ring->evtchndev);
+qemu_xen_evtchn_unmask(ring->evtchndev, port);
 
 qemu_bh_schedule(ring->bh);
 }
@@ -337,10 +337,10 @@ static void xen_9pfs_disconnect(struct XenLegacyDevice 
*xendev)
 
 for (i = 0; i < xen_9pdev->num_rings; i++) {
 if (xen_9pdev->rings[i].evtchndev != NULL) {
-qemu_set_fd_handler(xenevtchn_fd(xen_9pdev->rings[i].evtchndev),
-NULL, NULL, NULL);
-xenevtchn_unbind(xen_9pdev->rings[i].evtchndev,
- xen_9pdev->rings[i].local_port);
+
qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev),
+NULL, NULL, NULL);
+qemu_xen_evtchn_unbind(xen_9pdev->rings[i].evtchndev,
+   xen_9pdev->rings[i].local_port);
 xen_9pdev->rings[i].evtchndev = NULL;
 }
 }
@@ -447,12 +447,12 @@ static int xen_9pfs_connect(struct XenLegacyDevice 
*xendev)
 xen_9pdev->rings[i].inprogress = false;
 
 
-xen_9pdev->rings[i].evtchndev = xenevtchn_open(NULL, 0);
+xen_9pdev->rings[i].evtchndev = qemu_xen_evtchn_open();
 if (xen_9pdev->rings[i].evtchndev == NULL) {
 goto out;
 }
-qemu_set_cloexec(xenevtchn_fd(xen_9pdev->rings[i].evtchndev));
-xen_9pdev->rings[i].local_port = xenevtchn_bind_interdomain
+qemu_set_cloexec(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev));
+xen_9pdev->rings[i].local_port = qemu_xen_evtchn_bind_interdomain
 (xen_9pdev->rings[i].evtchndev,
  xendev->dom,
  xen_9pdev->rings[i].evtchn);
@@ -463,8 +463,8 @@ static int xen_9pfs_connect(struct XenLegacyDevice *xendev)
 goto out;
 }
 xen_pv_printf(xendev, 2, "bind evtchn port %d\n", xendev->local_port);
-qemu_set_fd_handler(xenevtchn_fd(xen_9pdev->rings[i].evtchndev),
-xen_9pfs_evtchn_event, NULL, &xen_9pdev->rings[i]);
+qemu_set_fd_handler(qemu_xen_evtchn_fd(xen_9pdev->rings[i].evtchndev),
+xen_9pfs_evtchn_event, NULL, &xen_9pdev->rings[i]);
 }
 
 xen_9pdev->security_model = xenstore_read_be_str(xendev, "security_model");
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index e5a1dd19f4..cb1d24f592 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -761,7 +761,7 @@ static ioreq_t *cpu_get_ioreq(XenIOState *state)
 int i;
 evtchn_port_t port;
 
-port = xenevtchn_pending(state->xce_handle);
+port = qemu_xen_evtchn_pending(state->xce_handle);
 if (port == state->bufioreq_local_port) {
 timer_mod(state->buffered_io_timer,
 BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
@@ -780,7 +780,7 @@ static ioreq_t *cpu_get_ioreq(XenIOState *state)
 }
 
 /* unmask the wanted port again */
-

[RFC PATCH v1 24/25] hw/xen: Implement soft reset for emulated gnttab

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

This is only part of it; we will also need to get the PV back end drivers
to tear down their own mappings (or do it for them, but they kind of need
to stop using the pointers too).

Some more work on the actual PV back ends and xen-bus code is going to be
needed to really make soft reset and migration fully functional, and this
part is the basis for that.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_gnttab.c  | 26 --
 hw/i386/kvm/xen_gnttab.h  |  1 +
 target/i386/kvm/xen-emu.c |  5 +
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 2bf91d36c0..21c30e3659 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -72,13 +72,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error 
**errp)
 error_setg(errp, "Xen grant table support is for Xen emulation");
 return;
 }
-s->nr_frames = 0;
 s->max_frames = kvm_xen_get_gnttab_max_frames();
 memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table",
XEN_PAGE_SIZE * s->max_frames, &error_abort);
 memory_region_set_enabled(&s->gnt_frames, true);
 s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames);
-memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
 
 /* Create individual page-sizes aliases for overlays */
 s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames);
@@ -90,8 +88,11 @@ static void xen_gnttab_realize(DeviceState *dev, Error 
**errp)
 s->gnt_frame_gpas[i] = INVALID_GPA;
 }
 
+s->nr_frames = 0;
+memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
 s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
 s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
 qemu_mutex_init(&s->gnt_lock);
 
 xen_gnttab_singleton = s;
@@ -523,3 +524,24 @@ static struct gnttab_backend_ops emu_gnttab_backend_ops = {
 .unmap = xen_be_gnttab_unmap,
 };
 
+int xen_gnttab_reset(void)
+{
+XenGnttabState *s = xen_gnttab_singleton;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+QEMU_LOCK_GUARD(&s->gnt_lock);
+
+s->nr_frames = 0;
+
+memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames);
+
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access;
+s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE);
+
+memset(s->map_track, 0, s->max_frames * ENTRIES_PER_FRAME_V1);
+
+return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index 3bdbe96191..ee215239b0 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -13,6 +13,7 @@
 #define QEMU_XEN_GNTTAB_H
 
 void xen_gnttab_create(void);
+int xen_gnttab_reset(void);
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
 struct gnttab_set_version;
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index bad3131d08..0bb6c601c9 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -1406,6 +1406,11 @@ int kvm_xen_soft_reset(void)
 return err;
 }
 
+err = xen_gnttab_reset();
+if (err) {
+return err;
+}
+
 err = xen_xenstore_reset();
 if (err) {
 return err;
-- 
2.39.0

[RFC PATCH v1 12/25] hw/xen: Add foreignmem operations to allow redirection to internal emulation

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Signed-off-by: Paul Durrant 
---
 hw/char/xen_console.c|  8 +++---
 hw/display/xenfb.c   | 20 +++---
 hw/xen/xen-operations.c  | 45 
 include/hw/xen/xen_backend_ops.h | 26 ++
 include/hw/xen/xen_common.h  | 13 -
 softmmu/globals.c|  1 +
 tests/unit/test-xs-node.c|  1 +
 7 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 19ad6c946a..e9cef3e1ef 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -237,9 +237,9 @@ static int con_initialise(struct XenLegacyDevice *xendev)
 
 if (!xendev->dev) {
 xen_pfn_t mfn = con->ring_ref;
-con->sring = xenforeignmemory_map(xen_fmem, con->xendev.dom,
-  PROT_READ | PROT_WRITE,
-  1, &mfn, NULL);
+con->sring = qemu_xen_foreignmem_map(con->xendev.dom, NULL,
+ PROT_READ | PROT_WRITE,
+ 1, &mfn, NULL);
 } else {
 con->sring = xen_be_map_grant_ref(xendev, con->ring_ref,
   PROT_READ | PROT_WRITE);
@@ -269,7 +269,7 @@ static void con_disconnect(struct XenLegacyDevice *xendev)
 
 if (con->sring) {
 if (!xendev->dev) {
-xenforeignmemory_unmap(xen_fmem, con->sring, 1);
+qemu_xen_foreignmem_unmap(con->sring, 1);
 } else {
 xen_be_unmap_grant_ref(xendev, con->sring, con->ring_ref);
 }
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 260eb38a76..2c4016fcbd 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -98,8 +98,9 @@ static int common_bind(struct common *c)
 if (xenstore_read_fe_int(&c->xendev, "event-channel", 
&c->xendev.remote_port) == -1)
 return -1;
 
-c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom,
-   PROT_READ | PROT_WRITE, 1, &mfn, NULL);
+c->page = qemu_xen_foreignmem_map(c->xendev.dom, NULL,
+  PROT_READ | PROT_WRITE, 1, &mfn,
+  NULL);
 if (c->page == NULL)
 return -1;
 
@@ -115,7 +116,7 @@ static void common_unbind(struct common *c)
 {
 xen_pv_unbind_evtchn(&c->xendev);
 if (c->page) {
-xenforeignmemory_unmap(xen_fmem, c->page, 1);
+qemu_xen_foreignmem_unmap(c->page, 1);
 c->page = NULL;
 }
 }
@@ -500,15 +501,16 @@ static int xenfb_map_fb(struct XenFB *xenfb)
 fbmfns = g_new0(xen_pfn_t, xenfb->fbpages);
 
 xenfb_copy_mfns(mode, n_fbdirs, pgmfns, pd);
-map = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom,
-   PROT_READ, n_fbdirs, pgmfns, NULL);
+map = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL, PROT_READ,
+  n_fbdirs, pgmfns, NULL);
 if (map == NULL)
 goto out;
 xenfb_copy_mfns(mode, xenfb->fbpages, fbmfns, map);
-xenforeignmemory_unmap(xen_fmem, map, n_fbdirs);
+qemu_xen_foreignmem_unmap(map, n_fbdirs);
 
-xenfb->pixels = xenforeignmemory_map(xen_fmem, xenfb->c.xendev.dom,
-PROT_READ, xenfb->fbpages, fbmfns, NULL);
+xenfb->pixels = qemu_xen_foreignmem_map(xenfb->c.xendev.dom, NULL,
+PROT_READ, xenfb->fbpages,
+fbmfns, NULL);
 if (xenfb->pixels == NULL)
 goto out;
 
@@ -927,7 +929,7 @@ static void fb_disconnect(struct XenLegacyDevice *xendev)
  *   Replacing the framebuffer with anonymous shared memory
  *   instead.  This releases the guest pages and keeps qemu happy.
  */
-xenforeignmemory_unmap(xen_fmem, fb->pixels, fb->fbpages);
+qemu_xen_foreignmem_unmap(fb->pixels, fb->fbpages);
 fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE,
   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
   -1, 0);
diff --git a/hw/xen/xen-operations.c b/hw/xen/xen-operations.c
index 73dabac8e5..61e56a7abe 100644
--- a/hw/xen/xen-operations.c
+++ b/hw/xen/xen-operations.c
@@ -22,6 +22,7 @@
  */
 #undef XC_WANT_COMPAT_EVTCHN_API
 #undef XC_WANT_COMPAT_GNTTAB_API
+#undef XC_WANT_COMPAT_MAP_FOREIGN_API
 
 #include 
 
@@ -56,10 +57,13 @@ typedef xc_gnttab xengnttab_handle;
 #define xengnttab_map_domain_grant_refs(h, c, d, r, p) \
 xc_gnttab_map_domain_grant_refs(h, c, d, r, p)
 
+typedef xc_interface xenforeignmemory_handle;
+
 #else /* CONFIG_XEN_CTRL_INTERFACE_VERSION >= 40701 */
 
 #include 
 #include 
+#include 
 
 #endif
 
@@ -218,6 +222,46 @@

[RFC PATCH v1 22/25] hw/xen: Add emulated implementation of XenStore operations

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Now that we have an internal implementation of XenStore, we can populate
the xenstore_backend_ops to allow PV backends to talk to it.

Watches can't be processed with immediate callbacks because that would
call back into XenBus code recursively. Defer them to a QEMUBH to be run
as appropriate from the main loop. We use a QEMUBH per XS handle, and it
walks all the watches (there shouldn't be many per handle) to fire any
which have pending events. We *could* have done it differently but this
allows us to use the same struct watch_event as we have for the guest
side, and keeps things relatively simple.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_xenstore.c | 273 -
 1 file changed, 269 insertions(+), 4 deletions(-)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index bab40d1a04..028f80499e 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -49,7 +49,7 @@ struct XenXenstoreState {
 /*< public >*/
 
 XenstoreImplState *impl;
-GList *watch_events;
+GList *watch_events; /* for the guest */
 
 MemoryRegion xenstore_page;
 struct xenstore_domain_interface *xs;
@@ -73,6 +73,8 @@ struct XenXenstoreState *xen_xenstore_singleton;
 static void xen_xenstore_event(void *opaque);
 static void fire_watch_cb(void *opaque, const char *path, const char *token);
 
+static struct xenstore_backend_ops emu_xenstore_backend_ops;
+
 static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s,
 GList *perms,
 const char *relpath,
@@ -169,6 +171,8 @@ static void xen_xenstore_realize(DeviceState *dev, Error 
**errp)
 relpath_printf(s, perms, "feature", "%s", "");
 
 g_list_free_full(perms, g_free);
+
+xen_xenstore_ops = &emu_xenstore_backend_ops;
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
@@ -1305,6 +1309,15 @@ struct watch_event {
 char *token;
 };
 
+static void free_watch_event(struct watch_event *ev)
+{
+if (ev) {
+g_free(ev->path);
+g_free(ev->token);
+g_free(ev);
+}
+}
+
 static void queue_watch(XenXenstoreState *s, const char *path,
 const char *token)
 {
@@ -1351,9 +1364,7 @@ static void process_watch_events(XenXenstoreState *s)
 deliver_watch(s, ev->path, ev->token);
 
 s->watch_events = g_list_remove(s->watch_events, ev);
-g_free(ev->path);
-g_free(ev->token);
-g_free(ev);
+free_watch_event(ev);
 }
 
 static void xen_xenstore_event(void *opaque)
@@ -1443,3 +1454,257 @@ int xen_xenstore_reset(void)
 
 return 0;
 }
+
+struct qemu_xs_handle {
+XenstoreImplState *impl;
+GList *watches;
+QEMUBH *watch_bh;
+};
+
+struct qemu_xs_watch {
+struct qemu_xs_handle *h;
+char *path;
+xs_watch_fn fn;
+void *opaque;
+GList *events;
+};
+
+static char *xs_be_get_domain_path(struct qemu_xs_handle *h, unsigned int 
domid)
+{
+return g_strdup_printf("/local/domain/%u", domid);
+}
+
+static char **xs_be_directory(struct qemu_xs_handle *h, xs_transaction_t t,
+  const char *path, unsigned int *num)
+{
+GList *items = NULL, *l;
+unsigned int i = 0;
+char **items_ret;
+int err;
+
+err = xs_impl_directory(h->impl, DOMID_QEMU, t, path, NULL, &items);
+if (err) {
+errno = err;
+return NULL;
+}
+
+items_ret = g_new0(char *, g_list_length(items) + 1);
+*num = 0;
+for (l = items; l; l = l->next) {
+items_ret[i++] = l->data;
+(*num)++;
+}
+g_list_free(items);
+return items_ret;
+}
+
+static void *xs_be_read(struct qemu_xs_handle *h, xs_transaction_t t,
+const char *path, unsigned int *len)
+{
+GByteArray *data = g_byte_array_new();
+bool free_segment = false;
+int err;
+
+err = xs_impl_read(h->impl, DOMID_QEMU, t, path, data);
+if (err) {
+free_segment = true;
+errno = err;
+} else {
+if (len) {
+*len = data->len;
+}
+/* The xen-bus-helper code expects to get NUL terminated string! */
+g_byte_array_append(data, (void *)"", 1);
+}
+
+return g_byte_array_free(data, free_segment);
+}
+
+static bool xs_be_write(struct qemu_xs_handle *h, xs_transaction_t t,
+const char *path, const void *data, unsigned int len)
+{
+GByteArray *gdata = g_byte_array_new();
+int err;
+
+g_byte_array_append(gdata, data, len);
+err = xs_impl_write(h->impl, DOMID_QEMU, t, path, gdata);
+g_byte_array_unref(gdata);
+if (err) {
+errno = err;
+return false;
+}
+return true;
+}
+
+static bool xs_be_create(struct qemu_xs_handle *h, xs_transaction_t t,
+ unsigned int own

[RFC PATCH v1 19/25] hw/xen: Only advertise ring-page-order for xen-block if gnttab supports it

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Whem emulating Xen, multi-page grants are distinctly non-trivial and we
have elected not to support them for the time being. Don't advertise
them to the guest.

Signed-off-by: David Woodhouse 
---
 hw/block/xen-block.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 87299615e3..f5a744589d 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -83,7 +83,8 @@ static void xen_block_connect(XenDevice *xendev, Error **errp)
 g_free(ring_ref);
 return;
 }
-} else if (order <= blockdev->props.max_ring_page_order) {
+} else if (qemu_xen_gnttab_can_map_multi() &&
+   order <= blockdev->props.max_ring_page_order) {
 unsigned int i;
 
 nr_ring_ref = 1 << order;
@@ -255,8 +256,12 @@ static void xen_block_realize(XenDevice *xendev, Error 
**errp)
 }
 
 xen_device_backend_printf(xendev, "feature-flush-cache", "%u", 1);
-xen_device_backend_printf(xendev, "max-ring-page-order", "%u",
-  blockdev->props.max_ring_page_order);
+
+if (qemu_xen_gnttab_can_map_multi()) {
+xen_device_backend_printf(xendev, "max-ring-page-order", "%u",
+  blockdev->props.max_ring_page_order);
+}
+
 xen_device_backend_printf(xendev, "info", "%u", blockdev->info);
 
 xen_device_frontend_printf(xendev, "virtual-device", "%lu",
-- 
2.39.0

[RFC PATCH v1 13/25] hw/xen: Add xenstore operations to allow redirection to internal emulation

2023-03-02 Thread David Woodhouse

From: Paul Durrant 

Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
---
 accel/xen/xen-all.c |  11 +-
 hw/char/xen_console.c   |   2 +-
 hw/i386/kvm/xen_xenstore.c  |   3 -
 hw/i386/kvm/xenstore_impl.h |   8 +-
 hw/xen/xen-bus-helper.c |  62 +++
 hw/xen/xen-bus.c| 261 
 hw/xen/xen-legacy-backend.c | 119 +++--
 hw/xen/xen-operations.c | 198 +
 hw/xen/xen_devconfig.c  |   4 +-
 hw/xen/xen_pt_graphics.c|   1 -
 hw/xen/xen_pvdev.c  |  49 +-
 include/hw/xen/xen-bus-helper.h |  26 +--
 include/hw/xen/xen-bus.h|  17 +-
 include/hw/xen/xen-legacy-backend.h |   6 +-
 include/hw/xen/xen_backend_ops.h| 163 +
 include/hw/xen/xen_common.h |   1 -
 include/hw/xen/xen_pvdev.h  |   2 +-
 softmmu/globals.c   |   1 +
 18 files changed, 525 insertions(+), 409 deletions(-)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index e85e4aeba5..425216230f 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -90,12 +90,15 @@ void xenstore_store_pv_console_info(int i, Chardev *chr)
 }
 
 
-static void xenstore_record_dm_state(struct xs_handle *xs, const char *state)
+static void xenstore_record_dm_state(const char *state)
 {
+struct xs_handle *xs;
 char path[50];
 
+/* We now have everything we need to set the xenstore entry. */
+xs = xs_open(0);
 if (xs == NULL) {
-error_report("xenstore connection not initialized");
+fprintf(stderr, "Could not contact XenStore\n");
 exit(1);
 }
 
@@ -109,6 +112,8 @@ static void xenstore_record_dm_state(struct xs_handle *xs, 
const char *state)
 error_report("error recording dm state");
 exit(1);
 }
+
+xs_close(xs);
 }
 
 
@@ -117,7 +122,7 @@ static void xen_change_state_handler(void *opaque, bool 
running,
 {
 if (running) {
 /* record state running */
-xenstore_record_dm_state(xenstore, "running");
+xenstore_record_dm_state("running");
 }
 }
 
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index e9cef3e1ef..ad8638a86d 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -181,7 +181,7 @@ static int con_init(struct XenLegacyDevice *xendev)
 const char *output;
 
 /* setup */
-dom = xs_get_domain_path(xenstore, con->xendev.dom);
+dom = qemu_xen_xs_get_domain_path(xenstore, con->xendev.dom);
 if (!xendev->dev) {
 snprintf(con->console, sizeof(con->console), "%s/console", dom);
 } else {
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 5a8e38aae7..bab40d1a04 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -38,9 +38,6 @@
 #define TYPE_XEN_XENSTORE "xen-xenstore"
 OBJECT_DECLARE_SIMPLE_TYPE(XenXenstoreState, XEN_XENSTORE)
 
-#define XEN_PAGE_SHIFT 12
-#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
-
 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t))
 #define ENTRIES_PER_FRAME_V2 (XEN_PAGE_SIZE / sizeof(grant_entry_v2_t))
 
diff --git a/hw/i386/kvm/xenstore_impl.h b/hw/i386/kvm/xenstore_impl.h
index bbe2391e2e..0df2a91aae 100644
--- a/hw/i386/kvm/xenstore_impl.h
+++ b/hw/i386/kvm/xenstore_impl.h
@@ -12,13 +12,7 @@
 #ifndef QEMU_XENSTORE_IMPL_H
 #define QEMU_XENSTORE_IMPL_H
 
-typedef uint32_t xs_transaction_t;
-
-#define XBT_NULL 0
-
-#define XS_PERM_NONE  0x00
-#define XS_PERM_READ  0x01
-#define XS_PERM_WRITE 0x02
+#include "hw/xen/xen_backend_ops.h"
 
 typedef struct XenstoreImplState XenstoreImplState;
 
diff --git a/hw/xen/xen-bus-helper.c b/hw/xen/xen-bus-helper.c
index 5a1e12b374..b2b2cc9c5d 100644
--- a/hw/xen/xen-bus-helper.c
+++ b/hw/xen/xen-bus-helper.c
@@ -10,6 +10,7 @@
 #include "hw/xen/xen-bus.h"
 #include "hw/xen/xen-bus-helper.h"
 #include "qapi/error.h"
+#include "trace.h"
 
 #include 
 
@@ -46,34 +47,28 @@ const char *xs_strstate(enum xenbus_state state)
 return "INVALID";
 }
 
-void xs_node_create(struct xs_handle *xsh, xs_transaction_t tid,
-const char *node, struct xs_permissions perms[],
-unsigned int nr_perms, Error **errp)
+void xs_node_create(struct qemu_xs_handle *h, xs_transaction_t tid,
+const char *node, unsigned int owner, unsigned int domid,
+unsigned int perms, Error **errp)
 {
 trace_xs_node_create(node);
 
-if (!xs_write(xsh, tid, node, "", 0)) {
+if (!qemu_xen_xs_create(h, tid, owner, domid, perms, node)) {
 error_setg_errno(errp, errno, "failed to create node '%s'", node);
-return;
-}
-
-if (!xs_set_permissions(xsh, tid, node, perms

[RFC PATCH v1 20/25] hw/xen: Hook up emulated implementation for event channel operations

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

We provided the backend-facing evtchn functions very early on as part of
the core Xen platform support, since things like timers and xenstore need
to use them.

By what may or may not be an astonishing coincidence, those functions
just *happen* all to have exactly the right function prototypes to slot
into the evtchn_backend_ops table and be called by the PV backends.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_evtchn.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 886fbf6b3b..98a7b85047 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -34,6 +34,7 @@
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
 #include "hw/irq.h"
+#include "hw/xen/xen_backend_ops.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -278,6 +279,17 @@ static const TypeInfo xen_evtchn_info = {
 .class_init= xen_evtchn_class_init,
 };
 
+static struct evtchn_backend_ops emu_evtchn_backend_ops = {
+.open = xen_be_evtchn_open,
+.bind_interdomain = xen_be_evtchn_bind_interdomain,
+.unbind = xen_be_evtchn_unbind,
+.close = xen_be_evtchn_close,
+.get_fd = xen_be_evtchn_fd,
+.notify = xen_be_evtchn_notify,
+.unmask = xen_be_evtchn_unmask,
+.pending = xen_be_evtchn_pending,
+};
+
 static void gsi_assert_bh(void *opaque)
 {
 struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
@@ -318,6 +330,9 @@ void xen_evtchn_create(void)
 s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
 s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
 s->pirq = g_new0(struct pirq_info, s->nr_pirqs);
+
+/* Set event channel functions for backend drivers to use */
+xen_evtchn_ops = &emu_evtchn_backend_ops;
 }
 
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
-- 
2.39.0

[RFC PATCH v1 07/25] hw/xen: Implement core serialize/deserialize methods for xenstore_impl

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

In fact I think we want to only serialize the contents of the domain's
path in /local/domain/${domid} and leave the rest to be recreated? Will
defer to Paul for that.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_xenstore.c  |  25 +-
 hw/i386/kvm/xenstore_impl.c | 574 +++-
 hw/i386/kvm/xenstore_impl.h |   5 +
 tests/unit/test-xs-node.c   | 236 ++-
 4 files changed, 824 insertions(+), 16 deletions(-)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 3b409e3817..1b1358ad4c 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -66,6 +66,9 @@ struct XenXenstoreState {
 evtchn_port_t guest_port;
 evtchn_port_t be_port;
 struct xenevtchn_handle *eh;
+
+uint8_t *impl_state;
+uint32_t impl_state_size;
 };
 
 struct XenXenstoreState *xen_xenstore_singleton;
@@ -109,16 +112,26 @@ static bool xen_xenstore_is_needed(void *opaque)
 static int xen_xenstore_pre_save(void *opaque)
 {
 XenXenstoreState *s = opaque;
+GByteArray *save;
 
 if (s->eh) {
 s->guest_port = xen_be_evtchn_get_guest_port(s->eh);
 }
+
+g_free(s->impl_state);
+save = xs_impl_serialize(s->impl);
+s->impl_state = save->data;
+s->impl_state_size = save->len;
+g_byte_array_free(save, false);
+
 return 0;
 }
 
 static int xen_xenstore_post_load(void *opaque, int ver)
 {
 XenXenstoreState *s = opaque;
+GByteArray *save;
+int ret;
 
 /*
  * As qemu/dom0, rebind to the guest's port. The Windows drivers may
@@ -135,7 +148,13 @@ static int xen_xenstore_post_load(void *opaque, int ver)
 }
 s->be_port = be_port;
 }
-return 0;
+
+save = g_byte_array_new_take(s->impl_state, s->impl_state_size);
+s->impl_state = NULL;
+s->impl_state_size = 0;
+
+ret = xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s);
+return ret;
 }
 
 static const VMStateDescription xen_xenstore_vmstate = {
@@ -155,6 +174,10 @@ static const VMStateDescription xen_xenstore_vmstate = {
 VMSTATE_BOOL(rsp_pending, XenXenstoreState),
 VMSTATE_UINT32(guest_port, XenXenstoreState),
 VMSTATE_BOOL(fatal_error, XenXenstoreState),
+VMSTATE_UINT32(impl_state_size, XenXenstoreState),
+VMSTATE_VARRAY_UINT32_ALLOC(impl_state, XenXenstoreState,
+impl_state_size, 0,
+vmstate_info_uint8, uint8_t),
 VMSTATE_END_OF_LIST()
 }
 };
diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 7988bde88f..82e7ae06f5 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -37,6 +37,7 @@ typedef struct XsNode {
 uint64_t gencnt;
 bool deleted_in_tx;
 bool modified_in_tx;
+unsigned int serialized_tx;
 #ifdef XS_NODE_UNIT_TEST
 gchar *name; /* debug only */
 #endif
@@ -68,6 +69,7 @@ struct XenstoreImplState {
 unsigned int nr_domu_transactions;
 unsigned int root_tx;
 unsigned int last_tx;
+bool serialized;
 };
 
 
@@ -1156,8 +1158,10 @@ int xs_impl_set_perms(XenstoreImplState *s, unsigned int 
dom_id,
 return xs_node_walk(n, &op);
 }
 
-int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
-  const char *token, xs_impl_watch_fn fn, void *opaque)
+static int do_xs_impl_watch(XenstoreImplState *s, unsigned int dom_id,
+const char *path, const char *token,
+xs_impl_watch_fn fn, void *opaque)
+
 {
 char abspath[XENSTORE_ABS_PATH_MAX + 1];
 XsWatch *w, *l;
@@ -1200,12 +1204,22 @@ int xs_impl_watch(XenstoreImplState *s, unsigned int 
dom_id, const char *path,
 s->nr_domu_watches++;
 }
 
-/* A new watch should fire immediately */
-fn(opaque, path, token);
-
 return 0;
 }
 
+int xs_impl_watch(XenstoreImplState *s, unsigned int dom_id, const char *path,
+  const char *token, xs_impl_watch_fn fn, void *opaque)
+{
+int ret = do_xs_impl_watch(s, dom_id, path, token, fn, opaque);
+
+if (!ret) {
+/* A new watch should fire immediately */
+fn(opaque, path, token);
+}
+
+return ret;
+}
+
 static XsWatch *free_watch(XenstoreImplState *s, XsWatch *w)
 {
 XsWatch *next = w->next;
@@ -1361,3 +1375,553 @@ XenstoreImplState *xs_impl_create(unsigned int dom_id)
 s->root_tx = s->last_tx = 1;
 return s;
 }
+
+
+static void clear_serialized_tx(gpointer key, gpointer value, gpointer opaque)
+{
+XsNode *n = value;
+
+n->serialized_tx = XBT_NULL;
+if (n->children) {
+g_hash_table_foreach(n->children, clear_serialized_tx, NULL);
+}
+}
+
+static void clear_tx_serialized_tx(gpointer key, gpointer value,
+   gpointer opaque)
+{
+XsTransaction *t = value;
+
+clear_s

[RFC PATCH v1 10/25] hw/xen: Add gnttab operations to allow redirection to internal emulation

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Move the existing code using libxengnttab to xen-operations.c and allow
the operations to be redirected so that we can add emulation of grant
table mapping for backend drivers.

In emulation, mapping more than one grant ref to be virtually contiguous
would be fairly difficult. The best way to do it might be to make the
ram_block mappings actually backed by a file (shmem or a deleted file,
perhaps) so that we can have multiple *shared* mappings of it. But that
would be fairly intrusive.

Making the backend drivers cope with page *lists* instead of expecting
the mapping to be contiguous is also non-trivial, since some structures
would actually *cross* page boundaries (e.g. the 32-bit blkif responses
which are 12 bytes).

So for now, we'll support only single-page mappings in emulation. Add a
XEN_GNTTAB_OP_FEATURE_MAP_MULTIPLE flag to indicate that the native Xen
implementation *does* support multi-page maps, and a helper function to
query it.

Signed-off-by: David Woodhouse 
Signed-off-by: Paul Durrant 
---
 hw/xen/xen-bus.c| 112 ++--
 hw/xen/xen-legacy-backend.c | 125 ++
 hw/xen/xen-operations.c | 157 
 hw/xen/xen_pvdev.c  |   2 +-
 include/hw/xen/xen-bus.h|   3 +-
 include/hw/xen/xen-legacy-backend.h |  13 +--
 include/hw/xen/xen_backend_ops.h| 100 ++
 include/hw/xen/xen_common.h |  39 ---
 softmmu/globals.c   |   1 +
 9 files changed, 280 insertions(+), 272 deletions(-)

diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index d0b1ae93da..b247e86f28 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -947,7 +947,7 @@ static void xen_device_frontend_destroy(XenDevice *xendev)
 void xen_device_set_max_grant_refs(XenDevice *xendev, unsigned int nr_refs,
Error **errp)
 {
-if (xengnttab_set_max_grants(xendev->xgth, nr_refs)) {
+if (qemu_xen_gnttab_set_max_grants(xendev->xgth, nr_refs)) {
 error_setg_errno(errp, errno, "xengnttab_set_max_grants failed");
 }
 }
@@ -956,9 +956,8 @@ void *xen_device_map_grant_refs(XenDevice *xendev, uint32_t 
*refs,
 unsigned int nr_refs, int prot,
 Error **errp)
 {
-void *map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_refs,
-xendev->frontend_id, refs,
-prot);
+void *map = qemu_xen_gnttab_map_refs(xendev->xgth, nr_refs,
+ xendev->frontend_id, refs, prot);
 
 if (!map) {
 error_setg_errno(errp, errno,
@@ -971,109 +970,17 @@ void *xen_device_map_grant_refs(XenDevice *xendev, 
uint32_t *refs,
 void xen_device_unmap_grant_refs(XenDevice *xendev, void *map,
  unsigned int nr_refs, Error **errp)
 {
-if (xengnttab_unmap(xendev->xgth, map, nr_refs)) {
+if (qemu_xen_gnttab_unmap(xendev->xgth, map, nr_refs)) {
 error_setg_errno(errp, errno, "xengnttab_unmap failed");
 }
 }
 
-static void compat_copy_grant_refs(XenDevice *xendev, bool to_domain,
-   XenDeviceGrantCopySegment segs[],
-   unsigned int nr_segs, Error **errp)
-{
-uint32_t *refs = g_new(uint32_t, nr_segs);
-int prot = to_domain ? PROT_WRITE : PROT_READ;
-void *map;
-unsigned int i;
-
-for (i = 0; i < nr_segs; i++) {
-XenDeviceGrantCopySegment *seg = &segs[i];
-
-refs[i] = to_domain ? seg->dest.foreign.ref :
-seg->source.foreign.ref;
-}
-
-map = xengnttab_map_domain_grant_refs(xendev->xgth, nr_segs,
-  xendev->frontend_id, refs,
-  prot);
-if (!map) {
-error_setg_errno(errp, errno,
- "xengnttab_map_domain_grant_refs failed");
-goto done;
-}
-
-for (i = 0; i < nr_segs; i++) {
-XenDeviceGrantCopySegment *seg = &segs[i];
-void *page = map + (i * XC_PAGE_SIZE);
-
-if (to_domain) {
-memcpy(page + seg->dest.foreign.offset, seg->source.virt,
-   seg->len);
-} else {
-memcpy(seg->dest.virt, page + seg->source.foreign.offset,
-   seg->len);
-}
-}
-
-if (xengnttab_unmap(xendev->xgth, map, nr_segs)) {
-error_setg_errno(errp, errno, "xengnttab_unmap failed");
-}
-
-done:
-g_free(refs);
-}
-
 void xen_device_copy_grant_refs(XenDevice *xendev, bool to_domain,
 XenDeviceGrantCopySegment segs[],
 unsigned int nr_segs, Error **errp)
 {
-xengnttab_grant_copy_segment_t *x

[RFC PATCH v1 11/25] hw/xen: Pass grant ref to gnttab unmap operation

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

The previous commit introduced redirectable gnttab operations fairly
much like-for-like, with the exception of the extra arguments to the
->open() call which were always NULL/0 anyway.

This *changes* the arguments to the ->unmap() operation to include the
original ref# that was mapped. Under real Xen it isn't necessary; all we
need to do from QEMU is munmap(), then the kernel will release the grant,
and Xen does the tracking/refcounting for the guest.

When we have emulated grant tables though, we need to do all that for
ourselves. So let's have the back ends keep track of what they mapped
and pass it in to the ->unmap() method for us.

Signed-off-by: David Woodhouse 
---
 hw/9pfs/xen-9p-backend.c|  7 ---
 hw/block/dataplane/xen-block.c  |  1 +
 hw/char/xen_console.c   |  2 +-
 hw/net/xen_nic.c| 13 -
 hw/usb/xen-usb.c| 21 -
 hw/xen/xen-bus.c|  4 ++--
 hw/xen/xen-legacy-backend.c |  4 ++--
 hw/xen/xen-operations.c |  9 -
 include/hw/xen/xen-bus.h|  2 +-
 include/hw/xen/xen-legacy-backend.h |  6 +++---
 include/hw/xen/xen_backend_ops.h|  7 ---
 11 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index 864bdaf952..d8bb0e847c 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -359,12 +359,13 @@ static int xen_9pfs_free(struct XenLegacyDevice *xendev)
 if (xen_9pdev->rings[i].data != NULL) {
 xen_be_unmap_grant_refs(&xen_9pdev->xendev,
 xen_9pdev->rings[i].data,
+xen_9pdev->rings[i].intf->ref,
 (1 << xen_9pdev->rings[i].ring_order));
 }
 if (xen_9pdev->rings[i].intf != NULL) {
-xen_be_unmap_grant_refs(&xen_9pdev->xendev,
-xen_9pdev->rings[i].intf,
-1);
+xen_be_unmap_grant_ref(&xen_9pdev->xendev,
+   xen_9pdev->rings[i].intf,
+   xen_9pdev->rings[i].ref);
 }
 if (xen_9pdev->rings[i].bh != NULL) {
 qemu_bh_delete(xen_9pdev->rings[i].bh);
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index 2785b9e849..e55b713002 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -705,6 +705,7 @@ void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
 Error *local_err = NULL;
 
 xen_device_unmap_grant_refs(xendev, dataplane->sring,
+dataplane->ring_ref,
 dataplane->nr_ring_ref, &local_err);
 dataplane->sring = NULL;
 
diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c
index 63153dfde4..19ad6c946a 100644
--- a/hw/char/xen_console.c
+++ b/hw/char/xen_console.c
@@ -271,7 +271,7 @@ static void con_disconnect(struct XenLegacyDevice *xendev)
 if (!xendev->dev) {
 xenforeignmemory_unmap(xen_fmem, con->sring, 1);
 } else {
-xen_be_unmap_grant_ref(xendev, con->sring);
+xen_be_unmap_grant_ref(xendev, con->sring, con->ring_ref);
 }
 con->sring = NULL;
 }
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 7d92c2d022..166d03787d 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -181,7 +181,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
 qemu_send_packet(qemu_get_queue(netdev->nic),
  page + txreq.offset, txreq.size);
 }
-xen_be_unmap_grant_ref(&netdev->xendev, page);
+xen_be_unmap_grant_ref(&netdev->xendev, page, txreq.gref);
 net_tx_response(netdev, &txreq, NETIF_RSP_OKAY);
 }
 if (!netdev->tx_work) {
@@ -261,7 +261,7 @@ static ssize_t net_rx_packet(NetClientState *nc, const 
uint8_t *buf, size_t size
 return -1;
 }
 memcpy(page + NET_IP_ALIGN, buf, size);
-xen_be_unmap_grant_ref(&netdev->xendev, page);
+xen_be_unmap_grant_ref(&netdev->xendev, page, rxreq.gref);
 net_rx_response(netdev, &rxreq, NETIF_RSP_OKAY, NET_IP_ALIGN, size, 0);
 
 return size;
@@ -343,7 +343,8 @@ static int net_connect(struct XenLegacyDevice *xendev)
netdev->rx_ring_ref,
PROT_READ | PROT_WRITE);
 if (!netdev->rxs) {
-xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs);
+xen_be_unmap_grant_ref(&netdev->xendev, netdev->txs,
+   netdev->tx_ring_ref);

[RFC PATCH v1 00/25] Enable PV backends with Xen/KVM emulation

2023-03-02 Thread David Woodhouse

Now that the basic platform support is hopefully on the cusp of being 
merged, here's phase 2 which wires up the XenBus and PV back ends.

It starts with a basic single-tenant internal implementation of a 
XenStore, with a copy-on-write tree, watches, transactions, quotas.

Then we introduce operations tables for the grant table, event channel,
foreignmen and xenstore operations so that in addition to using the Xen
libraries for those, QEMU can use its internal emulated versions.

A little bit of cleaning up of header files, and we can enable the build
of xen-bus in the CONFIG_XEN_EMU build, and run a Xen guest with an
actual PV disk...

   qemu-system-x86_64 -serial mon:stdio -M q35 -display none -m 1G -smp 2 \
  -accel kvm,xen-version=0x4000e,kernel-irqchip=split \
  -kernel bzImage -append "console=ttyS0 root=/dev/xvda1 selinux=0" \
  -drive file=/var/lib/libvirt/images/fedora28.qcow2,if=none,id=disk \
  -device xen-disk,drive=disk,vdev=xvda

The main thing that isn't working here is migration. I've implemented it 
for the internal xenstore and the unit tests exercise it, but the 
existing PV back ends don't support it, perhaps partly because support 
for guest transparent live migration support isn't upstream in Xen yet. 
So the disk doesn't come back correctly after migration. I'm content 
with that for 8.0 though.

The other pre-existing constraint is that only the block back end has
yet been ported to the "new" XenBus infrastructure, and is actually
capable of creating its own backend nodes. Again, I can live with
that for 8.0. Maybe this will motivate us to finally get round to
converting the rest off XenLegacyBackend and killing it.

We also don't have a simple way to perform grant mapping of multiple
guest pages to contiguous addresses, as we can under real Xen. So we
don't advertise max-ring-page-order for xen-disk in the emulated mode.
Fixing that — if we actually want to — would probably require mapping
RAM from an actual backing store object, so that it can be mapped again
at a different location for the PV back end to see.

David Woodhouse (21):
  hw/xen: Add xenstore wire implementation and implementation stubs
  hw/xen: Add basic XenStore tree walk and write/read/directory support
  hw/xen: Implement XenStore watches
  hw/xen: Implement XenStore transactions
  hw/xen: Watches on XenStore transactions
  hw/xen: Implement core serialize/deserialize methods for xenstore_impl
  hw/xen: Add evtchn operations to allow redirection to internal emulation
  hw/xen: Add gnttab operations to allow redirection to internal emulation
  hw/xen: Pass grant ref to gnttab unmap operation
  hw/xen: Add foreignmem operations to allow redirection to internal 
emulation
  hw/xen: Move xenstore_store_pv_console_info to xen_console.c
  hw/xen: Use XEN_PAGE_SIZE in PV backend drivers
  hw/xen: Rename xen_common.h to xen_native.h
  hw/xen: Build PV backend drivers for CONFIG_XEN_BUS
  hw/xen: Only advertise ring-page-order for xen-block if gnttab supports it
  hw/xen: Hook up emulated implementation for event channel operations
  hw/xen: Add emulated implementation of grant table operations
  hw/xen: Add emulated implementation of XenStore operations
  hw/xen: Map guest XENSTORE_PFN grant in emulated Xenstore
  hw/xen: Implement soft reset for emulated gnttab
  i386/xen: Initialize Xen backends from pc_basic_device_init() for 
emulation

Paul Durrant (4):
  hw/xen: Implement XenStore permissions
  hw/xen: Create initial XenStore nodes
  hw/xen: Add xenstore operations to allow redirection to internal emulation
  hw/xen: Avoid crash when backend watch fires too early

 accel/xen/xen-all.c   |   69 +-
 hw/9pfs/meson.build   |2 +-
 hw/9pfs/xen-9p-backend.c  |   32 +-
 hw/block/dataplane/meson.build|2 +-
 hw/block/dataplane/xen-block.c|   12 +-
 hw/block/meson.build  |2 +-
 hw/block/xen-block.c  |   12 +-
 hw/char/meson.build   |2 +-
 hw/char/xen_console.c |   57 +-
 hw/display/meson.build|2 +-
 hw/display/xenfb.c|   32 +-
 hw/i386/kvm/meson.build   |1 +
 hw/i386/kvm/trace-events  |   15 +
 hw/i386/kvm/xen_evtchn.c  |   15 +
 hw/i386/kvm/xen_gnttab.c  |  325 -
 hw/i386/kvm/xen_gnttab.h  |1 +
 hw/i386/kvm/xen_xenstore.c| 1250 +++-
 hw/i386/kvm/xenstore_impl.c   | 1927 +
 hw/i386/kvm/xenstore_impl.h   |   63 +
 hw/i386/pc.c  |7 +
 hw/i386/pc_pi

[RFC PATCH v1 02/25] hw/xen: Add basic XenStore tree walk and write/read/directory support

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

This is a fairly simple implementation of a copy-on-write tree.

The node walk function starts off at the root, with 'inplace == true'.
If it ever encounters a node with a refcount greater than one (including
the root node), then that node is shared with other trees, and cannot
be modified in place, so the inplace flag is cleared and we copy on
write from there on down.

Xenstore write has 'mkdir -p' semantics and will create the intermediate
nodes if they don't already exist, so in that case we flip the inplace
flag back to true as as populated the newly-created nodes.

We put a copy of the absolute path into the buffer in the struct walk_op,
with *two* NUL terminators at the end. As xs_node_walk() goes down the
tree, it replaces the next '/' separator with a NUL so that it can use
the 'child name' in place. The next recursion down then puts the '/'
back and repeats the exercise for the next path element... if it doesn't
hit that *second* NUL termination which indicates the true end of the
path.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xenstore_impl.c | 527 +++-
 tests/unit/meson.build  |   1 +
 tests/unit/test-xs-node.c   | 197 ++
 3 files changed, 718 insertions(+), 7 deletions(-)
 create mode 100644 tests/unit/test-xs-node.c

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 31dbc98fe0..9e10a31bea 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -10,13 +10,470 @@
  */
 
 #include "qemu/osdep.h"
+#include "qom/object.h"
 
 #include "xen_xenstore.h"
 #include "xenstore_impl.h"
 
+#include "hw/xen/interface/io/xs_wire.h"
+
+#define XS_MAX_WATCHES  128
+#define XS_MAX_DOMAIN_NODES 1000
+#define XS_MAX_NODE_SIZE2048
+#define XS_MAX_TRANSACTIONS 10
+#define XS_MAX_PERMS_PER_NODE   5
+
+#define XS_VALID_CHARS "abcdefghijklmnopqrstuvwxyz" \
+   "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
+   "0123456789-/_"
+
+typedef struct XsNode {
+uint32_t ref;
+GByteArray *content;
+GHashTable *children;
+uint64_t gencnt;
+#ifdef XS_NODE_UNIT_TEST
+gchar *name; /* debug only */
+#endif
+} XsNode;
+
 struct XenstoreImplState {
+XsNode *root;
+unsigned int nr_nodes;
 };
 
+static inline XsNode *xs_node_new(void)
+{
+XsNode *n = g_new0(XsNode, 1);
+n->ref = 1;
+
+#ifdef XS_NODE_UNIT_TEST
+nr_xs_nodes++;
+xs_node_list = g_list_prepend(xs_node_list, n);
+#endif
+return n;
+}
+
+static inline XsNode *xs_node_ref(XsNode *n)
+{
+/* With just 10 transactions, it can never get anywhere near this. */
+g_assert(n->ref < INT_MAX);
+
+g_assert(n->ref);
+n->ref++;
+return n;
+}
+
+static inline void xs_node_unref(XsNode *n)
+{
+if (!n) {
+return;
+}
+g_assert(n->ref);
+if (--n->ref) {
+return;
+}
+
+if (n->content) {
+g_byte_array_unref(n->content);
+}
+if (n->children) {
+g_hash_table_unref(n->children);
+}
+#ifdef XS_NODE_UNIT_TEST
+g_free(n->name);
+nr_xs_nodes--;
+xs_node_list = g_list_remove(xs_node_list, n);
+#endif
+g_free(n);
+}
+
+/* For copying from one hash table to another using g_hash_table_foreach() */
+static void do_insert(gpointer key, gpointer value, gpointer user_data)
+{
+g_hash_table_insert(user_data, g_strdup(key), xs_node_ref(value));
+}
+
+static XsNode *xs_node_copy(XsNode *old)
+{
+XsNode *n = xs_node_new();
+
+n->gencnt = old->gencnt;
+if (old->children) {
+n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+(GDestroyNotify)xs_node_unref);
+g_hash_table_foreach(old->children, do_insert, n->children);
+}
+if (old && old->content) {
+n->content = g_byte_array_ref(old->content);
+}
+return n;
+}
+
+/* Returns true if it made a change to the hash table */
+static bool xs_node_add_child(XsNode *n, const char *path_elem, XsNode *child)
+{
+assert(!strchr(path_elem, '/'));
+
+if (!child) {
+assert(n->children);
+return g_hash_table_remove(n->children, path_elem);
+}
+
+#ifdef XS_NODE_UNIT_TEST
+g_free(child->name);
+child->name = g_strdup(path_elem);
+#endif
+if (!n->children) {
+n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+(GDestroyNotify)xs_node_unref);
+}
+
+/*
+ * The documentation for g_hash_table_insert() says that it "returns a
+ * boolean value to indicate whether the newly added value was already
+ * in the hash table or not."
+ *
+ * It could perhaps be clearer that

[RFC PATCH v1 01/25] hw/xen: Add xenstore wire implementation and implementation stubs

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

This implements the basic wire protocol for the XenStore commands, punting
all the actual implementation to xs_impl_* functions which all just return
errors for now.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/meson.build |   1 +
 hw/i386/kvm/trace-events|  15 +
 hw/i386/kvm/xen_xenstore.c  | 871 +++-
 hw/i386/kvm/xenstore_impl.c | 117 +
 hw/i386/kvm/xenstore_impl.h |  58 +++
 5 files changed, 1054 insertions(+), 8 deletions(-)
 create mode 100644 hw/i386/kvm/xenstore_impl.c
 create mode 100644 hw/i386/kvm/xenstore_impl.h

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 82dd6ae7c6..6621ba5cd7 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -9,6 +9,7 @@ i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
   'xen_evtchn.c',
   'xen_gnttab.c',
   'xen_xenstore.c',
+  'xenstore_impl.c',
   ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
index b83c3eb965..e4c82de6f3 100644
--- a/hw/i386/kvm/trace-events
+++ b/hw/i386/kvm/trace-events
@@ -3,3 +3,18 @@ kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
 kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
 kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d"
+xenstore_error(unsigned int id, unsigned int tx_id, const char *err) "req %u 
tx %u err %s"
+xenstore_read(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_write(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_mkdir(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_directory(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_directory_part(unsigned int tx_id, const char *path, unsigned int 
offset) "tx %u path %s offset %u"
+xenstore_transaction_start(unsigned int new_tx) "new_tx %u"
+xenstore_transaction_end(unsigned int tx_id, bool commit) "tx %u commit %d"
+xenstore_rm(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_get_perms(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_set_perms(unsigned int tx_id, const char *path) "tx %u path %s"
+xenstore_watch(const char *path, const char *token) "path %s token %s"
+xenstore_unwatch(const char *path, const char *token) "path %s token %s"
+xenstore_reset_watches(void) ""
+xenstore_watch_event(const char *path, const char *token) "path %s token %s"
diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 14193ef3f9..64d8f1a38f 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -28,6 +28,10 @@
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
 
+#include "trace.h"
+
+#include "xenstore_impl.h"
+
 #include "hw/xen/interface/io/xs_wire.h"
 #include "hw/xen/interface/event_channel.h"
 
@@ -47,6 +51,9 @@ struct XenXenstoreState {
 SysBusDevice busdev;
 /*< public >*/
 
+XenstoreImplState *impl;
+GList *watch_events;
+
 MemoryRegion xenstore_page;
 struct xenstore_domain_interface *xs;
 uint8_t req_data[XENSTORE_HEADER_SIZE + XENSTORE_PAYLOAD_MAX];
@@ -64,6 +71,7 @@ struct XenXenstoreState {
 struct XenXenstoreState *xen_xenstore_singleton;
 
 static void xen_xenstore_event(void *opaque);
+static void fire_watch_cb(void *opaque, const char *path, const char *token);
 
 static void xen_xenstore_realize(DeviceState *dev, Error **errp)
 {
@@ -89,6 +97,8 @@ static void xen_xenstore_realize(DeviceState *dev, Error 
**errp)
 }
 aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
xen_xenstore_event, NULL, NULL, NULL, s);
+
+s->impl = xs_impl_create();
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
@@ -213,20 +223,761 @@ static void reset_rsp(XenXenstoreState *s)
 s->rsp_offset = 0;
 }
 
+static void xs_error(XenXenstoreState *s, unsigned int id,
+ xs_transaction_t tx_id, int errnum)
+{
+struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+const char *errstr = NULL;
+
+for (unsigned int i = 0; i < ARRAY_SIZE(xsd_errors); i++) {
+struct xsd_errors *xsd_error = &xsd_errors[i];
+
+if (xsd_error->errnum == errnum) {
+errstr = xsd_error->errstring;
+break;
+}
+}
+assert(errstr);
+
+trace_xenstore_error(id, tx_id, errstr);
+
+rsp->type = XS_ERROR;
+rsp->req_id = id;
+rsp->tx_id = tx_id;
+rsp->len = (uint32_t)strlen(errstr) + 1;
+
+memcpy(&rsp[1], errstr, rsp->len);
+}
+
+static void xs_ok

[RFC PATCH v1 04/25] hw/xen: Implement XenStore transactions

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Given that the whole thing supported copy on write from the beginning,
transactions end up being fairly simple. On starting a transaction, just
take a ref of the existing root; swap it back in on a successful commit.

The main tree has a transaction ID too, and we keep a record of the last
transaction ID given out. if the main tree is ever modified when it isn't
the latest, it gets a new transaction ID.

A commit can only succeed if the main tree hasn't moved on since it was
forked. Strictly speaking, the XenStore protocol allows a transaction to
succeed as long as nothing *it* read or wrote has changed in the interim,
but no implementations do that; *any* change is sufficient to abort a
transaction.

This does not yet fire watches on the changed nodes on a commit. That bit
is more fun and will come in a follow-on commit.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xenstore_impl.c | 150 ++--
 tests/unit/test-xs-node.c   | 118 
 2 files changed, 262 insertions(+), 6 deletions(-)

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 2e464af93a..e5074ab1ec 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -46,13 +46,56 @@ typedef struct XsWatch {
 int rel_prefix;
 } XsWatch;
 
+typedef struct XsTransaction {
+XsNode *root;
+unsigned int nr_nodes;
+unsigned int base_tx;
+unsigned int tx_id;
+unsigned int dom_id;
+} XsTransaction;
+
 struct XenstoreImplState {
 XsNode *root;
 unsigned int nr_nodes;
 GHashTable *watches;
 unsigned int nr_domu_watches;
+GHashTable *transactions;
+unsigned int nr_domu_transactions;
+unsigned int root_tx;
+unsigned int last_tx;
 };
 
+
+static void nobble_tx(gpointer key, gpointer value, gpointer user_data)
+{
+unsigned int *new_tx_id = user_data;
+XsTransaction *tx = value;
+
+if (tx->base_tx == *new_tx_id) {
+/* Transactions based on XBT_NULL will always fail */
+tx->base_tx = XBT_NULL;
+}
+}
+
+static inline unsigned int next_tx(struct XenstoreImplState *s)
+{
+unsigned int tx_id;
+
+/* Find the next TX id which isn't either XBT_NULL or in use. */
+do {
+tx_id = ++s->last_tx;
+} while (tx_id == XBT_NULL || tx_id == s->root_tx ||
+ g_hash_table_lookup(s->transactions, GINT_TO_POINTER(tx_id)));
+
+/*
+ * It is vanishingly unlikely, but ensure that no outstanding transaction
+ * is based on the (previous incarnation of the) newly-allocated TX id.
+ */
+g_hash_table_foreach(s->transactions, nobble_tx, &tx_id);
+
+return tx_id;
+}
+
 static inline XsNode *xs_node_new(void)
 {
 XsNode *n = g_new0(XsNode, 1);
@@ -159,6 +202,7 @@ struct walk_op {
 
 GList *watches;
 unsigned int dom_id;
+unsigned int tx_id;
 
 /* The number of nodes which will exist in the tree if this op succeeds. */
 unsigned int new_nr_nodes;
@@ -176,6 +220,7 @@ struct walk_op {
 bool inplace;
 bool mutating;
 bool create_dirs;
+bool in_transaction;
 };
 
 static void fire_watches(struct walk_op *op, bool parents)
@@ -183,7 +228,7 @@ static void fire_watches(struct walk_op *op, bool parents)
 GList *l = NULL;
 XsWatch *w;
 
-if (!op->mutating) {
+if (!op->mutating || op->in_transaction) {
 return;
 }
 
@@ -450,10 +495,23 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 assert(!op->watches);
 /*
  * On completing the recursion back up the path walk and reaching the
- * top, assign the new node count if the operation was successful.
+ * top, assign the new node count if the operation was successful. If
+ * the main tree was changed, bump its tx ID so that outstanding
+ * transactions correctly fail. But don't bump it every time; only
+ * if it makes a difference.
  */
 if (!err && op->mutating) {
-op->s->nr_nodes = op->new_nr_nodes;
+if (!op->in_transaction) {
+if (op->s->root_tx != op->s->last_tx) {
+op->s->root_tx = next_tx(op->s);
+}
+op->s->nr_nodes = op->new_nr_nodes;
+} else {
+XsTransaction *tx = g_hash_table_lookup(op->s->transactions,
+
GINT_TO_POINTER(op->tx_id));
+assert(tx);
+tx->nr_nodes = op->new_nr_nodes;
+}
 }
 }
 return err;
@@ -535,14 +593,23 @@ static int init_walk_op(XenstoreImplState *s, struct 
walk_op *op,
 op->inplace = true;
 op->mutating = false;
 op->create_dirs = false;
+op->in_transaction = false;
 op->dom_id = dom_id;
+op->tx_id = tx_id;
 op->s

[RFC PATCH v1 03/25] hw/xen: Implement XenStore watches

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Starts out fairly simple: a hash table of watches based on the path.

Except there can be multiple watches on the same path, so the watch ends
up being a simple linked list, and the head of that list is in the hash
table. Which makes removal a bit of a PITA but it's not so bad; we just
special-case "I had to remove the head of the list and now I have to
replace it in / remove it from the hash table". And if we don't remove
the head, it's a simple linked-list operation.

We do need to fire watches on *deleted* nodes, so instead of just a simple
xs_node_unref() on the topmost victim, we need to recurse down and fire
watches on them all.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xenstore_impl.c | 253 +---
 tests/unit/test-xs-node.c   |  85 
 2 files changed, 323 insertions(+), 15 deletions(-)

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index 9e10a31bea..2e464af93a 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -37,9 +37,20 @@ typedef struct XsNode {
 #endif
 } XsNode;
 
+typedef struct XsWatch {
+struct XsWatch *next;
+xs_impl_watch_fn *cb;
+void *cb_opaque;
+char *token;
+unsigned int dom_id;
+int rel_prefix;
+} XsWatch;
+
 struct XenstoreImplState {
 XsNode *root;
 unsigned int nr_nodes;
+GHashTable *watches;
+unsigned int nr_domu_watches;
 };
 
 static inline XsNode *xs_node_new(void)
@@ -146,6 +157,7 @@ struct walk_op {
 void *op_opaque;
 void *op_opaque2;
 
+GList *watches;
 unsigned int dom_id;
 
 /* The number of nodes which will exist in the tree if this op succeeds. */
@@ -166,6 +178,35 @@ struct walk_op {
 bool create_dirs;
 };
 
+static void fire_watches(struct walk_op *op, bool parents)
+{
+GList *l = NULL;
+XsWatch *w;
+
+if (!op->mutating) {
+return;
+}
+
+if (parents) {
+l = op->watches;
+}
+
+w = g_hash_table_lookup(op->s->watches, op->path);
+while (w || l) {
+if (!w) {
+/* Fire the parent nodes from 'op' if asked to */
+w = l->data;
+l = l->next;
+continue;
+}
+
+assert(strlen(op->path) > w->rel_prefix);
+w->cb(w->cb_opaque, op->path + w->rel_prefix, w->token);
+
+w = w->next;
+}
+}
+
 static int xs_node_add_content(XsNode **n, struct walk_op *op)
 {
 GByteArray *data = op->op_opaque;
@@ -213,6 +254,8 @@ static int xs_node_get_content(XsNode **n, struct walk_op 
*op)
 static int node_rm_recurse(gpointer key, gpointer value, gpointer user_data)
 {
 struct walk_op *op = user_data;
+int path_len = strlen(op->path);
+int key_len = strlen(key);
 XsNode *n = value;
 bool this_inplace = op->inplace;
 
@@ -220,11 +263,22 @@ static int node_rm_recurse(gpointer key, gpointer value, 
gpointer user_data)
 op->inplace = 0;
 }
 
+assert(key_len + path_len + 2 <= sizeof(op->path));
+op->path[path_len] = '/';
+memcpy(op->path + path_len + 1, key, key_len + 1);
+
 if (n->children) {
 g_hash_table_foreach_remove(n->children, node_rm_recurse, op);
 }
 op->new_nr_nodes--;
 
+/*
+ * Fire watches on *this* node but not the parents because they are
+ * going to be deleted too, so the watch will fire for them anyway.
+ */
+fire_watches(op, false);
+op->path[path_len] = '\0';
+
 /*
  * Actually deleting the child here is just an optimisation; if we
  * don't then the final unref on the topmost victim will just have
@@ -238,7 +292,7 @@ static int xs_node_rm(XsNode **n, struct walk_op *op)
 {
 bool this_inplace = op->inplace;
 
-/* Keep count of the nodes in the subtree which gets deleted. */
+/* Fire watches for, and count, nodes in the subtree which get deleted */
 if ((*n)->children) {
 g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
 }
@@ -269,9 +323,11 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 XsNode *old = *n, *child = NULL;
 bool stole_child = false;
 bool this_inplace;
+XsWatch *watch;
 int err;
 
 namelen = strlen(op->path);
+watch = g_hash_table_lookup(op->s->watches, op->path);
 
 /* Is there a child, or do we hit the double-NUL termination? */
 if (op->path[namelen + 1]) {
@@ -292,6 +348,9 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 if (!child_name) {
 /* This is the actual node on which the operation shall be performed */
 err = op->op_fn(n, op);
+if (!err) {
+fire_watches(op, true);
+}
 goto out;
 }
 
@@ -333,11 +392,24 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 goto out;
 }
 
+/*
+ * If the

[RFC PATCH v1 17/25] hw/xen: Build PV backend drivers for CONFIG_XEN_BUS

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Now that we have the redirectable Xen backend operations we can build the
PV backends even without the Xen libraries.

Signed-off-by: David Woodhouse 
---
 hw/9pfs/meson.build| 2 +-
 hw/block/dataplane/meson.build | 2 +-
 hw/block/meson.build   | 2 +-
 hw/char/meson.build| 2 +-
 hw/display/meson.build | 2 +-
 hw/usb/meson.build | 2 +-
 hw/xen/meson.build | 5 -
 7 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/hw/9pfs/meson.build b/hw/9pfs/meson.build
index 12443b6ad5..fd37b7a02d 100644
--- a/hw/9pfs/meson.build
+++ b/hw/9pfs/meson.build
@@ -15,7 +15,7 @@ fs_ss.add(files(
 ))
 fs_ss.add(when: 'CONFIG_LINUX', if_true: files('9p-util-linux.c'))
 fs_ss.add(when: 'CONFIG_DARWIN', if_true: files('9p-util-darwin.c'))
-fs_ss.add(when: 'CONFIG_XEN', if_true: files('xen-9p-backend.c'))
+fs_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-9p-backend.c'))
 softmmu_ss.add_all(when: 'CONFIG_FSDEV_9P', if_true: fs_ss)
 
 specific_ss.add(when: 'CONFIG_VIRTIO_9P', if_true: files('virtio-9p-device.c'))
diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build
index 12c6a264f1..78d7ac1a11 100644
--- a/hw/block/dataplane/meson.build
+++ b/hw/block/dataplane/meson.build
@@ -1,2 +1,2 @@
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
-specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
+specific_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c'))
diff --git a/hw/block/meson.build b/hw/block/meson.build
index b434d5654c..cc2a75cc50 100644
--- a/hw/block/meson.build
+++ b/hw/block/meson.build
@@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PFLASH_CFI02', if_true: 
files('pflash_cfi02.c'))
 softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c'))
 softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80_sfdp.c'))
 softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen-block.c'))
 softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c'))
 
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c', 
'virtio-blk-common.c'))
diff --git a/hw/char/meson.build b/hw/char/meson.build
index 7b594f51b8..e02c60dd54 100644
--- a/hw/char/meson.build
+++ b/hw/char/meson.build
@@ -18,7 +18,7 @@ softmmu_ss.add(when: 'CONFIG_SERIAL_PCI', if_true: 
files('serial-pci.c'))
 softmmu_ss.add(when: 'CONFIG_SERIAL_PCI_MULTI', if_true: 
files('serial-pci-multi.c'))
 softmmu_ss.add(when: 'CONFIG_SHAKTI_UART', if_true: files('shakti_uart.c'))
 softmmu_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: 
files('virtio-console.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen_console.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xen_console.c'))
 softmmu_ss.add(when: 'CONFIG_XILINX', if_true: files('xilinx_uartlite.c'))
 
 softmmu_ss.add(when: 'CONFIG_AVR_USART', if_true: files('avr_usart.c'))
diff --git a/hw/display/meson.build b/hw/display/meson.build
index f470179122..4191694380 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -14,7 +14,7 @@ softmmu_ss.add(when: 'CONFIG_PL110', if_true: 
files('pl110.c'))
 softmmu_ss.add(when: 'CONFIG_SII9022', if_true: files('sii9022.c'))
 softmmu_ss.add(when: 'CONFIG_SSD0303', if_true: files('ssd0303.c'))
 softmmu_ss.add(when: 'CONFIG_SSD0323', if_true: files('ssd0323.c'))
-softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xenfb.c'))
+softmmu_ss.add(when: 'CONFIG_XEN_BUS', if_true: files('xenfb.c'))
 
 softmmu_ss.add(when: 'CONFIG_VGA_PCI', if_true: files('vga-pci.c'))
 softmmu_ss.add(when: 'CONFIG_VGA_ISA', if_true: files('vga-isa.c'))
diff --git a/hw/usb/meson.build b/hw/usb/meson.build
index bdf34cbd3e..599dc24f0d 100644
--- a/hw/usb/meson.build
+++ b/hw/usb/meson.build
@@ -84,6 +84,6 @@ if libusb.found()
   hw_usb_modules += {'host': usbhost_ss}
 endif
 
-softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN', libusb], if_true: 
files('xen-usb.c'))
+softmmu_ss.add(when: ['CONFIG_USB', 'CONFIG_XEN_BUS', libusb], if_true: 
files('xen-usb.c'))
 
 modules += { 'hw-usb': hw_usb_modules }
diff --git a/hw/xen/meson.build b/hw/xen/meson.build
index f195bbd25c..19c6aabc7c 100644
--- a/hw/xen/meson.build
+++ b/hw/xen/meson.build
@@ -1,10 +1,13 @@
-softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
+softmmu_ss.add(when: ['CONFIG_XEN_BUS'], if_true: files(
   'xen-backend.c',
   'xen-bus-helper.c',
   'xen-bus.c',
   'xen-legacy-backend.c',
   'xen_devconfig.c',
   'xen_pvdev.c',
+))
+
+softmmu_ss.add(when: ['CONFIG_XEN', xen], if_true: files(
   'xen-operations.c',
 ))
 
-- 
2.39.0

[RFC PATCH v1 18/25] hw/xen: Avoid crash when backend watch fires too early

2023-03-02 Thread David Woodhouse

From: Paul Durrant 

The xen-block code ends up calling aio_poll() through blkconf_geometry(),
which means we see watch events during the indirect call to
xendev_class->realize() in xen_device_realize(). Unfortunately this call
is made before populating the initial frontend and backend device nodes
in xenstore and hence xen_block_frontend_changed() (which is called from
a watch event) fails to read the frontend's 'state' node, and hence
believes the device is being torn down. This in-turn sets the backend
state to XenbusStateClosed and causes the device to be deleted before it
is fully set up, leading to the crash.
By simply moving the call to xendev_class->realize() after the initial
xenstore nodes are populated, this sorry state of affairs is avoided.

Reported-by: David Woodhouse 
Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
---
 hw/xen/xen-bus.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index 9fe54967d4..c59850b1de 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -1034,13 +1034,6 @@ static void xen_device_realize(DeviceState *dev, Error 
**errp)
 goto unrealize;
 }
 
-if (xendev_class->realize) {
-xendev_class->realize(xendev, errp);
-if (*errp) {
-goto unrealize;
-}
-}
-
 xen_device_backend_printf(xendev, "frontend", "%s",
   xendev->frontend_path);
 xen_device_backend_printf(xendev, "frontend-id", "%u",
@@ -1059,6 +1052,13 @@ static void xen_device_realize(DeviceState *dev, Error 
**errp)
 xen_device_frontend_set_state(xendev, XenbusStateInitialising, true);
 }
 
+if (xendev_class->realize) {
+xendev_class->realize(xendev, errp);
+if (*errp) {
+goto unrealize;
+}
+}
+
 xendev->exit.notify = xen_device_exit;
 qemu_add_exit_notifier(&xendev->exit);
 return;
-- 
2.39.0

[RFC PATCH v1 05/25] hw/xen: Watches on XenStore transactions

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

Firing watches on the nodes that still exist is relatively easy; just
walk the tree and look at the nodes with refcount of one.

Firing watches on *deleted* nodes is more fun. We add 'modified_in_tx'
and 'deleted_in_tx' flags to each node. Nodes with those flags cannot
be shared, as they will always be unique to the transaction in which
they were created.

When xs_node_walk would need to *create* a node as scaffolding and it
encounters a deleted_in_tx node, it can resurrect it simply by clearing
its deleted_in_tx flag. If that node originally had any *data*, they're
gone, and the modified_in_tx flag will have been set when it was first
deleted.

We then attempt to send appropriate watches when the transaction is
committed, properly delete the deleted_in_tx nodes, and remove the
modified_in_tx flag from the others.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xenstore_impl.c | 151 ++-
 tests/unit/test-xs-node.c   | 231 +++-
 2 files changed, 380 insertions(+), 2 deletions(-)

diff --git a/hw/i386/kvm/xenstore_impl.c b/hw/i386/kvm/xenstore_impl.c
index e5074ab1ec..380f8003ec 100644
--- a/hw/i386/kvm/xenstore_impl.c
+++ b/hw/i386/kvm/xenstore_impl.c
@@ -32,6 +32,8 @@ typedef struct XsNode {
 GByteArray *content;
 GHashTable *children;
 uint64_t gencnt;
+bool deleted_in_tx;
+bool modified_in_tx;
 #ifdef XS_NODE_UNIT_TEST
 gchar *name; /* debug only */
 #endif
@@ -153,6 +155,13 @@ static XsNode *xs_node_copy(XsNode *old)
 XsNode *n = xs_node_new();
 
 n->gencnt = old->gencnt;
+
+#ifdef XS_NODE_UNIT_TEST
+if (n->name) {
+n->name = g_strdup(old->name);
+}
+#endif
+
 if (old->children) {
 n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
 (GDestroyNotify)xs_node_unref);
@@ -221,6 +230,9 @@ struct walk_op {
 bool mutating;
 bool create_dirs;
 bool in_transaction;
+
+/* Tracking during recursion so we know which is first. */
+bool deleted_in_tx;
 };
 
 static void fire_watches(struct walk_op *op, bool parents)
@@ -277,6 +289,9 @@ static int xs_node_add_content(XsNode **n, struct walk_op 
*op)
 g_byte_array_unref((*n)->content);
 }
 (*n)->content = g_byte_array_ref(data);
+if (op->tx_id != XBT_NULL) {
+(*n)->modified_in_tx = true;
+}
 return 0;
 }
 
@@ -333,10 +348,62 @@ static int node_rm_recurse(gpointer key, gpointer value, 
gpointer user_data)
 return this_inplace;
 }
 
+static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op);
+static void copy_deleted_recurse(gpointer key, gpointer value,
+ gpointer user_data)
+{
+struct walk_op *op = user_data;
+GHashTable *siblings = op->op_opaque2;
+XsNode *n = xs_node_copy_deleted(value, op);
+
+/*
+ * Reinsert the deleted_in_tx copy of the node into the parent's
+ * 'children' hash table. Having stashed it from op->op_opaque2
+ * before the recursive call to xs_node_copy_deleted() scribbled
+ * over it.
+ */
+g_hash_table_insert(siblings, g_strdup(key), n);
+}
+
+static XsNode *xs_node_copy_deleted(XsNode *old, struct walk_op *op)
+{
+XsNode *n = xs_node_new();
+
+n->gencnt = old->gencnt;
+
+#ifdef XS_NODE_UNIT_TEST
+if (old->name) {
+n->name = g_strdup(old->name);
+}
+#endif
+
+if (old->children) {
+n->children = g_hash_table_new_full(g_str_hash, g_str_equal, g_free,
+(GDestroyNotify)xs_node_unref);
+op->op_opaque2 = n->children;
+g_hash_table_foreach(old->children, copy_deleted_recurse, op);
+}
+n->deleted_in_tx = true;
+/* If it gets resurrected we only fire a watch if it lost its content */
+if (old->content) {
+n->modified_in_tx = true;
+}
+op->new_nr_nodes--;
+return n;
+}
+
 static int xs_node_rm(XsNode **n, struct walk_op *op)
 {
 bool this_inplace = op->inplace;
 
+if (op->tx_id != XBT_NULL) {
+/* It's not trivial to do inplace handling for this one */
+XsNode *old = *n;
+*n = xs_node_copy_deleted(old, op);
+xs_node_unref(old);
+return 0;
+}
+
 /* Fire watches for, and count, nodes in the subtree which get deleted */
 if ((*n)->children) {
 g_hash_table_foreach_remove((*n)->children, node_rm_recurse, op);
@@ -408,6 +475,10 @@ static int xs_node_walk(XsNode **n, struct walk_op *op)
 }
 
 if (child) {
+if (child->deleted_in_tx) {
+assert(child->ref == 1);
+/* Cannot actually set child->deleted_in_tx = false until later */
+}
 xs_node_ref(child);
 /*
  * Now we own it too. But if we can modify inplace, that&

[RFC PATCH v1 08/25] hw/xen: Create initial XenStore nodes

2023-03-02 Thread David Woodhouse

From: Paul Durrant 

Signed-off-by: Paul Durrant 
Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_xenstore.c | 70 ++
 1 file changed, 70 insertions(+)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 1b1358ad4c..5a8e38aae7 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -76,9 +76,39 @@ struct XenXenstoreState *xen_xenstore_singleton;
 static void xen_xenstore_event(void *opaque);
 static void fire_watch_cb(void *opaque, const char *path, const char *token);
 
+static void G_GNUC_PRINTF (4, 5) relpath_printf(XenXenstoreState *s,
+GList *perms,
+const char *relpath,
+const char *fmt, ...)
+{
+gchar *abspath;
+gchar *value;
+va_list args;
+GByteArray *data;
+int err;
+
+abspath = g_strdup_printf("/local/domain/%u/%s", xen_domid, relpath);
+va_start(args, fmt);
+value = g_strdup_vprintf(fmt, args);
+va_end(args);
+
+data = g_byte_array_new_take((void *)value, strlen(value));
+
+err = xs_impl_write(s->impl, DOMID_QEMU, XBT_NULL, abspath, data);
+assert(!err);
+
+g_byte_array_unref(data);
+
+err = xs_impl_set_perms(s->impl, DOMID_QEMU, XBT_NULL, abspath, perms);
+assert(!err);
+
+g_free(abspath);
+}
+
 static void xen_xenstore_realize(DeviceState *dev, Error **errp)
 {
 XenXenstoreState *s = XEN_XENSTORE(dev);
+GList *perms;
 
 if (xen_mode != XEN_EMULATE) {
 error_setg(errp, "Xen xenstore support is for Xen emulation");
@@ -102,6 +132,46 @@ static void xen_xenstore_realize(DeviceState *dev, Error 
**errp)
xen_xenstore_event, NULL, NULL, NULL, s);
 
 s->impl = xs_impl_create(xen_domid);
+
+/* Populate the default nodes */
+
+/* Nodes owned by 'dom0' but readable by the guest */
+perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, DOMID_QEMU));
+perms = g_list_append(perms, xs_perm_as_string(XS_PERM_READ, xen_domid));
+
+relpath_printf(s, perms, "", "%s", "");
+
+relpath_printf(s, perms, "domid", "%u", xen_domid);
+
+relpath_printf(s, perms, "control/platform-feature-xs_reset_watches", 
"%u", 1);
+relpath_printf(s, perms, 
"control/platform-feature-multiprocessor-suspend", "%u", 1);
+
+relpath_printf(s, perms, "platform/acpi", "%u", 1);
+relpath_printf(s, perms, "platform/acpi_s3", "%u", 1);
+relpath_printf(s, perms, "platform/acpi_s4", "%u", 1);
+relpath_printf(s, perms, "platform/acpi_laptop_slate", "%u", 0);
+
+g_list_free_full(perms, g_free);
+
+/* Nodes owned by the guest */
+perms = g_list_append(NULL, xs_perm_as_string(XS_PERM_NONE, xen_domid));
+
+relpath_printf(s, perms, "attr", "%s", "");
+
+relpath_printf(s, perms, "control/shutdown", "%s", "");
+relpath_printf(s, perms, "control/feature-poweroff", "%u", 1);
+relpath_printf(s, perms, "control/feature-reboot", "%u", 1);
+relpath_printf(s, perms, "control/feature-suspend", "%u", 1);
+relpath_printf(s, perms, "control/feature-s3", "%u", 1);
+relpath_printf(s, perms, "control/feature-s4", "%u", 1);
+
+relpath_printf(s, perms, "data", "%s", "");
+relpath_printf(s, perms, "device", "%s", "");
+relpath_printf(s, perms, "drivers", "%s", "");
+relpath_printf(s, perms, "error", "%s", "");
+relpath_printf(s, perms, "feature", "%s", "");
+
+g_list_free_full(perms, g_free);
 }
 
 static bool xen_xenstore_is_needed(void *opaque)
-- 
2.39.0

[RFC PATCH v1 15/25] hw/xen: Use XEN_PAGE_SIZE in PV backend drivers

2023-03-02 Thread David Woodhouse

From: David Woodhouse 

XC_PAGE_SIZE comes from the actual Xen libraries, while XEN_PAGE_SIZE is
provided by QEMU itself in xen_backend_ops.h. For backends which may be
built for emulation mode, use the latter.

Signed-off-by: David Woodhouse 
---
 hw/block/dataplane/xen-block.c |  8 
 hw/display/xenfb.c | 12 ++--
 hw/net/xen_nic.c   | 12 ++--
 hw/usb/xen-usb.c   |  8 
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index e55b713002..8322a1de82 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -101,9 +101,9 @@ static XenBlockRequest 
*xen_block_start_request(XenBlockDataPlane *dataplane)
  * re-use requests, allocate the memory once here. It will be freed
  * xen_block_dataplane_destroy() when the request list is freed.
  */
-request->buf = qemu_memalign(XC_PAGE_SIZE,
+request->buf = qemu_memalign(XEN_PAGE_SIZE,
  BLKIF_MAX_SEGMENTS_PER_REQUEST *
- XC_PAGE_SIZE);
+ XEN_PAGE_SIZE);
 dataplane->requests_total++;
 qemu_iovec_init(&request->v, 1);
 } else {
@@ -185,7 +185,7 @@ static int xen_block_parse_request(XenBlockRequest *request)
 goto err;
 }
 if (request->req.seg[i].last_sect * dataplane->sector_size >=
-XC_PAGE_SIZE) {
+XEN_PAGE_SIZE) {
 error_report("error: page crossing");
 goto err;
 }
@@ -740,7 +740,7 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
 
 dataplane->protocol = protocol;
 
-ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
+ring_size = XEN_PAGE_SIZE * dataplane->nr_ring_ref;
 switch (dataplane->protocol) {
 case BLKIF_PROTOCOL_NATIVE:
 {
diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index 2c4016fcbd..0074a9b6f8 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -489,13 +489,13 @@ static int xenfb_map_fb(struct XenFB *xenfb)
 }
 
 if (xenfb->pixels) {
-munmap(xenfb->pixels, xenfb->fbpages * XC_PAGE_SIZE);
+munmap(xenfb->pixels, xenfb->fbpages * XEN_PAGE_SIZE);
 xenfb->pixels = NULL;
 }
 
-xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XC_PAGE_SIZE);
+xenfb->fbpages = DIV_ROUND_UP(xenfb->fb_len, XEN_PAGE_SIZE);
 n_fbdirs = xenfb->fbpages * mode / 8;
-n_fbdirs = DIV_ROUND_UP(n_fbdirs, XC_PAGE_SIZE);
+n_fbdirs = DIV_ROUND_UP(n_fbdirs, XEN_PAGE_SIZE);
 
 pgmfns = g_new0(xen_pfn_t, n_fbdirs);
 fbmfns = g_new0(xen_pfn_t, xenfb->fbpages);
@@ -528,8 +528,8 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t 
fb_len_lim,
 {
 size_t mfn_sz = sizeof_field(struct xenfb_page, pd[0]);
 size_t pd_len = sizeof_field(struct xenfb_page, pd) / mfn_sz;
-size_t fb_pages = pd_len * XC_PAGE_SIZE / mfn_sz;
-size_t fb_len_max = fb_pages * XC_PAGE_SIZE;
+size_t fb_pages = pd_len * XEN_PAGE_SIZE / mfn_sz;
+size_t fb_len_max = fb_pages * XEN_PAGE_SIZE;
 int max_width, max_height;
 
 if (fb_len_lim > fb_len_max) {
@@ -930,7 +930,7 @@ static void fb_disconnect(struct XenLegacyDevice *xendev)
  *   instead.  This releases the guest pages and keeps qemu happy.
  */
 qemu_xen_foreignmem_unmap(fb->pixels, fb->fbpages);
-fb->pixels = mmap(fb->pixels, fb->fbpages * XC_PAGE_SIZE,
+fb->pixels = mmap(fb->pixels, fb->fbpages * XEN_PAGE_SIZE,
   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
   -1, 0);
 if (fb->pixels == MAP_FAILED) {
diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index 166d03787d..9bbf6599fc 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -145,7 +145,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
 continue;
 }
 
-if ((txreq.offset + txreq.size) > XC_PAGE_SIZE) {
+if ((txreq.offset + txreq.size) > XEN_PAGE_SIZE) {
 xen_pv_printf(&netdev->xendev, 0, "error: page crossing\n");
 net_tx_error(netdev, &txreq, rc);
 continue;
@@ -171,7 +171,7 @@ static void net_tx_packets(struct XenNetDev *netdev)
 if (txreq.flags & NETTXF_csum_blank) {
 /* have read-only mapping -> can't fill checksum in-place */
 if (!tmpbuf) {
-tmpbuf = g_malloc(XC_PAGE_SIZE);
+tmpbuf = g_malloc(XEN_PAGE_SIZE);
 }
 memcpy(tmpbuf, page + txreq.offset, txreq.size);
 net_checksum_calculate(tmpbuf, txreq.size, CSUM_ALL);
@@ -243,9 +243,9 @@ static ssize_t net_rx_packet(NetClientState *nc, const 
u

Re: [PATCH] hw/intc/i8259: Implement legacy LTIM Edge/Level Bank Select

2023-03-02 Thread David Woodhouse

On Thu, 2023-03-02 at 09:06 +, David Woodhouse wrote:
> Back in the mists of time, before IBM PS/2 came along with MCA and added
> per-pin level control in the ELCR register, the i8259 had a chip-wide
> level-mode control in bit 3 of ICW1.

Actually... I think MCA might have been level triggered system-wide,
and still used this LTIM bit. Per-pin control via ELCR probably came in
with EISA?

smime.p7s
Description: S/MIME cryptographic signature

[PATCH] hw/intc/i8259: Implement legacy LTIM Edge/Level Bank Select

2023-03-02 Thread David Woodhouse

Back in the mists of time, before IBM PS/2 came along with MCA and added
per-pin level control in the ELCR register, the i8259 had a chip-wide
level-mode control in bit 3 of ICW1.

Even in the PIIX3 datasheet from 1996 this is documented as 'This bit is
disabled', but apparently MorphOS is using it in the version of the
i8259 which is in the Pegasos2 board as part of the vt82c686 chipset.

It's easy enough to implement, and I think it's harmless enough to do so
unconditionally.

Signed-off-by: David Woodhouse 
---
 hw/intc/i8259.c | 10 --
 hw/intc/i8259_common.c  | 24 +++-
 include/hw/isa/i8259_internal.h |  1 +
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c
index 17910f3bcb..bbae2d87f4 100644
--- a/hw/intc/i8259.c
+++ b/hw/intc/i8259.c
@@ -133,7 +133,7 @@ static void pic_set_irq(void *opaque, int irq, int level)
 }
 #endif
 
-if (s->elcr & mask) {
+if (s->ltim || (s->elcr & mask)) {
 /* level triggered */
 if (level) {
 s->irr |= mask;
@@ -167,7 +167,7 @@ static void pic_intack(PICCommonState *s, int irq)
 s->isr |= (1 << irq);
 }
 /* We don't clear a level sensitive interrupt here */
-if (!(s->elcr & (1 << irq))) {
+if (!s->ltim && !(s->elcr & (1 << irq))) {
 s->irr &= ~(1 << irq);
 }
 pic_update_irq(s);
@@ -224,6 +224,7 @@ static void pic_reset(DeviceState *dev)
 PICCommonState *s = PIC_COMMON(dev);
 
 s->elcr = 0;
+s->ltim = 0;
 pic_init_reset(s);
 }
 
@@ -243,10 +244,7 @@ static void pic_ioport_write(void *opaque, hwaddr addr64,
 s->init_state = 1;
 s->init4 = val & 1;
 s->single_mode = val & 2;
-if (val & 0x08) {
-qemu_log_mask(LOG_UNIMP,
-  "i8259: level sensitive irq not supported\n");
-}
+s->ltim = val & 8;
 } else if (val & 0x08) {
 if (val & 0x04) {
 s->poll = 1;
diff --git a/hw/intc/i8259_common.c b/hw/intc/i8259_common.c
index af2e4a2241..c931dc2d07 100644
--- a/hw/intc/i8259_common.c
+++ b/hw/intc/i8259_common.c
@@ -51,7 +51,7 @@ void pic_reset_common(PICCommonState *s)
 s->special_fully_nested_mode = 0;
 s->init4 = 0;
 s->single_mode = 0;
-/* Note: ELCR is not reset */
+/* Note: ELCR and LTIM are not reset */
 }
 
 static int pic_dispatch_pre_save(void *opaque)
@@ -144,6 +144,24 @@ static void pic_print_info(InterruptStatsProvider *obj, 
Monitor *mon)
s->special_fully_nested_mode);
 }
 
+static bool ltim_state_needed(void *opaque)
+{
+PICCommonState *s = PIC_COMMON(opaque);
+
+return !!s->ltim;
+}
+
+static const VMStateDescription vmstate_pic_ltim = {
+.name = "i8259/ltim",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = ltim_state_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINT8(ltim, PICCommonState),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static const VMStateDescription vmstate_pic_common = {
 .name = "i8259",
 .version_id = 1,
@@ -168,6 +186,10 @@ static const VMStateDescription vmstate_pic_common = {
 VMSTATE_UINT8(single_mode, PICCommonState),
 VMSTATE_UINT8(elcr, PICCommonState),
 VMSTATE_END_OF_LIST()
+},
+.subsections = (const VMStateDescription*[]) {
+&vmstate_pic_ltim,
+NULL
 }
 };
 
diff --git a/include/hw/isa/i8259_internal.h b/include/hw/isa/i8259_internal.h
index 155b098452..f9dcc4163e 100644
--- a/include/hw/isa/i8259_internal.h
+++ b/include/hw/isa/i8259_internal.h
@@ -61,6 +61,7 @@ struct PICCommonState {
 uint8_t single_mode; /* true if slave pic is not initialized */
 uint8_t elcr; /* PIIX edge/trigger selection*/
 uint8_t elcr_mask;
+uint8_t ltim; /* Edge/Level Bank Select (pre-PIIX, chip-wide) */
 qemu_irq int_out[1];
 uint32_t master; /* reflects /SP input pin */
 uint32_t iobase;
-- 
2.39.0

Re: [PATCH v5 5/7] hw/isa/vt82c686: Work around missing level sensitive irq in i8259 model

2023-03-02 Thread David Woodhouse

On Wed, 2023-03-01 at 23:47 +0100, BALATON Zoltan wrote:
> On Wed, 1 Mar 2023, David Woodhouse wrote:
> > On Wed, 2023-03-01 at 19:01 +0100, BALATON Zoltan wrote:
> > > 
> > > > It isn't a *correct* fix without a little bit more typing, but does
> > > > this make it work?
> > > > 
> > > > diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c
> > > > index 17910f3bcb..36ebcff025 100644
> > > > --- a/hw/intc/i8259.c
> > > > +++ b/hw/intc/i8259.c
> > > > @@ -246,6 +246,7 @@ static void pic_ioport_write(void *opaque, hwaddr 
> > > > addr64,
> > > >  if (val & 0x08) {
> > > >  qemu_log_mask(LOG_UNIMP,
> > > >    "i8259: level sensitive irq not 
> > > > supported\n");
> > > > +    s->elcr = 0xff;
> > > 
> > > This works too. I guess the log can be then removed too. Could you submit
> > > a proper patch or you want me to do that so we can drop the workaround for
> > > it? Thanks for looking into it.
> > 
> > 
> > Happy for you to do the rest of the typing ... :)
> 
> I don't mind the typing but this is quite a bit more involved than I 
> expected. You've lost me at the vmstate stuff which I don't quite know how 
> to change or test. If these were stored as bits in an ISW1 register as 
> described by the docs I've looked at now then no change in migration would 
> be needed but this isn't how it seems to be in QEMU so I give up on that 
> in this case. Could you please do the typing then?

Yeah, it is a bit weird that we store the ICW1 byte in *separate*
elements of persistent state, each of *them* a uint8_t which holds only
a single bit of ICW1:

s->init4 = val & 1;
s->single_mode = val & 2;
s->ltim = val & 8;

Still, it's a pattern that's easy enough to follow.

As for the rest of the typing, I'd basically done the bulk of it
already when showing how to adjust the existing (s->elcr&mask) checks;
there was only one more of those to type.

And then the vmstate part is basically just a cut and paste of the bit
in docs/devel/migration.rst which tells you exactly how to do it.

Patch follows. It builds, but I'll let you do the actual testing,
including migration to/from the new version, checking with
scripts/analyze-migration.py that the ltim is there when it should be,
and isn't when it shouldn't, and any other review feedback.








smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH v5 5/7] hw/isa/vt82c686: Work around missing level sensitive irq in i8259 model

2023-03-01 Thread David Woodhouse

On Wed, 2023-03-01 at 19:01 +0100, BALATON Zoltan wrote:
> 
> > It isn't a *correct* fix without a little bit more typing, but does
> > this make it work?
> > 
> > diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c
> > index 17910f3bcb..36ebcff025 100644
> > --- a/hw/intc/i8259.c
> > +++ b/hw/intc/i8259.c
> > @@ -246,6 +246,7 @@ static void pic_ioport_write(void *opaque, hwaddr 
> > addr64,
> >  if (val & 0x08) {
> >  qemu_log_mask(LOG_UNIMP,
> >    "i8259: level sensitive irq not 
> > supported\n");
> > +    s->elcr = 0xff;
> 
> This works too. I guess the log can be then removed too. Could you submit 
> a proper patch or you want me to do that so we can drop the workaround for 
> it? Thanks for looking into it.

Happy for you to do the rest of the typing ... :)

So, *ideally* I think you need to introduce a new field in the
PICCommonState which records the status of the LTIM bit. And fix up the
vmstate_pic_common in hw/intc/i8259_common.c to save and restore that
(with versioning for upgrade/downgrade).

Then you find those places which currently check the bit for the
specific pin in s->elcr, and make them something like:

--- a/hw/intc/i8259.c
+++ b/hw/intc/i8259.c
@@ -133,7 +133,7 @@ static void pic_set_irq(void *opaque, int irq, int level)
 }
 #endif

-if (s->elcr & mask) {
+if (s->ltim || (s->elcr & mask)) {
 /* level triggered */
 if (level) {
 s->irr |= mask;

It *might* be that you should make the LTIM behaviour optional, so that
only certain incarnations of the i8259 actually get it at all and it
*wouldn't* take effect if a guest tried to set it, which is what the
PIIX3 datasheet implies. But I suspect we can get away without that.

smime.p7s
Description: S/MIME cryptographic signature

[PATCH v15 32/60] hw/xen: Implement EVTCHNOP_bind_virq

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Add the array of virq ports to each vCPU so that we can deliver timers,
debug ports, etc. Global virqs are allocated against vCPU 0 initially,
but can be migrated to other vCPUs (when we implement that).

The kernel needs to know about VIRQ_TIMER in order to accelerate timers,
so tell it via KVM_XEN_VCPU_ATTR_TYPE_TIMER. Also save/restore the value
of the singleshot timer across migration, as the kernel will handle the
hypercalls automatically now.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  | 85 
 hw/i386/kvm/xen_evtchn.h  |  2 +
 include/sysemu/kvm_xen.h  |  1 +
 target/i386/cpu.h |  4 ++
 target/i386/kvm/xen-emu.c | 91 +++
 target/i386/machine.c |  2 +
 6 files changed, 185 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 54707f1f9f..a3202d39ab 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -244,6 +244,11 @@ static bool valid_port(evtchn_port_t port)
 }
 }
 
+static bool valid_vcpu(uint32_t vcpu)
+{
+return !!qemu_get_cpu(vcpu);
+}
+
 int xen_evtchn_status_op(struct evtchn_status *status)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
@@ -496,6 +501,43 @@ static void free_port(XenEvtchnState *s, evtchn_port_t 
port)
 clear_port_pending(s, port);
 }
 
+static int allocate_port(XenEvtchnState *s, uint32_t vcpu, uint16_t type,
+ uint16_t val, evtchn_port_t *port)
+{
+evtchn_port_t p = 1;
+
+for (p = 1; valid_port(p); p++) {
+if (s->port_table[p].type == EVTCHNSTAT_closed) {
+s->port_table[p].vcpu = vcpu;
+s->port_table[p].type = type;
+s->port_table[p].type_val = val;
+
+*port = p;
+
+if (s->nr_ports < p + 1) {
+s->nr_ports = p + 1;
+}
+
+return 0;
+}
+}
+return -ENOSPC;
+}
+
+static bool virq_is_global(uint32_t virq)
+{
+switch (virq) {
+case VIRQ_TIMER:
+case VIRQ_DEBUG:
+case VIRQ_XENOPROF:
+case VIRQ_XENPMU:
+return false;
+
+default:
+return true;
+}
+}
+
 static int close_port(XenEvtchnState *s, evtchn_port_t port)
 {
 XenEvtchnPort *p = &s->port_table[port];
@@ -504,6 +546,11 @@ static int close_port(XenEvtchnState *s, evtchn_port_t 
port)
 case EVTCHNSTAT_closed:
 return -ENOENT;
 
+case EVTCHNSTAT_virq:
+kvm_xen_set_vcpu_virq(virq_is_global(p->type_val) ? 0 : p->vcpu,
+  p->type_val, 0);
+break;
+
 default:
 break;
 }
@@ -555,3 +602,41 @@ int xen_evtchn_unmask_op(struct evtchn_unmask *unmask)
 
 return ret;
 }
+
+int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+int ret;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (virq->virq >= NR_VIRQS) {
+return -EINVAL;
+}
+
+/* Global VIRQ must be allocated on vCPU0 first */
+if (virq_is_global(virq->virq) && virq->vcpu != 0) {
+return -EINVAL;
+}
+
+if (!valid_vcpu(virq->vcpu)) {
+return -ENOENT;
+}
+
+qemu_mutex_lock(&s->port_lock);
+
+ret = allocate_port(s, virq->vcpu, EVTCHNSTAT_virq, virq->virq,
+&virq->port);
+if (!ret) {
+ret = kvm_xen_set_vcpu_virq(virq->vcpu, virq->virq, virq->port);
+if (ret) {
+free_port(s, virq->port);
+}
+}
+
+qemu_mutex_unlock(&s->port_lock);
+
+return ret;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 69c6b0d743..0ea13dda3a 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -18,8 +18,10 @@ int xen_evtchn_set_callback_param(uint64_t param);
 struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
+struct evtchn_bind_virq;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
+int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index 644c7d889c..fbb7414eb7 100644
--- a/include/sysemu/kvm_xen.h
+++ b/include/sysemu/kvm_xen.h
@@ -23,6 +23,7 @@ int kvm_xen_soft_reset(void);
 uint32_t kvm_xen_get_caps(void);
 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id);
 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type);
+int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port);
 
 #define kvm_xen_has_cap(cap) (!!(kvm_xen_get_caps() &   \
  KVM_XEN_HVM_CONFIG_ ## cap))
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1c7603221d..4b70257db5 100644
--- a/target/i386/c

[PATCH v15 47/60] i386/xen: handle PV timer hypercalls

2023-03-01 Thread David Woodhouse

From: Joao Martins 

Introduce support for one shot and periodic mode of Xen PV timers,
whereby timer interrupts come through a special virq event channel
with deadlines being set through:

1) set_timer_op hypercall (only oneshot)
2) vcpu_op hypercall for {set,stop}_{singleshot,periodic}_timer
hypercalls

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  |  31 +
 hw/i386/kvm/xen_evtchn.h  |   2 +
 target/i386/cpu.h |   5 +
 target/i386/kvm/xen-emu.c | 271 +-
 target/i386/machine.c |   1 +
 5 files changed, 308 insertions(+), 2 deletions(-)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index f39b751ff9..71ed480189 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -1222,6 +1222,37 @@ int xen_evtchn_send_op(struct evtchn_send *send)
 return ret;
 }
 
+int xen_evtchn_set_port(uint16_t port)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+XenEvtchnPort *p;
+int ret = -EINVAL;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (!valid_port(port)) {
+return -EINVAL;
+}
+
+qemu_mutex_lock(&s->port_lock);
+
+p = &s->port_table[port];
+
+/* QEMU has no business sending to anything but these */
+if (p->type == EVTCHNSTAT_virq ||
+(p->type == EVTCHNSTAT_interdomain &&
+ (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU))) {
+set_port_pending(s, port);
+ret = 0;
+}
+
+qemu_mutex_unlock(&s->port_lock);
+
+return ret;
+}
+
 EvtchnInfoList *qmp_xen_event_list(Error **errp)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index b03c3108bc..24611478b8 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -20,6 +20,8 @@ int xen_evtchn_set_callback_param(uint64_t param);
 void xen_evtchn_connect_gsis(qemu_irq *system_gsis);
 void xen_evtchn_set_callback_level(int level);
 
+int xen_evtchn_set_port(uint16_t port);
+
 struct evtchn_status;
 struct evtchn_close;
 struct evtchn_unmask;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7227a8ec08..d243e290d3 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -26,6 +26,7 @@
 #include "exec/cpu-defs.h"
 #include "qapi/qapi-types-common.h"
 #include "qemu/cpu-float.h"
+#include "qemu/timer.h"
 
 #define XEN_NR_VIRQS 24
 
@@ -1811,6 +1812,10 @@ typedef struct CPUArchState {
 bool xen_callback_asserted;
 uint16_t xen_virq[XEN_NR_VIRQS];
 uint64_t xen_singleshot_timer_ns;
+QEMUTimer *xen_singleshot_timer;
+uint64_t xen_periodic_timer_period;
+QEMUTimer *xen_periodic_timer;
+QemuMutex xen_timers_lock;
 #endif
 #if defined(CONFIG_HVF)
 HVFX86LazyFlags hvf_lflags;
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 3b46cab1da..c210ff9d91 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -38,6 +38,9 @@
 
 #include "xen-compat.h"
 
+static void xen_vcpu_singleshot_timer_event(void *opaque);
+static void xen_vcpu_periodic_timer_event(void *opaque);
+
 #ifdef TARGET_X86_64
 #define hypercall_compat32(longmode) (!(longmode))
 #else
@@ -201,6 +204,23 @@ int kvm_xen_init_vcpu(CPUState *cs)
 env->xen_vcpu_time_info_gpa = INVALID_GPA;
 env->xen_vcpu_runstate_gpa = INVALID_GPA;
 
+qemu_mutex_init(&env->xen_timers_lock);
+env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+ xen_vcpu_singleshot_timer_event,
+ cpu);
+if (!env->xen_singleshot_timer) {
+return -ENOMEM;
+}
+env->xen_singleshot_timer->opaque = cs;
+
+env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+   xen_vcpu_periodic_timer_event,
+   cpu);
+if (!env->xen_periodic_timer) {
+return -ENOMEM;
+}
+env->xen_periodic_timer->opaque = cs;
+
 return 0;
 }
 
@@ -232,7 +252,8 @@ static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit 
*exit, X86CPU *cpu,
  1 << XENFEAT_writable_descriptor_tables |
  1 << XENFEAT_auto_translated_physmap |
  1 << XENFEAT_supervisor_mode_kernel |
- 1 << XENFEAT_hvm_callback_vector;
+ 1 << XENFEAT_hvm_callback_vector |
+ 1 << XENFEAT_hvm_safe_pvclock;
 }
 
 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
@@ -878,13 +899,208 @@ static int vcpuop_register_runstate_info(CPUState *cs, 
CPUState *target,
 return 0;
 }
 
+static uint64_t kvm_get_current_ns(void)
+{
+struct kvm_clock_data data;
+int ret;
+
+

[PATCH v15 16/60] i386/xen: manage and save/restore Xen guest long_mode setting

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Xen will "latch" the guest's 32-bit or 64-bit ("long mode") setting when
the guest writes the MSR to fill in the hypercall page, or when the guest
sets the event channel callback in HVM_PARAM_CALLBACK_IRQ.

KVM handles the former and sets the kernel's long_mode flag accordingly.
The latter will be handled in userspace. Keep them in sync by noticing
when a hypercall is made in a mode that doesn't match qemu's idea of
the guest mode, and resyncing from the kernel. Do that same sync right
before serialization too, in case the guest has set the hypercall page
but hasn't yet made a system call.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_overlay.c | 62 +++
 hw/i386/kvm/xen_overlay.h |  4 +++
 target/i386/kvm/xen-emu.c | 12 
 3 files changed, 78 insertions(+)

diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
index a2441e2b4e..8685d87959 100644
--- a/hw/i386/kvm/xen_overlay.c
+++ b/hw/i386/kvm/xen_overlay.c
@@ -44,6 +44,7 @@ struct XenOverlayState {
 MemoryRegion shinfo_mem;
 void *shinfo_ptr;
 uint64_t shinfo_gpa;
+bool long_mode;
 };
 
 struct XenOverlayState *xen_overlay_singleton;
@@ -96,9 +97,21 @@ static void xen_overlay_realize(DeviceState *dev, Error 
**errp)
 
 s->shinfo_ptr = memory_region_get_ram_ptr(&s->shinfo_mem);
 s->shinfo_gpa = INVALID_GPA;
+s->long_mode = false;
 memset(s->shinfo_ptr, 0, XEN_PAGE_SIZE);
 }
 
+static int xen_overlay_pre_save(void *opaque)
+{
+/*
+ * Fetch the kernel's idea of long_mode to avoid the race condition
+ * where the guest has set the hypercall page up in 64-bit mode but
+ * not yet made a hypercall by the time migration happens, so qemu
+ * hasn't yet noticed.
+ */
+return xen_sync_long_mode();
+}
+
 static int xen_overlay_post_load(void *opaque, int version_id)
 {
 XenOverlayState *s = opaque;
@@ -107,6 +120,9 @@ static int xen_overlay_post_load(void *opaque, int 
version_id)
 xen_overlay_do_map_page(&s->shinfo_mem, s->shinfo_gpa);
 xen_overlay_set_be_shinfo(s->shinfo_gpa >> XEN_PAGE_SHIFT);
 }
+if (s->long_mode) {
+xen_set_long_mode(true);
+}
 
 return 0;
 }
@@ -121,9 +137,11 @@ static const VMStateDescription xen_overlay_vmstate = {
 .version_id = 1,
 .minimum_version_id = 1,
 .needed = xen_overlay_is_needed,
+.pre_save = xen_overlay_pre_save,
 .post_load = xen_overlay_post_load,
 .fields = (VMStateField[]) {
 VMSTATE_UINT64(shinfo_gpa, XenOverlayState),
+VMSTATE_BOOL(long_mode, XenOverlayState),
 VMSTATE_END_OF_LIST()
 }
 };
@@ -208,3 +226,47 @@ void *xen_overlay_get_shinfo_ptr(void)
 
 return s->shinfo_ptr;
 }
+
+int xen_sync_long_mode(void)
+{
+int ret;
+struct kvm_xen_hvm_attr xa = {
+.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+};
+
+if (!xen_overlay_singleton) {
+return -ENOENT;
+}
+
+ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_GET_ATTR, &xa);
+if (!ret) {
+xen_overlay_singleton->long_mode = xa.u.long_mode;
+}
+
+return ret;
+}
+
+int xen_set_long_mode(bool long_mode)
+{
+int ret;
+struct kvm_xen_hvm_attr xa = {
+.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+.u.long_mode = long_mode,
+};
+
+if (!xen_overlay_singleton) {
+return -ENOENT;
+}
+
+ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+if (!ret) {
+xen_overlay_singleton->long_mode = xa.u.long_mode;
+}
+
+return ret;
+}
+
+bool xen_is_long_mode(void)
+{
+return xen_overlay_singleton && xen_overlay_singleton->long_mode;
+}
diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h
index 00cff05bb0..5c46a0b036 100644
--- a/hw/i386/kvm/xen_overlay.h
+++ b/hw/i386/kvm/xen_overlay.h
@@ -17,4 +17,8 @@ void xen_overlay_create(void);
 int xen_overlay_map_shinfo_page(uint64_t gpa);
 void *xen_overlay_get_shinfo_ptr(void);
 
+int xen_sync_long_mode(void);
+int xen_set_long_mode(bool long_mode);
+bool xen_is_long_mode(void);
+
 #endif /* QEMU_XEN_OVERLAY_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index ebea27caf6..be6d85f2cb 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -20,6 +20,8 @@
 #include "trace.h"
 #include "sysemu/runstate.h"
 
+#include "hw/i386/kvm/xen_overlay.h"
+
 #include "hw/xen/interface/version.h"
 #include "hw/xen/interface/sched.h"
 
@@ -282,6 +284,16 @@ int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit 
*exit)
 return -1;
 }
 
+/*
+ * The kernel latches the guest 32/64 mode when the MSR is used to fill
+ * the hypercall page. So if we see a hypercall in a mode that doesn't
+ * match our own idea of the guest mode, fetc

[PATCH v15 38/60] hw/xen: Implement EVTCHNOP_reset

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  | 30 ++
 hw/i386/kvm/xen_evtchn.h  |  3 +++
 target/i386/kvm/xen-emu.c | 17 +
 3 files changed, 50 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index cbaf4f535a..c804a315e0 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -12,6 +12,7 @@
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
 #include "qemu/module.h"
+#include "qemu/lockable.h"
 #include "qemu/main-loop.h"
 #include "qemu/log.h"
 #include "qapi/error.h"
@@ -747,6 +748,35 @@ static int close_port(XenEvtchnState *s, evtchn_port_t 
port)
 return 0;
 }
 
+int xen_evtchn_soft_reset(void)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+int i;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+assert(qemu_mutex_iothread_locked());
+
+QEMU_LOCK_GUARD(&s->port_lock);
+
+for (i = 0; i < s->nr_ports; i++) {
+close_port(s, i);
+}
+
+return 0;
+}
+
+int xen_evtchn_reset_op(struct evtchn_reset *reset)
+{
+if (reset->dom != DOMID_SELF && reset->dom != xen_domid) {
+return -ESRCH;
+}
+
+return xen_evtchn_soft_reset();
+}
+
 int xen_evtchn_close_op(struct evtchn_close *close)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 486b031c82..5d3e03553f 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -13,6 +13,7 @@
 #define QEMU_XEN_EVTCHN_H
 
 void xen_evtchn_create(void);
+int xen_evtchn_soft_reset(void);
 int xen_evtchn_set_callback_param(uint64_t param);
 
 struct evtchn_status;
@@ -24,6 +25,7 @@ struct evtchn_send;
 struct evtchn_alloc_unbound;
 struct evtchn_bind_interdomain;
 struct evtchn_bind_vcpu;
+struct evtchn_reset;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
@@ -33,5 +35,6 @@ int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
 int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain 
*interdomain);
 int xen_evtchn_bind_vcpu_op(struct evtchn_bind_vcpu *vcpu);
+int xen_evtchn_reset_op(struct evtchn_reset *reset);
 
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 8dc0d320f5..821629f077 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -964,6 +964,18 @@ static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 err = xen_evtchn_bind_vcpu_op(&vcpu);
 break;
 }
+case EVTCHNOP_reset: {
+struct evtchn_reset reset;
+
+qemu_build_assert(sizeof(reset) == 2);
+if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_evtchn_reset_op(&reset);
+break;
+}
 default:
 return false;
 }
@@ -981,6 +993,11 @@ int kvm_xen_soft_reset(void)
 
 trace_kvm_xen_soft_reset();
 
+err = xen_evtchn_soft_reset();
+if (err) {
+return err;
+}
+
 /*
  * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
  * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
-- 
2.39.0

[PATCH v15 14/60] xen: Permit --xen-domid argument when accel is KVM

2023-03-01 Thread David Woodhouse

From: Paul Durrant 

Signed-off-by: Paul Durrant 
Signed-off-by: David Wooodhouse 
---
 softmmu/vl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/softmmu/vl.c b/softmmu/vl.c
index cebe8d9452..3340f63c37 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -3360,7 +3360,7 @@ void qemu_init(int argc, char **argv)
 has_defaults = 0;
 break;
 case QEMU_OPTION_xen_domid:
-if (!(accel_find("xen"))) {
+if (!(accel_find("xen")) && !(accel_find("kvm"))) {
 error_report("Option not supported for this target");
 exit(1);
 }
-- 
2.39.0

[PATCH v15 41/60] hw/xen: Support HVM_PARAM_CALLBACK_TYPE_PCI_INTX callback

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

The guest is permitted to specify an arbitrary domain/bus/device/function
and INTX pin from which the callback IRQ shall appear to have come.

In QEMU we can only easily do this for devices that actually exist, and
even that requires us "knowing" that it's a PCMachine in order to find
the PCI root bus — although that's OK really because it's always true.

We also don't get to get notified of INTX routing changes, because we
can't do that as a passive observer; if we try to register a notifier
it will overwrite any existing notifier callback on the device.

But in practice, guests using PCI_INTX will only ever use pin A on the
Xen platform device, and won't swizzle the INTX routing after they set
it up. So this is just fine.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  | 80 ---
 target/i386/kvm/xen-emu.c | 34 +
 2 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 6b0bdba65d..f39b751ff9 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -28,6 +28,8 @@
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
 #include "hw/i386/x86.h"
+#include "hw/i386/pc.h"
+#include "hw/pci/pci.h"
 #include "hw/irq.h"
 
 #include "xen_evtchn.h"
@@ -101,6 +103,7 @@ struct XenEvtchnState {
 
 uint64_t callback_param;
 bool evtchn_in_kernel;
+uint32_t callback_gsi;
 
 QEMUBH *gsi_bh;
 
@@ -217,11 +220,41 @@ static void xen_evtchn_register_types(void)
 
 type_init(xen_evtchn_register_types)
 
+static int set_callback_pci_intx(XenEvtchnState *s, uint64_t param)
+{
+PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
+uint8_t pin = param & 3;
+uint8_t devfn = (param >> 8) & 0xff;
+uint16_t bus = (param >> 16) & 0x;
+uint16_t domain = (param >> 32) & 0x;
+PCIDevice *pdev;
+PCIINTxRoute r;
+
+if (domain || !pcms) {
+return 0;
+}
+
+pdev = pci_find_device(pcms->bus, bus, devfn);
+if (!pdev) {
+return 0;
+}
+
+r = pci_device_route_intx_to_irq(pdev, pin);
+if (r.mode != PCI_INTX_ENABLED) {
+return 0;
+}
+
+/*
+ * Hm, can we be notified of INTX routing changes? Not without
+ * *owning* the device and being allowed to overwrite its own
+ * ->intx_routing_notifier, AFAICT. So let's not.
+ */
+return r.irq;
+}
+
 void xen_evtchn_set_callback_level(int level)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
-uint32_t param;
-
 if (!s) {
 return;
 }
@@ -260,18 +293,12 @@ void xen_evtchn_set_callback_level(int level)
 return;
 }
 
-param = (uint32_t)s->callback_param;
-
-switch (s->callback_param >> CALLBACK_VIA_TYPE_SHIFT) {
-case HVM_PARAM_CALLBACK_TYPE_GSI:
-if (param < IOAPIC_NUM_PINS) {
-qemu_set_irq(s->gsis[param], level);
-if (level) {
-/* Ensure the vCPU polls for deassertion */
-kvm_xen_set_callback_asserted();
-}
+if (s->callback_gsi && s->callback_gsi < IOAPIC_NUM_PINS) {
+qemu_set_irq(s->gsis[s->callback_gsi], level);
+if (level) {
+/* Ensure the vCPU polls for deassertion */
+kvm_xen_set_callback_asserted();
 }
-break;
 }
 }
 
@@ -283,15 +310,22 @@ int xen_evtchn_set_callback_param(uint64_t param)
 .u.vector = 0,
 };
 bool in_kernel = false;
+uint32_t gsi = 0;
+int type = param >> CALLBACK_VIA_TYPE_SHIFT;
 int ret;
 
 if (!s) {
 return -ENOTSUP;
 }
 
+/*
+ * We need the BQL because set_callback_pci_intx() may call into PCI code,
+ * and because we may need to manipulate the old and new GSI levels.
+ */
+assert(qemu_mutex_iothread_locked());
 qemu_mutex_lock(&s->port_lock);
 
-switch (param >> CALLBACK_VIA_TYPE_SHIFT) {
+switch (type) {
 case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
 xa.u.vector = (uint8_t)param,
 
@@ -299,10 +333,17 @@ int xen_evtchn_set_callback_param(uint64_t param)
 if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) {
 in_kernel = true;
 }
+gsi = 0;
 break;
 }
 
+case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
+gsi = set_callback_pci_intx(s, param);
+ret = gsi ? 0 : -EINVAL;
+break;
+
 case HVM_PARAM_CALLBACK_TYPE_GSI:
+gsi = (uint32_t)param;
 ret = 0;
 break;
 
@@ -320,6 +361,17 @@ int xen_evtchn_set_callback_param(uint64_t param)
 }
 s->callback_param = param;
 s->evtchn_in_kernel = in_kernel;
+
+if (gsi != s->callback_gsi) {
+struct vcpu_info *vi = kvm_

[PATCH v15 42/60] kvm/i386: Add xen-gnttab-max-frames property

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 accel/kvm/kvm-all.c   |  1 +
 include/sysemu/kvm_int.h  |  1 +
 include/sysemu/kvm_xen.h  |  1 +
 target/i386/kvm/kvm.c | 34 ++
 target/i386/kvm/xen-emu.c |  6 ++
 5 files changed, 43 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 511d3eb9a0..3d8e400bbf 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -3704,6 +3704,7 @@ static void kvm_accel_instance_init(Object *obj)
 s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
 s->notify_window = 0;
 s->xen_version = 0;
+s->xen_gnttab_max_frames = 64;
 }
 
 /**
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 7f945bc763..39ce4d36f6 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -120,6 +120,7 @@ struct KVMState
 uint32_t notify_window;
 uint32_t xen_version;
 uint32_t xen_caps;
+uint16_t xen_gnttab_max_frames;
 };
 
 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index 2b20030281..49afa0eb9e 100644
--- a/include/sysemu/kvm_xen.h
+++ b/include/sysemu/kvm_xen.h
@@ -25,6 +25,7 @@ void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id);
 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type);
 void kvm_xen_set_callback_asserted(void);
 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port);
+uint16_t kvm_xen_get_gnttab_max_frames(void);
 
 #define kvm_xen_has_cap(cap) (!!(kvm_xen_get_caps() &   \
  KVM_XEN_HVM_CONFIG_ ## cap))
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 3c3795506d..a73c49aabb 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5880,6 +5880,33 @@ static void kvm_arch_set_xen_version(Object *obj, 
Visitor *v,
 }
 }
 
+static void kvm_arch_get_xen_gnttab_max_frames(Object *obj, Visitor *v,
+   const char *name, void *opaque,
+   Error **errp)
+{
+KVMState *s = KVM_STATE(obj);
+uint16_t value = s->xen_gnttab_max_frames;
+
+visit_type_uint16(v, name, &value, errp);
+}
+
+static void kvm_arch_set_xen_gnttab_max_frames(Object *obj, Visitor *v,
+   const char *name, void *opaque,
+   Error **errp)
+{
+KVMState *s = KVM_STATE(obj);
+Error *error = NULL;
+uint16_t value;
+
+visit_type_uint16(v, name, &value, &error);
+if (error) {
+error_propagate(errp, error);
+return;
+}
+
+s->xen_gnttab_max_frames = value;
+}
+
 void kvm_arch_accel_class_init(ObjectClass *oc)
 {
 object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
@@ -5905,6 +5932,13 @@ void kvm_arch_accel_class_init(ObjectClass *oc)
   "Xen version to be emulated "
   "(in XENVER_version form "
   "e.g. 0x4000a for 4.10)");
+
+object_class_property_add(oc, "xen-gnttab-max-frames", "uint16",
+  kvm_arch_get_xen_gnttab_max_frames,
+  kvm_arch_set_xen_gnttab_max_frames,
+  NULL, NULL);
+object_class_property_set_description(oc, "xen-gnttab-max-frames",
+  "Maximum number of grant table 
frames");
 }
 
 void kvm_set_max_apic_id(uint32_t max_apic_id)
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 9e22c9fa02..46be631726 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -1238,6 +1238,12 @@ int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit 
*exit)
 return 0;
 }
 
+uint16_t kvm_xen_get_gnttab_max_frames(void)
+{
+KVMState *s = KVM_STATE(current_accel());
+return s->xen_gnttab_max_frames;
+}
+
 int kvm_put_xen_state(CPUState *cs)
 {
 X86CPU *cpu = X86_CPU(cs);
-- 
2.39.0

[PATCH v15 36/60] hw/xen: Implement EVTCHNOP_bind_interdomain

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  | 78 +++
 hw/i386/kvm/xen_evtchn.h  |  2 +
 target/i386/kvm/xen-emu.c | 16 
 3 files changed, 96 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index df25bf0806..e2e36d94f6 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -722,6 +722,23 @@ static int close_port(XenEvtchnState *s, evtchn_port_t 
port)
 }
 break;
 
+case EVTCHNSTAT_interdomain:
+if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+/* Not yet implemented. This can't happen! */
+} else {
+/* Loopback interdomain */
+XenEvtchnPort *rp = &s->port_table[p->type_val];
+if (!valid_port(p->type_val) || rp->type_val != port ||
+rp->type != EVTCHNSTAT_interdomain) {
+error_report("Inconsistent state for interdomain unbind");
+} else {
+/* Set the other end back to unbound */
+rp->type = EVTCHNSTAT_unbound;
+rp->type_val = 0;
+}
+}
+break;
+
 default:
 break;
 }
@@ -837,6 +854,67 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
 return ret;
 }
 
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+uint16_t type_val;
+int ret;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (interdomain->remote_dom == DOMID_QEMU) {
+type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+} else if (interdomain->remote_dom == DOMID_SELF ||
+   interdomain->remote_dom == xen_domid) {
+type_val = 0;
+} else {
+return -ESRCH;
+}
+
+if (!valid_port(interdomain->remote_port)) {
+return -EINVAL;
+}
+
+qemu_mutex_lock(&s->port_lock);
+
+/* The newly allocated port starts out as unbound */
+ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val,
+&interdomain->local_port);
+if (ret) {
+goto out;
+}
+
+if (interdomain->remote_dom == DOMID_QEMU) {
+/* We haven't hooked up QEMU's PV drivers to this yet */
+ret = -ENOSYS;
+} else {
+/* Loopback */
+XenEvtchnPort *rp = &s->port_table[interdomain->remote_port];
+XenEvtchnPort *lp = &s->port_table[interdomain->local_port];
+
+if (rp->type == EVTCHNSTAT_unbound && rp->type_val == 0) {
+/* It's a match! */
+rp->type = EVTCHNSTAT_interdomain;
+rp->type_val = interdomain->local_port;
+
+lp->type = EVTCHNSTAT_interdomain;
+lp->type_val = interdomain->remote_port;
+} else {
+ret = -EINVAL;
+}
+}
+
+if (ret) {
+free_port(s, interdomain->local_port);
+}
+ out:
+qemu_mutex_unlock(&s->port_lock);
+
+return ret;
+
+}
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index fc080138e3..1ebc7580eb 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -22,6 +22,7 @@ struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
 struct evtchn_alloc_unbound;
+struct evtchn_bind_interdomain;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
@@ -29,5 +30,6 @@ int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain 
*interdomain);
 
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index ac04387a6a..1aff6b1042 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -936,6 +936,22 @@ static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 }
 break;
 }
+case EVTCHNOP_bind_interdomain: {
+struct evtchn_bind_interdomain interdomain;
+
+qemu_build_assert(sizeof(interdomain) == 12);
+if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_evtchn_bind_interdomain_op(&interdomain);
+if (!err &&
+kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
+err = -EFAULT;
+}
+break;
+}
 default:
 return false;
 }
-- 
2.39.0

Re: [PATCH v5 5/7] hw/isa/vt82c686: Work around missing level sensitive irq in i8259 model

2023-03-01 Thread David Woodhouse

On Wed, 2023-03-01 at 14:18 +0100, BALATON Zoltan wrote:
> > Are you sure the PIC ELCR is actually set for the lines you're having
> > trouble with? Is that something the Pegasos SmartFirmware would have
> > done, and MorphOS is expecting to inherit but isn't actually setting up
> > for itself?
> 
> No, it works with other guests like Linux and AmigaOS that use PIC as set 
> up by the firmware but MorphOS tries to use it in level sensitive mode and 
> likely has an IRQ handler which expects this to work. This is where I've 
> debugged it and came to this workaround:
> 
> https://lists.nongnu.org/archive/html/qemu-ppc/2023-02/msg00403.html
> 
> When booting MorphOS with -d unimp I see these logs:
> 
> i8259: level sensitive irq not supported
> i8259: level sensitive irq not supported
> 
> which is I guess when it tries to set it for both PICs. (If you want to 
> try this MorphOS iso is downloadable and instructions how to boot it is 
> here: http://zero.eik.bme.hu/~balaton/qemu/amiga/#morphos


Wow. Even looking at the PIIX3 datasheet from 1996, That 'Edge/Level
Bank Select (LTIM)' bit was documented as 'This bit is disabled. Its
function is replaced by the Edge/Level Triggerede Control (ELCR)
Registers.

We've been able to set the edge/level on a per-pin basis ever since the
ELCR was introduced with the IBM PS/2, I think.

It isn't a *correct* fix without a little bit more typing, but does
this make it work?

diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c
index 17910f3bcb..36ebcff025 100644
--- a/hw/intc/i8259.c
+++ b/hw/intc/i8259.c
@@ -246,6 +246,7 @@ static void pic_ioport_write(void *opaque, hwaddr addr64,
 if (val & 0x08) {
 qemu_log_mask(LOG_UNIMP,
   "i8259: level sensitive irq not supported\n");
+s->elcr = 0xff;
 }
 } else if (val & 0x08) {
 if (val & 0x04) {





smime.p7s
Description: S/MIME cryptographic signature

[PATCH v15 02/60] xen: add CONFIG_XEN_BUS and CONFIG_XEN_EMU options for Xen emulation

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

The XEN_EMU option will cover core Xen support in target/, which exists
only for x86 with KVM today but could theoretically also be implemented
on Arm/Aarch64 and with TCG or other accelerators (if anyone wants to
run the gauntlet of struct layout compatibility, errno mapping, and the
rest of that fui).

It will also cover the support for architecture-independent grant table
and event channel support which will be added in hw/i386/kvm/ (on the
basis that the non-KVM support is very theoretical and making it not use
KVM directly seems like gratuitous overengineering at this point).

The XEN_BUS option is for the xenfv platform support, which will now be
used both by XEN_EMU and by real Xen.

The XEN option remains dependent on the Xen runtime libraries, and covers
support for real Xen. Some code which currently resides under CONFIG_XEN
will be moving to CONFIG_XEN_BUS over time as the direct dependencies on
Xen runtime libraries are eliminated. The Xen PCI platform device will
also reside under CONFIG_XEN_BUS.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/Kconfig  | 1 +
 hw/i386/Kconfig | 5 +
 hw/xen/Kconfig  | 3 +++
 meson.build | 1 +
 4 files changed, 10 insertions(+)
 create mode 100644 hw/xen/Kconfig

diff --git a/hw/Kconfig b/hw/Kconfig
index 38233bbb0f..ba62ff6417 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -41,6 +41,7 @@ source tpm/Kconfig
 source usb/Kconfig
 source virtio/Kconfig
 source vfio/Kconfig
+source xen/Kconfig
 source watchdog/Kconfig
 
 # arch Kconfig
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 9fbfe748b5..d40802d83f 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -136,3 +136,8 @@ config VMPORT
 config VMMOUSE
 bool
 depends on VMPORT
+
+config XEN_EMU
+bool
+default y
+depends on KVM && (I386 || X86_64)
diff --git a/hw/xen/Kconfig b/hw/xen/Kconfig
new file mode 100644
index 00..3467efb986
--- /dev/null
+++ b/hw/xen/Kconfig
@@ -0,0 +1,3 @@
+config XEN_BUS
+bool
+default y if (XEN || XEN_EMU)
diff --git a/meson.build b/meson.build
index 77d2ae87e4..71a14e6b3d 100644
--- a/meson.build
+++ b/meson.build
@@ -3881,6 +3881,7 @@ if have_system
   if xen.found()
 summary_info += {'xen ctrl version':  xen.version()}
   endif
+  summary_info += {'Xen emulation': config_all.has_key('CONFIG_XEN_EMU')}
 endif
 summary_info += {'TCG support':   config_all.has_key('CONFIG_TCG')}
 if config_all.has_key('CONFIG_TCG')
-- 
2.39.0

[PATCH v15 49/60] i386/xen: handle HVMOP_get_param

2023-03-01 Thread David Woodhouse

From: Joao Martins 

Which is used to fetch xenstore PFN and port to be used
by the guest. This is preallocated by the toolstack when
guest will just read those and use it straight away.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/kvm/xen-emu.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 75bcf7b630..d2c88ef0d9 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -765,6 +765,42 @@ out:
 return true;
 }
 
+static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
+ uint64_t arg)
+{
+CPUState *cs = CPU(cpu);
+struct xen_hvm_param hp;
+int err = 0;
+
+/* No need for 32/64 compat handling */
+qemu_build_assert(sizeof(hp) == 16);
+
+if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
+err = -EFAULT;
+goto out;
+}
+
+if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
+err = -ESRCH;
+goto out;
+}
+
+switch (hp.index) {
+case HVM_PARAM_STORE_PFN:
+hp.value = XEN_SPECIAL_PFN(XENSTORE);
+break;
+default:
+return false;
+}
+
+if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
+err = -EFAULT;
+}
+out:
+exit->u.hcall.result = err;
+return true;
+}
+
 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
   X86CPU *cpu, uint64_t arg)
 {
@@ -809,6 +845,9 @@ static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, 
X86CPU *cpu,
 case HVMOP_set_param:
 return handle_set_param(exit, cpu, arg);
 
+case HVMOP_get_param:
+return handle_get_param(exit, cpu, arg);
+
 default:
 return false;
 }
-- 
2.39.0

[PATCH v15 24/60] i386/xen: implement HYPERVISOR_event_channel_op

2023-03-01 Thread David Woodhouse

From: Joao Martins 

Signed-off-by: Joao Martins 
[dwmw2: Ditch event_channel_op_compat which was never available to HVM guests]
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/kvm/xen-emu.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index c0a8e4c34e..ac143c05a4 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -28,6 +28,7 @@
 #include "hw/xen/interface/memory.h"
 #include "hw/xen/interface/hvm/hvm_op.h"
 #include "hw/xen/interface/vcpu.h"
+#include "hw/xen/interface/event_channel.h"
 
 #include "xen-compat.h"
 
@@ -588,6 +589,27 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit,
+int cmd, uint64_t arg)
+{
+int err = -ENOSYS;
+
+switch (cmd) {
+case EVTCHNOP_init_control:
+case EVTCHNOP_expand_array:
+case EVTCHNOP_set_priority:
+/* We do not support FIFO channels at this point */
+err = -ENOSYS;
+break;
+
+default:
+return false;
+}
+
+exit->u.hcall.result = err;
+return true;
+}
+
 int kvm_xen_soft_reset(void)
 {
 CPUState *cpu;
@@ -694,6 +716,9 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
 case __HYPERVISOR_sched_op:
 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
   exit->u.hcall.params[1]);
+case __HYPERVISOR_event_channel_op:
+return kvm_xen_hcall_evtchn_op(exit, exit->u.hcall.params[0],
+   exit->u.hcall.params[1]);
 case __HYPERVISOR_vcpu_op:
 return kvm_xen_hcall_vcpu_op(exit, cpu,
  exit->u.hcall.params[0],
-- 
2.39.0

[PATCH v15 57/60] hw/xen: Support MSI mapping to PIRQ

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

The way that Xen handles MSI PIRQs is kind of awful.

There is a special MSI message which targets a PIRQ. The vector in the
low bits of data must be zero. The low 8 bits of the PIRQ# are in the
destination ID field, the extended destination ID field is unused, and
instead the high bits of the PIRQ# are in the high 32 bits of the address.

Using the high bits of the address means that we can't intercept and
translate these messages in kvm_send_msi(), because they won't be caught
by the APIC — addresses like 0x1000fee46000 aren't in the APIC's range.

So we catch them in pci_msi_trigger() instead, and deliver the event
channel directly.

That isn't even the worst part. The worst part is that Xen snoops on
writes to devices' MSI vectors while they are *masked*. When a MSI
message is written which looks like it targets a PIRQ, it remembers
the device and vector for later.

When the guest makes a hypercall to bind that PIRQ# (snooped from a
marked MSI vector) to an event channel port, Xen *unmasks* that MSI
vector on the device. Xen guests using PIRQ delivery of MSI don't
ever actually unmask the MSI for themselves.

Now that this is working we can finally enable XENFEAT_hvm_pirqs and
let the guest use it all.

Tested with passthrough igb and emulated e1000e + AHCI.

   CPU0   CPU1
  0: 65  0   IO-APIC   2-edge  timer
  1:  0 14  xen-pirq   1-ioapic-edge  i8042
  4:  0846  xen-pirq   4-ioapic-edge  ttyS0
  8:  1  0  xen-pirq   8-ioapic-edge  rtc0
  9:  0  0  xen-pirq   9-ioapic-level  acpi
 12:257  0  xen-pirq  12-ioapic-edge  i8042
 24:   9600  0  xen-percpu-virq  timer0
 25:   2758  0  xen-percpu-ipi   resched0
 26:  0  0  xen-percpu-ipi   callfunc0
 27:  0  0  xen-percpu-virq  debug0
 28:   1526  0  xen-percpu-ipi   callfuncsingle0
 29:  0  0  xen-percpu-ipi   spinlock0
 30:  0   8608  xen-percpu-virq  timer1
 31:  0874  xen-percpu-ipi   resched1
 32:  0  0  xen-percpu-ipi   callfunc1
 33:  0  0  xen-percpu-virq  debug1
 34:  0   1617  xen-percpu-ipi   callfuncsingle1
 35:  0  0  xen-percpu-ipi   spinlock1
 36:  8  0   xen-dyn-event xenbus
 37:  0   6046  xen-pirq-msi   ahci[:00:03.0]
 38:  1  0  xen-pirq-msi-x ens4
 39:  0 73  xen-pirq-msi-x ens4-rx-0
 40: 14  0  xen-pirq-msi-x ens4-rx-1
 41:  0 32  xen-pirq-msi-x ens4-tx-0
 42: 47  0  xen-pirq-msi-x     ens4-tx-1

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/trace-events   |   1 +
 hw/i386/kvm/xen-stubs.c|  16 +++
 hw/i386/kvm/xen_evtchn.c   | 262 -
 hw/i386/kvm/xen_evtchn.h   |   8 ++
 hw/pci/msi.c   |  11 ++
 hw/pci/msix.c  |   9 ++
 hw/pci/pci.c   |  17 +++
 include/hw/pci/msi.h   |   1 +
 target/i386/kvm/kvm.c  |  19 ++-
 target/i386/kvm/kvm_i386.h |   2 +
 target/i386/kvm/xen-emu.c  |   3 +-
 11 files changed, 339 insertions(+), 10 deletions(-)

diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
index 04e60c5bb8..b83c3eb965 100644
--- a/hw/i386/kvm/trace-events
+++ b/hw/i386/kvm/trace-events
@@ -2,3 +2,4 @@ kvm_xen_map_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_unmap_pirq(int pirq, int gsi) "pirq %d gsi %d"
 kvm_xen_get_free_pirq(int pirq, int type) "pirq %d type %d"
 kvm_xen_bind_pirq(int pirq, int port) "pirq %d port %d"
+kvm_xen_unmask_pirq(int pirq, char *dev, int vector) "pirq %d dev %s vector %d"
diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c
index 720590aedd..ae406e0b02 100644
--- a/hw/i386/kvm/xen-stubs.c
+++ b/hw/i386/kvm/xen-stubs.c
@@ -14,6 +14,22 @@
 #include "qapi/error.h"
 #include "qapi/qapi-commands-misc-target.h"
 
+#include "xen_evtchn.h"
+
+void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector,
+  uint64_t addr, uint32_t data, bool is_masked)
+{
+}
+
+void xen_evtchn_remove_pci_device(PCIDevice *dev)
+{
+}
+
+bool xen_evtchn_deliver_pirq_msi(uint64_t address, uint32_t data)
+{
+return false;
+}
+
 #ifdef TARGET_I386
 EvtchnInfoList *qmp_xen_event_list(Error **errp)
 {
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index f2c4b43871..69c0204d4f 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -31,6 +31,8 @@
 #include "hw/i386/x86.h"
 #include "hw/i386/pc.h"
 #include "hw/pci/pci.h"
+#includ

[PATCH v15 25/60] i386/xen: implement HVMOP_set_evtchn_upcall_vector

2023-03-01 Thread David Woodhouse

From: Ankur Arora 

The HVMOP_set_evtchn_upcall_vector hypercall sets the per-vCPU upcall
vector, to be delivered to the local APIC just like an MSI (with an EOI).

This takes precedence over the system-wide delivery method set by the
HVMOP_set_param hypercall with HVM_PARAM_CALLBACK_IRQ. It's used by
Windows and Xen (PV shim) guests but normally not by Linux.

Signed-off-by: Ankur Arora 
Signed-off-by: Joao Martins 
[dwmw2: Rework for upstream kernel changes and split from HVMOP_set_param]
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/cpu.h|  1 +
 target/i386/kvm/trace-events |  1 +
 target/i386/kvm/xen-emu.c| 84 ++--
 target/i386/machine.c|  1 +
 4 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 5613df6d75..e882c4e251 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1803,6 +1803,7 @@ typedef struct CPUArchState {
 uint64_t xen_vcpu_info_default_gpa;
 uint64_t xen_vcpu_time_info_gpa;
 uint64_t xen_vcpu_runstate_gpa;
+uint8_t xen_vcpu_callback_vector;
 #endif
 #if defined(CONFIG_HVF)
 HVFX86LazyFlags hvf_lflags;
diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index a840e0333d..b365a8e8e2 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -11,3 +11,4 @@ kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, 
uint64_t a0, uint64_t a1
 kvm_xen_soft_reset(void) ""
 kvm_xen_set_shared_info(uint64_t gfn) "shared info at gfn 0x%" PRIx64
 kvm_xen_set_vcpu_attr(int cpu, int type, uint64_t gpa) "vcpu attr cpu %d type 
%d gpa 0x%" PRIx64
+kvm_xen_set_vcpu_callback(int cpu, int vector) "callback vcpu %d vector %d"
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index ac143c05a4..e9a4422d93 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -27,6 +27,7 @@
 #include "hw/xen/interface/sched.h"
 #include "hw/xen/interface/memory.h"
 #include "hw/xen/interface/hvm/hvm_op.h"
+#include "hw/xen/interface/hvm/params.h"
 #include "hw/xen/interface/vcpu.h"
 #include "hw/xen/interface/event_channel.h"
 
@@ -193,7 +194,8 @@ static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 fi.submap |= 1 << XENFEAT_writable_page_tables |
  1 << XENFEAT_writable_descriptor_tables |
  1 << XENFEAT_auto_translated_physmap |
- 1 << XENFEAT_supervisor_mode_kernel;
+ 1 << XENFEAT_supervisor_mode_kernel |
+ 1 << XENFEAT_hvm_callback_vector;
 }
 
 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
@@ -220,6 +222,31 @@ static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t 
type, uint64_t gpa)
 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
 }
 
+static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
+{
+uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
+struct kvm_xen_vcpu_attr xva;
+
+xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
+xva.u.vector = vector;
+
+trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
+
+return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
+}
+
+static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = &cpu->env;
+
+env->xen_vcpu_callback_vector = data.host_int;
+
+if (kvm_xen_has_cap(EVTCHN_SEND)) {
+kvm_xen_set_vcpu_callback_vector(cs);
+}
+}
+
 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
 {
 X86CPU *cpu = X86_CPU(cs);
@@ -276,12 +303,16 @@ static void do_vcpu_soft_reset(CPUState *cs, 
run_on_cpu_data data)
 env->xen_vcpu_info_default_gpa = INVALID_GPA;
 env->xen_vcpu_time_info_gpa = INVALID_GPA;
 env->xen_vcpu_runstate_gpa = INVALID_GPA;
+env->xen_vcpu_callback_vector = 0;
 
 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, INVALID_GPA);
 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
   INVALID_GPA);
 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
   INVALID_GPA);
+if (kvm_xen_has_cap(EVTCHN_SEND)) {
+kvm_xen_set_vcpu_callback_vector(cs);
+}
 
 }
 
@@ -458,17 +489,53 @@ static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
+  X86CPU *cpu, uint64_t arg)
+{
+struct xen_hvm_evtchn_upcall_vector up;
+CPUState *target_cs;
+
+/* No need for 32/64 compat handling */
+qemu_build_assert(sizeof(up) =

[PATCH v15 56/60] hw/xen: Support GSI mapping to PIRQ

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

If I advertise XENFEAT_hvm_pirqs then a guest now boots successfully as
long as I tell it 'pci=nomsi'.

[root@localhost ~]# cat /proc/interrupts
   CPU0
  0: 52   IO-APIC   2-edge  timer
  1: 16  xen-pirq   1-ioapic-edge  i8042
  4:   1534  xen-pirq   4-ioapic-edge  ttyS0
  8:  1  xen-pirq   8-ioapic-edge  rtc0
  9:  0  xen-pirq   9-ioapic-level  acpi
 11:   5648  xen-pirq  11-ioapic-level  ahci[:00:04.0]
 12:257  xen-pirq  12-ioapic-edge  i8042
...

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c | 58 +++-
 hw/i386/kvm/xen_evtchn.h |  2 ++
 hw/i386/x86.c| 16 +++
 3 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 59eea63272..f2c4b43871 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -148,6 +148,9 @@ struct XenEvtchnState {
 /* GSI → PIRQ mapping (serialized) */
 uint16_t gsi_pirq[IOAPIC_NUM_PINS];
 
+/* Per-GSI assertion state (serialized) */
+uint32_t pirq_gsi_set;
+
 /* Per-PIRQ information (rebuilt on migration) */
 struct pirq_info *pirq;
 };
@@ -246,6 +249,7 @@ static const VMStateDescription xen_evtchn_vmstate = {
 VMSTATE_VARRAY_UINT16_ALLOC(pirq_inuse_bitmap, XenEvtchnState,
 nr_pirq_inuse_words, 0,
 vmstate_info_uint64, uint64_t),
+VMSTATE_UINT32(pirq_gsi_set, XenEvtchnState),
 VMSTATE_END_OF_LIST()
 }
 };
@@ -1510,6 +1514,51 @@ static int allocate_pirq(XenEvtchnState *s, int type, 
int gsi)
 return pirq;
 }
 
+bool xen_evtchn_set_gsi(int gsi, int level)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+int pirq;
+
+assert(qemu_mutex_iothread_locked());
+
+if (!s || gsi < 0 || gsi > IOAPIC_NUM_PINS) {
+return false;
+}
+
+/*
+ * Check that that it *isn't* the event channel GSI, and thus
+ * that we are not recursing and it's safe to take s->port_lock.
+ *
+ * Locking aside, it's perfectly sane to bail out early for that
+ * special case, as it would make no sense for the event channel
+ * GSI to be routed back to event channels, when the delivery
+ * method is to raise the GSI... that recursion wouldn't *just*
+ * be a locking issue.
+ */
+if (gsi && gsi == s->callback_gsi) {
+return false;
+}
+
+QEMU_LOCK_GUARD(&s->port_lock);
+
+pirq = s->gsi_pirq[gsi];
+if (!pirq) {
+return false;
+}
+
+if (level) {
+int port = s->pirq[pirq].port;
+
+s->pirq_gsi_set |= (1U << gsi);
+if (port) {
+set_port_pending(s, port);
+}
+} else {
+s->pirq_gsi_set &= ~(1U << gsi);
+}
+return true;
+}
+
 int xen_physdev_map_pirq(struct physdev_map_pirq *map)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
@@ -1621,7 +1670,14 @@ int xen_physdev_eoi_pirq(struct physdev_eoi *eoi)
 return -EINVAL;
 }
 
-/* XX: Reassert a level IRQ if needed */
+/* Reassert a level IRQ if needed */
+if (s->pirq_gsi_set & (1U << gsi)) {
+int port = s->pirq[pirq].port;
+if (port) {
+set_port_pending(s, port);
+}
+}
+
 return 0;
 }
 
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index a7383f760c..95400b7fbf 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -24,6 +24,8 @@ void xen_evtchn_set_callback_level(int level);
 
 int xen_evtchn_set_port(uint16_t port);
 
+bool xen_evtchn_set_gsi(int gsi, int level);
+
 /*
  * These functions mirror the libxenevtchn library API, providing the QEMU
  * backend side of "interdomain" event channels.
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index c44846f47b..a56b10b2fb 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -61,6 +61,11 @@
 #include CONFIG_DEVICES
 #include "kvm/kvm_i386.h"
 
+#ifdef CONFIG_XEN_EMU
+#include "hw/xen/xen.h"
+#include "hw/i386/kvm/xen_evtchn.h"
+#endif
+
 /* Physical Address of PVH entry point read from kernel ELF NOTE */
 static size_t pvh_start_addr;
 
@@ -610,6 +615,17 @@ void gsi_handler(void *opaque, int n, int level)
 }
 /* fall through */
 case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
+#ifdef CONFIG_XEN_EMU
+/*
+ * Xen delivers the GSI to the Legacy PIC (not that Legacy PIC
+ * routing actually works properly under Xen). And then to
+ * *either* the PIRQ handling or the I/OAPIC depending on
+ * whether the former wants it.
+ */
+if (xen_mode == XEN_EMULATE && xen_evtchn_set_gsi(n, level)) {
+break;
+}
+#endif
 qemu_set_irq(s->ioapic_irq[n], level);
 break;
 case IO_APIC_SECONDARY_IRQBASE
-- 
2.39.0

[PATCH v15 11/60] i386/xen: implement HYPERVISOR_sched_op, SCHEDOP_shutdown

2023-03-01 Thread David Woodhouse

From: Joao Martins 

It allows to shutdown itself via hypercall with any of the 3 reasons:
  1) self-reboot
  2) shutdown
  3) crash

Implementing SCHEDOP_shutdown sub op let us handle crashes gracefully rather
than leading to triple faults if it remains unimplemented.

In addition, the SHUTDOWN_soft_reset reason is used for kexec, to reset
Xen shared pages and other enlightenments and leave a clean slate for the
new kernel without the hypervisor helpfully writing information at
unexpected addresses.

Signed-off-by: Joao Martins 
[dwmw2: Ditch sched_op_compat which was never available for HVM guests,
Add SCHEDOP_soft_reset]
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 include/sysemu/kvm_xen.h |  1 +
 target/i386/kvm/trace-events |  1 +
 target/i386/kvm/xen-emu.c| 75 
 3 files changed, 77 insertions(+)

diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index 296533f2d5..5dffcc0542 100644
--- a/include/sysemu/kvm_xen.h
+++ b/include/sysemu/kvm_xen.h
@@ -12,6 +12,7 @@
 #ifndef QEMU_SYSEMU_KVM_XEN_H
 #define QEMU_SYSEMU_KVM_XEN_H
 
+int kvm_xen_soft_reset(void);
 uint32_t kvm_xen_get_caps(void);
 
 #define kvm_xen_has_cap(cap) (!!(kvm_xen_get_caps() &   \
diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index cd6f842b1f..bb732e1da8 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -8,3 +8,4 @@ kvm_x86_update_msi_routes(int num) "Updated %d MSI routes"
 
 # xen-emu.c
 kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, uint64_t a0, uint64_t 
a1, uint64_t a2, uint64_t ret) "xen_hypercall: cpu %d cpl %d input %" PRIu64 " 
a0 0x%" PRIx64 " a1 0x%" PRIx64 " a2 0x%" PRIx64" ret 0x%" PRIx64
+kvm_xen_soft_reset(void) ""
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 56b80a7880..4ed833656f 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -11,14 +11,17 @@
 
 #include "qemu/osdep.h"
 #include "qemu/log.h"
+#include "qemu/main-loop.h"
 #include "sysemu/kvm_int.h"
 #include "sysemu/kvm_xen.h"
 #include "kvm/kvm_i386.h"
 #include "exec/address-spaces.h"
 #include "xen-emu.h"
 #include "trace.h"
+#include "sysemu/runstate.h"
 
 #include "hw/xen/interface/version.h"
+#include "hw/xen/interface/sched.h"
 
 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
   bool is_write)
@@ -170,6 +173,75 @@ static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+int kvm_xen_soft_reset(void)
+{
+assert(qemu_mutex_iothread_locked());
+
+trace_kvm_xen_soft_reset();
+
+/* Nothing to reset... yet. */
+return 0;
+}
+
+static int schedop_shutdown(CPUState *cs, uint64_t arg)
+{
+struct sched_shutdown shutdown;
+int ret = 0;
+
+/* No need for 32/64 compat handling */
+qemu_build_assert(sizeof(shutdown) == 4);
+
+if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
+return -EFAULT;
+}
+
+switch (shutdown.reason) {
+case SHUTDOWN_crash:
+cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
+qemu_system_guest_panicked(NULL);
+break;
+
+case SHUTDOWN_reboot:
+qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+break;
+
+case SHUTDOWN_poweroff:
+qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+break;
+
+case SHUTDOWN_soft_reset:
+qemu_mutex_lock_iothread();
+ret = kvm_xen_soft_reset();
+qemu_mutex_unlock_iothread();
+break;
+
+default:
+ret = -EINVAL;
+break;
+}
+
+return ret;
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
+   int cmd, uint64_t arg)
+{
+CPUState *cs = CPU(cpu);
+int err = -ENOSYS;
+
+switch (cmd) {
+case SCHEDOP_shutdown:
+err = schedop_shutdown(cs, arg);
+break;
+
+default:
+return false;
+}
+
+exit->u.hcall.result = err;
+return true;
+}
+
 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
 {
 uint16_t code = exit->u.hcall.input;
@@ -180,6 +252,9 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
 }
 
 switch (code) {
+case __HYPERVISOR_sched_op:
+return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
+  exit->u.hcall.params[1]);
 case __HYPERVISOR_xen_version:
 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
  exit->u.hcall.params[1]);
-- 
2.39.0

[PATCH v15 31/60] hw/xen: Implement EVTCHNOP_unmask

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

This finally comes with a mechanism for actually injecting events into
the guest vCPU, with all the atomic-test-and-set that's involved in
setting the bit in the shinfo, then the index in the vcpu_info, and
injecting either the lapic vector as MSI, or letting KVM inject the
bare vector.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  | 175 ++
 hw/i386/kvm/xen_evtchn.h  |   2 +
 target/i386/kvm/xen-emu.c |  12 +++
 3 files changed, 189 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index c57d36b492..54707f1f9f 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -224,6 +224,13 @@ int xen_evtchn_set_callback_param(uint64_t param)
 return ret;
 }
 
+static void inject_callback(XenEvtchnState *s, uint32_t vcpu)
+{
+int type = s->callback_param >> CALLBACK_VIA_TYPE_SHIFT;
+
+kvm_xen_inject_vcpu_callback_vector(vcpu, type);
+}
+
 static bool valid_port(evtchn_port_t port)
 {
 if (!port) {
@@ -294,6 +301,152 @@ int xen_evtchn_status_op(struct evtchn_status *status)
 return 0;
 }
 
+/*
+ * Never thought I'd hear myself say this, but C++ templates would be
+ * kind of nice here.
+ *
+ * template static int do_unmask_port(T *shinfo, ...);
+ */
+static int do_unmask_port_lm(XenEvtchnState *s, evtchn_port_t port,
+ bool do_unmask, struct shared_info *shinfo,
+ struct vcpu_info *vcpu_info)
+{
+const int bits_per_word = BITS_PER_BYTE * 
sizeof(shinfo->evtchn_pending[0]);
+typeof(shinfo->evtchn_pending[0]) mask;
+int idx = port / bits_per_word;
+int offset = port % bits_per_word;
+
+mask = 1UL << offset;
+
+if (idx >= bits_per_word) {
+return -EINVAL;
+}
+
+if (do_unmask) {
+/*
+ * If this is a true unmask operation, clear the mask bit. If
+ * it was already unmasked, we have nothing further to do.
+ */
+if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) {
+return 0;
+}
+} else {
+/*
+ * This is a pseudo-unmask for affinity changes. We don't
+ * change the mask bit, and if it's *masked* we have nothing
+ * else to do.
+ */
+if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+return 0;
+}
+}
+
+/* If the event was not pending, we're done. */
+if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) {
+return 0;
+}
+
+/* Now on to the vcpu_info evtchn_pending_sel index... */
+mask = 1UL << idx;
+
+/* If a port in this word was already pending for this vCPU, all done. */
+if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+return 0;
+}
+
+/* Set evtchn_upcall_pending for this vCPU */
+if (qatomic_fetch_or(&vcpu_info->evtchn_upcall_pending, 1)) {
+return 0;
+}
+
+inject_callback(s, s->port_table[port].vcpu);
+
+return 0;
+}
+
+static int do_unmask_port_compat(XenEvtchnState *s, evtchn_port_t port,
+ bool do_unmask,
+ struct compat_shared_info *shinfo,
+ struct compat_vcpu_info *vcpu_info)
+{
+const int bits_per_word = BITS_PER_BYTE * 
sizeof(shinfo->evtchn_pending[0]);
+typeof(shinfo->evtchn_pending[0]) mask;
+int idx = port / bits_per_word;
+int offset = port % bits_per_word;
+
+mask = 1UL << offset;
+
+if (idx >= bits_per_word) {
+return -EINVAL;
+}
+
+if (do_unmask) {
+/*
+ * If this is a true unmask operation, clear the mask bit. If
+ * it was already unmasked, we have nothing further to do.
+ */
+if (!((qatomic_fetch_and(&shinfo->evtchn_mask[idx], ~mask) & mask))) {
+return 0;
+}
+} else {
+/*
+ * This is a pseudo-unmask for affinity changes. We don't
+ * change the mask bit, and if it's *masked* we have nothing
+ * else to do.
+ */
+if (qatomic_fetch_or(&shinfo->evtchn_mask[idx], 0) & mask) {
+return 0;
+}
+}
+
+/* If the event was not pending, we're done. */
+if (!(qatomic_fetch_or(&shinfo->evtchn_pending[idx], 0) & mask)) {
+return 0;
+}
+
+/* Now on to the vcpu_info evtchn_pending_sel index... */
+mask = 1UL << idx;
+
+/* If a port in this word was already pending for this vCPU, all done. */
+if (qatomic_fetch_or(&vcpu_info->evtchn_pending_sel, mask) & mask) {
+return 0;
+}
+
+/* Set evtchn_upcall_pending for this vCPU */
+if (qatomic_fetch_or(&vcpu_info->evtchn_upca

[PATCH v15 53/60] hw/xen: Automatically add xen-platform PCI device for emulated Xen guests

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

It isn't strictly mandatory but Linux guests at least will only map
their grant tables over the dummy BAR that it provides, and don't have
sufficient wit to map them in any other unused part of their guest
address space. So include it by default for minimal surprise factor.

As I come to document "how to run a Xen guest in QEMU", this means one
fewer thing to tell the user about, according to the mantra of "if it
needs documenting, fix it first, then document what remains".

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/pc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f4a08cc23f..fd17ce7a94 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1315,6 +1315,9 @@ void pc_basic_device_init(struct PCMachineState *pcms,
 #ifdef CONFIG_XEN_EMU
 if (xen_mode == XEN_EMULATE) {
 xen_evtchn_connect_gsis(gsi);
+if (pcms->bus) {
+pci_create_simple(pcms->bus, -1, "xen-platform");
+}
 }
 #endif
 
-- 
2.39.0

[PATCH v15 58/60] kvm/i386: Add xen-evtchn-max-pirq property

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

The default number of PIRQs is set to 256 to avoid issues with 32-bit MSI
devices. Allow it to be increased if the user desires.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 accel/kvm/kvm-all.c   |  1 +
 hw/i386/kvm/xen_evtchn.c  | 21 +++--
 include/sysemu/kvm_int.h  |  1 +
 include/sysemu/kvm_xen.h  |  1 +
 target/i386/kvm/kvm.c | 34 ++
 target/i386/kvm/xen-emu.c |  6 ++
 6 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 3d8e400bbf..f2a6ea6a68 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -3705,6 +3705,7 @@ static void kvm_accel_instance_init(Object *obj)
 s->notify_window = 0;
 s->xen_version = 0;
 s->xen_gnttab_max_frames = 64;
+s->xen_evtchn_max_pirq = 256;
 }
 
 /**
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 69c0204d4f..886fbf6b3b 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -302,17 +302,18 @@ void xen_evtchn_create(void)
 }
 
 /*
- * We could parameterise the number of PIRQs available if needed,
- * but for now limit it to 256. The Xen scheme for encoding PIRQ#
- * into an MSI message is not compatible with 32-bit MSI, as it
- * puts the high bits of the PIRQ# into the high bits of the MSI
- * message address, instead of using the Extended Destination ID
- * in address bits 4-11 which perhaps would have been a better
- * choice. So to keep life simple, just stick with 256 as the
- * default, which conveniently doesn't need to set anything
- * outside the low 32 bits of the address.
+ * The Xen scheme for encoding PIRQ# into an MSI message is not
+ * compatible with 32-bit MSI, as it puts the high bits of the
+ * PIRQ# into the high bits of the MSI message address, instead of
+ * using the Extended Destination ID in address bits 4-11 which
+ * perhaps would have been a better choice.
+ *
+ * To keep life simple, kvm_accel_instance_init() initialises the
+ * default to 256. which conveniently doesn't need to set anything
+ * outside the low 32 bits of the address. It can be increased by
+ * setting the xen-evtchn-max-pirq property.
  */
-s->nr_pirqs = 256;
+s->nr_pirqs = kvm_xen_get_evtchn_max_pirq();
 
 s->nr_pirq_inuse_words = DIV_ROUND_UP(s->nr_pirqs, 64);
 s->pirq_inuse_bitmap = g_new0(uint64_t, s->nr_pirq_inuse_words);
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index 39ce4d36f6..a641c974ea 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -121,6 +121,7 @@ struct KVMState
 uint32_t xen_version;
 uint32_t xen_caps;
 uint16_t xen_gnttab_max_frames;
+uint16_t xen_evtchn_max_pirq;
 };
 
 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index b2aafaf7ab..595abfbe40 100644
--- a/include/sysemu/kvm_xen.h
+++ b/include/sysemu/kvm_xen.h
@@ -26,6 +26,7 @@ void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, 
int type);
 void kvm_xen_set_callback_asserted(void);
 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port);
 uint16_t kvm_xen_get_gnttab_max_frames(void);
+uint16_t kvm_xen_get_evtchn_max_pirq(void);
 
 #define kvm_xen_has_cap(cap) (!!(kvm_xen_get_caps() &   \
  KVM_XEN_HVM_CONFIG_ ## cap))
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index d390137f02..1aef54f87e 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5922,6 +5922,33 @@ static void kvm_arch_set_xen_gnttab_max_frames(Object 
*obj, Visitor *v,
 s->xen_gnttab_max_frames = value;
 }
 
+static void kvm_arch_get_xen_evtchn_max_pirq(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+KVMState *s = KVM_STATE(obj);
+uint16_t value = s->xen_evtchn_max_pirq;
+
+visit_type_uint16(v, name, &value, errp);
+}
+
+static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+KVMState *s = KVM_STATE(obj);
+Error *error = NULL;
+uint16_t value;
+
+visit_type_uint16(v, name, &value, &error);
+if (error) {
+error_propagate(errp, error);
+return;
+}
+
+s->xen_evtchn_max_pirq = value;
+}
+
 void kvm_arch_accel_class_init(ObjectClass *oc)
 {
 object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
@@ -5954,6 +5981,13 @@ void kvm_arch_accel_class_init(ObjectClass *oc)
   NULL, NULL);
 object_class_property_set_descrip

[PATCH v15 20/60] i386/xen: implement HYPERVISOR_vcpu_op

2023-03-01 Thread David Woodhouse

From: Joao Martins 

This is simply when guest tries to register a vcpu_info
and since vcpu_info placement is optional in the minimum ABI
therefore we can just fail with -ENOSYS

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/kvm/xen-emu.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 4002b1b797..e5ae0a9a38 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -27,6 +27,7 @@
 #include "hw/xen/interface/sched.h"
 #include "hw/xen/interface/memory.h"
 #include "hw/xen/interface/hvm/hvm_op.h"
+#include "hw/xen/interface/vcpu.h"
 
 #include "xen-compat.h"
 
@@ -363,6 +364,25 @@ static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 }
 }
 
+static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
+  int cmd, int vcpu_id, uint64_t arg)
+{
+int err;
+
+switch (cmd) {
+case VCPUOP_register_vcpu_info:
+/* no vcpu info placement for now */
+err = -ENOSYS;
+break;
+
+default:
+return false;
+}
+
+exit->u.hcall.result = err;
+return true;
+}
+
 int kvm_xen_soft_reset(void)
 {
 int err;
@@ -464,6 +484,11 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
 case __HYPERVISOR_sched_op:
 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
   exit->u.hcall.params[1]);
+case __HYPERVISOR_vcpu_op:
+return kvm_xen_hcall_vcpu_op(exit, cpu,
+ exit->u.hcall.params[0],
+ exit->u.hcall.params[1],
+ exit->u.hcall.params[2]);
 case __HYPERVISOR_hvm_op:
 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
 exit->u.hcall.params[1]);
-- 
2.39.0

[PATCH v15 52/60] hw/xen: Add basic ring handling to xenstore

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Extract requests, return ENOSYS to all of them. This is enough to allow
older Linux guests to boot, as they need *something* back but it doesn't
matter much what.

A full implementation of a single-tentant internal XenStore copy-on-write
tree with transactions and watches is waiting in the wings to be sent in
a subsequent round of patches along with hooking up the actual PV disk
back end in qemu, but this is enough to get guests booting for now.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_xenstore.c | 254 -
 1 file changed, 251 insertions(+), 3 deletions(-)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index e8abddae57..14193ef3f9 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -192,18 +192,266 @@ uint16_t xen_xenstore_get_port(void)
 return s->guest_port;
 }
 
+static bool req_pending(XenXenstoreState *s)
+{
+struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+
+return s->req_offset == XENSTORE_HEADER_SIZE + req->len;
+}
+
+static void reset_req(XenXenstoreState *s)
+{
+memset(s->req_data, 0, sizeof(s->req_data));
+s->req_offset = 0;
+}
+
+static void reset_rsp(XenXenstoreState *s)
+{
+s->rsp_pending = false;
+
+memset(s->rsp_data, 0, sizeof(s->rsp_data));
+s->rsp_offset = 0;
+}
+
+static void process_req(XenXenstoreState *s)
+{
+struct xsd_sockmsg *req = (struct xsd_sockmsg *)s->req_data;
+struct xsd_sockmsg *rsp = (struct xsd_sockmsg *)s->rsp_data;
+const char enosys[] = "ENOSYS";
+
+assert(req_pending(s));
+assert(!s->rsp_pending);
+
+rsp->type = XS_ERROR;
+rsp->req_id = req->req_id;
+rsp->tx_id = req->tx_id;
+rsp->len = sizeof(enosys);
+memcpy((void *)&rsp[1], enosys, sizeof(enosys));
+
+s->rsp_pending = true;
+reset_req(s);
+}
+
+static unsigned int copy_from_ring(XenXenstoreState *s, uint8_t *ptr,
+   unsigned int len)
+{
+if (!len) {
+return 0;
+}
+
+XENSTORE_RING_IDX prod = qatomic_read(&s->xs->req_prod);
+XENSTORE_RING_IDX cons = qatomic_read(&s->xs->req_cons);
+unsigned int copied = 0;
+
+/* Ensure the ring contents don't cross the req_prod access. */
+smp_rmb();
+
+while (len) {
+unsigned int avail = prod - cons;
+unsigned int offset = MASK_XENSTORE_IDX(cons);
+unsigned int copylen = avail;
+
+if (avail > XENSTORE_RING_SIZE) {
+error_report("XenStore ring handling error");
+s->fatal_error = true;
+break;
+} else if (avail == 0) {
+break;
+}
+
+if (copylen > len) {
+copylen = len;
+}
+if (copylen > XENSTORE_RING_SIZE - offset) {
+copylen = XENSTORE_RING_SIZE - offset;
+}
+
+memcpy(ptr, &s->xs->req[offset], copylen);
+copied += copylen;
+
+ptr += copylen;
+len -= copylen;
+
+cons += copylen;
+}
+
+/*
+ * Not sure this ever mattered except on Alpha, but this barrier
+ * is to ensure that the update to req_cons is globally visible
+ * only after we have consumed all the data from the ring, and we
+ * don't end up seeing data written to the ring *after* the other
+ * end sees the update and writes more to the ring. Xen's own
+ * xenstored has the same barrier here (although with no comment
+ * at all, obviously, because it's Xen code).
+ */
+smp_mb();
+
+qatomic_set(&s->xs->req_cons, cons);
+
+return copied;
+}
+
+static unsigned int copy_to_ring(XenXenstoreState *s, uint8_t *ptr,
+ unsigned int len)
+{
+if (!len) {
+return 0;
+}
+
+XENSTORE_RING_IDX cons = qatomic_read(&s->xs->rsp_cons);
+XENSTORE_RING_IDX prod = qatomic_read(&s->xs->rsp_prod);
+unsigned int copied = 0;
+
+/*
+ * This matches the barrier in copy_to_ring() (or the guest's
+ * equivalent) betweem writing the data to the ring and updating
+ * rsp_prod. It protects against the pathological case (which
+ * again I think never happened except on Alpha) where our
+ * subsequent writes to the ring could *cross* the read of
+ * rsp_cons and the guest could see the new data when it was
+ * intending to read the old.
+ */
+smp_mb();
+
+while (len) {
+unsigned int avail = cons + XENSTORE_RING_SIZE - prod;
+unsigned int offset = MASK_XENSTORE_IDX(prod);
+unsigned int copylen = len;
+
+if (avail > XENSTORE_RING_SIZE) {
+error_report("XenStore ring handling error");
+s->fatal_error = true;
+break;
+} else if (avail == 0

[PATCH v15 17/60] i386/xen: implement HYPERVISOR_memory_op

2023-03-01 Thread David Woodhouse

From: Joao Martins 

Specifically XENMEM_add_to_physmap with space XENMAPSPACE_shared_info to
allow the guest to set its shared_info page.

Signed-off-by: Joao Martins 
[dwmw2: Use the xen_overlay device, add compat support]
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/kvm/trace-events |   1 +
 target/i386/kvm/xen-compat.h |  27 
 target/i386/kvm/xen-emu.c| 116 ++-
 3 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 target/i386/kvm/xen-compat.h

diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index bb732e1da8..8e9f269f56 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -9,3 +9,4 @@ kvm_x86_update_msi_routes(int num) "Updated %d MSI routes"
 # xen-emu.c
 kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, uint64_t a0, uint64_t 
a1, uint64_t a2, uint64_t ret) "xen_hypercall: cpu %d cpl %d input %" PRIu64 " 
a0 0x%" PRIx64 " a1 0x%" PRIx64 " a2 0x%" PRIx64" ret 0x%" PRIx64
 kvm_xen_soft_reset(void) ""
+kvm_xen_set_shared_info(uint64_t gfn) "shared info at gfn 0x%" PRIx64
diff --git a/target/i386/kvm/xen-compat.h b/target/i386/kvm/xen-compat.h
new file mode 100644
index 00..2d852e2a28
--- /dev/null
+++ b/target/i386/kvm/xen-compat.h
@@ -0,0 +1,27 @@
+/*
+ * Xen HVM emulation support in KVM
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_I386_KVM_XEN_COMPAT_H
+#define QEMU_I386_KVM_XEN_COMPAT_H
+
+#include "hw/xen/interface/memory.h"
+
+typedef uint32_t compat_pfn_t;
+typedef uint32_t compat_ulong_t;
+
+struct compat_xen_add_to_physmap {
+domid_t domid;
+uint16_t size;
+unsigned int space;
+compat_ulong_t idx;
+compat_pfn_t gpfn;
+};
+
+#endif /* QEMU_I386_XEN_COMPAT_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index be6d85f2cb..5d79827128 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -12,6 +12,7 @@
 #include "qemu/osdep.h"
 #include "qemu/log.h"
 #include "qemu/main-loop.h"
+#include "hw/xen/xen.h"
 #include "sysemu/kvm_int.h"
 #include "sysemu/kvm_xen.h"
 #include "kvm/kvm_i386.h"
@@ -24,6 +25,15 @@
 
 #include "hw/xen/interface/version.h"
 #include "hw/xen/interface/sched.h"
+#include "hw/xen/interface/memory.h"
+
+#include "xen-compat.h"
+
+#ifdef TARGET_X86_64
+#define hypercall_compat32(longmode) (!(longmode))
+#else
+#define hypercall_compat32(longmode) (false)
+#endif
 
 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
   bool is_write)
@@ -175,13 +185,114 @@ static bool kvm_xen_hcall_xen_version(struct 
kvm_xen_exit *exit, X86CPU *cpu,
 return true;
 }
 
+static int xen_set_shared_info(uint64_t gfn)
+{
+uint64_t gpa = gfn << TARGET_PAGE_BITS;
+int err;
+
+QEMU_IOTHREAD_LOCK_GUARD();
+
+/*
+ * The xen_overlay device tells KVM about it too, since it had to
+ * do that on migration load anyway (unless we're going to jump
+ * through lots of hoops to maintain the fiction that this isn't
+ * KVM-specific.
+ */
+err = xen_overlay_map_shinfo_page(gpa);
+if (err) {
+return err;
+}
+
+trace_kvm_xen_set_shared_info(gfn);
+
+return err;
+}
+
+static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
+{
+switch (space) {
+case XENMAPSPACE_shared_info:
+if (idx > 0) {
+return -EINVAL;
+}
+return xen_set_shared_info(gfn);
+
+case XENMAPSPACE_grant_table:
+case XENMAPSPACE_gmfn:
+case XENMAPSPACE_gmfn_range:
+return -ENOTSUP;
+
+case XENMAPSPACE_gmfn_foreign:
+case XENMAPSPACE_dev_mmio:
+return -EPERM;
+
+default:
+return -EINVAL;
+}
+}
+
+static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
+ uint64_t arg)
+{
+struct xen_add_to_physmap xatp;
+CPUState *cs = CPU(cpu);
+
+if (hypercall_compat32(exit->u.hcall.longmode)) {
+struct compat_xen_add_to_physmap xatp32;
+
+qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
+if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
+return -EFAULT;
+}
+xatp.domid = xatp32.domid;
+xatp.size = xatp32.size;
+xatp.space = xatp32.space;
+xatp.idx = xatp32.idx;
+xatp.gpfn = xatp32.gpfn;
+} else {
+if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
+return -EFAULT;
+}
+}
+
+if (xat

[PATCH v15 46/60] hw/xen: Implement GNTTABOP_query_size

2023-03-01 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_gnttab.c  | 19 +++
 hw/i386/kvm/xen_gnttab.h  |  2 ++
 target/i386/kvm/xen-emu.c | 16 +++-
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index b54a94e2bd..1e691ded32 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -211,3 +211,22 @@ int xen_gnttab_get_version_op(struct gnttab_get_version 
*get)
 get->version = 1;
 return 0;
 }
+
+int xen_gnttab_query_size_op(struct gnttab_query_size *size)
+{
+XenGnttabState *s = xen_gnttab_singleton;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (size->dom != DOMID_SELF && size->dom != xen_domid) {
+size->status = GNTST_bad_domain;
+return 0;
+}
+
+size->status = GNTST_okay;
+size->nr_frames = s->nr_frames;
+size->max_nr_frames = s->max_frames;
+return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index 79579677ba..3bdbe96191 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -17,7 +17,9 @@ int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
 struct gnttab_set_version;
 struct gnttab_get_version;
+struct gnttab_query_size;
 int xen_gnttab_set_version_op(struct gnttab_set_version *set);
 int xen_gnttab_get_version_op(struct gnttab_get_version *get);
+int xen_gnttab_query_size_op(struct gnttab_query_size *size);
 
 #endif /* QEMU_XEN_GNTTAB_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index d49b6117f1..3b46cab1da 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -1207,7 +1207,21 @@ static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 }
 break;
 }
-case GNTTABOP_query_size:
+case GNTTABOP_query_size: {
+struct gnttab_query_size size;
+
+qemu_build_assert(sizeof(size) == 16);
+if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_gnttab_query_size_op(&size);
+if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
+err = -EFAULT;
+}
+break;
+}
 case GNTTABOP_setup_table:
 case GNTTABOP_copy:
 case GNTTABOP_map_grant_ref:
-- 
2.39.0

< 3 4 5 6 7 8 9 10 11 12 >

701 - 800 of 1657 matches

Mail list logo