Xen Patch-04: Shared info page

Mike Belopuhov Sat, 05 Dec 2015 10:32:49 -0800

Allocate and hook up a "shared info page" that provides a matrix
of pending events and some other information like hypervisor
timecounter.  I'd like to keep the debugging function
xen_print_info_page around for a while.


OK?

---
 sys/dev/pv/xen.c    |  82 ++++++++++++
 sys/dev/pv/xenreg.h | 368 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 sys/dev/pv/xenvar.h |   2 +
 3 files changed, 452 insertions(+)

diff --git sys/dev/pv/xen.c sys/dev/pv/xen.c
index f1198d1..e8aeeb8 100644
--- sys/dev/pv/xen.c
+++ sys/dev/pv/xen.c
@@ -35,10 +35,11 @@ struct xen_softc *xen_sc;
 
 void   xen_find_base(struct xen_softc *);
 int    xen_init_hypercall(struct xen_softc *);
 int    xen_getversion(struct xen_softc *);
 int    xen_getfeatures(struct xen_softc *);
+int    xen_init_info_page(struct xen_softc *);
 
 int    xen_match(struct device *, void *, void *);
 void   xen_attach(struct device *, struct device *, void *);
 void   xen_resume(struct device *);
 int    xen_activate(struct device *, int);
@@ -84,10 +85,13 @@ xen_attach(struct device *parent, struct device *self, void 
*aux)
 
        if (xen_getversion(sc))
                return;
        if (xen_getfeatures(sc))
                return;
+
+       if (xen_init_info_page(sc))
+               return;
 }
 
 void
 xen_resume(struct device *self)
 {
@@ -336,5 +340,83 @@ xen_getfeatures(struct xen_softc *sc)
        printf("%s: features %b\n", sc->sc_dev.dv_xname, sc->sc_features,
            "\20\014DOM0\013PIRQ\012PVCLOCK\011CBVEC\010GNTFLAGS\007HMA"
            "\006PTUPD\005PAE4G\004SUPERVISOR\003AUTOPMAP\002WDT\001WPT");
        return (0);
 }
+
+#ifdef XEN_DEBUG
+void
+xen_print_info_page(void)
+{
+       struct xen_softc *sc = xen_sc;
+       struct shared_info *s = sc->sc_ipg;
+       struct vcpu_info *v;
+       int i;
+
+       membar_sync();
+       for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
+               v = &s->vcpu_info[i];
+               if (!v->evtchn_upcall_pending && !v->evtchn_upcall_mask &&
+                   !v->evtchn_pending_sel && !v->time.version &&
+                   !v->time.tsc_timestamp && !v->time.system_time &&
+                   !v->time.tsc_to_system_mul && !v->time.tsc_shift)
+                       continue;
+               printf("vcpu%d:\n"
+                   "   upcall_pending=%02x upcall_mask=%02x pending_sel=%#lx\n"
+                   "   time version=%u tsc=%llu system=%llu\n"
+                   "   time mul=%u shift=%d\n"
+                   , i, v->evtchn_upcall_pending, v->evtchn_upcall_mask,
+                   v->evtchn_pending_sel, v->time.version,
+                   v->time.tsc_timestamp, v->time.system_time,
+                   v->time.tsc_to_system_mul, v->time.tsc_shift);
+       }
+       printf("pending events: ");
+       for (i = 0; i < nitems(s->evtchn_pending); i++) {
+               if (s->evtchn_pending[i] == 0)
+                       continue;
+               printf(" %d:%#lx", i, s->evtchn_pending[i]);
+       }
+       printf("\nmasked events: ");
+       for (i = 0; i < nitems(s->evtchn_mask); i++) {
+               if (s->evtchn_mask[i] == 0xffffffffffffffffULL)
+                       continue;
+               printf(" %d:%#lx", i, s->evtchn_mask[i]);
+       }
+       printf("\nwc ver=%u sec=%u nsec=%u\n", s->wc_version, s->wc_sec,
+           s->wc_nsec);
+       printf("arch maxpfn=%lu framelist=%lu nmi=%lu\n", s->arch.max_pfn,
+           s->arch.pfn_to_mfn_frame_list, s->arch.nmi_reason);
+}
+#endif /* XEN_DEBUG */
+
+int
+xen_init_info_page(struct xen_softc *sc)
+{
+       struct xen_add_to_physmap xatp;
+       paddr_t pa;
+
+       sc->sc_ipg = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
+       if (sc->sc_ipg == NULL) {
+               printf("%s: failed to allocate shared info page\n",
+                   sc->sc_dev.dv_xname);
+               return (-1);
+       }
+       if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_ipg, &pa)) {
+               printf("%s: shared info page PA extraction failed\n",
+                   sc->sc_dev.dv_xname);
+               free(sc->sc_ipg, M_DEVBUF, PAGE_SIZE);
+               return (-1);
+       }
+       xatp.domid = DOMID_SELF;
+       xatp.idx = 0;
+       xatp.space = XENMAPSPACE_shared_info;
+       xatp.gpfn = atop(pa);
+       if (xen_hypercall(sc, memory_op, 2, XENMEM_add_to_physmap, &xatp)) {
+               printf("%s: failed to register shared info page\n",
+                   sc->sc_dev.dv_xname);
+               free(sc->sc_ipg, M_DEVBUF, PAGE_SIZE);
+               return (-1);
+       }
+       DPRINTF("%s: shared info page at va %p pa %#lx\n", sc->sc_dev.dv_xname,
+           sc->sc_ipg, pa);
+       return (0);
+}
diff --git sys/dev/pv/xenreg.h sys/dev/pv/xenreg.h
index 3f646d3..ec45722 100644
--- sys/dev/pv/xenreg.h
+++ sys/dev/pv/xenreg.h
@@ -85,20 +85,339 @@
 # error "Not implemented"
 #endif
 
 #define CPUID_OFFSET_XEN_HYPERCALL             0x2
 
+#if defined(__i386__) || defined(__amd64__)
+struct arch_vcpu_info {
+       unsigned long cr2;
+       unsigned long pad;
+} __packed;
+
+typedef unsigned long xen_pfn_t;
+typedef unsigned long xen_ulong_t;
+
+/* Maximum number of virtual CPUs in legacy multi-processor guests. */
+#define XEN_LEGACY_MAX_VCPUS 32
+
+struct arch_shared_info {
+       unsigned long max_pfn;  /* max pfn that appears in table */
+       /*
+        * Frame containing list of mfns containing list of mfns containing p2m.
+        */
+       xen_pfn_t pfn_to_mfn_frame_list;
+       unsigned long nmi_reason;
+       uint64_t pad[32];
+} __packed;
+#else
+#error "Not implemented"
+#endif /* __i386__ || __amd64__ */
+
 /*
  * interface/xen.h
  */
 
 typedef uint16_t domid_t;
 
 /* DOMID_SELF is used in certain contexts to refer to oneself. */
 #define DOMID_SELF             (0x7FF0U)
 
 /*
+ * Event channel endpoints per domain:
+ *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ */
+#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+
+struct vcpu_time_info {
+       /*
+        * Updates to the following values are preceded and followed by an
+        * increment of 'version'. The guest can therefore detect updates by
+        * looking for changes to 'version'. If the least-significant bit of
+        * the version number is set then an update is in progress and the
+        * guest must wait to read a consistent set of values.
+        *
+        * The correct way to interact with the version number is similar to
+        * Linux's seqlock: see the implementations of read_seqbegin and
+        * read_seqretry.
+        */
+       uint32_t version;
+       uint32_t pad0;
+       uint64_t tsc_timestamp; /* TSC at last update of time vals.  */
+       uint64_t system_time;   /* Time, in nanosecs, since boot.    */
+       /*
+        * Current system time:
+        *   system_time +
+        *   ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
+        * CPU frequency (Hz):
+        *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+        */
+       uint32_t tsc_to_system_mul;
+       int8_t tsc_shift;
+       int8_t pad1[3];
+} __packed; /* 32 bytes */
+
+struct vcpu_info {
+       /*
+        * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+        * a pending notification for a particular VCPU. It is then cleared
+        * by the guest OS /before/ checking for pending work, thus avoiding
+        * a set-and-check race. Note that the mask is only accessed by Xen
+        * on the CPU that is currently hosting the VCPU. This means that the
+        * pending and mask flags can be updated by the guest without special
+        * synchronisation (i.e., no need for the x86 LOCK prefix).
+        * This may seem suboptimal because if the pending flag is set by
+        * a different CPU then an IPI may be scheduled even when the mask
+        * is set. However, note:
+        *  1. The task of 'interrupt holdoff' is covered by the per-event-
+        *     channel mask bits. A 'noisy' event that is continually being
+        *     triggered can be masked at source at this very precise
+        *     granularity.
+        *  2. The main purpose of the per-VCPU mask is therefore to restrict
+        *     reentrant execution: whether for concurrency control, or to
+        *     prevent unbounded stack usage. Whatever the purpose, we expect
+        *     that the mask will be asserted only for short periods at a time,
+        *     and so the likelihood of a 'spurious' IPI is suitably small.
+        * The mask is read before making an event upcall to the guest: a
+        * non-zero mask therefore guarantees that the VCPU will not receive
+        * an upcall activation. The mask is cleared when the VCPU requests
+        * to block: this avoids wakeup-waiting races.
+        */
+       uint8_t evtchn_upcall_pending;
+       uint8_t pad1[3];
+       uint8_t evtchn_upcall_mask;
+       uint8_t pad2[3];
+       unsigned long evtchn_pending_sel;
+       struct arch_vcpu_info arch;
+       struct vcpu_time_info time;
+} __packed; /* 64 bytes (x86) */
+
+/*
+ * Xen/kernel shared data -- pointer provided in start_info.
+ *
+ * This structure is defined to be both smaller than a page, and the only data
+ * on the shared page, but may vary in actual size even within compatible Xen
+ * versions; guests should not rely on the size of this structure remaining
+ * constant.
+ */
+struct shared_info {
+       struct vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
+
+       /*
+        * A domain can create "event channels" on which it can send and
+        * receive asynchronous event notifications. There are three classes
+        * of event that are delivered by this mechanism:
+        *  1. Bi-directional inter- and intra-domain connections.  Domains
+        *     must arrange out-of-band to set up a connection (usually by
+        *     allocating an unbound 'listener' port and avertising that via
+        *     a storage service such as xenstore).
+        *  2. Physical interrupts. A domain with suitable hardware-access
+        *     privileges can bind an event-channel port to a physical
+        *     interrupt source.
+        *  3. Virtual interrupts ('events'). A domain can bind an event
+        *     channel port to a virtual interrupt source, such as the
+        *     virtual-timer device or the emergency console.
+        *
+        * Event channels are addressed by a "port index". Each channel is
+        * associated with two bits of information:
+        *  1. PENDING -- notifies the domain that there is a pending
+        *     notification to be processed. This bit is cleared by the guest.
+        *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
+        *     will cause an asynchronous upcall to be scheduled. This bit is
+        *     only updated by the guest. It is read-only within Xen. If a
+        *     channel becomes pending while the channel is masked then the
+        *     'edge' is lost (i.e., when the channel is unmasked, the guest
+        *     must manually handle pending notifications as no upcall will be
+        *     scheduled by Xen).
+        *
+        * To expedite scanning of pending notifications, any 0->1 pending
+        * transition on an unmasked channel causes a corresponding bit in a
+        * per-vcpu selector word to be set. Each bit in the selector covers a
+        * 'C long' in the PENDING bitfield array.
+        */
+       unsigned long evtchn_pending[sizeof(unsigned long) * 8];
+       unsigned long evtchn_mask[sizeof(unsigned long) * 8];
+
+       /*
+        * Wallclock time: updated only by control software. Guests should
+        * base their gettimeofday() syscall on this wallclock-base value.
+        */
+       uint32_t wc_version;    /* Version counter: see vcpu_time_info_t. */
+       uint32_t wc_sec;        /* Secs  00:00:00 UTC, Jan 1, 1970.  */
+       uint32_t wc_nsec;       /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+
+       struct arch_shared_info arch;
+} __packed;
+
+
+/*
+ * interface/hvm/hvm_op.h
+ */
+
+/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
+#define HVMOP_set_param                0
+#define HVMOP_get_param                1
+struct xen_hvm_param {
+       domid_t  domid;         /* IN */
+       uint32_t index;         /* IN */
+       uint64_t value;         /* IN/OUT */
+};
+
+/*
+ * Parameter space for HVMOP_{set,get}_param.
+ */
+
+/*
+ * How should CPU0 event-channel notifications be delivered?
+ * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt).
+ * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
+ *                  Domain = val[47:32], Bus  = val[31:16],
+ *                  DevFn  = val[15: 8], IntX = val[ 1: 0]
+ * val[63:56] == 2: val[7:0] is a vector number, check for
+ *                  XENFEAT_hvm_callback_vector to know if this delivery
+ *                  method is available.
+ * If val == 0 then CPU0 event-channel notifications are not delivered.
+ */
+#define HVM_PARAM_CALLBACK_IRQ                 0
+
+/*
+ * These are not used by Xen. They are here for convenience of HVM-guest
+ * xenbus implementations.
+ */
+#define HVM_PARAM_STORE_PFN                    1
+#define HVM_PARAM_STORE_EVTCHN                 2
+
+#define HVM_PARAM_PAE_ENABLED                  4
+
+#define HVM_PARAM_IOREQ_PFN                    5
+
+#define HVM_PARAM_BUFIOREQ_PFN                 6
+#define HVM_PARAM_BUFIOREQ_EVTCHN              26
+
+/*
+ * Set mode for virtual timers (currently x86 only):
+ *  delay_for_missed_ticks (default):
+ *   Do not advance a vcpu's time beyond the correct delivery time for
+ *   interrupts that have been missed due to preemption. Deliver missed
+ *   interrupts when the vcpu is rescheduled and advance the vcpu's virtual
+ *   time stepwise for each one.
+ *  no_delay_for_missed_ticks:
+ *   As above, missed interrupts are delivered, but guest time always tracks
+ *   wallclock (i.e., real) time while doing so.
+ *  no_missed_ticks_pending:
+ *   No missed interrupts are held pending. Instead, to ensure ticks are
+ *   delivered at some non-zero rate, if we detect missed ticks then the
+ *   internal tick alarm is not disabled if the VCPU is preempted during the
+ *   next tick period.
+ *  one_missed_tick_pending:
+ *   Missed interrupts are collapsed together and delivered as one 'late tick'.
+ *   Guest time always tracks wallclock (i.e., real) time.
+ */
+#define HVM_PARAM_TIMER_MODE                   10
+#define HVMPTM_delay_for_missed_ticks           0
+#define HVMPTM_no_delay_for_missed_ticks        1
+#define HVMPTM_no_missed_ticks_pending          2
+#define HVMPTM_one_missed_tick_pending          3
+
+/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
+#define HVM_PARAM_HPET_ENABLED                 11
+
+/* Identity-map page directory used by Intel EPT when CR0.PG=0. */
+#define HVM_PARAM_IDENT_PT                     12
+
+/* Device Model domain, defaults to 0. */
+#define HVM_PARAM_DM_DOMAIN                    13
+
+/* ACPI S state: currently support S0 and S3 on x86. */
+#define HVM_PARAM_ACPI_S_STATE                 14
+
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS                     15
+
+/* Boolean: Enable aligning all periodic vpts to reduce interrupts */
+#define HVM_PARAM_VPT_ALIGN                    16
+
+/* Console debug shared memory ring and event channel */
+#define HVM_PARAM_CONSOLE_PFN                  17
+#define HVM_PARAM_CONSOLE_EVTCHN               18
+
+/*
+ * Select location of ACPI PM1a and TMR control blocks. Currently two locations
+ * are supported, specified by version 0 or 1 in this parameter:
+ *   - 0: default, use the old addresses
+ *        PM1A_EVT == 0x1f40; PM1A_CNT == 0x1f44; PM_TMR == 0x1f48
+ *   - 1: use the new default qemu addresses
+ *        PM1A_EVT == 0xb000; PM1A_CNT == 0xb004; PM_TMR == 0xb008
+ * You can find these address definitions in <hvm/ioreq.h>
+ */
+#define HVM_PARAM_ACPI_IOPORTS_LOCATION                19
+
+/* Enable blocking memory events, async or sync (pause vcpu until response)
+ * onchangeonly indicates messages only on a change of value */
+#define HVM_PARAM_MEMORY_EVENT_CR0             20
+#define HVM_PARAM_MEMORY_EVENT_CR3             21
+#define HVM_PARAM_MEMORY_EVENT_CR4             22
+#define HVM_PARAM_MEMORY_EVENT_INT3            23
+#define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP     25
+
+#define HVMPME_MODE_MASK                       (3 << 0)
+#define HVMPME_mode_disabled                    0
+#define HVMPME_mode_async                       1
+#define HVMPME_mode_sync                        2
+#define HVMPME_onchangeonly                    (1 << 2)
+
+/* Boolean: Enable nestedhvm (hvm only) */
+#define HVM_PARAM_NESTEDHVM                    24
+
+/* Params for the mem event rings */
+#define HVM_PARAM_PAGING_RING_PFN              27
+#define HVM_PARAM_ACCESS_RING_PFN              28
+#define HVM_PARAM_SHARING_RING_PFN             29
+
+#define HVM_NR_PARAMS                          30
+
+/** The callback method types for Hypervisor event delivery to our domain. */
+enum {
+       HVM_CB_TYPE_GSI,
+       HVM_CB_TYPE_PCI_INTX,
+       HVM_CB_TYPE_VECTOR,
+       HVM_CB_TYPE_MASK                = 0xFF,
+       HVM_CB_TYPE_SHIFT               = 56
+};
+
+/** Format for specifying a GSI type callback. */
+enum {
+       HVM_CB_GSI_GSI_MASK             = 0xFFFFFFFF,
+       HVM_CB_GSI_GSI_SHIFT            = 0
+};
+#define HVM_CALLBACK_GSI(gsi) \
+       (((uint64_t)HVM_CB_TYPE_GSI << HVM_CB_TYPE_SHIFT) | \
+        ((gsi) & HVM_CB_GSI_GSI_MASK) << HVM_CB_GSI_GSI_SHIFT)
+
+/** Format for specifying a virtual PCI interrupt line GSI style callback. */
+enum {
+       HVM_CB_PCI_INTX_INTPIN_MASK     = 0x3,
+       HVM_CB_PCI_INTX_INTPIN_SHIFT    = 0,
+       HVM_CB_PCI_INTX_SLOT_MASK       = 0x1F,
+       HVM_CB_PCI_INTX_SLOT_SHIFT      = 11,
+};
+#define HVM_CALLBACK_PCI_INTX(slot, pin) \
+       (((uint64_t)HVM_CB_TYPE_PCI_INTX << HVM_CB_TYPE_SHIFT) | \
+        (((slot) & HVM_CB_PCI_INTX_SLOT_MASK) << HVM_CB_PCI_INTX_SLOT_SHIFT) | 
\
+        (((pin) & HVM_CB_PCI_INTX_INTPIN_MASK) << 
HVM_CB_PCI_INTX_INTPIN_SHIFT))
+
+/** Format for specifying a direct IDT vector injection style callback. */
+enum {
+       HVM_CB_VECTOR_VECTOR_MASK       = 0xFFFFFFFF,
+       HVM_CB_VECTOR_VECTOR_SHIFT      = 0
+};
+#define HVM_CALLBACK_VECTOR(vector) \
+       (((uint64_t)HVM_CB_TYPE_VECTOR << HVM_CB_TYPE_SHIFT) | \
+        (((vector) & HVM_CB_GSI_GSI_MASK) << HVM_CB_GSI_GSI_SHIFT))
+
+
+
+/*
  * interface/features.h
  *
  * Feature flags, reported by XENVER_get_features.
  */
 
@@ -142,10 +461,59 @@ typedef uint16_t domid_t;
 /* x86: pirq can be used by HVM guests */
 #define XENFEAT_hvm_pirqs                      10
 /* operation as Dom0 is supported */
 #define XENFEAT_dom0                           11
 
+/*
+ * interface/memory.h
+ *
+ * Memory reservation and information.
+ */
+
+/*
+ * Increase or decrease the specified domain's memory reservation.
+ * Returns the number of extents successfully allocated or freed.
+ * arg == addr of struct xen_memory_reservation.
+ */
+#define XENMEM_increase_reservation    0
+#define XENMEM_decrease_reservation    1
+#define XENMEM_populate_physmap                6
+
+#define XENMAPSPACE_shared_info                0       /* shared info page */
+#define XENMAPSPACE_grant_table                1       /* grant table page */
+#define XENMAPSPACE_gmfn               2       /* GMFN */
+#define XENMAPSPACE_gmfn_range         3       /* GMFN range */
+#define XENMAPSPACE_gmfn_foreign       4       /* GMFN from another domain */
+
+/*
+ * Sets the GPFN at which a particular page appears in the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_add_to_physmap_t.
+ */
+#define XENMEM_add_to_physmap          7
+struct xen_add_to_physmap {
+       /* Which domain to change the mapping for. */
+       domid_t domid;
+
+       /* Number of pages to go through for gmfn_range */
+       uint16_t size;
+
+       /* Source mapping space. */
+#define XENMAPSPACE_shared_info        0 /* shared info page */
+#define XENMAPSPACE_grant_table        1 /* grant table page */
+#define XENMAPSPACE_gmfn       2 /* GMFN */
+#define XENMAPSPACE_gmfn_range 3 /* GMFN range */
+       unsigned int space;
+
+#define XENMAPIDX_grant_table_status 0x80000000
+
+       /* Index into source mapping space. */
+       xen_ulong_t idx;
+
+       /* GPFN where the source mapping page should appear. */
+       xen_pfn_t gpfn;
+};
 
 /*
  * interface/version.h
  *
  * Xen version, type, and compile information.
diff --git sys/dev/pv/xenvar.h sys/dev/pv/xenvar.h
index 3fcfc96..b5db26d 100644
--- sys/dev/pv/xenvar.h
+++ sys/dev/pv/xenvar.h
@@ -29,10 +29,12 @@ struct xen_softc {
        struct device            sc_dev;
        uint32_t                 sc_base;
        void                    *sc_hc;
        uint32_t                 sc_features;
 #define  XENFEAT_CBVEC         (1<<8)
+
+       struct shared_info      *sc_ipg;        /* HYPERVISOR_shared_info */
 };
 
 extern struct xen_softc *xen_sc;
 
 /*
-- 
2.6.3

Xen Patch-04: Shared info page

Reply via email to