[RFC PATCH v3 3/3] tools/misc: Add xen-vcpus-stats tool

2023-10-31 Thread Matias Ezequiel Vara Larsen
Add a demonstration tool that uses the stats_table resource to
query vcpus' RUNSTATE_running counter for a DomU.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v3:
- use memory layout as discussed at
  https://lists.xenproject.org/archives/html/xen-devel/2023-03/msg00383.html
- use virt_*()
- issue xenforeignmemory_close()

Changes in v2:
- use period instead of frec
- rely on version to ensure reading is coherent 

Changes in v1:
- change the name of the tool to xen-vcpus-stats
- set command line parameters in the same order that are passed
- remove header libs.h
- build by default
- remove errno, strerrno, "\n", and identation
- use errx when errno is not needed
- address better the number of pages requested and error msgs
- use the shared_vcpustatspage_t structure
- use the correct frame id when requesting the resource
---
 tools/misc/Makefile  |   6 ++
 tools/misc/xen-vcpus-stats.c | 132 +++
 2 files changed, 138 insertions(+)
 create mode 100644 tools/misc/xen-vcpus-stats.c

diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 8b9558b93f..9c7cb50483 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -51,6 +51,7 @@ TARGETS_COPY += xenpvnetboot
 
 # Everything which needs to be built
 TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
+TARGETS_BUILD += xen-vcpus-stats
 
 # ... including build-only targets
 TARGETS_BUILD += $(TARGETS_BUILD-y)
@@ -139,4 +140,9 @@ xencov: xencov.o
 xen-ucode: xen-ucode.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+xen-vcpus-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
+
+xen-vcpus-stats: xen-vcpus-stats.o
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) 
$(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
+
 -include $(DEPS_INCLUDE)
diff --git a/tools/misc/xen-vcpus-stats.c b/tools/misc/xen-vcpus-stats.c
new file mode 100644
index 00..f277b6ce8f
--- /dev/null
+++ b/tools/misc/xen-vcpus-stats.c
@@ -0,0 +1,132 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+/*
+ * Note that virt_*() is used when ordering is required between the hypevisor
+ * and the tool domain. This tool is meant to be arch-agnostic so add the
+ * corresponding barrier for each architecture.
+ *
+ */
+#if defined(__x86_64__)
+#define barrier() asm volatile("" ::: "memory")
+#define virt_rmb() barrier()
+#elif defined(__aarch64__)
+#define dmb(opt) asm volatile("dmb " #opt : : : "memory")
+#define virt_rmb() dmb(ishld)
+#else
+#error Please fill in barrier macros
+#endif
+
+static sig_atomic_t interrupted;
+static void close_handler(int signum)
+{
+interrupted = 1;
+}
+
+int main(int argc, char **argv)
+{
+xenforeignmemory_handle *fh;
+xenforeignmemory_resource_handle *res;
+size_t size;
+int rc, domid, period, vcpu;
+xen_vcpu_shmemstats_t *info_shmem;
+xen_shared_vcpustats_t *info;
+struct sigaction act;
+uint32_t seq;
+uint64_t value;
+
+if ( argc != 4 )
+{
+fprintf(stderr, "Usage: %s   \n", argv[0]);
+return 1;
+}
+
+domid = atoi(argv[1]);
+vcpu = atoi(argv[2]);
+period = atoi(argv[3]);
+
+act.sa_handler = close_handler;
+act.sa_flags = 0;
+sigemptyset(_mask);
+sigaction(SIGHUP,  , NULL);
+sigaction(SIGTERM, , NULL);
+sigaction(SIGINT,  , NULL);
+sigaction(SIGALRM, , NULL);
+
+fh = xenforeignmemory_open(NULL, 0);
+
+if ( !fh )
+err(1, "xenforeignmemory_open");
+
+rc = xenforeignmemory_resource_size(
+fh, domid, XENMEM_resource_stats_table,
+XENMEM_resource_stats_table_id_vcpustats, );
+
+if ( rc )
+err(1, "Fail: Get size");
+
+res = xenforeignmemory_map_resource(
+fh, domid, XENMEM_resource_stats_table,
+XENMEM_resource_stats_table_id_vcpustats, 0, size >> XC_PAGE_SHIFT,
+(void **)_shmem, PROT_READ, 0);
+
+if ( !res )
+err(1, "Fail: Map");
+
+if ( info_shmem->magic != VCPU_STATS_MAGIC )
+{
+fprintf(stderr, "Wrong magic number\n");
+return 1;
+}
+
+if ( offsetof(struct vcpu_stats, runstate_running_time) > info_shmem->size 
)
+{
+fprintf(stderr, "The counter is not produced\n");
+return 1;
+}
+
+info = (xen_shared_vcpustats_t*)((void*)info_shmem
+ + info_shmem->offset
+ + info_shmem->size * vcpu);
+
+if ( info->runstate_running_time & ((uint64_t)1 << 63) )
+{
+fprintf(stderr, "The counter is inactived or has overflowed\n");
+return 1;
+}
+
+while ( !interrupted )
+{
+sleep(period);
+do {
+seq = info[vcpu].seq;
+virt_rmb();
+   

[RFC PATCH v3 2/3] x86/mm: Do not validate/devalidate PGT_none type

2023-10-31 Thread Matias Ezequiel Vara Larsen
This commit prevents PGT_none type pages to be validated/devalidated.
This is required for the use-case in which a guest-agnostic resource is
allocated. In this case, these pages may be accessible by an "owning" PV
domain. To lock the page from guest pov, pages are required to be marked
with PGT_none with a single type ref. In particular, this commit makes
the stats_table resource type to use this flag during
get_page_and_type(). 

Signed-off-by: Matias Ezequiel Vara Larsen 
---
 xen/arch/x86/mm.c   | 3 ++-
 xen/common/memory.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 5812321cae..d2f311abe4 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2787,6 +2787,7 @@ static int _put_page_type(struct page_info *page, 
unsigned int flags,
 {
 case 0:
 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+ unlikely((nx & PGT_type_mask) >= PGT_l1_page_table) &&
  likely(nx & (PGT_validated|PGT_partial)) )
 {
 int rc;
@@ -3072,7 +3073,7 @@ static int _get_page_type(struct page_info *page, 
unsigned long type,
  *
  * per validate_page(), non-atomic updates are fine here.
  */
-if ( type == PGT_writable_page || type == PGT_shared_page )
+if ( type == PGT_writable_page || type == PGT_shared_page || type == 
PGT_none )
 page->u.inuse.type_info |= PGT_validated;
 else
 {
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 2acac40c63..e26ba21d75 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -1254,7 +1254,7 @@ static int stats_vcpu_alloc_mfn(struct domain *d)
 
 for ( i = 0; i < nr_frames; i++ )
 {
-if ( unlikely(!get_page_and_type([i], d, PGT_writable_page)) )
+if ( unlikely(!get_page_and_type([i], d, PGT_none)) )
 /*
  * The domain can't possibly know about this page yet, so failure
  * here is a clear indication of something fishy going on.
-- 
2.34.1




[RFC PATCH v3 1/3] xen/memory : Add a stats_table resource type

2023-10-31 Thread Matias Ezequiel Vara Larsen
This commit proposes a new mechanism to query the RUNSTATE_running counter for
a given vcpu from a dom0 userspace application. This commit proposes to expose
that counter by using the acquire_resource interface. For this purpose, the
commit adds a new resource named XENMEM_resource_stats_table and a
type-specific resource named XENMEM_resource_stats_table_id_vcpustats. This
type-specific resource is aiming at exposing per-VCPU counters.

The current mechanism relies on the XEN_DOMCTL_getvcpuinfo and holds a single
global domctl_lock for the entire hypercall; and iterate over every vcpu in the
system for every update thus impacting operations that share that lock. This
commit proposes to expose vcpu RUNSTATE_running via the xenforeignmemory
interface thus preventing to issue the hypercall and holding the lock.

For that purpose, a new resource type named XENMEM_resource_stats_table is
added. In particular, the type-specific resource
XENMEM_resource_stats_table_id_vcpustats is added to host per-VCPU counters.
These counters are mapped in frames. The allocation of these frames only
happens if the resource is requested. The number of allocated frames is equal
to the domain's max_vcpus. Frames are released after the domain is destroyed.

To expose this information to a consumer, two structures are required:
1. vcpu_shmem_stats
2. vcpu_stats[]

The memory layout has been discussed at [1]. The laylout looks like:

struct vcpu_shmem_stats {
#define VCPU_STATS_MAGIC 0xaabbccdd
uint32_t magic;
uint32_t offset;  // roundup(sizeof(vcpu_shmem_stats), cacheline_size)
uint32_t size;// sizeof(vcpu_stats)
uint32_t stride;  // roundup(sizeof(vcpu_stats), cacheline_size)
};

struct vcpu_stats {
/*
 * If the least-significant bit of the seq number is set then an update
 * is in progress and the consumer must wait to read a consistent set of
 * values. This mechanism is similar to Linux's seqlock.
 */
uint32_t seq;
uint32 _pad;
/*
 * If the most-significant bit of a counter is set then the counter
 * is inactive and the consumer must ignore its value. Note that this
 * could also indicate that the counter has overflowed.
 */
uint64_t stats_a; // e.g., runstate_running_time
uint64_t stats_b; // e.g.,
uint64_t stats_c; // e.g.,
...
};

All padding fields shall be marked as "inactive". The consumer can't
distinguish inactive from overflowed. Also, the consumer shall always
verify before reading that:

  offsetof(struct vcpu_stats, stats_y) < size.

in case the consumer knows about a counter, e.g., stats_y, that Xen does
not it.

The vcpu_shmem_stats structure exposes a magic number, the size of the
vcpu_stats structure, the offset in which the vcpu_stats structures begin and
the stride for each instance. The address of the vcpu_shmem_stats and
vcpu_stats instances are cache line aligned to prevent cache ping-pong when
accessing per-vcpu elements. In the vcpu_stats structure, most-significant bit
of a counter is set to indicate that either the counter has overflowed or it is
inactive so the consumer must ignore it.

Note that the updating of vcpu's counters is in a hot path, thus, in this 
commit,
copying only happens if it is specifically required.

Note that the resource is extensible in two ways. First, the structure
vcpu_stats can be extended with new per-vcpu counters. To do so, new per-vcpu
counters shall be added after the last element of the structure definition to
not break existing consumers. Second, new type-specific resources can be added
to host a different sort of counters.

[1]
https://lists.xenproject.org/archives/html/xen-devel/2023-03/msg00383.html

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v3:
- allow to host an arbitrary nuumber of vcpus
- release resource during domain_kill() since it is guest-type
  independent and arch agnostic
- rework stats_table_max_frames()
- use memory layout as discussed at
  https://lists.xenproject.org/archives/html/xen-devel/2023-03/msg00383.html

Changes in v2:
- rework to ensure that guest reads a coherent value by using a version
  number in the vcpu_stats structure
- add version to the vcpu_stats structure

Changes in v1:
- rework the allocation and releasing of the frames
- use the zero frame for per-vcpu counters that are listed as an array
- allocate vcpu stats frames only when the resource is requested
- rewrite commit message
- add the vcpu_stats structure to keep per-vcpu counters
- add the shared_vcpustatspage to keep an array of per-vcpu counters for a
  given domain
- declare the structures in a public header 
- define the vcpustats_page in the domain structure
---
 xen/common/domain.c |   1 +
 xen/common/memory.c | 167 
 xen/common/sched/core.c |  20 +
 xen/include/public/memory.h |   3 +
 xen/include/public/vcpu.h   |  27 ++
 xen/include/xen/mm.h|   2 +
 xen/includ

[RFC PATCH v3 0/3] Add a new acquire resource to query vcpu statistics

2023-10-31 Thread Matias Ezequiel Vara Larsen
Hello all and apologies for the delay in sending v3,

the purpose of this RFC is to get feedback about a new acquire resource that
exposes vcpu statistics for a given domain. The current mechanism to get those
statistics is by querying the hypervisor. This mechanism relies on a hypercall
and holds the domctl spinlock during its execution. When a pv tool like xcp-rrdd
periodically samples these counters, it ends up affecting other paths that share
that spinlock. By using acquire resources, the pv tool only requires a few
hypercalls to set the shared memory region and samples are got without issuing
any other hypercall. The original idea has been suggested by Andrew Cooper to
which I have been discussing about how to implement the current PoC. You can
find the RFC patch series at [1]. The series is rebased on top of stable-4.16.

The current series includes a simple pv tool that shows how this new interface 
is
used. This tool maps the counter and periodically samples it.

Any feedback/help would be appreciated.

Thanks, Matias.

[1] https://github.com/MatiasVara/xen/commits/feature_vcpu_stats

Changes in v3:
- use memory layout discussed at
  https://lists.xenproject.org/archives/html/xen-devel/2023-03/msg00383.html
 
Changes in v2:
- rework to ensure that consumer fetches consistent data

Changes in v1:
- rework how the resource is allocated and released
- rework when the resource is allocated that happens only when the resource is
  requested 
- rework the structure shared between the tool and Xen to make it extensible to
  new counters and declare it in a public header

There are still the following questions:
   - resource shall be released when there are no more readers otherwise we keep
 updating it during a hot path

Matias Ezequiel Vara Larsen (3):
  xen/memory : Add a stats_table resource type
  x86/mm: Do not validate/devalidate PGT_none type
  tools/misc: Add xen-vcpus-stats tool

 tools/misc/Makefile  |   6 ++
 tools/misc/xen-vcpus-stats.c | 132 +++
 xen/arch/x86/mm.c|   3 +-
 xen/common/domain.c  |   1 +
 xen/common/memory.c  | 167 +++
 xen/common/sched/core.c  |  20 +
 xen/include/public/memory.h  |   3 +
 xen/include/public/vcpu.h|  27 ++
 xen/include/xen/mm.h |   2 +
 xen/include/xen/sched.h  |   5 ++
 10 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 tools/misc/xen-vcpus-stats.c

-- 
2.34.1




Re: [RFC PATCH v2 2/2] tools/misc: Add xen-vcpus-stats tool

2023-03-17 Thread Matias Ezequiel Vara Larsen
On Thu, Feb 23, 2023 at 08:31:29PM +, Julien Grall wrote:
> Hi,
> 
> On 23/02/2023 16:01, Andrew Cooper wrote:
> > On 07/10/2022 1:39 pm, Matias Ezequiel Vara Larsen wrote:
> > 
> > A couple of observations, all unrelated to the stats themselves.
> > 
> > Although overall, I'm not entirely certain that a tool like this is
> > going to be very helpful after initial development.  Something to
> > consider would be to alter libxenstat to use this new interface?
> > 
> > > diff --git a/tools/misc/Makefile b/tools/misc/Makefile
> > > index 2b683819d4..837e4b50da 100644
> > > --- a/tools/misc/Makefile
> > > +++ b/tools/misc/Makefile
> > > @@ -49,6 +49,7 @@ TARGETS_COPY += xenpvnetboot
> > > 
> > > # Everything which needs to be built
> > > TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
> > > +TARGETS_BUILD += xen-vcpus-stats
> > 
> > This patch is whitespace corrupted.  If at all possible, you need to see
> > about getting `git send-email` working to send patches with, as it deals
> > with most of the whitespace problems for you.
> > 
> > I'm afraid you can't simply copy the patch text into an email and send that.
> > 
> > > 
> > > # ... including build-only targets
> > > TARGETS_BUILD-$(CONFIG_X86)    += xen-vmtrace
> > > @@ -135,4 +136,9 @@ xencov: xencov.o
> > > xen-ucode: xen-ucode.o
> > >  $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
> > > 
> > > +xen-vcpus-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
> > > +
> > > +xen-vcpus-stats: xen-vcpus-stats.o
> > > +    $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl)
> > > $(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
> > > +
> > > -include $(DEPS_INCLUDE)
> > > diff --git a/tools/misc/xen-vcpus-stats.c b/tools/misc/xen-vcpus-stats.c
> > > new file mode 100644
> > > index 00..29d0efb124
> > > --- /dev/null
> > > +++ b/tools/misc/xen-vcpus-stats.c
> > > @@ -0,0 +1,87 @@
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +
> > > +#include 
> > > +#include 
> > > +#include 
> > > +
> > > +#define rmb()   asm volatile("lfence":::"memory")
> > 
> > This is rmb(), but rmb() isn't what you want.
> > 
> > You want smp_rmb(), which is
> > 
> > #define smp_rmb() asm volatile ("" ::: "memory")
> 
> From the generic PoV, I find smp_rmb() a bit misleading because it is not
> clear in this context whether we are referring to the SMP-ness of the
> hypervisor or the tools domain.
> 
> If the latter, then technically it could be uniprocessor domain and one
> could argue that for Arm it could be downgraded to just a compiler barrier.
> 
> AFAICT, this would not be the case here because we are getting data from
> Xen. So we always need a "dmb ish".
> 
> So, I would suggest to name it virt_*() (to match Linux's naming).
> 
> Also, is this tool meant to be arch-agnostic? If so, then we need to
> introduce the proper barrier for the other arch.
> 
Thanks Julien for the comment. Is it `xen_rmb()` meant for that?

Matias



Re: API/ABIs: Re: [RFC PATCH v2 0/2] Add a new acquire resource to query vcpu statistics

2023-03-14 Thread Matias Ezequiel Vara Larsen
On Fri, Mar 10, 2023 at 12:34:33PM +0100, Jan Beulich wrote:
> On 10.03.2023 11:58, Matias Ezequiel Vara Larsen wrote:
> > Oh, I see, thanks for the clarification. To summarise, these are the current
> > options:
> > 1. Use a "uint64_t" field thus limiting the number of counters to 64. The
> > current vcpu_runstate_info structure is limited to 4 counters though, one 
> > for
> > each RUNSTATE_*. 
> > 2. Use a dynamic array but this makes harder to use the interface.
> > 3. Eliminate stats_active and set to ~0 the actual stats value to mark 
> > inactive
> > counters. This requires adding a "nr_stats" field to know how many counters 
> > are.
> 
> While nr_stats can indeed be seen as a generalization of the earlier
> stats_active, I think it is possible to get away without, as long as
> padding fields also are filled with the "inactive" marker.
> 

Understood.

> > Also, this requires to make sure to saturate at 2^^64-2.
> 
> Thinking of it - considering overflowed counters inactive looks like a
> reasonable model to me as well (which would mean saturating at 2^^64-1).
> 
> > I might miss some details here but these are the options to evaluate. 
> > 
> > I would go with a variation of 1) by using two uint64_t, i.e., up to 128 
> > vcpu's
> > counters, which I think it would be enough. I may be wrong.
> 
> Well, to me it doesn't matter whether it's 32, 64, or 128 - my concern
> is with any kind of inherent upper bound. Using 128 right away might
> look excessive, just like 32 might look too little. Hence my desire to
> get away without any built-in upper bound. IOW I continue to favor 3,
> irrespective of the presence or absence of nr_stats.
> 
I see. 3) layout would look like:

struct vcpu_shmem_stats {
#define VCPU_STATS_MAGIC 0xaabbccdd
uint32_t magic;
uint32_t offset;  // roundup(sizeof(vcpu_shmem_stats), cacheline_size)
uint32_t size;// sizeof(vcpu_stats)
uint32_t stride;  // roundup(sizeof(vcpu_stats), cacheline_size)
};

struct vcpu_stats {
/*
 * If the least-significant bit of the seq number is set then an update
 * is in progress and the consumer must wait to read a consistent set of
 * values. This mechanism is similar to Linux's seqlock.
 */
uint32_t seq;
uint32 _pad;
/*
 * If the most-significant bit of a counter is set then the counter
 * is inactive and the consumer must ignore its value. Note that this
 * could also indicate that the counter has overflowed.
 */
uint64_t stats_a; // e.g., runstate_running_time
...
};

All padding fields shall be marked as "inactive". The consumer can't
distinguish inactive from overflowed. Also, the consumer shall always verify
before reading that:

offsetof(struct vcpu_stats, stats_y) < size. 

in case the consumer knows about a counter, e.g., stats_y, that Xen does not
it.

Matias 



Re: API/ABIs: Re: [RFC PATCH v2 0/2] Add a new acquire resource to query vcpu statistics

2023-03-10 Thread Matias Ezequiel Vara Larsen
On Thu, Mar 09, 2023 at 12:50:18PM +0100, Jan Beulich wrote:
> On 09.03.2023 11:38, Matias Ezequiel Vara Larsen wrote:
> > On Wed, Mar 08, 2023 at 03:16:05PM +0100, Jan Beulich wrote:
> >> On 08.03.2023 12:54, Matias Ezequiel Vara Larsen wrote:
> >>> On Tue, Mar 07, 2023 at 11:12:00AM +0100, Jan Beulich wrote:
> >>>> On 06.03.2023 15:23, Matias Ezequiel Vara Larsen wrote:
> >>>>> - Xen shall use the "stats_active" field to indicate what it is 
> >>>>> producing. In
> >>>>>   this field, reserved bits shall be 0. This shall allow us to agree on 
> >>>>> the
> >>>>> layout even when producer and consumer are compiled with different 
> >>>>> headers.
> >>>>
> >>>> I wonder how well such a bitfield is going to scale. It provides for
> >>>> only 32 (maybe 64) counters. Of course this may seem a lot right now,
> >>>> but you never know how quickly something like this can grow. Plus
> >>>> with ...
> >>>>
> >>>
> >>> Would it make sense to define it like this?:
> >>>
> >>> struct vcpu_shmem_stats {
> >>> #define STATS_A (1u << 0)
> >>> ...
> >>> #define VCPU_STATS_MAGIC 0xaabbccdd
> >>>  uint32_t magic;
> >>>  uint32_t offset;  // roundup(sizeof(vcpu_shmem_stats) + 
> >>> sizeof(uint32_t) * nr_stats, cacheline_size)
> >>>  uint32_t size;// sizeof(vcpu_stats)
> >>>  uint32_t stride;  // roundup(sizeof(vcpu_stats), cacheline_size)
> >>>  uint32_t nr_stats; // size of stats_active in uint32_t
> >>>  uint32_t stats_active[XEN_FLEX_ARRAY_DIM];
> >>>  ...
> >>> };
> >>
> > 
> > The use of stats_active[] is meant to have a bitmap that could scale thus 
> > not
> > limiting the number of counters in the vcpu_stat structure to 32 or 64. I 
> > can't
> > see other way to have an unlimited number of counters though.
> > 
> >> Possibly, but this would make it harder to use the interface. An 
> >> alternative
> >> might be to specify that an actual stats value set to ~0 marks an inactive
> >> element. Since these are 64-bit counters, with today's and tomorrow's
> >> computers we won't be at risk of a counter reaching a value of 2^^64-1, I
> >> guess. And even if there was one where such a risk could not be excluded
> >> (e.g. because of using pretty large increments), one would merely need to
> >> make sure to saturate at 2^^64-2. Plus at such a point one would need to
> >> consider anyway to switch to 128-bit fields, as neither saturated nor
> >> wrapped values are of much use to consumers.
> >>
> > 
> > If I understand well, this use-case is in case an element in the 
> > stats_active
> > bitmap becomes inactive, i.e., it is set to "0" in stats_active[]. You are
> > proposing to set to ~0 the actual stats value to mark an inactive element. I
> > may be missing something here but would not be enough to set to "0" the
> > corresponding stats_active[] bit? 
> 
> The suggestion was to eliminate the need for stats_active[].
> 
Oh, I see, thanks for the clarification. To summarise, these are the current
options:
1. Use a "uint64_t" field thus limiting the number of counters to 64. The
current vcpu_runstate_info structure is limited to 4 counters though, one for
each RUNSTATE_*. 
2. Use a dynamic array but this makes harder to use the interface.
3. Eliminate stats_active and set to ~0 the actual stats value to mark inactive
counters. This requires adding a "nr_stats" field to know how many counters are.
Also, this requires to make sure to saturate at 2^^64-2.

I might miss some details here but these are the options to evaluate. 

I would go with a variation of 1) by using two uint64_t, i.e., up to 128 vcpu's
counters, which I think it would be enough. I may be wrong.

Thoughs?
 
Matias 



Re: API/ABIs: Re: [RFC PATCH v2 0/2] Add a new acquire resource to query vcpu statistics

2023-03-09 Thread Matias Ezequiel Vara Larsen
On Wed, Mar 08, 2023 at 03:16:05PM +0100, Jan Beulich wrote:
> On 08.03.2023 12:54, Matias Ezequiel Vara Larsen wrote:
> > On Tue, Mar 07, 2023 at 11:12:00AM +0100, Jan Beulich wrote:
> >> On 06.03.2023 15:23, Matias Ezequiel Vara Larsen wrote:
> >>> - Xen shall use the "stats_active" field to indicate what it is 
> >>> producing. In
> >>>   this field, reserved bits shall be 0. This shall allow us to agree on 
> >>> the
> >>> layout even when producer and consumer are compiled with different 
> >>> headers.
> >>
> >> I wonder how well such a bitfield is going to scale. It provides for
> >> only 32 (maybe 64) counters. Of course this may seem a lot right now,
> >> but you never know how quickly something like this can grow. Plus
> >> with ...
> >>
> > 
> > Would it make sense to define it like this?:
> > 
> > struct vcpu_shmem_stats {
> > #define STATS_A (1u << 0)
> > ...
> > #define VCPU_STATS_MAGIC 0xaabbccdd
> >  uint32_t magic;
> >  uint32_t offset;  // roundup(sizeof(vcpu_shmem_stats) + 
> > sizeof(uint32_t) * nr_stats, cacheline_size)
> >  uint32_t size;// sizeof(vcpu_stats)
> >  uint32_t stride;  // roundup(sizeof(vcpu_stats), cacheline_size)
> >  uint32_t nr_stats; // size of stats_active in uint32_t
> >  uint32_t stats_active[XEN_FLEX_ARRAY_DIM];
> >  ...
> > };
> 

The use of stats_active[] is meant to have a bitmap that could scale thus not
limiting the number of counters in the vcpu_stat structure to 32 or 64. I can't
see other way to have an unlimited number of counters though.

> Possibly, but this would make it harder to use the interface. An alternative
> might be to specify that an actual stats value set to ~0 marks an inactive
> element. Since these are 64-bit counters, with today's and tomorrow's
> computers we won't be at risk of a counter reaching a value of 2^^64-1, I
> guess. And even if there was one where such a risk could not be excluded
> (e.g. because of using pretty large increments), one would merely need to
> make sure to saturate at 2^^64-2. Plus at such a point one would need to
> consider anyway to switch to 128-bit fields, as neither saturated nor
> wrapped values are of much use to consumers.
> 

If I understand well, this use-case is in case an element in the stats_active
bitmap becomes inactive, i.e., it is set to "0" in stats_active[]. You are
proposing to set to ~0 the actual stats value to mark an inactive element. I
may be missing something here but would not be enough to set to "0" the
corresponding stats_active[] bit? 

> >>> - In the vcpu_stats structure, new fields can only ever be appended.
> >>
> >> ... this rule the only ambiguity that could arise to consumers when
> >> no active flags existed would be that they can't tell "event never
> >> occurred" from "hypervisor doesn't know about this anymore".
> > 
> > Do you mean how the consumer can figure out if either 1) Xen does not know 
> > yet
> > about some flag or 2) the flag has been deprecated? I think 2) is the case 
> > that
> > Andrew mention in which the magic number could be used as an ABI version to
> > break backwards compatibility.
> 
> No, an inactive field wouldn't break the ABI. An ABI change would be if
> such an inactive field was actually removed from the array. Or if e.g.,
> as per above, we needed to switch to 128-bit counters.

Got it, Thanks.

Matias



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-03-09 Thread Matias Ezequiel Vara Larsen
On Tue, Mar 07, 2023 at 05:55:26PM +0100, Jan Beulich wrote:
> On 07.03.2023 15:44, Matias Ezequiel Vara Larsen wrote:
> > On Thu, Feb 23, 2023 at 01:42:08PM +0100, Jan Beulich wrote:
> >> On 23.02.2023 13:16, Matias Ezequiel Vara Larsen wrote:
> >>> On Fri, Feb 17, 2023 at 03:10:53PM +0100, Jan Beulich wrote:
> >>>> On 17.02.2023 10:29, Matias Ezequiel Vara Larsen wrote:
> >>>>> On Fri, Feb 17, 2023 at 09:57:43AM +0100, Jan Beulich wrote:
> >>>>>> On 17.02.2023 09:50, Matias Ezequiel Vara Larsen wrote:
> >>>>>>> On Wed, Dec 14, 2022 at 08:56:57AM +0100, Jan Beulich wrote:
> >>>>>>>> On 14.12.2022 08:29, Jan Beulich wrote:
> >>>>>>>>> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> >>>>>>>>>> +static int stats_vcpu_alloc_mfn(struct domain *d)
> >>>>>>>>>> +{
> >>>>>>>>>> +struct page_info *pg;
> >>>>>>>>>> +
> >>>>>>>>>> +pg = alloc_domheap_page(d, MEMF_no_refcount);
> >>>>>>>>>
> >>>>>>>>> The ioreq and vmtrace resources are also allocated this way, but 
> >>>>>>>>> they're
> >>>>>>>>> HVM-specific. The one here being supposed to be VM-type 
> >>>>>>>>> independent, I'm
> >>>>>>>>> afraid such pages will be accessible by an "owning" PV domain (it'll
> >>>>>>>>> need to guess the MFN, but that's no excuse).
> >>>>>>>>
> >>>>>>>> Which might be tolerable if it then can't write to the page. That 
> >>>>>>>> would
> >>>>>>>> require "locking" the page r/o (from guest pov), which ought to be
> >>>>>>>> possible by leveraging a variant of what share_xen_page_with_guest()
> >>>>>>>> does: It marks pages PGT_none with a single type ref. This would mean
> >>>>>>>> ...
> >>>>>>>>
> >>>>>>>>>> +if ( !pg )
> >>>>>>>>>> +return -ENOMEM;
> >>>>>>>>>> +
> >>>>>>>>>> +if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
> >>>>>>>>
> >>>>>>>> ... using PGT_none here. Afaict this _should_ work, but we have no
> >>>>>>>> precedent of doing so in the tree, and I may be overlooking something
> >>>>>>>> which prevents that from working.
> >>>>>>>>
> >>>>>>>
> >>>>>>> I do not fully understand this. I checked share_xen_page_with_guest() 
> >>>>>>> and I
> >>>>>>> think you're talking about doing something like this for each 
> >>>>>>> allocated page to
> >>>>>>> set them ro from a pv guest pov:
> >>>>>>>
> >>>>>>> pg->u.inuse.type_info = PGT_none;
> >>>>>>> pg->u.inuse.type_info |= PGT_validated | 1;
> >>>>>>> page_set_owner(page, d); // not sure if this is needed
> >>>>>>>
> >>>>>>> Then, I should use PGT_none instead of PGT_writable_page in
> >>>>>>> get_page_and_type(). Am I right?
> >>>>>>
> >>>>>> No, if at all possible you should avoid open-coding anything. As said,
> >>>>>> simply passing PGT_none to get_page_and_type() ought to work (again, as
> >>>>>> said, unless I'm overlooking something). share_xen_page_with_guest()
> >>>>>> can do what it does because the page isn't owned yet. For a page with
> >>>>>> owner you may not fiddle with type_info in such an open-coded manner.
> >>>>>>
> >>>>>
> >>>>> Thanks. I got the following bug when passing PGT_none:
> >>>>>
> >>>>> (XEN) Bad type in validate_page 0 t=0001 c=8042
> >>>>> (XEN) Xen BUG at mm.c:2643
> >>>>
> >>>> The caller of the function needs to avoid the call not only for writable
> >>>> and shared pages, but also for this new case of PGT_none.
> >>>
> >>> Thanks. If I understand correctly, _get_page_type() needs to avoid to call
> >>> validate_page() when type = PGT_none.
> >>
> >> Yes.
> >>
> >>> For the writable and shared pages, this
> >>> is avoided by setting nx |= PGT_validated. Am I right?
> >>
> >> Well, no, I wouldn't describe it like that. The two (soon three) types not
> >> requiring validation simply set the flag without calling validate_page().
> >>
> > 
> > I see, thanks. I added the corresponding check at _get_page_type() to set 
> > the
> > flag without calling validate_page() for the PGT_none type. I think I am
> > missing something when I am releasing the pages. I am triggering the 
> > following
> > BUG() when issuing put_page_and_type():
> >  
> > (XEN) Xen BUG at mm.c:2698
> > 
> > This is at devalidate_page(). I guess the call to devalidate_page() shall be
> > also avoided.
> 
> Well, yes, symmetry requires a change there as well. Here it's indirect:
> You want to avoid the call to _put_final_page_type(). That's enclosed by
> (nx & PGT_type_mask) <= PGT_l4_page_table, which happens to be true for
> PGT_none as well. There may be more instances of such a comparison, so
> it'll be necessary to find them and check whether they may now also be
> reached with PGT_none (looks like a comparison against PGT_root_page_table
> in _get_page_type() is also affected, albeit in a largely benign way).
> 
Thanks. I could not find any other instance of that comparison except those
that you mention. I'll add a new patch in the series to deal with the
support for PGT_none.

Matias 



Re: API/ABIs: Re: [RFC PATCH v2 0/2] Add a new acquire resource to query vcpu statistics

2023-03-08 Thread Matias Ezequiel Vara Larsen
On Tue, Mar 07, 2023 at 11:12:00AM +0100, Jan Beulich wrote:
> On 06.03.2023 15:23, Matias Ezequiel Vara Larsen wrote:
> > Regarding your email, I have the following comments:
> > 
> > - I still do not know how to choose the value of cacheline_size. I 
> > understand
> > this value shall be between the actual cacheline_size and PAGE_SIZE. A value
> > that could match most architectures could be 256 bytes.
> 
> This isn't a concern anymore when offset and stride are stored in the
> header. It was a concern when trying to come up with a layout where
> these value were to be inferred (or known a priori).
> 

Oh, I see. Cacheline_size shall be decided at compilation time for a given
arch, e.g., SMP_CACHE_BYTES.

> > - Xen shall use the "stats_active" field to indicate what it is producing. 
> > In
> >   this field, reserved bits shall be 0. This shall allow us to agree on the
> > layout even when producer and consumer are compiled with different headers.
> 
> I wonder how well such a bitfield is going to scale. It provides for
> only 32 (maybe 64) counters. Of course this may seem a lot right now,
> but you never know how quickly something like this can grow. Plus
> with ...
> 

Would it make sense to define it like this?:

struct vcpu_shmem_stats {
#define STATS_A (1u << 0)
...
#define VCPU_STATS_MAGIC 0xaabbccdd
 uint32_t magic;
 uint32_t offset;  // roundup(sizeof(vcpu_shmem_stats) + sizeof(uint32_t) * 
nr_stats, cacheline_size)
 uint32_t size;// sizeof(vcpu_stats)
 uint32_t stride;  // roundup(sizeof(vcpu_stats), cacheline_size)
 uint32_t nr_stats; // size of stats_active in uint32_t
 uint32_t stats_active[XEN_FLEX_ARRAY_DIM];
 ...
};
 
> > - In the vcpu_stats structure, new fields can only ever be appended.
> 
> ... this rule the only ambiguity that could arise to consumers when
> no active flags existed would be that they can't tell "event never
> occurred" from "hypervisor doesn't know about this anymore".

Do you mean how the consumer can figure out if either 1) Xen does not know yet
about some flag or 2) the flag has been deprecated? I think 2) is the case that
Andrew mention in which the magic number could be used as an ABI version to
break backwards compatibility.

Thanks, Matias. 



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-03-07 Thread Matias Ezequiel Vara Larsen
On Thu, Feb 23, 2023 at 01:42:08PM +0100, Jan Beulich wrote:
> On 23.02.2023 13:16, Matias Ezequiel Vara Larsen wrote:
> > On Fri, Feb 17, 2023 at 03:10:53PM +0100, Jan Beulich wrote:
> >> On 17.02.2023 10:29, Matias Ezequiel Vara Larsen wrote:
> >>> On Fri, Feb 17, 2023 at 09:57:43AM +0100, Jan Beulich wrote:
> >>>> On 17.02.2023 09:50, Matias Ezequiel Vara Larsen wrote:
> >>>>> On Wed, Dec 14, 2022 at 08:56:57AM +0100, Jan Beulich wrote:
> >>>>>> On 14.12.2022 08:29, Jan Beulich wrote:
> >>>>>>> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> >>>>>>>> +static int stats_vcpu_alloc_mfn(struct domain *d)
> >>>>>>>> +{
> >>>>>>>> +struct page_info *pg;
> >>>>>>>> +
> >>>>>>>> +pg = alloc_domheap_page(d, MEMF_no_refcount);
> >>>>>>>
> >>>>>>> The ioreq and vmtrace resources are also allocated this way, but 
> >>>>>>> they're
> >>>>>>> HVM-specific. The one here being supposed to be VM-type independent, 
> >>>>>>> I'm
> >>>>>>> afraid such pages will be accessible by an "owning" PV domain (it'll
> >>>>>>> need to guess the MFN, but that's no excuse).
> >>>>>>
> >>>>>> Which might be tolerable if it then can't write to the page. That would
> >>>>>> require "locking" the page r/o (from guest pov), which ought to be
> >>>>>> possible by leveraging a variant of what share_xen_page_with_guest()
> >>>>>> does: It marks pages PGT_none with a single type ref. This would mean
> >>>>>> ...
> >>>>>>
> >>>>>>>> +if ( !pg )
> >>>>>>>> +return -ENOMEM;
> >>>>>>>> +
> >>>>>>>> +if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
> >>>>>>
> >>>>>> ... using PGT_none here. Afaict this _should_ work, but we have no
> >>>>>> precedent of doing so in the tree, and I may be overlooking something
> >>>>>> which prevents that from working.
> >>>>>>
> >>>>>
> >>>>> I do not fully understand this. I checked share_xen_page_with_guest() 
> >>>>> and I
> >>>>> think you're talking about doing something like this for each allocated 
> >>>>> page to
> >>>>> set them ro from a pv guest pov:
> >>>>>
> >>>>> pg->u.inuse.type_info = PGT_none;
> >>>>> pg->u.inuse.type_info |= PGT_validated | 1;
> >>>>> page_set_owner(page, d); // not sure if this is needed
> >>>>>
> >>>>> Then, I should use PGT_none instead of PGT_writable_page in
> >>>>> get_page_and_type(). Am I right?
> >>>>
> >>>> No, if at all possible you should avoid open-coding anything. As said,
> >>>> simply passing PGT_none to get_page_and_type() ought to work (again, as
> >>>> said, unless I'm overlooking something). share_xen_page_with_guest()
> >>>> can do what it does because the page isn't owned yet. For a page with
> >>>> owner you may not fiddle with type_info in such an open-coded manner.
> >>>>
> >>>
> >>> Thanks. I got the following bug when passing PGT_none:
> >>>
> >>> (XEN) Bad type in validate_page 0 t=0001 c=8042
> >>> (XEN) Xen BUG at mm.c:2643
> >>
> >> The caller of the function needs to avoid the call not only for writable
> >> and shared pages, but also for this new case of PGT_none.
> > 
> > Thanks. If I understand correctly, _get_page_type() needs to avoid to call
> > validate_page() when type = PGT_none.
> 
> Yes.
> 
> > For the writable and shared pages, this
> > is avoided by setting nx |= PGT_validated. Am I right?
> 
> Well, no, I wouldn't describe it like that. The two (soon three) types not
> requiring validation simply set the flag without calling validate_page().
> 

I see, thanks. I added the corresponding check at _get_page_type() to set the
flag without calling validate_page() for the PGT_none type. I think I am
missing something when I am releasing the pages. I am triggering the following
BUG() when issuing put_page_and_type():
 
(XEN) Xen BUG at mm.c:2698

This is at devalidate_page(). I guess the call to devalidate_page() shall be
also avoided. I was wondering if put_page_and_type() is required in this case.

Matias



Re: API/ABIs: Re: [RFC PATCH v2 0/2] Add a new acquire resource to query vcpu statistics

2023-03-06 Thread Matias Ezequiel Vara Larsen
Hello Andrew and thanks for the comments, please find my comments below.

On Thu, Feb 23, 2023 at 07:56:28PM +, Andrew Cooper wrote:
> A discussion about forward extensible APIs and ABIs here.
> 
> First, its important to say that this should be "domain stats" and not
> just "vcpu stats".  This is easy to account for now, but hard to
> retrofit later.
> 
> For the shared memory, we have a minimum granularity of PAGE_SIZE (4k
> for now, but this will change in due course on non-x86 architectures),
> and a resource-agnostic way of determining the resource size (as a
> multiple of the page size).
> 
> But there are other things we need to consider:
> 
> 1) To be sensibly extendable, there needs to be some metadata, and the
> domain stats is clearly going to be a different shape to the vcpu stats.
> 
> 2) We also want to give Xen some flexibility to allocate memory in a
> suitable manner for the system.
> 
> 3) Xen and the userspace consuming this interface will likely be built
> from different versions of the header.  This needs to inter-operate with
> the common subset of functionality.
> 
> 
> So what we want, at least to describe the shape, is something like this:
> 
> struct dom_shmem_stats {
>     uint32_t dom_size; // sizeof(dom)
>     uint32_t vcpu_offs;
>     uint32_t vcpu_size; // sizeof(vcpu)
>     uint32_t vcpu_stride;
>     ...
> };
> 
> struct vcpu_shmem_stats {
>     ...
> };
> 
> where the data layout shall be that there is one dom structure starting
> at 0, and an array of vcpu_stride objects starting at vcpu_offset.
> 
> Obviously, some invariants apply.  vcpu_offs >= dom_size, and
> vcpu_stride >= vcpu_size.  The total size of the mapping must be larger
> than vcpu_offs + vcpus * vcpu_stride  (and no, I intentionally don't
> care about the rounding at the end because it doesn't change things in
> practice.)
> 

Would it make sense to use different type-specific resources identifiers for
each "stat"?, e.g., XENMEM_resource_stats_table_id_vcpustats,
XENMEM_resource_stats_table_id_domstats and so on. The size of each of these
type-specific resources would be queried by using
`xenforeignmemory_resource_size()`. The mapping would be done by using
`xenforeignmemory_map_resource()`.

The metadata to represent the XENMEM_resource_stats_table_id_vcpustats
resource could be:

struct vcpu_shmem_stats {
#define STATS_A (1u << 0)
...
#define VCPU_STATS_MAGIC 0xaabbccdd
     uint32_t magic;
     uint32_t stats_active;
     uint32_t offset;  // roundup(sizeof(vcpu_shmem_stats), cacheline_size)
     uint32_t size;    // sizeof(vcpu_stats)
     uint32_t stride;  // roundup(sizeof(vcpu_stats), cacheline_size)
 uint32_t _pad;
     ...
};

struct vcpu_stats {
    /*
     * If the least-significant bit of the seq number is set then an update
     * is in progress and the consumer must wait to read a consistent set of
     * values. This mechanism is similar to Linux's seqlock.
     */
    uint32_t seq;
    uint32 _pad;
    uint64_t stats_a; // e.g., runstate_running_time
    ...
};

The data layout shall be that there is one vcpu_shmem_stats structure starting
at 0, and an array of stride objects starting at offset, i.e., vcpu_stats
structures. The invariants would be:
* 1. offset >= sizeof(struct vcpu_shmem_stats)
* 2. stride >= size
* 3. the total size of the mapping in frames, which is the value returned by
  resource_size(), must be larger than (offs + vcpus * stride).
* 4. Xen must not produce any data outside of [base, stride) for vcpu data.

The steps to add a new counter B would be:
1. append the new field at vcpu_stats structure.
2. define the bit in stats_active,.i.e., #define STATS_B (1u << 1)
3. advertise STATS_B
I may be missing some steps here but that would be the overall process.

Regarding your email, I have the following comments:

- I still do not know how to choose the value of cacheline_size. I understand
this value shall be between the actual cacheline_size and PAGE_SIZE. A value
that could match most architectures could be 256 bytes.

- Xen shall use the "stats_active" field to indicate what it is producing. In
  this field, reserved bits shall be 0. This shall allow us to agree on the
layout even when producer and consumer are compiled with different headers.

- In the vcpu_stats structure, new fields can only ever be appended.

- The magic field shall act as a sanity check but also as an ABI version in case
we decide to break backward-compatibility.

> A very simple layout, packing the data as closely as reasonable, might be:
> 
> vcpu_offs = roundup(sizeof(dom), cacheline_size)
> vcpu_stride = roundup(sizeof(vcpu), cacheline_size);
> 
> but Xen might have reason to use some other rounding.  As the dom or
> vcpu size approaches a page, then Xen will want to change allocation
> scheme to use page size for both, and not vmap the lot as one
> consecutive region.
> 
> 
> 
> For the stats data itself, there wants to be some indication of what
> data Xen is producing, so userspace can 

Re: [RFC PATCH v2 2/2] tools/misc: Add xen-vcpus-stats tool

2023-02-24 Thread Matias Ezequiel Vara Larsen
Hello Andrew and thanks for the comments,

On Thu, Feb 23, 2023 at 04:01:09PM +, Andrew Cooper wrote:
> On 07/10/2022 1:39 pm, Matias Ezequiel Vara Larsen wrote:
> 
> A couple of observations, all unrelated to the stats themselves.
> 
> Although overall, I'm not entirely certain that a tool like this is
> going to be very helpful after initial development.  Something to
> consider would be to alter libxenstat to use this new interface?
> 

Yes. We discussed about this in a design sesion at the summit. I could not move
forward on that direction yet but it is the right way to go. I use this tool
only to play with the interface and I could just remove it from the RFC in next
versions.

> > diff --git a/tools/misc/Makefile b/tools/misc/Makefile
> > index 2b683819d4..837e4b50da 100644
> > --- a/tools/misc/Makefile
> > +++ b/tools/misc/Makefile
> > @@ -49,6 +49,7 @@ TARGETS_COPY += xenpvnetboot
> >
> > # Everything which needs to be built
> > TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
> > +TARGETS_BUILD += xen-vcpus-stats
> 
> This patch is whitespace corrupted.  If at all possible, you need to see
> about getting `git send-email` working to send patches with, as it deals
> with most of the whitespace problems for you.
> 
> I'm afraid you can't simply copy the patch text into an email and send that.
> 

I am using `git send-email` to send patches. I may have missed some flag.
I'll double-check. 

> >
> > # ... including build-only targets
> > TARGETS_BUILD-$(CONFIG_X86)    += xen-vmtrace
> > @@ -135,4 +136,9 @@ xencov: xencov.o
> > xen-ucode: xen-ucode.o
> > $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
> >
> > +xen-vcpus-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
> > +
> > +xen-vcpus-stats: xen-vcpus-stats.o
> > +    $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl)
> > $(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
> > +
> > -include $(DEPS_INCLUDE)
> > diff --git a/tools/misc/xen-vcpus-stats.c b/tools/misc/xen-vcpus-stats.c
> > new file mode 100644
> > index 00..29d0efb124
> > --- /dev/null
> > +++ b/tools/misc/xen-vcpus-stats.c
> > @@ -0,0 +1,87 @@
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#define rmb()   asm volatile("lfence":::"memory")
> 
> This is rmb(), but rmb() isn't what you want.
> 
> You want smp_rmb(), which is
> 
> #define smp_rmb() asm volatile ("" ::: "memory")
> 
> 
> I'm surprised we haven't got this in a common location, considering how
> often it goes wrong.  (Doesn't help that there's plenty of buggy
> examples to copy, even in xen.git)
> 

Got it. I'll rework on it in the next version. For inspiration, I used the code
at arch/x86/kernel/pvclock.c:pvclock_read_wallclock(). 

> > +
> > +static sig_atomic_t interrupted;
> > +static void close_handler(int signum)
> > +{
> > +    interrupted = 1;
> > +}
> > +
> > +int main(int argc, char **argv)
> > +{
> > +    xenforeignmemory_handle *fh;
> > +    xenforeignmemory_resource_handle *res;
> > +    size_t size;
> > +    int rc, domid, period, vcpu;
> > +    shared_vcpustatspage_t * info;
> 
> shared_vcpustatspage_t *info;
> 
> no space after the *.
> 
> But you also cannot have a single structure describing that.  I'll reply
> to the cover letter discussing ABIs.

I am reading it and I will comment on this soon. 

> 
> > +    struct sigaction act;
> > +    uint32_t version;
> > +    uint64_t value;
> > +
> > +    if (argc != 4 ) {
> 
> { on a new line.
> 
> > +    fprintf(stderr, "Usage: %s   \n", argv[0]);
> > +    return 1;
> > +    }
> > +
> > +    domid = atoi(argv[1]);
> > +    vcpu = atoi(argv[2]);
> > +    period = atoi(argv[3]);
> > +
> > +    act.sa_handler = close_handler;
> > +    act.sa_flags = 0;
> > +    sigemptyset(_mask);
> > +    sigaction(SIGHUP,  , NULL);
> > +    sigaction(SIGTERM, , NULL);
> > +    sigaction(SIGINT,  , NULL);
> > +    sigaction(SIGALRM, , NULL);
> > +
> > +    fh = xenforeignmemory_open(NULL, 0);
> > +
> > +    if ( !fh )
> > +    err(1, "xenforeignmemory_open");
> > +
> > +    rc = xenforeignmemory_resource_size(
> > +    fh, domid, XENMEM_resource_stats_table,
> > +    0, );
> > +
> > +    if (

Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-02-23 Thread Matias Ezequiel Vara Larsen
On Fri, Feb 17, 2023 at 03:10:53PM +0100, Jan Beulich wrote:
> On 17.02.2023 10:29, Matias Ezequiel Vara Larsen wrote:
> > On Fri, Feb 17, 2023 at 09:57:43AM +0100, Jan Beulich wrote:
> >> On 17.02.2023 09:50, Matias Ezequiel Vara Larsen wrote:
> >>> On Wed, Dec 14, 2022 at 08:56:57AM +0100, Jan Beulich wrote:
> >>>> On 14.12.2022 08:29, Jan Beulich wrote:
> >>>>> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> >>>>>> +static int stats_vcpu_alloc_mfn(struct domain *d)
> >>>>>> +{
> >>>>>> +struct page_info *pg;
> >>>>>> +
> >>>>>> +pg = alloc_domheap_page(d, MEMF_no_refcount);
> >>>>>
> >>>>> The ioreq and vmtrace resources are also allocated this way, but they're
> >>>>> HVM-specific. The one here being supposed to be VM-type independent, I'm
> >>>>> afraid such pages will be accessible by an "owning" PV domain (it'll
> >>>>> need to guess the MFN, but that's no excuse).
> >>>>
> >>>> Which might be tolerable if it then can't write to the page. That would
> >>>> require "locking" the page r/o (from guest pov), which ought to be
> >>>> possible by leveraging a variant of what share_xen_page_with_guest()
> >>>> does: It marks pages PGT_none with a single type ref. This would mean
> >>>> ...
> >>>>
> >>>>>> +if ( !pg )
> >>>>>> +return -ENOMEM;
> >>>>>> +
> >>>>>> +if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
> >>>>
> >>>> ... using PGT_none here. Afaict this _should_ work, but we have no
> >>>> precedent of doing so in the tree, and I may be overlooking something
> >>>> which prevents that from working.
> >>>>
> >>>
> >>> I do not fully understand this. I checked share_xen_page_with_guest() and 
> >>> I
> >>> think you're talking about doing something like this for each allocated 
> >>> page to
> >>> set them ro from a pv guest pov:
> >>>
> >>> pg->u.inuse.type_info = PGT_none;
> >>> pg->u.inuse.type_info |= PGT_validated | 1;
> >>> page_set_owner(page, d); // not sure if this is needed
> >>>
> >>> Then, I should use PGT_none instead of PGT_writable_page in
> >>> get_page_and_type(). Am I right?
> >>
> >> No, if at all possible you should avoid open-coding anything. As said,
> >> simply passing PGT_none to get_page_and_type() ought to work (again, as
> >> said, unless I'm overlooking something). share_xen_page_with_guest()
> >> can do what it does because the page isn't owned yet. For a page with
> >> owner you may not fiddle with type_info in such an open-coded manner.
> >>
> > 
> > Thanks. I got the following bug when passing PGT_none:
> > 
> > (XEN) Bad type in validate_page 0 t=0001 c=8042
> > (XEN) Xen BUG at mm.c:2643
> 
> The caller of the function needs to avoid the call not only for writable
> and shared pages, but also for this new case of PGT_none.

Thanks. If I understand correctly, _get_page_type() needs to avoid to call
validate_page() when type = PGT_none. For the writable and shared pages, this
is avoided by setting nx |= PGT_validated. Am I right?

Matias



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-02-20 Thread Matias Ezequiel Vara Larsen
On Thu, Feb 16, 2023 at 04:15:29PM +0100, Jan Beulich wrote:
> On 16.02.2023 16:07, Matias Ezequiel Vara Larsen wrote:
> > On Wed, Dec 14, 2022 at 08:29:53AM +0100, Jan Beulich wrote:
> >> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> >>> @@ -287,6 +289,20 @@ static inline void vcpu_runstate_change(
> >>>  }
> >>>  
> >>>  v->runstate.state = new_state;
> >>> +
> >>> +vcpustats_va = (shared_vcpustatspage_t*)d->vcpustats_page.va;
> >>> +if ( vcpustats_va )
> >>> +{
> >>> + vcpustats_va->vcpu_info[v->vcpu_id].version =
> >>> + version_update_begin(vcpustats_va->vcpu_info[v->vcpu_id].version);
> >>> +smp_wmb();
> >>> +
> >>> memcpy(_va->vcpu_info[v->vcpu_id].runstate_running_time,
> >>> +   >runstate.time[RUNSTATE_running],
> >>> +   sizeof(v->runstate.time[RUNSTATE_running]));
> >>> +smp_wmb();
> >>> +vcpustats_va->vcpu_info[v->vcpu_id].version =
> >>> +
> >>> version_update_end(vcpustats_va->vcpu_info[v->vcpu_id].version);
> >>> +}
> >>
> >> A further aspect to consider here is cache line ping-pong. I think the
> >> per-vCPU elements of the array want to be big enough to not share a
> >> cache line. The interface being generic this presents some challenge
> >> in determining what the supposed size is to be. However, taking into
> >> account the extensibility question, maybe the route to take is to
> >> simply settle on a power-of-2 value somewhere between x86'es and Arm's
> >> cache line sizes and the pretty common page size of 4k, e.g. 512 bytes
> >> or 1k?
> >>
> > 
> > I do not now how to address this. I was thinking to align each vcpu_stats
> > instance to a multiple of the cache-line. I would pick up the first multiple
> > that is bigger to the size of the vcpu_stats structure. For example, 
> > currently
> > the structure is 16 bytes so I would align each instance in a frame to 64
> > bytes. Would it make sense? 
> 
> Well, 64 may be an option, but I gave higher numbers for a reason. One thing
> I don't know is what common cache line sizes are on Arm or e.g. RISC-V.

Thanks. I found that structures that require cache-aligment are defined with
"__cacheline_aligned" that uses L1_CACHE_BYTES. For example, in x86, this
aligns to 128 bytes. What is the reason to use a higher value like 512 bytes or
1k?.

Thanks, Matias. 



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-02-17 Thread Matias Ezequiel Vara Larsen
On Fri, Feb 17, 2023 at 09:57:43AM +0100, Jan Beulich wrote:
> On 17.02.2023 09:50, Matias Ezequiel Vara Larsen wrote:
> > On Wed, Dec 14, 2022 at 08:56:57AM +0100, Jan Beulich wrote:
> >> On 14.12.2022 08:29, Jan Beulich wrote:
> >>> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> >>>> +static int stats_vcpu_alloc_mfn(struct domain *d)
> >>>> +{
> >>>> +struct page_info *pg;
> >>>> +
> >>>> +pg = alloc_domheap_page(d, MEMF_no_refcount);
> >>>
> >>> The ioreq and vmtrace resources are also allocated this way, but they're
> >>> HVM-specific. The one here being supposed to be VM-type independent, I'm
> >>> afraid such pages will be accessible by an "owning" PV domain (it'll
> >>> need to guess the MFN, but that's no excuse).
> >>
> >> Which might be tolerable if it then can't write to the page. That would
> >> require "locking" the page r/o (from guest pov), which ought to be
> >> possible by leveraging a variant of what share_xen_page_with_guest()
> >> does: It marks pages PGT_none with a single type ref. This would mean
> >> ...
> >>
> >>>> +if ( !pg )
> >>>> +return -ENOMEM;
> >>>> +
> >>>> +if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
> >>
> >> ... using PGT_none here. Afaict this _should_ work, but we have no
> >> precedent of doing so in the tree, and I may be overlooking something
> >> which prevents that from working.
> >>
> > 
> > I do not fully understand this. I checked share_xen_page_with_guest() and I
> > think you're talking about doing something like this for each allocated 
> > page to
> > set them ro from a pv guest pov:
> > 
> > pg->u.inuse.type_info = PGT_none;
> > pg->u.inuse.type_info |= PGT_validated | 1;
> > page_set_owner(page, d); // not sure if this is needed
> > 
> > Then, I should use PGT_none instead of PGT_writable_page in
> > get_page_and_type(). Am I right?
> 
> No, if at all possible you should avoid open-coding anything. As said,
> simply passing PGT_none to get_page_and_type() ought to work (again, as
> said, unless I'm overlooking something). share_xen_page_with_guest()
> can do what it does because the page isn't owned yet. For a page with
> owner you may not fiddle with type_info in such an open-coded manner.
> 

Thanks. I got the following bug when passing PGT_none:

(XEN) Bad type in validate_page 0 t=0001 c=8042
(XEN) Xen BUG at mm.c:2643

I did not investigate yet why this has happened.

Matias



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-02-17 Thread Matias Ezequiel Vara Larsen
On Wed, Dec 14, 2022 at 08:56:57AM +0100, Jan Beulich wrote:
> On 14.12.2022 08:29, Jan Beulich wrote:
> > On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> >> +static int stats_vcpu_alloc_mfn(struct domain *d)
> >> +{
> >> +struct page_info *pg;
> >> +
> >> +pg = alloc_domheap_page(d, MEMF_no_refcount);
> > 
> > The ioreq and vmtrace resources are also allocated this way, but they're
> > HVM-specific. The one here being supposed to be VM-type independent, I'm
> > afraid such pages will be accessible by an "owning" PV domain (it'll
> > need to guess the MFN, but that's no excuse).
> 
> Which might be tolerable if it then can't write to the page. That would
> require "locking" the page r/o (from guest pov), which ought to be
> possible by leveraging a variant of what share_xen_page_with_guest()
> does: It marks pages PGT_none with a single type ref. This would mean
> ...
> 
> >> +if ( !pg )
> >> +return -ENOMEM;
> >> +
> >> +if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
> 
> ... using PGT_none here. Afaict this _should_ work, but we have no
> precedent of doing so in the tree, and I may be overlooking something
> which prevents that from working.
> 

I do not fully understand this. I checked share_xen_page_with_guest() and I
think you're talking about doing something like this for each allocated page to
set them ro from a pv guest pov:

pg->u.inuse.type_info = PGT_none;
pg->u.inuse.type_info |= PGT_validated | 1;
page_set_owner(page, d); // not sure if this is needed

Then, I should use PGT_none instead of PGT_writable_page in
get_page_and_type(). Am I right?

Matias



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-02-16 Thread Matias Ezequiel Vara Larsen
On Wed, Dec 14, 2022 at 08:29:53AM +0100, Jan Beulich wrote:
> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> > --- a/xen/common/memory.c
> > +++ b/xen/common/memory.c
> > @@ -1078,6 +1078,12 @@ unsigned int ioreq_server_max_frames(const struct 
> > domain *d)
> >   return nr;
> >  }
> >  
> > +unsigned int stats_table_max_frames(const struct domain *d)
> > +{
> > +/* One frame per 512 vcpus. */
> > +return 1;
> > +}
> 
> Beyond my earlier comment (and irrespective of this needing changing
> anyway): I guess this "512" was not updated to match the now larger
> size of struct vcpu_stats?

In the next series, I am calculating the number of frames by:

nr = DIV_ROUND_UP(d->max_vcpus * sizeof(struct vcpu_stats), PAGE_SIZE);

> 
> > +static int stats_vcpu_alloc_mfn(struct domain *d)
> > +{
> > +struct page_info *pg;
> > +
> > +pg = alloc_domheap_page(d, MEMF_no_refcount);
> 
> The ioreq and vmtrace resources are also allocated this way, but they're
> HVM-specific. The one here being supposed to be VM-type independent, I'm
> afraid such pages will be accessible by an "owning" PV domain (it'll
> need to guess the MFN, but that's no excuse).
> 
> > +if ( !pg )
> > +return -ENOMEM;
> > +
> > +if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
> > +put_page_alloc_ref(pg);
> > +return -ENODATA;
> > +}
> > +
> > +d->vcpustats_page.va = __map_domain_page_global(pg);
> > +if ( !d->vcpustats_page.va )
> > +goto fail;
> > +
> > +d->vcpustats_page.pg = pg;
> > +clear_page(d->vcpustats_page.va);
> 
> Beyond my earlier comment: I think that by the time the surrounding
> hypercall returns the page ought to contain valid data. Otherwise I
> see no way for the consumer to know from which point on the data is
> going to be valid.
> 
> > @@ -287,6 +289,20 @@ static inline void vcpu_runstate_change(
> >  }
> >  
> >  v->runstate.state = new_state;
> > +
> > +vcpustats_va = (shared_vcpustatspage_t*)d->vcpustats_page.va;
> > +if ( vcpustats_va )
> > +{
> > +   vcpustats_va->vcpu_info[v->vcpu_id].version =
> > +   version_update_begin(vcpustats_va->vcpu_info[v->vcpu_id].version);
> > +smp_wmb();
> > +memcpy(_va->vcpu_info[v->vcpu_id].runstate_running_time,
> > +   >runstate.time[RUNSTATE_running],
> > +   sizeof(v->runstate.time[RUNSTATE_running]));
> > +smp_wmb();
> > +vcpustats_va->vcpu_info[v->vcpu_id].version =
> > +
> > version_update_end(vcpustats_va->vcpu_info[v->vcpu_id].version);
> > +}
> 
> A further aspect to consider here is cache line ping-pong. I think the
> per-vCPU elements of the array want to be big enough to not share a
> cache line. The interface being generic this presents some challenge
> in determining what the supposed size is to be. However, taking into
> account the extensibility question, maybe the route to take is to
> simply settle on a power-of-2 value somewhere between x86'es and Arm's
> cache line sizes and the pretty common page size of 4k, e.g. 512 bytes
> or 1k?
> 

I do not now how to address this. I was thinking to align each vcpu_stats
instance to a multiple of the cache-line. I would pick up the first multiple
that is bigger to the size of the vcpu_stats structure. For example, currently
the structure is 16 bytes so I would align each instance in a frame to 64
bytes. Would it make sense? 

> > --- a/xen/include/public/vcpu.h
> > +++ b/xen/include/public/vcpu.h
> > @@ -235,6 +235,22 @@ struct vcpu_register_time_memory_area {
> >  typedef struct vcpu_register_time_memory_area 
> > vcpu_register_time_memory_area_t;
> >  DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
> >  
> > +struct vcpu_stats{
> > +/* If the least-significant bit of the version number is set then an 
> > update
> > + * is in progress and the guest must wait to read a consistent set of 
> > values
> > + * This mechanism is similar to Linux's seqlock.
> > + */
> > +uint32_t version;
> > +uint32_t pad0;
> > +uint64_t runstate_running_time;
> > +};
> > +
> > +struct shared_vcpustatspage {
> > +struct vcpu_stats vcpu_info[1];
> > +};
> > +
> > +typedef struct shared_vcpustatspage shared_vcpustatspage_t;
> 
> For new additions please avoid further name space issues: All types
> and alike want to be prefixed by "xen_".
>

Should I name it "xen_shared_vcpustatspage_t" instead?

Matias 



Re: [RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2023-02-16 Thread Matias Ezequiel Vara Larsen
Hello Jan and thanks for your comments. I addressed most of the them but
I've still some questions. Please find my questions below:

On Tue, Dec 13, 2022 at 06:02:55PM +0100, Jan Beulich wrote:
> On 07.10.2022 14:39, Matias Ezequiel Vara Larsen wrote:
> > This commit proposes a new mechanism to query the RUNSTATE_running counter 
> > for
> > a given vcpu from a dom0 userspace application. This commit proposes to 
> > expose
> > that counter by using the acquire_resource interface. The current mechanism
> > relies on the XEN_DOMCTL_getvcpuinfo and holds a single global domctl_lock 
> > for
> > the entire hypercall; and iterate over every vcpu in the system for every
> > update thus impacting operations that share that lock.
> > 
> > This commit proposes to expose vcpu RUNSTATE_running via the
> > xenforeignmemory interface thus preventing to issue the hypercall and 
> > holding
> > the lock. For that purpose, a new resource type named stats_table is added. 
> > The
> > first frame of this resource stores per-vcpu counters. The frame has one 
> > entry
> > of type struct vcpu_stats per vcpu. The allocation of this frame only 
> > happens
> > if the resource is requested. The frame is released after the domain is
> > destroyed.
> > 
> > Note that the updating of this counter is in a hot path, thus, in this 
> > commit,
> > copying only happens if it is specifically required.
> > 
> > Note that the exposed structure is extensible in two ways. First, the 
> > structure
> > vcpu_stats can be extended with new per-vcpu counters while it fits in a 
> > frame.
> 
> I'm afraid I don't see how this is "extensible". I would recommend that
> you outline for yourself how a change would look like to actually add
> such a 2nd counter. While doing that keep in mind that whatever changes
> you make may not break existing consumers.
> 
> It's also not clear what you mean with "fits in a frame": struct
> shared_vcpustatspage is a container for an array with a single element.
> I may guess (looking at just the public interface) that this really is
> meant to be a flexible array (and hence should be marked as such - see
> other uses of XEN_FLEX_ARRAY_DIM in the public headers). Yet if that's
> the case, then a single page already won't suffice for a domain with
> sufficiently many vCPU-s.
> 

I taclked this by using "d->max_vcpus" to calculate the number of required 
frames
to allocate for a given guest. Also, I added a new type-specific resource named
XENMEM_resource_stats_table_id_vcpustats to host per-vcpu counters. I
completely forgot the "id" in the previous series.

> > Second, new frames can be added in case new counters are required.
> 
> Are you talking of "new counters" here which aren't "new per-vcpu
> counters"? Or else what's the difference from the 1st way?

Yes, I was talking about that sort of counters. In the next series, that sort
of counters could be added by adding a new type-specific resource id.

> 
> > --- a/xen/arch/x86/hvm/hvm.c
> > +++ b/xen/arch/x86/hvm/hvm.c
> > @@ -741,6 +741,8 @@ void hvm_domain_relinquish_resources(struct domain *d)
> >  
> >  ioreq_server_destroy_all(d);
> >  
> > +stats_free_vcpu_mfn(d);
> 
> How come this lives here? Surely this new feature should be not only
> guest-type independent, but also arch-agnostic? Clearly you putting
> the new data in struct domain (and not struct arch_domain or yet
> deeper in the hierarchy) indicates you may have been meaning to make
> it so.
> 

The whole feature shall to be guest-type independent and also arch-agnostic.
Would it be better to put it at xen/common/domain.c:domain_kill()?
 
> > --- a/xen/common/memory.c
> > +++ b/xen/common/memory.c
> > @@ -1078,6 +1078,12 @@ unsigned int ioreq_server_max_frames(const struct 
> > domain *d)
> >  return nr;
> >  }
> >  
> > +unsigned int stats_table_max_frames(const struct domain *d)
> > +{
> > +/* One frame per 512 vcpus. */
> > +return 1;
> > +}
> 
> As alluded to earlier already - 1 isn't going to be suitable for
> arbitrary size domains. (Yes, HVM domains are presently limited to
> 128 vCPU-s, but as per above this shouldn't be a HVM-only feature.)
>

I am going to use "d->max_vcpus" to calculate the number of required frames for
per-vcpu counters for a given guest.
 
> > @@ -1162,6 +1171,88 @@ static int acquire_vmtrace_buf(
> >  return nr_frames;
> >  }
> >  
> > +void stats_free_vcpu_mfn(struct domain * d)
> > +{
> > +struct page_info *pg = d->vcpustats_pa

[RFC PATCH v2 1/2] xen/memory : Add a stats_table resource type

2022-10-07 Thread Matias Ezequiel Vara Larsen
This commit proposes a new mechanism to query the RUNSTATE_running counter for
a given vcpu from a dom0 userspace application. This commit proposes to expose
that counter by using the acquire_resource interface. The current mechanism
relies on the XEN_DOMCTL_getvcpuinfo and holds a single global domctl_lock for
the entire hypercall; and iterate over every vcpu in the system for every
update thus impacting operations that share that lock.

This commit proposes to expose vcpu RUNSTATE_running via the
xenforeignmemory interface thus preventing to issue the hypercall and holding
the lock. For that purpose, a new resource type named stats_table is added. The
first frame of this resource stores per-vcpu counters. The frame has one entry
of type struct vcpu_stats per vcpu. The allocation of this frame only happens
if the resource is requested. The frame is released after the domain is
destroyed.

Note that the updating of this counter is in a hot path, thus, in this commit,
copying only happens if it is specifically required.

Note that the exposed structure is extensible in two ways. First, the structure
vcpu_stats can be extended with new per-vcpu counters while it fits in a frame.
Second, new frames can be added in case new counters are required.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v2:
- rework to ensure that guest reads a coherent value by using a version
  number in the vcpu_stats structure
- add version to the vcpu_stats structure

Changes in v1:
- rework the allocation and releasing of the frames
- use the zero frame for per-vcpu counters that are listed as an array
- allocate vcpu stats frames only when the resource is requested
- rewrite commit message
- add the vcpu_stats structure to keep per-vcpu counters
- add the shared_vcpustatspage to keep an array of per-vcpu counters for a
  given domain
- declare the structures in a public header 
- define the vcpustats_page in the domain structure
---
 xen/arch/x86/hvm/hvm.c  |  2 +
 xen/common/memory.c | 94 +
 xen/common/sched/core.c | 16 +++
 xen/include/public/memory.h |  3 ++
 xen/include/public/vcpu.h   | 16 +++
 xen/include/xen/mm.h|  2 +
 xen/include/xen/sched.h |  5 ++
 7 files changed, 138 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index ddd001a6ad..1ef6cb5ff0 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -741,6 +741,8 @@ void hvm_domain_relinquish_resources(struct domain *d)
 
 ioreq_server_destroy_all(d);
 
+stats_free_vcpu_mfn(d);
+
 msixtbl_pt_cleanup(d);
 
 /* Stop all asynchronous timer actions. */
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 297b98a562..749486d5d4 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -1078,6 +1078,12 @@ unsigned int ioreq_server_max_frames(const struct domain 
*d)
 return nr;
 }
 
+unsigned int stats_table_max_frames(const struct domain *d)
+{
+/* One frame per 512 vcpus. */
+return 1;
+}
+
 /*
  * Return 0 on any kind of error.  Caller converts to -EINVAL.
  *
@@ -1099,6 +1105,9 @@ static unsigned int resource_max_frames(const struct 
domain *d,
 case XENMEM_resource_vmtrace_buf:
 return d->vmtrace_size >> PAGE_SHIFT;
 
+case XENMEM_resource_stats_table:
+return stats_table_max_frames(d);
+
 default:
 return -EOPNOTSUPP;
 }
@@ -1162,6 +1171,88 @@ static int acquire_vmtrace_buf(
 return nr_frames;
 }
 
+void stats_free_vcpu_mfn(struct domain * d)
+{
+struct page_info *pg = d->vcpustats_page.pg;
+
+if ( !pg )
+return;
+
+d->vcpustats_page.pg = NULL;
+
+if ( d->vcpustats_page.va )
+unmap_domain_page_global(d->vcpustats_page.va);
+
+d->vcpustats_page.va = NULL;
+
+put_page_alloc_ref(pg);
+put_page_and_type(pg);
+}
+
+static int stats_vcpu_alloc_mfn(struct domain *d)
+{
+struct page_info *pg;
+
+pg = alloc_domheap_page(d, MEMF_no_refcount);
+
+if ( !pg )
+return -ENOMEM;
+
+if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
+put_page_alloc_ref(pg);
+return -ENODATA;
+}
+
+d->vcpustats_page.va = __map_domain_page_global(pg);
+if ( !d->vcpustats_page.va )
+goto fail;
+
+d->vcpustats_page.pg = pg;
+clear_page(d->vcpustats_page.va);
+return 1;
+
+fail:
+put_page_alloc_ref(pg);
+put_page_and_type(pg);
+
+return -ENOMEM;
+}
+
+static int acquire_stats_table(struct domain *d,
+unsigned int id,
+unsigned int frame,
+unsigned int nr_frames,
+xen_pfn_t mfn_list[])
+{
+mfn_t mfn;
+int rc;
+unsigned int i;
+
+if ( !d )
+return -ENOENT;
+
+for ( i = 0; i < nr_frames; i++ )
+{
+switch ( i )
+{
+case XENMEM_resource_stats_frame_vcpustats:

[RFC PATCH v2 2/2] tools/misc: Add xen-vcpus-stats tool

2022-10-07 Thread Matias Ezequiel Vara Larsen
Add a demonstration tool that uses the stats_table resource to
query vcpus' RUNSTATE_running counter for a DomU.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v2:
- use period instead of frec
- rely on version to ensure reading is coherent 

Changes in v1:
- change the name of the tool to xen-vcpus-stats
- set command line parameters in the same order that are passed
- remove header libs.h
- build by default
- remove errno, strerrno, "\n", and identation
- use errx when errno is not needed
- address better the number of pages requested and error msgs
- use the shared_vcpustatspage_t structure
- use the correct frame id when requesting the resource
---
 tools/misc/Makefile  |  6 +++
 tools/misc/xen-vcpus-stats.c | 87 
 2 files changed, 93 insertions(+)
 create mode 100644 tools/misc/xen-vcpus-stats.c

diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 2b683819d4..837e4b50da 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -49,6 +49,7 @@ TARGETS_COPY += xenpvnetboot
 
 # Everything which needs to be built
 TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
+TARGETS_BUILD += xen-vcpus-stats
 
 # ... including build-only targets
 TARGETS_BUILD-$(CONFIG_X86)+= xen-vmtrace
@@ -135,4 +136,9 @@ xencov: xencov.o
 xen-ucode: xen-ucode.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+xen-vcpus-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
+
+xen-vcpus-stats: xen-vcpus-stats.o
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) 
$(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
+
 -include $(DEPS_INCLUDE)
diff --git a/tools/misc/xen-vcpus-stats.c b/tools/misc/xen-vcpus-stats.c
new file mode 100644
index 00..29d0efb124
--- /dev/null
+++ b/tools/misc/xen-vcpus-stats.c
@@ -0,0 +1,87 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#define rmb()   asm volatile("lfence":::"memory")
+
+static sig_atomic_t interrupted;
+static void close_handler(int signum)
+{
+interrupted = 1;
+}
+
+int main(int argc, char **argv)
+{
+xenforeignmemory_handle *fh;
+xenforeignmemory_resource_handle *res;
+size_t size;
+int rc, domid, period, vcpu;
+shared_vcpustatspage_t * info;
+struct sigaction act;
+uint32_t version;
+uint64_t value;
+
+if (argc != 4 ) {
+fprintf(stderr, "Usage: %s   \n", argv[0]);
+return 1;
+}
+
+domid = atoi(argv[1]);
+vcpu = atoi(argv[2]);
+period = atoi(argv[3]);
+
+act.sa_handler = close_handler;
+act.sa_flags = 0;
+sigemptyset(_mask);
+sigaction(SIGHUP,  , NULL);
+sigaction(SIGTERM, , NULL);
+sigaction(SIGINT,  , NULL);
+sigaction(SIGALRM, , NULL);
+
+fh = xenforeignmemory_open(NULL, 0);
+
+if ( !fh )
+err(1, "xenforeignmemory_open");
+
+rc = xenforeignmemory_resource_size(
+fh, domid, XENMEM_resource_stats_table,
+0, );
+
+if ( rc )
+err(1, "Fail: Get size");
+
+res = xenforeignmemory_map_resource(
+fh, domid, XENMEM_resource_stats_table,
+0, XENMEM_resource_stats_frame_vcpustats, size >> XC_PAGE_SHIFT,
+(void **), PROT_READ, 0);
+
+if ( !res )
+err(1, "Fail: Map");
+
+while ( !interrupted ) {
+sleep(period);
+do {
+version = info->vcpu_info[vcpu].version;
+rmb();
+value = info->vcpu_info[vcpu].runstate_running_time;
+rmb();
+} while ((info->vcpu_info[vcpu].version & 1) ||
+(version != info->vcpu_info[vcpu].version));
+printf("running_vcpu_time[%d]: %ld\n", vcpu, value);
+}
+
+rc = xenforeignmemory_unmap_resource(fh, res);
+if ( rc )
+err(1, "Fail: Unmap");
+
+return 0;
+}
-- 
2.25.1




[RFC PATCH v2 0/2] Add a new acquire resource to query vcpu statistics

2022-10-07 Thread Matias Ezequiel Vara Larsen
Hello all,

The purpose of this RFC is to get feedback about a new acquire resource that
exposes vcpu statistics for a given domain. The current mechanism to get those
statistics is by querying the hypervisor. This mechanism relies on a hypercall
and holds the domctl spinlock during its execution. When a pv tool like xcp-rrdd
periodically samples these counters, it ends up affecting other paths that share
that spinlock. By using acquire resources, the pv tool only requires a few
hypercalls to set the shared memory region and samples are got without issuing
any other hypercall. The original idea has been suggested by Andrew Cooper to
which I have been discussing about how to implement the current PoC. You can
find the RFC patch series at [1]. The series is rebased on top of stable-4.15.

I am currently a bit blocked on 1) what to expose and 2) how to expose it. For
1), I decided to expose what xcp-rrdd is querying, e.g., XEN_DOMCTL_getvcpuinfo.
More precisely, xcp-rrd gets runstate.time[RUNSTATE_running]. This is a uint64_t
counter. However, the time spent in other states may be interesting too.
Regarding 2), I am not sure if simply using an array of uint64_t is enough or if
a different interface should be exposed. The remaining question is when to get
new values. For the moment, I am updating this counter during
vcpu_runstate_change().

The current series includes a simple pv tool that shows how this new interface 
is
used. This tool maps the counter and periodically samples it.

Any feedback/help would be appreciated.

Thanks, Matias.

[1] https://github.com/MatiasVara/xen/tree/feature_stats

Changes in v2:
- rework to ensure that consumer fetches consistent data

Changes in v1:
- rework how the resource is allocated and released
- rework when the resource is allocated that happens only when the resource is
  requested 
- rework the structure shared between the tool and Xen to make it extensible to
  new counters and declare it in a public header

There are still the following questions:
   - resource shall be released when there are no more readers otherwise we keep
 updating it during a hot path
   - one frame can host up to 512 vcpus. Should I check to this limit when
 updating? Should it be possible to allocate more than one frame for vcpu
 counters? 

Matias Ezequiel Vara Larsen (2):
  xen/memory : Add a stats_table resource type
  tools/misc: Add xen-vcpus-stats tool

 tools/misc/Makefile  |  6 +++
 tools/misc/xen-vcpus-stats.c | 87 +
 xen/arch/x86/hvm/hvm.c   |  2 +
 xen/common/memory.c  | 94 
 xen/common/sched/core.c  | 16 ++
 xen/include/public/memory.h  |  3 ++
 xen/include/public/vcpu.h| 16 ++
 xen/include/xen/mm.h |  2 +
 xen/include/xen/sched.h  |  5 ++
 9 files changed, 231 insertions(+)
 create mode 100644 tools/misc/xen-vcpus-stats.c

-- 
2.25.1




Re: [RFC PATCH v1 0/2] Add a new acquire resource to query vcpu statistics

2022-09-30 Thread Matias Ezequiel Vara Larsen
On Thu, Sep 29, 2022 at 05:55:50PM +0200, Jan Beulich wrote:
> On 24.08.2022 15:27, Matias Ezequiel Vara Larsen wrote:
> > The purpose of this RFC is to get feedback about a new acquire resource that
> > exposes vcpu statistics for a given domain. The current mechanism to get 
> > those
> > statistics is by querying the hypervisor. This mechanism relies on a 
> > hypercall
> > and holds the domctl spinlock during its execution. When a pv tool like 
> > xcp-rrdd
> > periodically samples these counters, it ends up affecting other paths that 
> > share
> > that spinlock. By using acquire resources, the pv tool only requires a few
> > hypercalls to set the shared memory region and samples are got without 
> > issuing
> > any other hypercall. The original idea has been suggested by Andrew Cooper 
> > to
> > which I have been discussing about how to implement the current PoC. You can
> > find the RFC patch series at [1]. The series is rebased on top of 
> > stable-4.15.
> > 
> > I am currently a bit blocked on 1) what to expose and 2) how to expose it. 
> > For
> > 1), I decided to expose what xcp-rrdd is querying, e.g., 
> > XEN_DOMCTL_getvcpuinfo.
> > More precisely, xcp-rrd gets runstate.time[RUNSTATE_running]. This is a 
> > uint64_t
> > counter. However, the time spent in other states may be interesting too.
> > Regarding 2), I am not sure if simply using an array of uint64_t is enough 
> > or if
> > a different interface should be exposed. The remaining question is when to 
> > get
> > new values. For the moment, I am updating this counter during
> > vcpu_runstate_change().
> > 
> > The current series includes a simple pv tool that shows how this new 
> > interface is
> > used. This tool maps the counter and periodically samples it.
> > 
> > Any feedback/help would be appreciated.
> 
> Before looking more closely - was there perhaps kind-of-review feedback
> during the summit, which would make it more reasonable to look through
> this once a v2 has appeared?
> 

Yes, there was. I will submit v2 from feedback during summit. Thanks for point
it.

Matias 



Design session "Using the new VCPU counters in XAPI"

2022-09-23 Thread Matias Ezequiel Vara Larsen
Participants: Pau, Edwin, Matias

- the semantics of the new interface for querying registers is now async, this 
would require a initialization phase in xcp-rrdd before query the counters.
- why this 
https://github.com/xenserver/xen.pg/blob/XS-8.2.x/master/mixed-domain-runstates.patch
 is not up-streamed?
- we need to implement a protocol to ensure consistency between 
producer(hypervisor) and consumer(user-space)
- we can get inspiration from in 
xen/arch/x86/time.c:__update_vcpu_system_time(). This relies on 
version_update_begin() and version_update_end()
- get inspiration from the comment at public/xen.h in the struct 
vcpu_time_info{} in which a version number is used
- get inspiration from arch/x86/time.c:read_xen_timer()
- we need something in user-space that relies on the same mechanism, i.e., get 
a consistent value if version number is the same before and after reading the 
structure.
- add bindings for acquiere resource see 
https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/c_stubs/xenctrlext_stubs.c#L460)
- also get inspiration from 2529c85
- we cannot find where is the implementation of this protocol in user-space. 
Xenctrl should implement it somewhere



[RFC PATCH v1] xen/docs: Document acquire resource interface

2022-08-29 Thread Matias Ezequiel Vara Larsen
This commit creates a new doc to document the acquire resource interface. This
is a reference document.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v1:
- correct documentation about how mfns are allocated
- correct documentation about how mfns are released
- use the wording tool instead of pv tool
- fix typos
---
 .../acquire_resource_reference.rst| 338 ++
 docs/hypervisor-guide/index.rst   |   2 +
 2 files changed, 340 insertions(+)
 create mode 100644 docs/hypervisor-guide/acquire_resource_reference.rst

diff --git a/docs/hypervisor-guide/acquire_resource_reference.rst 
b/docs/hypervisor-guide/acquire_resource_reference.rst
new file mode 100644
index 00..d1989d2fd4
--- /dev/null
+++ b/docs/hypervisor-guide/acquire_resource_reference.rst
@@ -0,0 +1,338 @@
+.. SPDX-License-Identifier: CC-BY-4.0
+
+Acquire resource reference
+==
+
+Acquire resource allows you to share a resource between Xen and dom0.
+Resources are generally represented by pages that are mapped into user-space.
+These pages are accessed by Xen and belong to a domU. This document describes
+the api to build tools to access these resources. The document also describes
+the software components required to create and expose a domain's resource. This
+is not a tutorial or a how-to guide. It merely describes the machinery that is
+already described in the code itself.
+
+.. warning::
+
+The code in this document may already be out of date, however it may
+be enough to illustrate how the acquire resource interface works.
+
+
+Tool API
+---
+
+This section describes the api to map a resource in a user-space tool in dom0.
+The api is based on the following functions:
+
+* xenforeignmemory_open()
+
+* xenforeignmemory_resource_size()
+
+* xenforeignmemory_map_resource()
+
+* xenforeignmemory_unmap_resource()
+
+The ``xenforeignmemory_open()`` function gets the handler that is used by the
+rest of the functions:
+
+.. code-block:: c
+
+   fh = xenforeignmemory_open(NULL, 0);
+
+The ``xenforeignmemory_resource_size()`` function gets the size of the 
resource.
+For example, in the following code, we get the size of the
+``XENMEM_RESOURCE_VMTRACE_BUF``:
+
+.. code-block:: c
+
+rc = xenforeignmemory_resource_size(fh, domid, 
XENMEM_resource_vmtrace_buf, vcpu, );
+
+The size of the resource is returned in ``size`` in bytes.
+
+The ``xenforeignmemory_map_resource()`` function maps a domain's resource. The
+function is declared as follows:
+
+.. code-block:: c
+
+xenforeignmemory_resource_handle *xenforeignmemory_map_resource(
+xenforeignmemory_handle *fmem, domid_t domid, unsigned int type,
+unsigned int id, unsigned long frame, unsigned long nr_frames,
+void **paddr, int prot, int flags);
+
+The size of the resource is in number of frames. For example, **QEMU** uses it
+to map the ioreq server between the domain and QEMU:
+
+.. code-block:: c
+
+fres = xenforeignmemory_map_resource(xen_fmem, xen_domid, 
XENMEM_resource_ioreq_server,
+ state->ioservid, 0, 2, , PROT_READ | PROT_WRITE, 0);
+
+
+The third parameter corresponds with the resource that we request from the
+domain, e.g., ``XENMEM_resource_ioreq_server``. The seventh parameter returns a
+pointer-to-pointer to the address of the mapped resource.
+
+Finally, the ``xenforeignmemory_unmap_resource()`` function unmaps the region:
+
+.. code-block:: c
+:caption: tools/misc/xen-vmtrace.c
+
+if ( fres && xenforeignmemory_unmap_resource(fh, fres) )
+perror("xenforeignmemory_unmap_resource()");
+
+Exposing a domain's resource
+-
+
+In this section, we describe how to build a new resource and expose it to dom0.
+Resources are defined in ``xen/include/public/memory.h``. In Xen-4.16, there
+are three resources:
+
+.. code-block:: c
+:caption: xen/include/public/memory.h
+
+#define XENMEM_resource_ioreq_server 0
+#define XENMEM_resource_grant_table 1
+#define XENMEM_resource_vmtrace_buf 2
+
+The ``resource_max_frames()`` function returns the size of a resource. The
+resource may provide a handler to get the size. This is the definition of the
+``resource_max_frame()`` function:
+
+.. code-block:: c
+:linenos:
+:caption: xen/common/memory.c
+
+static unsigned int resource_max_frames(const struct domain *d,
+unsigned int type, unsigned int id)
+{
+switch ( type )
+{
+case XENMEM_resource_grant_table:
+return gnttab_resource_max_frames(d, id);
+
+case XENMEM_resource_ioreq_server:
+return ioreq_server_max_frames(d);
+
+case XENMEM_resource_vmtrace_buf:
+return d->vmtrace_size >> PAGE_SHIFT;
+
+default:
+return -EOPNOTSUPP;
+}
+}
+
+The ``_acquire_resource()`` function invokes the corresponding handler that 
m

[RFC PATCH v1 2/2] tools/misc: Add xen-vcpus-stats tool

2022-08-24 Thread Matias Ezequiel Vara Larsen
Add a demostration tool that uses the stats_table resource to
query vcpus' RUNSTATE_running counter for a DomU.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v1:
- change the name of the tool to xen-vcpus-stats
- set command line parameters in the same order that are passed
- remove header libs.h
- build by default
- remove errno, strerrno, "\n", and identation
- use errx when errno is not needed
- address better the number of pages requested and error msgs
- use the shared_vcpustatspage_t structure
- use the correct frame id when requesting the resource
---
 tools/misc/Makefile  |  6 +++
 tools/misc/xen-vcpus-stats.c | 76 
 2 files changed, 82 insertions(+)
 create mode 100644 tools/misc/xen-vcpus-stats.c

diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 2b683819d4..837e4b50da 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -49,6 +49,7 @@ TARGETS_COPY += xenpvnetboot
 
 # Everything which needs to be built
 TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
+TARGETS_BUILD += xen-vcpus-stats
 
 # ... including build-only targets
 TARGETS_BUILD-$(CONFIG_X86)+= xen-vmtrace
@@ -135,4 +136,9 @@ xencov: xencov.o
 xen-ucode: xen-ucode.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+xen-vcpus-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
+
+xen-vcpus-stats: xen-vcpus-stats.o
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) 
$(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
+
 -include $(DEPS_INCLUDE)
diff --git a/tools/misc/xen-vcpus-stats.c b/tools/misc/xen-vcpus-stats.c
new file mode 100644
index 00..d56d1493e4
--- /dev/null
+++ b/tools/misc/xen-vcpus-stats.c
@@ -0,0 +1,76 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+static sig_atomic_t interrupted;
+static void close_handler(int signum)
+{
+interrupted = 1;
+}
+
+int main(int argc, char **argv)
+{
+xenforeignmemory_handle *fh;
+xenforeignmemory_resource_handle *res;
+size_t size;
+int rc, domid, frec, vcpu;
+shared_vcpustatspage_t * info;
+struct sigaction act;
+
+if (argc != 4 ) {
+fprintf(stderr, "Usage: %s   \n", argv[0]);
+return 1;
+}
+
+domid = atoi(argv[1]);
+vcpu = atoi(argv[2]);
+frec = atoi(argv[3]);
+
+act.sa_handler = close_handler;
+act.sa_flags = 0;
+sigemptyset(_mask);
+sigaction(SIGHUP,  , NULL);
+sigaction(SIGTERM, , NULL);
+sigaction(SIGINT,  , NULL);
+sigaction(SIGALRM, , NULL);
+
+fh = xenforeignmemory_open(NULL, 0);
+
+if ( !fh )
+err(1, "xenforeignmemory_open");
+
+rc = xenforeignmemory_resource_size(
+fh, domid, XENMEM_resource_stats_table,
+0, );
+
+if ( rc )
+err(1, "Fail: Get size");
+
+res = xenforeignmemory_map_resource(
+fh, domid, XENMEM_resource_stats_table,
+0, XENMEM_resource_stats_frame_vcpustats, size >> XC_PAGE_SHIFT,
+(void **), PROT_READ, 0);
+
+if ( !res )
+err(1, "Fail: Map");
+
+while ( !interrupted ) {
+sleep(frec);
+printf("running cpu_time: %ld\n", 
info->vcpu_info[vcpu].runstate_running_time);
+}
+
+rc = xenforeignmemory_unmap_resource(fh, res);
+if ( rc )
+err(1, "Fail: Unmap");
+
+return 0;
+}
-- 
2.25.1




[RFC PATCH v1 1/2] xen/memory : Add a stats_table resource type

2022-08-24 Thread Matias Ezequiel Vara Larsen
This commit proposes a new mechanism to query the RUNSTATE_running counter for
a given vcpu from a dom0 userspace application. This commit proposes to expose
that counter by using the acquire_resource interface. The current mechanism
relies on the XEN_DOMCTL_getvcpuinfo and holds a single global domctl_lock for
the entire hypercall; and iterate over every vcpu in the system for every
update thus impacting operations that share that lock.

This commit proposes to expose vcpu RUNSTATE_running via the
xenforeignmemory interface thus preventing to issue the hypercall and holding
the lock. For that purpose, a new resource type named stats_table is added. The
first frame of this resource stores per-vcpu counters. The frame has one entry
of type struct vcpu_stats per vcpu. The allocation of this frame only happens
if the resource is requested. The frame is released after the domain is
destroyed.

Note that the updating of this counter is in a hot path, thus, in this commit,
copying only happens if it is specifically required.

Note that the exposed structure is extensible in two ways. First, the structure
vcpu_stats can be extended with new per-vcpu counters while it fits in a frame.
Second, new frames can be added in case new counters are required.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
Changes in v1:
- rework the allocation and releasing of the frames
- use the zero frame for per-vcpu counters that are listed as an array
- allocate vcpu stats frames only when the resource is requested
- rewrite commit message
- add the vcpu_stats structure to keep per-vcpu counters
- add the shared_vcpustatspage to keep an array of per-vcpu counters for a
  given domain
- declare the structures in a public header 
- define the vcpustats_page in the domain structure
---
 xen/arch/x86/hvm/hvm.c  |  2 +
 xen/common/memory.c | 94 +
 xen/common/sched/core.c |  7 +++
 xen/include/public/memory.h |  3 ++
 xen/include/public/vcpu.h   | 10 
 xen/include/xen/mm.h|  2 +
 xen/include/xen/sched.h |  5 ++
 7 files changed, 123 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index ddd001a6ad..1ef6cb5ff0 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -741,6 +741,8 @@ void hvm_domain_relinquish_resources(struct domain *d)
 
 ioreq_server_destroy_all(d);
 
+stats_free_vcpu_mfn(d);
+
 msixtbl_pt_cleanup(d);
 
 /* Stop all asynchronous timer actions. */
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 297b98a562..749486d5d4 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -1078,6 +1078,12 @@ unsigned int ioreq_server_max_frames(const struct domain 
*d)
 return nr;
 }
 
+unsigned int stats_table_max_frames(const struct domain *d)
+{
+/* One frame per 512 vcpus. */
+return 1;
+}
+
 /*
  * Return 0 on any kind of error.  Caller converts to -EINVAL.
  *
@@ -1099,6 +1105,9 @@ static unsigned int resource_max_frames(const struct 
domain *d,
 case XENMEM_resource_vmtrace_buf:
 return d->vmtrace_size >> PAGE_SHIFT;
 
+case XENMEM_resource_stats_table:
+return stats_table_max_frames(d);
+
 default:
 return -EOPNOTSUPP;
 }
@@ -1162,6 +1171,88 @@ static int acquire_vmtrace_buf(
 return nr_frames;
 }
 
+void stats_free_vcpu_mfn(struct domain * d)
+{
+struct page_info *pg = d->vcpustats_page.pg;
+
+if ( !pg )
+return;
+
+d->vcpustats_page.pg = NULL;
+
+if ( d->vcpustats_page.va )
+unmap_domain_page_global(d->vcpustats_page.va);
+
+d->vcpustats_page.va = NULL;
+
+put_page_alloc_ref(pg);
+put_page_and_type(pg);
+}
+
+static int stats_vcpu_alloc_mfn(struct domain *d)
+{
+struct page_info *pg;
+
+pg = alloc_domheap_page(d, MEMF_no_refcount);
+
+if ( !pg )
+return -ENOMEM;
+
+if ( !get_page_and_type(pg, d, PGT_writable_page) ) {
+put_page_alloc_ref(pg);
+return -ENODATA;
+}
+
+d->vcpustats_page.va = __map_domain_page_global(pg);
+if ( !d->vcpustats_page.va )
+goto fail;
+
+d->vcpustats_page.pg = pg;
+clear_page(d->vcpustats_page.va);
+return 1;
+
+fail:
+put_page_alloc_ref(pg);
+put_page_and_type(pg);
+
+return -ENOMEM;
+}
+
+static int acquire_stats_table(struct domain *d,
+unsigned int id,
+unsigned int frame,
+unsigned int nr_frames,
+xen_pfn_t mfn_list[])
+{
+mfn_t mfn;
+int rc;
+unsigned int i;
+
+if ( !d )
+return -ENOENT;
+
+for ( i = 0; i < nr_frames; i++ )
+{
+switch ( i )
+{
+case XENMEM_resource_stats_frame_vcpustats:
+if ( !d->vcpustats_page.pg ) {
+rc = stats_vcpu_alloc_mfn(d);
+if ( rc < 1 )
+return rc;
+ 

[RFC PATCH v1 0/2] Add a new acquire resource to query vcpu statistics

2022-08-24 Thread Matias Ezequiel Vara Larsen
Hello all,

The purpose of this RFC is to get feedback about a new acquire resource that
exposes vcpu statistics for a given domain. The current mechanism to get those
statistics is by querying the hypervisor. This mechanism relies on a hypercall
and holds the domctl spinlock during its execution. When a pv tool like xcp-rrdd
periodically samples these counters, it ends up affecting other paths that share
that spinlock. By using acquire resources, the pv tool only requires a few
hypercalls to set the shared memory region and samples are got without issuing
any other hypercall. The original idea has been suggested by Andrew Cooper to
which I have been discussing about how to implement the current PoC. You can
find the RFC patch series at [1]. The series is rebased on top of stable-4.15.

I am currently a bit blocked on 1) what to expose and 2) how to expose it. For
1), I decided to expose what xcp-rrdd is querying, e.g., XEN_DOMCTL_getvcpuinfo.
More precisely, xcp-rrd gets runstate.time[RUNSTATE_running]. This is a uint64_t
counter. However, the time spent in other states may be interesting too.
Regarding 2), I am not sure if simply using an array of uint64_t is enough or if
a different interface should be exposed. The remaining question is when to get
new values. For the moment, I am updating this counter during
vcpu_runstate_change().

The current series includes a simple pv tool that shows how this new interface 
is
used. This tool maps the counter and periodically samples it.

Any feedback/help would be appreciated.

Thanks, Matias.

[1] https://github.com/MatiasVara/xen/tree/feature_stats

Changes in v1:
- rework how the resource is allocated and released
- rework when the resource is allocated that happens only when the resource is
  requested 
- rework the structure shared between the tool and Xen to make it extensible to
  new counters and declare it in a public header

There are still the following questions:
   - consumer may fetch inconsistency data
   - resource shall be released when there are no more readers otherwise we keep
 updating it during a hot path
   - one frame can host up to 512 vcpus. Should I check to this limit when
 updating? Should it be possible to allocate more than one frame for vcpu
 counters?  

Matias Ezequiel Vara Larsen (2):
  xen/memory : Add a stats_table resource type
  tools/misc: Add xen-vcpus-stats tool

 tools/misc/Makefile  |  6 +++
 tools/misc/xen-vcpus-stats.c | 76 +
 xen/arch/x86/hvm/hvm.c   |  2 +
 xen/common/memory.c  | 94 
 xen/common/sched/core.c  |  7 +++
 xen/include/public/memory.h  |  3 ++
 xen/include/public/vcpu.h| 10 
 xen/include/xen/mm.h |  2 +
 xen/include/xen/sched.h  |  5 ++
 9 files changed, 205 insertions(+)
 create mode 100644 tools/misc/xen-vcpus-stats.c

-- 
2.25.1




Re: [RFC PATCH 1/2] xen/memory : Add stats_table resource type

2022-06-29 Thread Matias Ezequiel Vara Larsen
On Fri, Jun 17, 2022 at 08:54:34PM +, George Dunlap wrote:
> Preface: It looks like this may be one of your first hypervisor patches.  So 
> thank you!  FYI there’s a lot that needs fixing up here; please don’t read a 
> tone of annoyance into it, I’m just trying to tell you what needs fixing in 
> the most efficient manner. :-)
> 

Thanks for the comments :) I have addressed some of them already in the 
response to the
cover letter.

> > On 17 May 2022, at 15:33, Matias Ezequiel Vara Larsen 
> >  wrote:
> > 
> > Allow to map vcpu stats using acquire_resource().
> 
> This needs a lot more expansion in terms of what it is that you’re doing in 
> this patch and why.
> 

Got it. I'll improve the commit message in v1.

> > 
> > Signed-off-by: Matias Ezequiel Vara Larsen 
> > ---
> > xen/common/domain.c | 42 +
> > xen/common/memory.c | 29 +
> > xen/common/sched/core.c |  5 +
> > xen/include/public/memory.h |  1 +
> > xen/include/xen/sched.h |  5 +
> > 5 files changed, 82 insertions(+)
> > 
> > diff --git a/xen/common/domain.c b/xen/common/domain.c
> > index 17cc32fde3..ddd9f88874 100644
> > --- a/xen/common/domain.c
> > +++ b/xen/common/domain.c
> > @@ -132,6 +132,42 @@ static void vcpu_info_reset(struct vcpu *v)
> > v->vcpu_info_mfn = INVALID_MFN;
> > }
> > 
> > +static void stats_free_buffer(struct vcpu * v)
> > +{
> > +struct page_info *pg = v->stats.pg;
> > +
> > +if ( !pg )
> > +return;
> > +
> > +v->stats.va = NULL;
> > +
> > +if ( v->stats.va )
> > +unmap_domain_page_global(v->stats.va);
> > +
> > +v->stats.va = NULL;
> 
> Looks like you meant to delete the first `va->stats.va = NULL` but forgot?
> 

Apologies for this, I completely missed.

> > +
> > +free_domheap_page(pg);
> 
> Pretty sure this will crash.
> 
> Unfortunately page allocation and freeing is somewhat complicated and 
> requires a bit of boilerplate.  You can look at the vmtrace_alloc_buffer() 
> code for a template, but the general sequence you want is as follows:
> 
> * On the allocate side:
> 
> 1. Allocate the page
> 
>pg = alloc_domheap_page(d, MEMF_no_refcount);
> 
> This will allocate a page with the PGC_allocated bit set and a single 
> reference count.  (If you pass a page with PGC_allocated set to 
> free_domheap_page(), it will crash; which is why I said the above.)
> 
> 2.  Grab a general reference count for the vcpu struct’s reference to it; as 
> well as a writable type count, so that it doesn’t get used as a special page
> 
> if (get_page_and_type(pg, d, PGT_writable_page)) {
>   put_page_alloc_ref(p);
>   /* failure path */
> }
> 
> * On the free side, don’t call free_domheap_pages() directly.  Rather, drop 
> the allocation, then drop your own type count, thus:
> 
> v->stats.va <http://stats.va/> = NULL;
> 
> put_page_alloc_ref(pg);
> put_page_and_type(pg);
> 
> The issue here is that we can’t free the page until all references have 
> dropped; and the whole point of this exercise is to allow guest userspace in 
> dom0 to gain a reference to the page.  We can’t actually free the page until 
> *all* references are gone, including the userspace one in dom0.  The 
> put_page() which brings the reference count to 0 will automatically free the 
> page.
> 

Thanks for the explanation. For some reason, this is not crashing. I will
reimplement the allocation, releasing, and then update the documentation that I
proposed at
https://lists.xenproject.org/archives/html/xen-devel/2022-05/msg01963.html. The
idea of this reference document is to guide someone that would like to export a 
new
resource by relying on the acquire-resource interface. 

> 
> > +}
> > +
> > +static int stats_alloc_buffer(struct vcpu *v)
> > +{
> > +struct domain *d = v->domain;
> > +struct page_info *pg;
> > +
> > +pg = alloc_domheap_page(d, MEMF_no_refcount);
> > +
> > +if ( !pg )
> > +return -ENOMEM;
> > +
> > +v->stats.va = __map_domain_page_global(pg);
> > +if ( !v->stats.va )
> > +return -ENOMEM;
> > +
> > +v->stats.pg = pg;
> > +clear_page(v->stats.va);
> > +return 0;
> > +}
> 
> The other thing to say about this is that the memory is being allocated 
> unconditionally, even if nobody is planning to read it.  The vast majority of 
> Xen users will not be running xcp-rrd, so it will be pointless overheard.
>

Re: [RFC PATCH 0/2] Add a new acquire resource to query vcpu statistics

2022-06-24 Thread Matias Ezequiel Vara Larsen
Hello George and thanks for the review! you will find my comments below.

On Fri, Jun 17, 2022 at 07:54:32PM +, George Dunlap wrote:
> 
> 
> > On 17 May 2022, at 15:33, Matias Ezequiel Vara Larsen 
> >  wrote:
> > 
> > Hello all,
> > 
> > The purpose of this RFC is to get feedback about a new acquire resource that
> > exposes vcpu statistics for a given domain. The current mechanism to get 
> > those
> > statistics is by querying the hypervisor. This mechanism relies on a 
> > hypercall
> > and holds the domctl spinlock during its execution. When a pv tool like 
> > xcp-rrdd
> > periodically samples these counters, it ends up affecting other paths that 
> > share
> > that spinlock. By using acquire resources, the pv tool only requires a few
> > hypercalls to set the shared memory region and samples are got without 
> > issuing
> > any other hypercall. The original idea has been suggested by Andrew Cooper 
> > to
> > which I have been discussing about how to implement the current PoC. You can
> > find the RFC patch series at [1]. The series is rebased on top of 
> > stable-4.15.
> > 
> > I am currently a bit blocked on 1) what to expose and 2) how to expose it. 
> > For
> > 1), I decided to expose what xcp-rrdd is querying, e.g., 
> > XEN_DOMCTL_getvcpuinfo.
> > More precisely, xcp-rrd gets runstate.time[RUNSTATE_running]. This is a 
> > uint64_t
> > counter. However, the time spent in other states may be interesting too.
> > Regarding 2), I am not sure if simply using an array of uint64_t is enough 
> > or if
> > a different interface should be exposed. The remaining question is when to 
> > get
> > new values. For the moment, I am updating this counter during
> > vcpu_runstate_change().
> > 
> > The current series includes a simple pv tool that shows how this new 
> > interface is
> > used. This tool maps the counter and periodically samples it.
> > 
> > Any feedback/help would be appreciated.
> 
> Hey Matias,
> 
> Sorry it’s taken so long to get back to you.  My day-to-day job has shifted 
> away from technical things to community management; this has been on my radar 
> but I never made time to dig into it.
> 
> There are some missing details I’ve had to try to piece together about the 
> situation, so let me sketch things out a bit further and see if I understand 
> the situation:
> 
> * xcp-rrd currently wants (at minimum) to record 
> runstate.time[RUNSTATE_running] for each vcpu.  Currently that means calling 
> XEN_DOMCTL_getvcpuinfo, which has to hold a single global domctl_lock (!) for 
> the entire hypercall; and of course must be iterated over every vcpu in the 
> system for every update.
> 

For example, in xcp-ng, xcp-rrdd is sampling all the VCPUs of the system every 5
seconds. Also, xcp-rrdd queries other counters like XEN_DOMCTL_getdomaininfo. 

Out of curiosity, do you know any benchmark to measure the impact of this
querying? My guess is that the time of domctl-based operations would increase
with the number of VCPUs. In such a escenario, I am supposing that the time to
query all vcpus increase with the number of vcpus thus holding the domctl_lock
longer. However, this would be only observable in a large
enough system.

> * VCPUOP_get_runstate_info copies out a vcpu_runstate_info struct, which 
> contains information on the other runstates.  Additionally, 
> VCPUOP_register_runstate_memory_area already does something similar to what 
> you want: it passes a virtual address to Xen, which Xen maps, and copies 
> information about the various vcpus into (in update_runstate_area()).
> 
> * However, the above assumes a domain of “current->domain”: That is a domain 
> can call VCPUOP_get_runstate_info on one of its own vcpus, but dom0 cannot 
> call it to get information about the vcpus of other domains.
> 
> * Additionally, VCPUOP_register_runstate_memory_area registers by *virtual 
> address*; this is actually problematic even for guest kernels looking at 
> their own vcpus; but would be completely inappropriate for a dom0 userspace 
> application, which is what you’re looking at.
> 

I just learned about VCPUOP_register_runstate_memory_area a few days ago. I did
not know that it is only for current->domain. Thanks for pointing it out.

> Your solution is to expose things via the xenforeignmemory interface instead, 
> modelled after the vmtrace_buf functionality.
> 
> Does that all sound right?
> 

That's correct. I used vmtrace_buf functionality for inspiration.

> I think at a high level that’s probably the right way to go.
> 
> As you say, my default would be to expose similar information 

Re: [RFC PATCH 2/2] tools/misc: Add xen-stats tool

2022-06-20 Thread Matias Ezequiel Vara Larsen
Hello Anthony, 

On Fri, Jun 03, 2022 at 01:08:20PM +0200, Matias Ezequiel Vara Larsen wrote:
> Hello Anthony and thanks for your comments. I addressed them below:
> 
> On Tue, May 31, 2022 at 12:16:02PM +0100, Anthony PERARD wrote:
> > Hi Matias,
> > 
> > On Tue, May 17, 2022 at 04:33:15PM +0200, Matias Ezequiel Vara Larsen wrote:
> > > Add a demostration tool that uses the stats_table resource to
> > > query vcpu time for a DomU.
> > > 
> > > Signed-off-by: Matias Ezequiel Vara Larsen 
> > > ---
> > > diff --git a/tools/misc/Makefile b/tools/misc/Makefile
> > > index 2b683819d4..b510e3aceb 100644
> > > --- a/tools/misc/Makefile
> > > +++ b/tools/misc/Makefile
> > > @@ -135,4 +135,9 @@ xencov: xencov.o
> > >  xen-ucode: xen-ucode.o
> > >   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
> > >  
> > > +xen-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
> > > +
> > > +xen-stats: xen-stats.o
> > 
> > The tools seems to be only about vcpu, maybe `xen-stats` is a bit too
> > generic. Would `xen-vcpus-stats`, or maybe something with `time` would
> > be better?
> 
> Do you think `xen-vcpus-stats` would be good enough?
> 

I will pick up `xen-vcpus-stats` for v1 if you are not against it.

Thanks,

Matias

> > Also, is it a tools that could be useful enough to be installed by
> > default? Should we at least build it by default so it doesn't rot? (By
> > adding it to only $(TARGETS).)
> 
> I will add to build this tool by default in the next patches' version.
>  
> > > + $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) 
> > > $(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
> > > +
> > >  -include $(DEPS_INCLUDE)
> > > diff --git a/tools/misc/xen-stats.c b/tools/misc/xen-stats.c
> > > new file mode 100644
> > > index 00..5d4a3239cc
> > > --- /dev/null
> > > +++ b/tools/misc/xen-stats.c
> > > @@ -0,0 +1,83 @@
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +#include 
> > > +
> > > +#include 
> > 
> > It seems overkill to use this header when the tool only use
> > xenforeignmemory interface. But I don't know how to replace
> > XC_PAGE_SHIFT, so I guess that's ok.
> > 
> > > +#include 
> > > +#include 
> > 
> > What do you use this headers for? Is it left over?
> 
> `xenforeignmemory.h` is used for `xenforeignmemory_*` functions.
> `xen-tools/libs.h` is left over so I will remove it in next version.
> 
> > > +static sig_atomic_t interrupted;
> > > +static void close_handler(int signum)
> > > +{
> > > +interrupted = 1;
> > > +}
> > > +
> > > +int main(int argc, char **argv)
> > > +{
> > > +xenforeignmemory_handle *fh;
> > > +xenforeignmemory_resource_handle *res;
> > > +size_t size;
> > > +int rc, nr_frames, domid, frec, vcpu;
> > > +uint64_t * info;
> > > +struct sigaction act;
> > > +
> > > +if (argc != 4 ) {
> > > +fprintf(stderr, "Usage: %s   \n", argv[0]);
> > > +return 1;
> > > +}
> > > +
> > > +// TODO: this depends on the resource
> > > +nr_frames = 1;
> > > +
> > > +domid = atoi(argv[1]);
> > > +frec = atoi(argv[3]);
> > > +vcpu = atoi(argv[2]);
> > 
> > Can you swap the last two line? I think it is better if the order is the 
> > same
> > as on the command line.
> 
> Yes, I can.
> 
> > > +
> > > +act.sa_handler = close_handler;
> > > +act.sa_flags = 0;
> > > +sigemptyset(_mask);
> > > +sigaction(SIGHUP,  , NULL);
> > > +sigaction(SIGTERM, , NULL);
> > > +sigaction(SIGINT,  , NULL);
> > > +sigaction(SIGALRM, , NULL);
> > > +
> > > +fh = xenforeignmemory_open(NULL, 0);
> > > +
> > > +if ( !fh )
> > > +err(1, "xenforeignmemory_open");
> > > +
> > > +rc = xenforeignmemory_resource_size(
> > > +fh, domid, XENMEM_resource_stats_table,
> > > +vcpu, );
> > > +
> > > +if ( rc )
> > > +err(1, "Fail: Get size: %d - %s\n", errno, strerror(errno));
> > 
> > It seems that err() already does print strerror(), and add a "\n", so
> > why print it again? Also, if we have strerror(), what the point of
> > printing "errno"?
> 
> I will remove errno, strerror(errno), and the extra "\n".
> 
> > Also, I'm not sure the extra indentation in the error message is really
> > useful, but that doesn't really matter.
> 
> I will remove the indentation.
> 
> > > +
> > > +if ( (size >> XC_PAGE_SHIFT) != nr_frames )
> > > +err(1, "Fail: Get size: expected %u frames, got %zu\n",
> > > +nr_frames, size >> XC_PAGE_SHIFT);
> > 
> > err() prints strerror(errno), maybe errx() is better here. 
> 
> I will use errx().
> 
> Thanks,
>  
> > 
> > Thanks,
> > 
> > -- 
> > Anthony PERARD



Re: [RFC PATCH 2/2] tools/misc: Add xen-stats tool

2022-06-03 Thread Matias Ezequiel Vara Larsen
Hello Anthony and thanks for your comments. I addressed them below:

On Tue, May 31, 2022 at 12:16:02PM +0100, Anthony PERARD wrote:
> Hi Matias,
> 
> On Tue, May 17, 2022 at 04:33:15PM +0200, Matias Ezequiel Vara Larsen wrote:
> > Add a demostration tool that uses the stats_table resource to
> > query vcpu time for a DomU.
> > 
> > Signed-off-by: Matias Ezequiel Vara Larsen 
> > ---
> > diff --git a/tools/misc/Makefile b/tools/misc/Makefile
> > index 2b683819d4..b510e3aceb 100644
> > --- a/tools/misc/Makefile
> > +++ b/tools/misc/Makefile
> > @@ -135,4 +135,9 @@ xencov: xencov.o
> >  xen-ucode: xen-ucode.o
> > $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
> >  
> > +xen-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
> > +
> > +xen-stats: xen-stats.o
> 
> The tools seems to be only about vcpu, maybe `xen-stats` is a bit too
> generic. Would `xen-vcpus-stats`, or maybe something with `time` would
> be better?

Do you think `xen-vcpus-stats` would be good enough?

> Also, is it a tools that could be useful enough to be installed by
> default? Should we at least build it by default so it doesn't rot? (By
> adding it to only $(TARGETS).)

I will add to build this tool by default in the next patches' version.
 
> > +   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) 
> > $(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
> > +
> >  -include $(DEPS_INCLUDE)
> > diff --git a/tools/misc/xen-stats.c b/tools/misc/xen-stats.c
> > new file mode 100644
> > index 00..5d4a3239cc
> > --- /dev/null
> > +++ b/tools/misc/xen-stats.c
> > @@ -0,0 +1,83 @@
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include 
> 
> It seems overkill to use this header when the tool only use
> xenforeignmemory interface. But I don't know how to replace
> XC_PAGE_SHIFT, so I guess that's ok.
> 
> > +#include 
> > +#include 
> 
> What do you use this headers for? Is it left over?

`xenforeignmemory.h` is used for `xenforeignmemory_*` functions.
`xen-tools/libs.h` is left over so I will remove it in next version.

> > +static sig_atomic_t interrupted;
> > +static void close_handler(int signum)
> > +{
> > +interrupted = 1;
> > +}
> > +
> > +int main(int argc, char **argv)
> > +{
> > +xenforeignmemory_handle *fh;
> > +xenforeignmemory_resource_handle *res;
> > +size_t size;
> > +int rc, nr_frames, domid, frec, vcpu;
> > +uint64_t * info;
> > +struct sigaction act;
> > +
> > +if (argc != 4 ) {
> > +fprintf(stderr, "Usage: %s   \n", argv[0]);
> > +return 1;
> > +}
> > +
> > +// TODO: this depends on the resource
> > +nr_frames = 1;
> > +
> > +domid = atoi(argv[1]);
> > +frec = atoi(argv[3]);
> > +vcpu = atoi(argv[2]);
> 
> Can you swap the last two line? I think it is better if the order is the same
> as on the command line.

Yes, I can.

> > +
> > +act.sa_handler = close_handler;
> > +act.sa_flags = 0;
> > +sigemptyset(_mask);
> > +sigaction(SIGHUP,  , NULL);
> > +sigaction(SIGTERM, , NULL);
> > +sigaction(SIGINT,  , NULL);
> > +sigaction(SIGALRM, , NULL);
> > +
> > +fh = xenforeignmemory_open(NULL, 0);
> > +
> > +if ( !fh )
> > +err(1, "xenforeignmemory_open");
> > +
> > +rc = xenforeignmemory_resource_size(
> > +fh, domid, XENMEM_resource_stats_table,
> > +vcpu, );
> > +
> > +if ( rc )
> > +err(1, "Fail: Get size: %d - %s\n", errno, strerror(errno));
> 
> It seems that err() already does print strerror(), and add a "\n", so
> why print it again? Also, if we have strerror(), what the point of
> printing "errno"?

I will remove errno, strerror(errno), and the extra "\n".

> Also, I'm not sure the extra indentation in the error message is really
> useful, but that doesn't really matter.

I will remove the indentation.

> > +
> > +if ( (size >> XC_PAGE_SHIFT) != nr_frames )
> > +err(1, "Fail: Get size: expected %u frames, got %zu\n",
> > +nr_frames, size >> XC_PAGE_SHIFT);
> 
> err() prints strerror(errno), maybe errx() is better here. 

I will use errx().

Thanks,
 
> 
> Thanks,
> 
> -- 
> Anthony PERARD



[RFC PATCH] xen/docs: Document acquire resource interface

2022-05-24 Thread Matias Ezequiel Vara Larsen
This commit creates a new doc to document the acquire resource interface. This
is a reference document.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
RFC: The current document still contains TODOs. I am not really sure why
different resources are implemented differently. I would like to understand it
better so I can document it and then easily build new resources. I structured
the document in two sections but I am not sure if that is the right way to do
it.

---
 .../acquire_resource_reference.rst| 337 ++
 docs/hypervisor-guide/index.rst   |   2 +
 2 files changed, 339 insertions(+)
 create mode 100644 docs/hypervisor-guide/acquire_resource_reference.rst

diff --git a/docs/hypervisor-guide/acquire_resource_reference.rst 
b/docs/hypervisor-guide/acquire_resource_reference.rst
new file mode 100644
index 00..a9944aae1d
--- /dev/null
+++ b/docs/hypervisor-guide/acquire_resource_reference.rst
@@ -0,0 +1,337 @@
+.. SPDX-License-Identifier: CC-BY-4.0
+
+Acquire resource reference
+==
+
+Acquire resource allows you to share a resource between a domain and a dom0 pv
+tool.  Resources are generally represented by pages that are mapped into the pv
+tool memory space. These pages are accessed by Xen and they may or may not be
+accessed by the DomU itself. This document describes the api to build pv tools.
+The document also describes the software components required to create and
+expose a domain's resource. This is not a tutorial or a how-to guide. It merely
+describes the machinery that is already described in the code itself.
+
+.. warning::
+
+The code in this document may already be out of date, however it may
+be enough to illustrate how the acquire resource interface works.
+
+
+PV tool API
+---
+
+This section describes the api to map a resource from a pv tool. The api is 
based
+on the following functions:
+
+* xenforeignmemory_open()
+
+* xenforeignmemory_resource_size()
+
+* xenforeignmemory_map_resource()
+
+* xenforeignmemory_unmap_resource()
+
+The ``xenforeignmemory_open()`` function gets the handler that is used by the
+rest of the functions:
+
+.. code-block:: c
+
+   fh = xenforeignmemory_open(NULL, 0);
+
+The ``xenforeignmemory_resource_size()`` function gets the size of the 
resource.
+For example, in the following code, we get the size of the
+``XENMEM_RESOURCE_VMTRACE_BUF``:
+
+.. code-block:: c
+
+rc = xenforeignmemory_resource_size(fh, domid, 
XENMEM_resource_vmtrace_buf, vcpu, );
+
+The size of the resource is returned in ``size`` in bytes.
+
+The ``xenforeignmemory_map_resource()`` function maps a domain's resource. The
+function is declared as follows:
+
+.. code-block:: c
+
+xenforeignmemory_resource_handle *xenforeignmemory_map_resource(
+xenforeignmemory_handle *fmem, domid_t domid, unsigned int type,
+unsigned int id, unsigned long frame, unsigned long nr_frames,
+void **paddr, int prot, int flags);
+
+The size of the resource is in number of frames. For example, **QEMU** uses it
+to map the ioreq server between the domain and QEMU:
+
+.. code-block:: c
+
+fres = xenforeignmemory_map_resource(xen_fmem, xen_domid, 
XENMEM_resource_ioreq_server,
+ state->ioservid, 0, 2, , PROT_READ | PROT_WRITE, 0);
+
+
+The third parameter corresponds with the resource that we request from the
+domain, e.g., ``XENMEM_resource_ioreq_server``. The seventh parameter returns a
+point-to-pointer to the address of the mapped resource.
+
+Finally, the ``xenforeignmemory_unmap_resource()`` function unmaps the region:
+
+.. code-block:: c
+:caption: tools/misc/xen-vmtrace.c
+
+if ( fres && xenforeignmemory_unmap_resource(fh, fres) )
+perror("xenforeignmemory_unmap_resource()");
+
+Sharing a resource with a pv tool
+-
+
+In this section, we describe how to build a new resource and share it with a pv
+too. Resources are defined in ``xen/include/public/memory.h``. In Xen-4.16,
+there are three resources:
+
+.. code-block:: c
+:caption: xen/include/public/memory.h
+
+#define XENMEM_resource_ioreq_server 0
+#define XENMEM_resource_grant_table 1
+#define XENMEM_resource_vmtrace_buf 2
+
+The ``resource_max_frames()`` function returns the size of a resource. The
+resource may provide a handler to get the size. This is the definition of the
+``resource_max_frame()`` function:
+
+.. code-block:: c
+:linenos:
+:caption: xen/common/memory.c
+
+static unsigned int resource_max_frames(const struct domain *d,
+unsigned int type, unsigned int id)
+{
+switch ( type )
+{
+case XENMEM_resource_grant_table:
+return gnttab_resource_max_frames(d, id);
+
+case XENMEM_resource_ioreq_server:
+return ioreq_server_max_frames(d);
+
+case XENMEM_resource_vmtrace_buf:
+return d->vm

[RFC PATCH 1/2] xen/memory : Add stats_table resource type

2022-05-17 Thread Matias Ezequiel Vara Larsen
Allow to map vcpu stats using acquire_resource().

Signed-off-by: Matias Ezequiel Vara Larsen 
---
 xen/common/domain.c | 42 +
 xen/common/memory.c | 29 +
 xen/common/sched/core.c |  5 +
 xen/include/public/memory.h |  1 +
 xen/include/xen/sched.h |  5 +
 5 files changed, 82 insertions(+)

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 17cc32fde3..ddd9f88874 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -132,6 +132,42 @@ static void vcpu_info_reset(struct vcpu *v)
 v->vcpu_info_mfn = INVALID_MFN;
 }
 
+static void stats_free_buffer(struct vcpu * v)
+{
+struct page_info *pg = v->stats.pg;
+
+if ( !pg )
+return;
+
+v->stats.va = NULL;
+
+if ( v->stats.va )
+unmap_domain_page_global(v->stats.va);
+
+v->stats.va = NULL;
+
+free_domheap_page(pg);
+}
+
+static int stats_alloc_buffer(struct vcpu *v)
+{
+struct domain *d = v->domain;
+struct page_info *pg;
+
+pg = alloc_domheap_page(d, MEMF_no_refcount);
+
+if ( !pg )
+return -ENOMEM;
+
+v->stats.va = __map_domain_page_global(pg);
+if ( !v->stats.va )
+return -ENOMEM;
+
+v->stats.pg = pg;
+clear_page(v->stats.va);
+return 0;
+}
+
 static void vmtrace_free_buffer(struct vcpu *v)
 {
 const struct domain *d = v->domain;
@@ -203,6 +239,9 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
  */
 static int vcpu_teardown(struct vcpu *v)
 {
+
+stats_free_buffer(v);
+
 vmtrace_free_buffer(v);
 
 return 0;
@@ -269,6 +308,9 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int 
vcpu_id)
 if ( vmtrace_alloc_buffer(v) != 0 )
 goto fail_wq;
 
+if ( stats_alloc_buffer(v) != 0 )
+goto fail_wq;
+
 if ( arch_vcpu_create(v) != 0 )
 goto fail_sched;
 
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 297b98a562..39de6d9d05 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -1099,6 +1099,10 @@ static unsigned int resource_max_frames(const struct 
domain *d,
 case XENMEM_resource_vmtrace_buf:
 return d->vmtrace_size >> PAGE_SHIFT;
 
+// WIP: to figure out the correct size of the resource
+case XENMEM_resource_stats_table:
+return 1;
+
 default:
 return -EOPNOTSUPP;
 }
@@ -1162,6 +1166,28 @@ static int acquire_vmtrace_buf(
 return nr_frames;
 }
 
+static int acquire_stats_table(struct domain *d,
+unsigned int id,
+unsigned int frame,
+unsigned int nr_frames,
+xen_pfn_t mfn_list[])
+{
+const struct vcpu *v = domain_vcpu(d, id);
+mfn_t mfn;
+
+if ( !v )
+return -ENOENT;
+
+if ( !v->stats.pg )
+return -EINVAL;
+
+mfn = page_to_mfn(v->stats.pg);
+mfn_list[0] = mfn_x(mfn);
+
+printk("acquire_perf_table: id: %d, nr_frames: %d, %p, domainid: %d\n", 
id, nr_frames, v->stats.pg, d->domain_id);
+return 1;
+}
+
 /*
  * Returns -errno on error, or positive in the range [1, nr_frames] on
  * success.  Returning less than nr_frames contitutes a request for a
@@ -1182,6 +1208,9 @@ static int _acquire_resource(
 case XENMEM_resource_vmtrace_buf:
 return acquire_vmtrace_buf(d, id, frame, nr_frames, mfn_list);
 
+case XENMEM_resource_stats_table:
+return acquire_stats_table(d, id, frame, nr_frames, mfn_list);
+
 default:
 return -EOPNOTSUPP;
 }
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 8f4b1ca10d..2a8b534977 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -264,6 +264,7 @@ static inline void vcpu_runstate_change(
 {
 s_time_t delta;
 struct sched_unit *unit = v->sched_unit;
+uint64_t * runstate;
 
 ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
 if ( v->runstate.state == new_state )
@@ -287,6 +288,10 @@ static inline void vcpu_runstate_change(
 }
 
 v->runstate.state = new_state;
+
+// WIP: use a different interface
+runstate = (uint64_t*)v->stats.va;
+memcpy(runstate, >runstate.time[0], sizeof(v->runstate.time[0]));
 }
 
 void sched_guest_idle(void (*idle) (void), unsigned int cpu)
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 50e73eef98..752fd0be0f 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -626,6 +626,7 @@ struct xen_mem_acquire_resource {
 #define XENMEM_resource_ioreq_server 0
 #define XENMEM_resource_grant_table 1
 #define XENMEM_resource_vmtrace_buf 2
+#define XENMEM_resource_stats_table 3
 
 /*
  * IN - a type-specific resource identifier, which must be zero
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 5485d08afb..bc99adea7e 100644
--- a/xen/include

[RFC PATCH 0/2] Add a new acquire resource to query vcpu statistics

2022-05-17 Thread Matias Ezequiel Vara Larsen
Hello all,

The purpose of this RFC is to get feedback about a new acquire resource that
exposes vcpu statistics for a given domain. The current mechanism to get those
statistics is by querying the hypervisor. This mechanism relies on a hypercall
and holds the domctl spinlock during its execution. When a pv tool like xcp-rrdd
periodically samples these counters, it ends up affecting other paths that share
that spinlock. By using acquire resources, the pv tool only requires a few
hypercalls to set the shared memory region and samples are got without issuing
any other hypercall. The original idea has been suggested by Andrew Cooper to
which I have been discussing about how to implement the current PoC. You can
find the RFC patch series at [1]. The series is rebased on top of stable-4.15.

I am currently a bit blocked on 1) what to expose and 2) how to expose it. For
1), I decided to expose what xcp-rrdd is querying, e.g., XEN_DOMCTL_getvcpuinfo.
More precisely, xcp-rrd gets runstate.time[RUNSTATE_running]. This is a uint64_t
counter. However, the time spent in other states may be interesting too.
Regarding 2), I am not sure if simply using an array of uint64_t is enough or if
a different interface should be exposed. The remaining question is when to get
new values. For the moment, I am updating this counter during
vcpu_runstate_change().

The current series includes a simple pv tool that shows how this new interface 
is
used. This tool maps the counter and periodically samples it.

Any feedback/help would be appreciated.

Thanks, Matias.

[1] https://github.com/MatiasVara/xen/tree/feature_stats

Matias Ezequiel Vara Larsen (2):
  xen/memory : Add stats_table resource type
  tools/misc: Add xen-stats tool

 tools/misc/Makefile |  5 +++
 tools/misc/xen-stats.c  | 83 +
 xen/common/domain.c | 42 +++
 xen/common/memory.c | 29 +
 xen/common/sched/core.c |  5 +++
 xen/include/public/memory.h |  1 +
 xen/include/xen/sched.h |  5 +++
 7 files changed, 170 insertions(+)
 create mode 100644 tools/misc/xen-stats.c

-- 
2.25.1




[RFC PATCH 2/2] tools/misc: Add xen-stats tool

2022-05-17 Thread Matias Ezequiel Vara Larsen
Add a demostration tool that uses the stats_table resource to
query vcpu time for a DomU.

Signed-off-by: Matias Ezequiel Vara Larsen 
---
 tools/misc/Makefile|  5 +++
 tools/misc/xen-stats.c | 83 ++
 2 files changed, 88 insertions(+)
 create mode 100644 tools/misc/xen-stats.c

diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 2b683819d4..b510e3aceb 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -135,4 +135,9 @@ xencov: xencov.o
 xen-ucode: xen-ucode.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+xen-stats.o: CFLAGS += $(CFLAGS_libxenforeginmemory)
+
+xen-stats: xen-stats.o
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) 
$(LDLIBS_libxenforeignmemory) $(APPEND_LDFLAGS)
+
 -include $(DEPS_INCLUDE)
diff --git a/tools/misc/xen-stats.c b/tools/misc/xen-stats.c
new file mode 100644
index 00..5d4a3239cc
--- /dev/null
+++ b/tools/misc/xen-stats.c
@@ -0,0 +1,83 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+static sig_atomic_t interrupted;
+static void close_handler(int signum)
+{
+interrupted = 1;
+}
+
+int main(int argc, char **argv)
+{
+xenforeignmemory_handle *fh;
+xenforeignmemory_resource_handle *res;
+size_t size;
+int rc, nr_frames, domid, frec, vcpu;
+uint64_t * info;
+struct sigaction act;
+
+if (argc != 4 ) {
+fprintf(stderr, "Usage: %s   \n", argv[0]);
+return 1;
+}
+
+// TODO: this depends on the resource
+nr_frames = 1;
+
+domid = atoi(argv[1]);
+frec = atoi(argv[3]);
+vcpu = atoi(argv[2]);
+
+act.sa_handler = close_handler;
+act.sa_flags = 0;
+sigemptyset(_mask);
+sigaction(SIGHUP,  , NULL);
+sigaction(SIGTERM, , NULL);
+sigaction(SIGINT,  , NULL);
+sigaction(SIGALRM, , NULL);
+
+fh = xenforeignmemory_open(NULL, 0);
+
+if ( !fh )
+err(1, "xenforeignmemory_open");
+
+rc = xenforeignmemory_resource_size(
+fh, domid, XENMEM_resource_stats_table,
+vcpu, );
+
+if ( rc )
+err(1, "Fail: Get size: %d - %s\n", errno, strerror(errno));
+
+if ( (size >> XC_PAGE_SHIFT) != nr_frames )
+err(1, "Fail: Get size: expected %u frames, got %zu\n",
+nr_frames, size >> XC_PAGE_SHIFT);
+
+res = xenforeignmemory_map_resource(
+fh, domid, XENMEM_resource_stats_table,
+vcpu, 0, size >> XC_PAGE_SHIFT,
+(void **), PROT_READ, 0);
+
+if ( !res )
+err(1, "Fail: Map %d - %s\n", errno, strerror(errno));
+
+while ( !interrupted ) {
+sleep(frec);
+printf("running cpu_time: %ld\n", *info);
+}
+
+rc = xenforeignmemory_unmap_resource(fh, res);
+if ( rc )
+err(1, "Fail: Unmap %d - %s\n", errno, strerror(errno));
+
+return 0;
+}
-- 
2.25.1