Re: [Xen-devel] [PATCH v3 1/2] efi: Fix allocation problems if ExitBootServices() fails

2015-06-02 Thread Ian Campbell
On Mon, 2015-06-01 at 14:20 -0700, Roy Franz wrote:
 On Mon, Jun 1, 2015 at 4:24 AM, Ian Campbell ian.campb...@citrix.com wrote:
  On Mon, 2015-06-01 at 12:10 +0100, Jan Beulich wrote:
   On 01.06.15 at 12:17, ross.lagerw...@citrix.com wrote:
   If calling ExitBootServices() fails, the required memory map size may
   have increased. When initially allocating the memory map, allocate a
   slightly larger buffer (by an arbitrary 8 entries) to fix this.
  
   The ARM code path was already allocating a larger buffer than required,
   so this moves the code to be common for all architectures.
  
   This was seen on the following machine when using the iscsidxe UEFI
   driver. The machine would consistently fail the first call to
   ExitBootServices().
   System Information
   Manufacturer: Supermicro
   Product Name: X10SLE-F/HF
   BIOS Information
   Vendor: American Megatrends Inc.
   Version: 2.00
   Release Date: 04/24/2014
  
   Signed-off-by: Ross Lagerwall ross.lagerw...@citrix.com
 
  Provided ARM folks are happy with the reduced increase,
 
  Hi Roy,
 
  This patch[0] turns a +PAGE_SIZE in efi_arch_allocate_mmap_buffer into a
  8 * efi_mdesc_size in the common code.
 
  The +PAGE_SIZE came from [1] so I think it is as arbitrary as the
  +8*sizeof here.
 
  IOW this change looks ok to me, what do you think?
 
 Yeah, this should be fine.  Most EFI allocations have page-size
 granularity within the firmware,
 so there wasn't much point doing something smaller.  I haven't
 actually used firmware that
 changed the memmap size on ExitBootServices, so that size was not
 based on any actual
 firmware's behavior.  The x86 allocations are done differently and are
 more size constrained,
 so a smaller value should be fine for common code.
 
 Roy
 
 Reviewed-by: Roy Franz roy.fr...@linaro.org

Thanks. On that basis:

Acked-by: Ian Campbell ian.campb...@citrix.com


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4] efi: Avoid calling boot services after ExitBootServices()

2015-06-02 Thread Ross Lagerwall
After the first call to ExitBootServices(), avoid calling any boot
services by setting setting efi_bs to NULL and halting in blexit().

Signed-off-by: Ross Lagerwall ross.lagerw...@citrix.com
---

* Separated halt into an arch hook.
* Applies on top of the first patch from v3.
* Tested on x86, not sure if the ARM version is correct.

 xen/arch/arm/efi/efi-boot.h |  5 +
 xen/arch/x86/efi/efi-boot.h |  7 +++
 xen/common/efi/boot.c   | 13 ++---
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h
index 3297f27..47efdfc 100644
--- a/xen/arch/arm/efi/efi-boot.h
+++ b/xen/arch/arm/efi/efi-boot.h
@@ -522,6 +522,11 @@ static void __init efi_arch_blexit(void)
 efi_bs-FreePool(memmap);
 }
 
+static void __init efi_arch_halt(void)
+{
+stop_cpu();
+}
+
 static void __init efi_arch_load_addr_check(EFI_LOADED_IMAGE *loaded_image)
 {
 if ( (unsigned long)loaded_image-ImageBase  ((1  12) - 1) )
diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h
index cd14c19..9f41793 100644
--- a/xen/arch/x86/efi/efi-boot.h
+++ b/xen/arch/x86/efi/efi-boot.h
@@ -614,6 +614,13 @@ static void __init efi_arch_blexit(void)
 efi_bs-FreePages(ucode.addr, PFN_UP(ucode.size));
 }
 
+static void __init efi_arch_halt(void)
+{
+local_irq_disable();
+for ( ; ; )
+halt();
+}
+
 static void __init efi_arch_load_addr_check(EFI_LOADED_IMAGE *loaded_image)
 {
 xen_phys_start = (UINTN)loaded_image-ImageBase;
diff --git a/xen/common/efi/boot.c b/xen/common/efi/boot.c
index 60c1b8d..4b816f2 100644
--- a/xen/common/efi/boot.c
+++ b/xen/common/efi/boot.c
@@ -216,6 +216,9 @@ static void __init noreturn blexit(const CHAR16 *str)
 PrintStr((CHAR16 *)str);
 PrintStr(newline);
 
+if ( !efi_bs )
+efi_arch_halt();
+
 if ( cfg.addr )
 efi_bs-FreePages(cfg.addr, PFN_UP(cfg.size));
 if ( kernel.addr )
@@ -1063,8 +1066,10 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE 
*SystemTable)
 for ( retry = 0; ; retry = 1 )
 {
 efi_memmap_size = map_alloc_size;
-status = efi_bs-GetMemoryMap(efi_memmap_size, efi_memmap, map_key,
-  efi_mdesc_size, mdesc_ver);
+status = SystemTable-BootServices-GetMemoryMap(efi_memmap_size,
+ efi_memmap, map_key,
+ efi_mdesc_size,
+ mdesc_ver);
 if ( EFI_ERROR(status) )
 PrintErrMesg(LCannot obtain memory map, status);
 
@@ -1073,7 +1078,9 @@ efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE 
*SystemTable)
 
 efi_arch_pre_exit_boot();
 
-status = efi_bs-ExitBootServices(ImageHandle, map_key);
+status = SystemTable-BootServices-ExitBootServices(ImageHandle,
+ map_key);
+efi_bs = NULL;
 if ( status != EFI_INVALID_PARAMETER || retry )
 break;
 }
-- 
2.1.0


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 COLO Pre 04/12] tools/libxl: introduce a new API libxl__domain_restore() to load qemu state

2015-06-02 Thread Yang Hongyang



On 06/02/2015 05:38 PM, Wen Congyang wrote:

On 06/02/2015 05:26 PM, Yang Hongyang wrote:

[...]


+int libxl__qmp_restore(libxl__gc *gc, int domid, const char *state_file)
+{
+libxl__json_object *args = NULL;
+
+qmp_parameters_add_string(gc, args, filename, state_file);
+
+return qmp_run_command(gc, domid, xen-load-devices-state, args,
+   NULL, NULL);


IIRC, this is a new qmp command. Post the patch for qemu together?


Sure, will post it to qemu upstream.



Thanks
Wen Congyang


+}
+
  static int qmp_change(libxl__gc *gc, libxl__qmp_handler *qmp,
char *device, char *target, char *arg)
  {



.



--
Thanks,
Yang.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH RFC v1] Modified RTDS scheduler to use an event-driven model instead of polling.

2015-06-02 Thread Dagaen Golomb
To do this, we create a new list that holds, for each
vcpu, the time least into the future that it may need to be
rescheduled. The scheduler chooses the lowest time off of this
list and waits until the specified time instead of running every
1 ms as it did before.

Signed-off-by: Dagaen Golomb dgol...@seas.upenn.edu
Signed-off-by: Meng Xu men...@cis.upenn.edu
---
 xen/common/sched_rt.c |  319 ++---
 1 file changed, 222 insertions(+), 97 deletions(-)

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 7c39a9e..25f0458 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -4,6 +4,7 @@
  *
  * by Sisu Xi, 2013, Washington University in Saint Louis
  * and Meng Xu, 2014, University of Pennsylvania
+ * and Dagaen Golomb, 2015, University of Pennsylvania
  *
  * based on the code of credit Scheduler
  */
@@ -134,6 +135,8 @@ struct rt_private {
 struct list_head sdom;  /* list of availalbe domains, used for dump */
 struct list_head runq;  /* ordered list of runnable vcpus */
 struct list_head depletedq; /* unordered list of depleted vcpus */
+struct list_head timerq;/* ascending list of next required scheduling 
+   decision */
 cpumask_t tickled;  /* cpus been tickled */
 };
 
@@ -143,6 +146,7 @@ struct rt_private {
 struct rt_vcpu {
 struct list_head q_elem;/* on the runq/depletedq list */
 struct list_head sdom_elem; /* on the domain VCPU list */
+struct list_head t_elem;/* on the timerq */
 
 /* Up-pointers */
 struct rt_dom *sdom;
@@ -156,6 +160,7 @@ struct rt_vcpu {
 s_time_t cur_budget;/* current budget */
 s_time_t last_start;/* last start time */
 s_time_t cur_deadline;  /* current deadline for EDF */
+s_time_t next_sched_needed; /* next time to make scheduling decision */
 
 unsigned flags; /* mark __RTDS_scheduled, etc.. */
 };
@@ -197,6 +202,11 @@ static inline struct list_head *rt_depletedq(const struct 
scheduler *ops)
 return rt_priv(ops)-depletedq;
 }
 
+static inline struct list_head *rt_timerq(const struct scheduler *ops)
+{
+return rt_priv(ops)-timerq;
+}
+
 /*
  * Queue helper functions for runq and depletedq
  */
@@ -212,6 +222,11 @@ __q_elem(struct list_head *elem)
 return list_entry(elem, struct rt_vcpu, q_elem);
 }
 
+static struct rt_vcpu * __t_elem(struct list_head *elem)
+{
+return list_entry(elem, struct rt_vcpu, t_elem);
+}
+
 /*
  * Debug related code, dump vcpu/cpu information
  */
@@ -231,7 +246,8 @@ rt_dump_vcpu(const struct scheduler *ops, const struct 
rt_vcpu *svc)
 
 cpumask_scnprintf(cpustr, sizeof(cpustr), svc-vcpu-cpu_hard_affinity);
 printk([%5d.%-2u] cpu %u, (%PRI_stime, %PRI_stime),
-cur_b=%PRI_stime cur_d=%PRI_stime last_start=%PRI_stime\n
+cur_b=%PRI_stime cur_d=%PRI_stime last_start=%PRI_stime
+next_sched=%PRI_stime\n
 \t\t onQ=%d runnable=%d cpu_hard_affinity=%s ,
 svc-vcpu-domain-domain_id,
 svc-vcpu-vcpu_id,
@@ -241,6 +257,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct 
rt_vcpu *svc)
 svc-cur_budget,
 svc-cur_deadline,
 svc-last_start,
+svc-next_sched_needed,
 __vcpu_on_q(svc),
 vcpu_runnable(svc-vcpu),
 cpustr);
@@ -264,7 +281,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu)
 static void
 rt_dump(const struct scheduler *ops)
 {
-struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter;
+struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *timerq, *iter;
 struct rt_private *prv = rt_priv(ops);
 struct rt_vcpu *svc;
 struct rt_dom *sdom;
@@ -277,6 +294,7 @@ rt_dump(const struct scheduler *ops)
 
 runq = rt_runq(ops);
 depletedq = rt_depletedq(ops);
+timerq = rt_timerq(ops);
 
 printk(Global RunQueue info:\n);
 list_for_each( iter, runq )
@@ -292,6 +310,14 @@ rt_dump(const struct scheduler *ops)
 rt_dump_vcpu(ops, svc);
 }
 
+printk(Global TimerQueue info:\n);
+list_for_each( iter, timerq )
+{
+svc = __t_elem(iter);
+printk(\tvcpu %d next_sched=%PRI_stime\n, svc-vcpu-vcpu_id, 
+  svc-next_sched_needed);
+}
+
 printk(Domain info:\n);
 list_for_each( iter_sdom, prv-sdom )
 {
@@ -361,6 +387,12 @@ __q_remove(struct rt_vcpu *svc)
 list_del_init(svc-q_elem);
 }
 
+static inline void __t_remove(struct rt_vcpu *svc)
+{
+   if( !list_empty(svc-t_elem) )
+   list_del_init(svc-t_elem);
+}
+
 /*
  * Insert svc with budget in RunQ according to EDF:
  * vcpus with smaller deadlines go first.
@@ -395,6 +427,72 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu 
*svc)
 }
 
 /*
+ * Insert svc into the timerq, maintaining ascending order by 
next_sched_needed.
+ */
+static void __timerq_insert(const 

Re: [Xen-devel] Xen/arm: Virtual ITS command queue handling

2015-06-02 Thread Ian Campbell
On Mon, 2015-06-01 at 15:57 -0700, Manish Jaggi wrote:

  Anyway, the general shape of this plan seems plausible enough.
 Could you modify the http://xenbits.xen.org/people/ianc/vits/draftC.html(5 
 vITS to pITS mapping) based on this approach

I'm updating things as I go and feed back will be relected in the next
draft.


   -5- domU is booted with a single virtual its node in device tree. Front 
   end driver  attaches this its as msi-parent
   -6- When domU accesses for ITS are trapped in Xen, using the helper 
   function say
   get_phys_its_for_guest(guest_id, guest_sbdf, /*[out]*/its_ptr *its)
   
   its can be retrieved.
   AFAIK this is numa safe.
 2) When PCI device is assigned to DomU, how does domU choose
  vITS to send commands.  AFAIK, the BDF of assigned device
  is different from actual BDF in DomU.
AIUI this is described in the firmware tables.

e.g. in DT via the msi-parent phandle on the PCI root complex or
individual device.

Is there an assumption here that a single PCI root bridge is associated
with a single ITS block? Or can different devices on a PCI bus use
different ITS blocks?

Ian.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
  
 



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [xen-unstable test] 57712: regressions - trouble: broken/fail/pass

2015-06-02 Thread Andrew Cooper
On 02/06/15 03:27, osstest service user wrote:
 flight 57712 xen-unstable real [real]
 http://logs.test-lab.xenproject.org/osstest/logs/57712/

 Regressions :-(

 Tests which did not succeed and are blocking,
 including tests which could not be run:
  test-armhf-armhf-xl-cubietruck 15 guest-start/debian.repeat fail REGR. vs. 
 57419

This looks new, but definitely not an issue caused by the changesets
listed below.

Jun  1 09:32:57.097721 [  957.892456] [ cut here ]
Jun  1 09:32:58.078515 [  957.892528] kernel BUG at 
drivers/xen/grant-table.c:923!
Jun  1 09:32:58.083826 [  957.892556] Internal error: Oops - BUG: 0 [#1] SMP ARM
Jun  1 09:32:58.089020 [  957.892584] Modules linked in: xen_gntalloc bridge 
stp ipv6 llc brcmfmac brcmutil cfg80211
Jun  1 09:32:58.097371 [  957.892674] CPU: 1 PID: 16698 Comm: vif3.0-q0-guest 
Not tainted 3.16.7-ckt4+ #1
Jun  1 09:32:58.104772 [  957.892761] task: d838c900 ti: d2396000 task.ti: 
d2396000
Jun  1 09:32:58.110234 [  957.892815] PC is at gnttab_batch_copy+0xd4/0xe0
Jun  1 09:32:58.114884 [  957.892843] LR is at gnttab_batch_copy+0x1c/0xe0
Jun  1 09:32:58.119492 [  957.892870] pc : [c04ab434]lr : [c04ab37c]
psr: a013
Jun  1 09:32:58.125861 [  957.892870] sp : d2397eb0  ip : deadbeef  fp : 
d2397f3c
Jun  1 09:32:58.131121 [  957.892925] r10: 0001  r9 :   r8 : 
0005
Jun  1 09:32:58.136517 [  957.892953] r7 : 0001  r6 : 0001  r5 : 
  r4 : e1e83d30
Jun  1 09:32:58.143015 [  957.892983] r3 : 0001  r2 : deadbeef  r1 : 
deadbeef  r0 : fff2
Jun  1 09:32:58.149650 [  957.893015] Flags: NzCv  IRQs on  FIQs on  Mode 
SVC_32  ISA ARM  Segment kernel
Jun  1 09:32:58.156990 [  957.893050] Control: 10c5387d  Table: 7b27c06a  DAC: 
0015
Jun  1 09:32:58.162765 [  957.893080] Process vif3.0-q0-guest (pid: 16698, 
stack limit = 0xd2396248)
Jun  1 09:32:58.169775 [  957.893110] Stack: (0xd2397eb0 to 0xd2398000)
Jun  1 09:32:58.174140 [  957.893138] 7ea0: 
0001   e1e7b000
Jun  1 09:32:58.182400 [  957.893177] 7ec0: 0001 c05d6acc 0062 0302 
d2397f3c   0010
Jun  1 09:32:58.190645 [  957.893216] 7ee0: c0b64c70 435f4741  0005 
 d2397f20 e1e83cfc e1e83d30
Jun  1 09:32:58.198896 [  957.893253] 7f00: 0001  0001  
e1e83d30 e1eae530 0062 0205
Jun  1 09:32:58.207138 [  957.893291] 7f20: d8100840 d8100840 0001  
d2397f2c d2397f34 d2397f34 
Jun  1 09:32:58.215368 [  957.893329] 7f40:  d891c940  e1e7b000 
c05d65f4   
Jun  1 09:32:58.223551 [  957.893382] 7f60:  c0264048 0100  
00f8 e1e7b000  
Jun  1 09:32:58.231770 [  957.893428] 7f80: d2397f80 d2397f80   
d2397f90 d2397f90 d2397fac d891c940
Jun  1 09:32:58.240014 [  957.893467] 7fa0: c0263f7c   c020f038 
   
Jun  1 09:32:58.248268 [  957.893506] 7fc0:     
   
Jun  1 09:32:58.256517 [  957.893543] 7fe0:     
0013   
Jun  1 09:32:58.264784 [  957.893605] [c04ab434] (gnttab_batch_copy) from 
[c05d6acc] (xenvif_kthread_guest_rx+0x4d8/0xbc0)
Jun  1 09:32:58.273889 [  957.893659] [c05d6acc] (xenvif_kthread_guest_rx) 
from [c0264048] (kthread+0xcc/0xe8)
Jun  1 09:32:58.282006 [  957.893704] [c0264048] (kthread) from [c020f038] 
(ret_from_fork+0x14/0x3c)
Jun  1 09:32:58.289393 [  957.893746] Code: 0ae5 eaed e8bd80f8 e7f001f2 
(e7f001f2)
Jun  1 09:32:58.295500 [  957.893796] ---[ end trace fb4f074e7680a077 ]---

~Andrew

  build-armhf-xsm   4 host-build-prep  fail in 57644 REGR. vs. 
 57419

 Tests which are failing intermittently (not blocking):
  test-armhf-armhf-xl-cubietruck 3 host-install(3) broken in 57644 pass in 
 57712
  test-armhf-armhf-xl-sedf  3 host-install(3)   broken pass in 
 57644
  test-amd64-amd64-xl-qemuu-win7-amd64 9 windows-install fail in 57644 pass in 
 57712
  test-amd64-i386-xl-qemuu-win7-amd64  9 windows-install  fail pass in 
 57644

 Regressions which are regarded as allowable (not blocking):
  test-amd64-amd64-libvirt-xsm 11 guest-start   fail REGR. vs. 
 57419
  test-armhf-armhf-libvirt 11 guest-start   fail in 57644 like 
 57419
  test-amd64-i386-xl-qemuu-win7-amd64 16 guest-stop fail in 57644 like 
 57419
  test-amd64-i386-libvirt  11 guest-start  fail   like 
 57419
  test-amd64-i386-libvirt-xsm  11 guest-start  fail   like 
 57419
  test-amd64-amd64-libvirt 11 guest-start  fail   like 
 57419
  test-amd64-amd64-xl-qemut-win7-amd64 16 guest-stop fail like 
 57419

 Tests which did not succeed, but are not blocking:
  test-armhf-armhf-libvirt-xsm  1 build-check(1)blocked in 57644 
 

Re: [Xen-devel] [PATCH] nested EPT: fix the handling of nested EPT.

2015-06-02 Thread Jan Beulich
 On 02.06.15 at 19:41, liang.z...@intel.com wrote:
 @@ -1074,6 +1075,9 @@ void ept_sync_domain(struct p2m_domain *p2m)
  if ( !paging_mode_hap(d) || !d-vcpu || !d-vcpu[0] )
  return;
  
 +if ( nestedhvm_enabled(d) )
 +p2m_flush_nestedp2m(d);
 +
  ASSERT(local_irq_is_enabled());
  
  /*

Looks plausible, but I think the addition would better go after the
ASSERT().

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v8 01/13] x86: add socket_cpumask

2015-06-02 Thread Chao Peng
On Tue, Jun 02, 2015 at 07:57:55AM +0100, Jan Beulich wrote:
  On 02.06.15 at 08:35, chao.p.p...@linux.intel.com wrote:
  On Fri, May 29, 2015 at 09:52:03AM +0100, Jan Beulich wrote:
   On 29.05.15 at 10:28, chao.p.p...@linux.intel.com wrote:
   On Fri, May 29, 2015 at 09:01:53AM +0100, Jan Beulich wrote:
On 29.05.15 at 04:35, chao.p.p...@linux.intel.com wrote:
On Thu, May 28, 2015 at 01:38:05PM +0100, Jan Beulich wrote:
 On 21.05.15 at 10:41, chao.p.p...@linux.intel.com wrote:
 --- a/xen/arch/x86/mpparse.c
 +++ b/xen/arch/x86/mpparse.c
 @@ -87,6 +87,18 @@ void __init set_nr_cpu_ids(unsigned int 
 max_cpus)
  #endif
  }
  
 +void __init set_nr_sockets(void)
 +{
 +unsigned int cpus = bitmap_weight(phys_cpu_present_map.mask,
 +  boot_cpu_data.x86_max_cores 
 *
 +  
 boot_cpu_data.x86_num_siblings);
 +
 +if ( cpus == 0 )
 +cpus = 1;
 +
 +nr_sockets = DIV_ROUND_UP(num_processors + disabled_cpus, 
 cpus);
 +}

Is there a reason why this can't just be added to the end of the
immediately preceding set_nr_cpu_ids()?

You mean the declaration or invocation? If the former I have no 
special
reason for it (e.g. I can change it).
   
   Neither - I just don't see the need for a new function.
   
   In which case the invocation of set_nr_cpu_ids() should move to the
   place where now set_nr_sockets() is invoked, to make sure
   boot_cpu_data.x86_max_cores/x86_num_siblings available, which may not be
   your expectation.
  
  Ah, in which case this _is_ the explanation, albeit only provided the
  use of the two boot_cpu_data fields has to remain (which I had put
  under question). And if these have to remain, couldn't this be done
  in a presmp initcall instead of an explicitly called function?
  
  presmp is too late. nr_sockets will get used in smp_prepare_cpus()
  before calling set_cpu_sibling_map for cpu 0.
 
 Okay. In which case - why not calculate the value there?

Okay, then I just need move the invocation of set_nr_sockets() from
__start_xen() to smp_prepare_cpus().

Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Sanjeev Pandita
All,

I am pretty new to xen . I am trying to boot DOMU with qemu qcow AARCH64
Ubuntu 15.04 disk on Xen but I am getting the errors which link to
/usr/local/lib/xen/bin/qemu-system-i386.
Since I am working on aarch64 system the
/usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or might
not work as expected.

Please let me know how to make the Qemu qcow image work on Xen.
Attached are the DomU boot log and config file.

Thanks,
San

-- 
CONFIDENTIALITY NOTICE: This e-mail message, including any attachments, 
is for the sole use of the intended recipient(s) and contains information
that is confidential and proprietary to Applied Micro Circuits Corporation 
or its subsidiaries. 
It is to be used solely for the purpose of furthering the parties' business 
relationship. 
All unauthorized review, use, disclosure or distribution is prohibited. 
If you are not the intended recipient, please contact the sender by reply 
e-mail 
and destroy all copies of the original message.


config.log
Description: Binary data


DomU_booting.log
Description: Binary data
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 COLO Pre 04/12] tools/libxl: introduce a new API libxl__domain_restore() to load qemu state

2015-06-02 Thread Wen Congyang
On 06/02/2015 05:26 PM, Yang Hongyang wrote:
 Secondary vm is running in colo mode. So we will do
 the following things again and again:
 1. suspend both primay vm and secondary vm
 2. sync the state
 3. resume both primary vm and secondary vm
 We will send qemu's state each time in step2, and
 slave's qemu should read it each time before resuming
 secondary vm. Introduce a new API libxl__domain_restore()
 to do it. This API should be called before resuming
 secondary vm.
 
 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 Signed-off-by: Wen Congyang we...@cn.fujitsu.com
 ---
  tools/libxl/Makefile|  3 +-
  tools/libxl/libxl_dom_restore.c | 76 
 +
  tools/libxl/libxl_internal.h|  4 +++
  tools/libxl/libxl_qmp.c | 10 ++
  4 files changed, 92 insertions(+), 1 deletion(-)
  create mode 100644 tools/libxl/libxl_dom_restore.c
 
 diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
 index a87ee04..39ce836 100644
 --- a/tools/libxl/Makefile
 +++ b/tools/libxl/Makefile
 @@ -96,7 +96,8 @@ LIBXL_OBJS = flexarray.o libxl.o libxl_create.o libxl_dm.o 
 libxl_pci.o \
   libxl_json.o libxl_aoutils.o libxl_numa.o libxl_vnuma.o 
 \
   libxl_save_callout.o _libxl_save_msgs_callout.o \
   libxl_qmp.o libxl_event.o libxl_fork.o 
 libxl_dom_suspend.o \
 - libxl_toolstack.o libxl_dom_save.o $(LIBXL_OBJS-y)
 + libxl_toolstack.o libxl_dom_save.o libxl_dom_restore.o \
 + $(LIBXL_OBJS-y)
  LIBXL_OBJS += libxl_genid.o
  LIBXL_OBJS += _libxl_types.o libxl_flask.o _libxl_types_internal.o
  
 diff --git a/tools/libxl/libxl_dom_restore.c b/tools/libxl/libxl_dom_restore.c
 new file mode 100644
 index 000..df15ece
 --- /dev/null
 +++ b/tools/libxl/libxl_dom_restore.c
 @@ -0,0 +1,76 @@
 +/*
 + * Copyright (C) 2015 FUJITSU LIMITED
 + * Author Yang Hongyang yan...@cn.fujitsu.com
 + *Wen congyang we...@cn.fujitsu.com
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU Lesser General Public License as published
 + * by the Free Software Foundation; version 2 and later. with the special
 + * exception on linking described in file LICENSE.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU Lesser General Public License for more details.
 + */
 +
 +#include libxl_osdeps.h /* must come before any other headers */
 +
 +#include libxl_internal.h
 +
 +/*- main code for restoring, in order of execution -*/
 +
 +int libxl__domain_restore(libxl__gc *gc, uint32_t domid)
 +{
 +int rc = 0;
 +
 +libxl_domain_type type = libxl__domain_type(gc, domid);
 +if (type != LIBXL_DOMAIN_TYPE_HVM) {
 +rc = ERROR_FAIL;
 +goto out;
 +}
 +
 +rc = libxl__domain_restore_device_model(gc, domid);
 +if (rc)
 +LOG(ERROR, failed to restore device mode for domain %u:%d,
 +domid, rc);
 +out:
 +return rc;
 +}
 +
 +int libxl__domain_restore_device_model(libxl__gc *gc, uint32_t domid)
 +{
 +char *state_file;
 +int rc;
 +
 +switch (libxl__device_model_version_running(gc, domid)) {
 +case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
 +/* not supported now */
 +rc = ERROR_INVAL;
 +break;
 +case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
 +/*
 + * This function may be called too many times for the same gc,
 + * so we use NOGC, and free the memory before return to avoid
 + * OOM.
 + */
 +state_file = libxl__sprintf(NOGC,
 +XC_DEVICE_MODEL_RESTORE_FILE.%d,
 +domid);
 +rc = libxl__qmp_restore(gc, domid, state_file);
 +free(state_file);
 +break;
 +default:
 +rc = ERROR_INVAL;
 +}
 +
 +return rc;
 +}
 +
 +/*
 + * Local variables:
 + * mode: C
 + * c-basic-offset: 4
 + * indent-tabs-mode: nil
 + * End:
 + */
 diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
 index 4bab0de..71728ff 100644
 --- a/tools/libxl/libxl_internal.h
 +++ b/tools/libxl/libxl_internal.h
 @@ -1022,6 +1022,7 @@ _hidden int libxl__domain_rename(libxl__gc *gc, 
 uint32_t domid,
  
  _hidden int libxl__toolstack_restore(uint32_t domid, const uint8_t *buf,
   uint32_t size, void *data);
 +_hidden int libxl__domain_restore_device_model(libxl__gc *gc, uint32_t 
 domid);
  _hidden int libxl__domain_resume_device_model(libxl__gc *gc, uint32_t domid);
  
  _hidden const char *libxl__userdata_path(libxl__gc *gc, uint32_t domid,
 @@ -1039,6 +1040,7 @@ _hidden int libxl__userdata_store(libxl__gc *gc, 
 uint32_t domid,
const char 

Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Stefan Bader
On 02.06.2015 09:40, Sanjeev Pandita wrote:
 All,
 
 I am pretty new to xen . I am trying to boot DOMU with qemu qcow AARCH64
 Ubuntu 15.04 disk on Xen but I am getting the errors which link to
 /usr/local/lib/xen/bin/qemu-system-i386.
 Since I am working on aarch64 system the
 /usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or might
 not work as expected.

Because I am lacking hardware and feedback, the arm64 packaging is a rather
theoretical exercise. At least for armhf I thought qemu-system-x86 was a
dependency. That binary should provide x86 emulation on arm64, the same as one
could install qemu for other arches on x86.
Have you tried to install qemu-system-x86 manually?

-Stefan

 
 Please let me know how to make the Qemu qcow image work on Xen.
 Attached are the DomU boot log and config file.
 
 Thanks,
 San
 
 
 
 ___
 Xen-devel mailing list
 Xen-devel@lists.xen.org
 http://lists.xen.org/xen-devel
 




signature.asc
Description: OpenPGP digital signature
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v1 COLO Pre 08/12] tools/libxl: Update libxl_save_msgs_gen.pl to support return data from xl to xc

2015-06-02 Thread Yang Hongyang
From: Wen Congyang we...@cn.fujitsu.com

 Currently, all callbacks return an integer value or void. We cannot
 return some data to xc via callback. Update libxl_save_msgs_gen.pl
 to support this case.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 tools/libxl/libxl_internal.h   |  3 ++
 tools/libxl/libxl_save_callout.c   | 31 ++
 tools/libxl/libxl_save_helper.c| 17 ++
 tools/libxl/libxl_save_msgs_gen.pl | 65 ++
 4 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index e8357fc..8b60fef 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3179,6 +3179,9 @@ _hidden void libxl__xc_domain_save_done(libxl__egc*, void 
*dss_void,
  * When they are ready to indicate completion, they call this. */
 void libxl__xc_domain_saverestore_async_callback_done(libxl__egc *egc,
libxl__save_helper_state *shs, int return_value);
+void libxl__xc_domain_saverestore_async_callback_done_with_data(libxl__egc 
*egc,
+   libxl__save_helper_state *shs,
+   const void *data, uint64_t size);
 
 
 _hidden void libxl__domain_suspend_common_switch_qemu_logdirty
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index cd342b9..5c691eb 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -145,6 +145,15 @@ void 
libxl__xc_domain_saverestore_async_callback_done(libxl__egc *egc,
 shs-egc = 0;
 }
 
+void libxl__xc_domain_saverestore_async_callback_done_with_data(libxl__egc 
*egc,
+   libxl__save_helper_state *shs,
+   const void *data, uint64_t size)
+{
+shs-egc = egc;
+libxl__srm_callout_sendreply_data(data, size, shs);
+shs-egc = 0;
+}
+
 /*- helper execution -*/
 
 static void run_helper(libxl__egc *egc, libxl__save_helper_state *shs,
@@ -370,6 +379,28 @@ void libxl__srm_callout_sendreply(int r, void *user)
 helper_failed(egc, shs, ERROR_FAIL);
 }
 
+void libxl__srm_callout_sendreply_data(const void *data, uint64_t size, void 
*user)
+{
+libxl__save_helper_state *shs = user;
+libxl__egc *egc = shs-egc;
+STATE_AO_GC(shs-ao);
+int errnoval;
+
+errnoval = libxl_write_exactly(CTX, libxl__carefd_fd(shs-pipes[0]),
+   size, sizeof(size), shs-stdin_what,
+   callback return data length);
+if (errnoval)
+goto out;
+
+errnoval = libxl_write_exactly(CTX, libxl__carefd_fd(shs-pipes[0]),
+   data, size, shs-stdin_what,
+   callback return data);
+
+out:
+if (errnoval)
+helper_failed(egc, shs, ERROR_FAIL);
+}
+
 void libxl__srm_callout_callback_log(uint32_t level, uint32_t errnoval,
   const char *context, const char *formatted, void *user)
 {
diff --git a/tools/libxl/libxl_save_helper.c b/tools/libxl/libxl_save_helper.c
index 74826a1..44c5807 100644
--- a/tools/libxl/libxl_save_helper.c
+++ b/tools/libxl/libxl_save_helper.c
@@ -155,6 +155,23 @@ int helper_getreply(void *user)
 return v;
 }
 
+uint8_t *helper_getreply_data(void *user)
+{
+uint64_t size;
+int r = read_exactly(0, size, sizeof(size));
+uint8_t *data;
+
+if (r = 0)
+exit(-2);
+
+data = helper_allocbuf(size, user);
+r = read_exactly(0, data, size);
+if (r = 0)
+exit(-2);
+
+return data;
+}
+
 /*- other callbacks -*/
 
 static int toolstack_save_fd;
diff --git a/tools/libxl/libxl_save_msgs_gen.pl 
b/tools/libxl/libxl_save_msgs_gen.pl
index 6b4b65e..41ee000 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -15,6 +15,7 @@ our @msgs = (
 # and its null-ness needs to be passed through to the helper's xc
 #   W  - needs a return value; callback is synchronous
 #   A  - needs a return value; callback is asynchronous
+#   B  - return value is an pointer
 [  1, 'sr', log,   [qw(uint32_t level
  uint32_t errnoval
  STRING context
@@ -99,23 +100,28 @@ our $libxl = libxl__srm;
 our $callback = ${libxl}_callout_callback;
 our $receiveds = ${libxl}_callout_received;
 our $sendreply = ${libxl}_callout_sendreply;
+our $sendreply_data = ${libxl}_callout_sendreply_data;
 our $getcallbacks = ${libxl}_callout_get_callbacks;
 our $enumcallbacks = ${libxl}_callout_enumcallbacks;
 sub cbtype ($) { ${libxl}_.$_[0]._autogen_callbacks; };
 
 f_decl($sendreply, 'callout', 'void', (int r, void *user));
+f_decl($sendreply_data, 'callout', 'void',
+   (const void *data, uint64_t size, void *user));
 
 our $helper = helper;
 our $encode = ${helper}_stub;
 our $allocbuf = ${helper}_allocbuf;
 our 

[Xen-devel] [PATCH v1 COLO Pre 10/12] tools/libxl: rename remus device to checkpoint device

2015-06-02 Thread Yang Hongyang
This patch is auto generated by the following commands:
 1. git mv tools/libxl/libxl_remus_device.c 
tools/libxl/libxl_checkpoint_device.c
 2. perl -pi -e 's/libxl_remus_device/libxl_checkpoint_device/g' 
tools/libxl/Makefile
 3. perl -pi -e 's/\blibxl__remus_devices/libxl__checkpoint_devices/g' 
tools/libxl/*.[ch]
 4. perl -pi -e 's/\blibxl__remus_device\b/libxl__checkpoint_device/g' 
tools/libxl/*.[ch]
 5. perl -pi -e 
's/\blibxl__remus_device_instance_ops\b/libxl__checkpoint_device_instance_ops/g'
 tools/libxl/*.[ch]
 6. perl -pi -e 's/\blibxl__remus_callback\b/libxl__checkpoint_callback/g' 
tools/libxl/*.[ch]
 7. perl -pi -e 's/\bremus_device_init\b/checkpoint_device_init/g' 
tools/libxl/*.[ch]
 8. perl -pi -e 's/\bremus_devices_setup\b/checkpoint_devices_setup/g' 
tools/libxl/*.[ch]
 9. perl -pi -e 's/\bdefine_remus_checkpoint_api\b/define_checkpoint_api/g' 
tools/libxl/*.[ch]
10. perl -pi -e 's/\brds\b/cds/g' tools/libxl/*.[ch]
11. perl -pi -e 's/REMUS_DEVICE/CHECKPOINT_DEVICE/g' tools/libxl/*.[ch] 
tools/libxl/*.idl
12. perl -pi -e 's/REMUS_DEVOPS/CHECKPOINT_DEVOPS/g' tools/libxl/*.[ch] 
tools/libxl/*.idl
13. perl -pi -e 's/\bremus\b/checkpoint/g' 
tools/libxl/libxl_checkpoint_device.[ch]
14. perl -pi -e 's/\bremus device/checkpoint device/g' 
tools/libxl/libxl_internal.h
15. perl -pi -e 's/\bRemus device/checkpoint device/g' 
tools/libxl/libxl_internal.h
16. perl -pi -e 's/\bremus abstract/checkpoint abstract/g' 
tools/libxl/libxl_internal.h
17. perl -pi -e 's/\bremus invocation/checkpoint invocation/g' 
tools/libxl/libxl_internal.h
18. perl -pi -e 's/\blibxl__remus_device_\(/libxl__checkpoint_device_(/g' 
tools/libxl/libxl_internal.h

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxl/Makefile  |   2 +-
 tools/libxl/libxl_checkpoint_device.c | 327 ++
 tools/libxl/libxl_internal.h  | 112 ++--
 tools/libxl/libxl_netbuffer.c | 108 +--
 tools/libxl/libxl_nonetbuffer.c   |  10 +-
 tools/libxl/libxl_remus.c |  78 
 tools/libxl/libxl_remus_device.c  | 327 --
 tools/libxl/libxl_remus_disk_drbd.c   |  52 +++---
 tools/libxl/libxl_types.idl   |   4 +-
 9 files changed, 510 insertions(+), 510 deletions(-)
 create mode 100644 tools/libxl/libxl_checkpoint_device.c
 delete mode 100644 tools/libxl/libxl_remus_device.c

diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 39ce836..a8a0709 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -56,7 +56,7 @@ else
 LIBXL_OBJS-y += libxl_nonetbuffer.o
 endif
 
-LIBXL_OBJS-y += libxl_remus.o libxl_remus_device.o libxl_remus_disk_drbd.o
+LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
 
 LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
 LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o libxl_libfdt_compat.o
diff --git a/tools/libxl/libxl_checkpoint_device.c 
b/tools/libxl/libxl_checkpoint_device.c
new file mode 100644
index 000..109cd23
--- /dev/null
+++ b/tools/libxl/libxl_checkpoint_device.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Yang Hongyang yan...@cn.fujitsu.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include libxl_osdeps.h /* must come before any other headers */
+
+#include libxl_internal.h
+
+extern const libxl__checkpoint_device_instance_ops remus_device_nic;
+extern const libxl__checkpoint_device_instance_ops remus_device_drbd_disk;
+static const libxl__checkpoint_device_instance_ops *remus_ops[] = {
+remus_device_nic,
+remus_device_drbd_disk,
+NULL,
+};
+
+/*- helper functions -*/
+
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+/* init device subkind-specific state in the libxl ctx */
+int rc;
+STATE_AO_GC(cds-ao);
+
+if (libxl__netbuffer_enabled(gc)) {
+rc = init_subkind_nic(cds);
+if (rc) goto out;
+}
+
+rc = init_subkind_drbd_disk(cds);
+if (rc) goto out;
+
+rc = 0;
+out:
+return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+/* cleanup device subkind-specific state in the libxl ctx */
+STATE_AO_GC(cds-ao);
+
+if (libxl__netbuffer_enabled(gc))
+cleanup_subkind_nic(cds);
+
+cleanup_subkind_drbd_disk(cds);
+}
+
+/*- setup() and teardown() -*/
+
+/* callbacks */
+

[Xen-devel] [PATCH v1 COLO Pre 03/12] tools/libxc: export xc_bitops.h

2015-06-02 Thread Yang Hongyang
When we are under COLO, we will send dirty page bitmap info from
secondary to primary at every checkpoint. So we need to get/test
the dirty page bitmap. We just expose xc_bitops.h for libxl use.

NOTE:
  Need to make clean and rerun configure to get it compiled.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxc/include/xc_bitops.h | 76 +
 tools/libxc/xc_bitops.h | 76 -
 2 files changed, 76 insertions(+), 76 deletions(-)
 create mode 100644 tools/libxc/include/xc_bitops.h
 delete mode 100644 tools/libxc/xc_bitops.h

diff --git a/tools/libxc/include/xc_bitops.h b/tools/libxc/include/xc_bitops.h
new file mode 100644
index 000..cd749f4
--- /dev/null
+++ b/tools/libxc/include/xc_bitops.h
@@ -0,0 +1,76 @@
+#ifndef XC_BITOPS_H
+#define XC_BITOPS_H 1
+
+/* bitmap operations for single threaded access */
+
+#include stdlib.h
+#include string.h
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
+
+#define BITMAP_ENTRY(_nr,_bmap) ((_bmap))[(_nr)/BITS_PER_LONG]
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+/* calculate required space for number of longs needed to hold nr_bits */
+static inline int bitmap_size(int nr_bits)
+{
+int nr_long, nr_bytes;
+nr_long = (nr_bits + BITS_PER_LONG - 1)  ORDER_LONG;
+nr_bytes = nr_long * sizeof(unsigned long);
+return nr_bytes;
+}
+
+static inline unsigned long *bitmap_alloc(int nr_bits)
+{
+return calloc(1, bitmap_size(nr_bits));
+}
+
+static inline void bitmap_set(unsigned long *addr, int nr_bits)
+{
+memset(addr, 0xff, bitmap_size(nr_bits));
+}
+
+static inline void bitmap_clear(unsigned long *addr, int nr_bits)
+{
+memset(addr, 0, bitmap_size(nr_bits));
+}
+
+static inline int test_bit(int nr, unsigned long *addr)
+{
+return (BITMAP_ENTRY(nr, addr)  BITMAP_SHIFT(nr))  1;
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+BITMAP_ENTRY(nr, addr) = ~(1UL  BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+BITMAP_ENTRY(nr, addr) |= (1UL  BITMAP_SHIFT(nr));
+}
+
+static inline int test_and_clear_bit(int nr, unsigned long *addr)
+{
+int oldbit = test_bit(nr, addr);
+clear_bit(nr, addr);
+return oldbit;
+}
+
+static inline int test_and_set_bit(int nr, unsigned long *addr)
+{
+int oldbit = test_bit(nr, addr);
+set_bit(nr, addr);
+return oldbit;
+}
+
+static inline void bitmap_or(unsigned long *dst, const unsigned long *other,
+ int nr_bits)
+{
+int i, nr_longs = (bitmap_size(nr_bits) / sizeof(unsigned long));
+for ( i = 0; i  nr_longs; ++i )
+dst[i] |= other[i];
+}
+
+#endif  /* XC_BITOPS_H */
diff --git a/tools/libxc/xc_bitops.h b/tools/libxc/xc_bitops.h
deleted file mode 100644
index cd749f4..000
--- a/tools/libxc/xc_bitops.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef XC_BITOPS_H
-#define XC_BITOPS_H 1
-
-/* bitmap operations for single threaded access */
-
-#include stdlib.h
-#include string.h
-
-#define BITS_PER_LONG (sizeof(unsigned long) * 8)
-#define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
-
-#define BITMAP_ENTRY(_nr,_bmap) ((_bmap))[(_nr)/BITS_PER_LONG]
-#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
-
-/* calculate required space for number of longs needed to hold nr_bits */
-static inline int bitmap_size(int nr_bits)
-{
-int nr_long, nr_bytes;
-nr_long = (nr_bits + BITS_PER_LONG - 1)  ORDER_LONG;
-nr_bytes = nr_long * sizeof(unsigned long);
-return nr_bytes;
-}
-
-static inline unsigned long *bitmap_alloc(int nr_bits)
-{
-return calloc(1, bitmap_size(nr_bits));
-}
-
-static inline void bitmap_set(unsigned long *addr, int nr_bits)
-{
-memset(addr, 0xff, bitmap_size(nr_bits));
-}
-
-static inline void bitmap_clear(unsigned long *addr, int nr_bits)
-{
-memset(addr, 0, bitmap_size(nr_bits));
-}
-
-static inline int test_bit(int nr, unsigned long *addr)
-{
-return (BITMAP_ENTRY(nr, addr)  BITMAP_SHIFT(nr))  1;
-}
-
-static inline void clear_bit(int nr, unsigned long *addr)
-{
-BITMAP_ENTRY(nr, addr) = ~(1UL  BITMAP_SHIFT(nr));
-}
-
-static inline void set_bit(int nr, unsigned long *addr)
-{
-BITMAP_ENTRY(nr, addr) |= (1UL  BITMAP_SHIFT(nr));
-}
-
-static inline int test_and_clear_bit(int nr, unsigned long *addr)
-{
-int oldbit = test_bit(nr, addr);
-clear_bit(nr, addr);
-return oldbit;
-}
-
-static inline int test_and_set_bit(int nr, unsigned long *addr)
-{
-int oldbit = test_bit(nr, addr);
-set_bit(nr, addr);
-return oldbit;
-}
-
-static inline void bitmap_or(unsigned long *dst, const unsigned long *other,
- int nr_bits)
-{
-int i, nr_longs = (bitmap_size(nr_bits) / sizeof(unsigned long));
-for ( i = 0; i  nr_longs; ++i )
-dst[i] |= other[i];
-}
-
-#endif  /* XC_BITOPS_H */
-- 
1.9.1



[Xen-devel] [PATCH v1 COLO Pre 12/12] tools/libxl: don't touch remus in checkpoint_device

2015-06-02 Thread Yang Hongyang
Checkpoint device is an abstract layer to do checkpoint.
COLO can also use it to do checkpoint. But there are
still some codes in checkpoint device which touch remus:
1. remus_ops: we use remus ops directly in checkpoint
   device. Store it in checkpoint device state.
2. concrete layer's private member: add a new structure
   remus state, and move them to remus state.
3. init/cleanup device subkind: we call (init|cleanup)_subkind_nic
   and (init|cleanup)_subkind_drbd_disk directly in checkpoint
   device. Call them before calling libxl__checkpoint_devices_setup()
   or after calling libxl__checkpoint_devices_teardown().

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxl/libxl.c   |  2 +-
 tools/libxl/libxl_checkpoint_device.c | 52 ++--
 tools/libxl/libxl_dom_save.c  |  3 +-
 tools/libxl/libxl_internal.h  | 40 ++--
 tools/libxl/libxl_netbuffer.c | 51 +++-
 tools/libxl/libxl_remus.c | 90 ---
 tools/libxl/libxl_remus_disk_drbd.c   |  8 ++--
 7 files changed, 136 insertions(+), 110 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 8b15be6..0b37763 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -874,7 +874,7 @@ int libxl_domain_remus_start(libxl_ctx *ctx, 
libxl_domain_remus_info *info,
 assert(info);
 
 /* Point of no return */
-libxl__remus_setup(egc, dss);
+libxl__remus_setup(egc, dss-rs);
 return AO_INPROGRESS;
 
  out:
diff --git a/tools/libxl/libxl_checkpoint_device.c 
b/tools/libxl/libxl_checkpoint_device.c
index 226f159..0a16dbb 100644
--- a/tools/libxl/libxl_checkpoint_device.c
+++ b/tools/libxl/libxl_checkpoint_device.c
@@ -17,46 +17,6 @@
 
 #include libxl_internal.h
 
-extern const libxl__checkpoint_device_instance_ops remus_device_nic;
-extern const libxl__checkpoint_device_instance_ops remus_device_drbd_disk;
-static const libxl__checkpoint_device_instance_ops *remus_ops[] = {
-remus_device_nic,
-remus_device_drbd_disk,
-NULL,
-};
-
-/*- helper functions -*/
-
-static int init_device_subkind(libxl__checkpoint_devices_state *cds)
-{
-/* init device subkind-specific state in the libxl ctx */
-int rc;
-STATE_AO_GC(cds-ao);
-
-if (libxl__netbuffer_enabled(gc)) {
-rc = init_subkind_nic(cds);
-if (rc) goto out;
-}
-
-rc = init_subkind_drbd_disk(cds);
-if (rc) goto out;
-
-rc = 0;
-out:
-return rc;
-}
-
-static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
-{
-/* cleanup device subkind-specific state in the libxl ctx */
-STATE_AO_GC(cds-ao);
-
-if (libxl__netbuffer_enabled(gc))
-cleanup_subkind_nic(cds);
-
-cleanup_subkind_drbd_disk(cds);
-}
-
 /*- setup() and teardown() -*/
 
 /* callbacks */
@@ -94,14 +54,10 @@ static void checkpoint_devices_setup(libxl__egc *egc,
 void libxl__checkpoint_devices_setup(libxl__egc *egc,
  libxl__checkpoint_devices_state *cds)
 {
-int i, rc;
+int i;
 
 STATE_AO_GC(cds-ao);
 
-rc = init_device_subkind(cds);
-if (rc)
-goto out;
-
 cds-num_devices = 0;
 cds-num_nics = 0;
 cds-num_disks = 0;
@@ -134,7 +90,7 @@ void libxl__checkpoint_devices_setup(libxl__egc *egc,
 return;
 
 out:
-cds-callback(egc, cds, rc);
+cds-callback(egc, cds, 0);
 }
 
 static void checkpoint_devices_setup(libxl__egc *egc,
@@ -172,7 +128,7 @@ static void device_setup_iterate(libxl__egc *egc, 
libxl__ao_device *aodev)
 goto out;
 
 do {
-dev-ops = remus_ops[++dev-ops_index];
+dev-ops = dev-cds-ops[++dev-ops_index];
 if (!dev-ops) {
 libxl_device_nic * nic = NULL;
 libxl_device_disk * disk = NULL;
@@ -271,8 +227,6 @@ static void devices_teardown_cb(libxl__egc *egc,
 cds-disks = NULL;
 cds-num_disks = 0;
 
-cleanup_device_subkind(cds);
-
 cds-callback(egc, cds, rc);
 }
 
diff --git a/tools/libxl/libxl_dom_save.c b/tools/libxl/libxl_dom_save.c
index d34eeab..f2d3bc3 100644
--- a/tools/libxl/libxl_dom_save.c
+++ b/tools/libxl/libxl_dom_save.c
@@ -298,7 +298,6 @@ void libxl__domain_save(libxl__egc *egc, 
libxl__domain_save_state *dss)
 dss2-ao = ao;
 
 if (r_info != NULL) {
-dss-interval = r_info-interval;
 dss-xcflags |= XCFLAGS_CHECKPOINTED;
 if (libxl_defbool_val(r_info-compression))
 dss-xcflags |= XCFLAGS_CHECKPOINT_COMPRESS;
@@ -492,7 +491,7 @@ static void domain_save_done(libxl__egc *egc,
  * from sending checkpoints. Teardown the network buffers and
  * release netlink resources.  This is an async op.
  */
-libxl__remus_teardown(egc, dss, rc);
+libxl__remus_teardown(egc, dss-rs, rc);
 return;
 }
 
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 

[Xen-devel] [PATCH v1 COLO Pre 04/12] tools/libxl: introduce a new API libxl__domain_restore() to load qemu state

2015-06-02 Thread Yang Hongyang
Secondary vm is running in colo mode. So we will do
the following things again and again:
1. suspend both primay vm and secondary vm
2. sync the state
3. resume both primary vm and secondary vm
We will send qemu's state each time in step2, and
slave's qemu should read it each time before resuming
secondary vm. Introduce a new API libxl__domain_restore()
to do it. This API should be called before resuming
secondary vm.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 tools/libxl/Makefile|  3 +-
 tools/libxl/libxl_dom_restore.c | 76 +
 tools/libxl/libxl_internal.h|  4 +++
 tools/libxl/libxl_qmp.c | 10 ++
 4 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 tools/libxl/libxl_dom_restore.c

diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index a87ee04..39ce836 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -96,7 +96,8 @@ LIBXL_OBJS = flexarray.o libxl.o libxl_create.o libxl_dm.o 
libxl_pci.o \
libxl_json.o libxl_aoutils.o libxl_numa.o libxl_vnuma.o 
\
libxl_save_callout.o _libxl_save_msgs_callout.o \
libxl_qmp.o libxl_event.o libxl_fork.o 
libxl_dom_suspend.o \
-   libxl_toolstack.o libxl_dom_save.o $(LIBXL_OBJS-y)
+   libxl_toolstack.o libxl_dom_save.o libxl_dom_restore.o \
+   $(LIBXL_OBJS-y)
 LIBXL_OBJS += libxl_genid.o
 LIBXL_OBJS += _libxl_types.o libxl_flask.o _libxl_types_internal.o
 
diff --git a/tools/libxl/libxl_dom_restore.c b/tools/libxl/libxl_dom_restore.c
new file mode 100644
index 000..df15ece
--- /dev/null
+++ b/tools/libxl/libxl_dom_restore.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2015 FUJITSU LIMITED
+ * Author Yang Hongyang yan...@cn.fujitsu.com
+ *Wen congyang we...@cn.fujitsu.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2 and later. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include libxl_osdeps.h /* must come before any other headers */
+
+#include libxl_internal.h
+
+/*- main code for restoring, in order of execution -*/
+
+int libxl__domain_restore(libxl__gc *gc, uint32_t domid)
+{
+int rc = 0;
+
+libxl_domain_type type = libxl__domain_type(gc, domid);
+if (type != LIBXL_DOMAIN_TYPE_HVM) {
+rc = ERROR_FAIL;
+goto out;
+}
+
+rc = libxl__domain_restore_device_model(gc, domid);
+if (rc)
+LOG(ERROR, failed to restore device mode for domain %u:%d,
+domid, rc);
+out:
+return rc;
+}
+
+int libxl__domain_restore_device_model(libxl__gc *gc, uint32_t domid)
+{
+char *state_file;
+int rc;
+
+switch (libxl__device_model_version_running(gc, domid)) {
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
+/* not supported now */
+rc = ERROR_INVAL;
+break;
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+/*
+ * This function may be called too many times for the same gc,
+ * so we use NOGC, and free the memory before return to avoid
+ * OOM.
+ */
+state_file = libxl__sprintf(NOGC,
+XC_DEVICE_MODEL_RESTORE_FILE.%d,
+domid);
+rc = libxl__qmp_restore(gc, domid, state_file);
+free(state_file);
+break;
+default:
+rc = ERROR_INVAL;
+}
+
+return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 4bab0de..71728ff 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -1022,6 +1022,7 @@ _hidden int libxl__domain_rename(libxl__gc *gc, uint32_t 
domid,
 
 _hidden int libxl__toolstack_restore(uint32_t domid, const uint8_t *buf,
  uint32_t size, void *data);
+_hidden int libxl__domain_restore_device_model(libxl__gc *gc, uint32_t domid);
 _hidden int libxl__domain_resume_device_model(libxl__gc *gc, uint32_t domid);
 
 _hidden const char *libxl__userdata_path(libxl__gc *gc, uint32_t domid,
@@ -1039,6 +1040,7 @@ _hidden int libxl__userdata_store(libxl__gc *gc, uint32_t 
domid,
   const char *userdata_userid,
   const uint8_t *data, int datalen);
 
+_hidden int libxl__domain_restore(libxl__gc *gc, uint32_t domid);
 _hidden int 

[Xen-devel] [PATCH v1 COLO Pre 02/12] libxc/restore: zero ioreq page only one time

2015-06-02 Thread Yang Hongyang
ioreq page contains evtchn which will be set when we resume the
secondary vm the first time. The hypervisor will check if the
evtchn is corrupted, so we cannot zero the ioreq page more
than one time.

The ioreq-state is always STATE_IOREQ_NONE after the vm is
suspended, so it is OK if we only zero it one time.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
Signed-off-by: Wen congyang we...@cn.fujitsu.com
CC: Andrew Cooper andrew.coop...@citrix.com
---
 tools/libxc/xc_sr_restore_x86_hvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c 
b/tools/libxc/xc_sr_restore_x86_hvm.c
index 6f5af0e..06177e0 100644
--- a/tools/libxc/xc_sr_restore_x86_hvm.c
+++ b/tools/libxc/xc_sr_restore_x86_hvm.c
@@ -78,7 +78,8 @@ static int handle_hvm_params(struct xc_sr_context *ctx,
 break;
 case HVM_PARAM_IOREQ_PFN:
 case HVM_PARAM_BUFIOREQ_PFN:
-xc_clear_domain_page(xch, ctx-domid, entry-value);
+if ( !ctx-restore.buffer_all_records )
+xc_clear_domain_page(xch, ctx-domid, entry-value);
 break;
 }
 
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v1 COLO Pre 11/12] tools/libxl: adjust the indentation

2015-06-02 Thread Yang Hongyang
This is just tidying up after the previous automatic renaming.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 tools/libxl/libxl_checkpoint_device.c | 21 +++--
 tools/libxl/libxl_internal.h  | 19 +++
 tools/libxl/libxl_remus.c |  9 ++---
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/tools/libxl/libxl_checkpoint_device.c 
b/tools/libxl/libxl_checkpoint_device.c
index 109cd23..226f159 100644
--- a/tools/libxl/libxl_checkpoint_device.c
+++ b/tools/libxl/libxl_checkpoint_device.c
@@ -73,9 +73,9 @@ static void devices_teardown_cb(libxl__egc *egc,
 /* checkpoint device setup and teardown */
 
 static libxl__checkpoint_device* checkpoint_device_init(libxl__egc *egc,
-  libxl__checkpoint_devices_state 
*cds,
-  libxl__device_kind kind,
-  void *libxl_dev)
+libxl__checkpoint_devices_state *cds,
+libxl__device_kind kind,
+void *libxl_dev)
 {
 libxl__checkpoint_device *dev = NULL;
 
@@ -89,9 +89,10 @@ static libxl__checkpoint_device* 
checkpoint_device_init(libxl__egc *egc,
 }
 
 static void checkpoint_devices_setup(libxl__egc *egc,
-libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
 
-void libxl__checkpoint_devices_setup(libxl__egc *egc, 
libxl__checkpoint_devices_state *cds)
+void libxl__checkpoint_devices_setup(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds)
 {
 int i, rc;
 
@@ -137,7 +138,7 @@ out:
 }
 
 static void checkpoint_devices_setup(libxl__egc *egc,
-libxl__checkpoint_devices_state *cds)
+ libxl__checkpoint_devices_state *cds)
 {
 int i, rc;
 
@@ -285,12 +286,12 @@ static void devices_checkpoint_cb(libxl__egc *egc,
 
 /* API implementations */
 
-#define define_checkpoint_api(api)\
-void libxl__checkpoint_devices_##api(libxl__egc *egc,\
-libxl__checkpoint_devices_state *cds)\
+#define define_checkpoint_api(api)  \
+void libxl__checkpoint_devices_##api(libxl__egc *egc,   \
+libxl__checkpoint_devices_state *cds)   \
 {   \
 int i;  \
-libxl__checkpoint_device *dev;   \
+libxl__checkpoint_device *dev;  \
 \
 STATE_AO_GC(cds-ao);   \
 \
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index e37d4dd..ecff232 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2672,7 +2672,8 @@ typedef struct libxl__save_helper_state {
  * Each device type needs to implement the interfaces specified in
  * the libxl__checkpoint_device_instance_ops if it wishes to support Remus.
  *
- * The high-level control flow through the checkpoint device layer is shown 
below:
+ * The high-level control flow through the checkpoint device layer is shown
+ * below:
  *
  * xl remus
  *  |-  libxl_domain_remus_start
@@ -2733,7 +2734,8 @@ int 
init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
 void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
 
 typedef void libxl__checkpoint_callback(libxl__egc *,
-   libxl__checkpoint_devices_state *, int rc);
+libxl__checkpoint_devices_state *,
+int rc);
 
 /*
  * State associated with a checkpoint invocation, including parameters
@@ -2741,7 +2743,7 @@ typedef void libxl__checkpoint_callback(libxl__egc *,
  * save/restore machinery.
  */
 struct libxl__checkpoint_devices_state {
-/* must be set by caller of libxl__checkpoint_device_(setup|teardown) 
*/
+/*-- must be set by caller of libxl__checkpoint_device_(setup|teardown) 
--*/
 
 libxl__ao *ao;
 uint32_t domid;
@@ -2754,7 +2756,8 @@ struct libxl__checkpoint_devices_state {
 /*
  * this array is allocated before setup the checkpoint devices by the
  * checkpoint abstract layer.
- * devs may be NULL, means there's no checkpoint devices that has been set 
up.
+ * devs may be NULL, means there's no checkpoint devices that has been
+ * set up.
 

[Xen-devel] [PATCH v1 COLO Pre 06/12] tools/libxl: Update libxl__domain_unpause() to support qemu-xen

2015-06-02 Thread Yang Hongyang
Currently, libxl__domain_unpause() only supports
qemu-xen-traditional. Update it to support qemu-xen.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxl/libxl.c | 42 +-
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 98f2a4f..b960984 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -966,10 +966,37 @@ out:
 return AO_INPROGRESS;
 }
 
-int libxl__domain_unpause(libxl__gc *gc, uint32_t domid)
+static int libxl__domain_unpause_device_model(libxl__gc *gc, uint32_t domid)
 {
 char *path;
 char *state;
+
+switch (libxl__device_model_version_running(gc, domid)) {
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: {
+uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
+
+path = libxl__device_model_xs_path(gc, dm_domid, domid, /state);
+state = libxl__xs_read(gc, XBT_NULL, path);
+if (state != NULL  !strcmp(state, paused)) {
+libxl__qemu_traditional_cmd(gc, domid, continue);
+libxl__wait_for_device_model_deprecated(gc, domid, running,
+NULL, NULL, NULL);
+}
+break;
+}
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+if (libxl__qmp_resume(gc, domid))
+return ERROR_FAIL;
+break;
+default:
+return ERROR_INVAL;
+}
+
+return 0;
+}
+
+int libxl__domain_unpause(libxl__gc *gc, uint32_t domid)
+{
 int ret, rc = 0;
 
 libxl_domain_type type = libxl__domain_type(gc, domid);
@@ -979,14 +1006,11 @@ int libxl__domain_unpause(libxl__gc *gc, uint32_t domid)
 }
 
 if (type == LIBXL_DOMAIN_TYPE_HVM) {
-uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
-
-path = libxl__device_model_xs_path(gc, dm_domid, domid, /state);
-state = libxl__xs_read(gc, XBT_NULL, path);
-if (state != NULL  !strcmp(state, paused)) {
-libxl__qemu_traditional_cmd(gc, domid, continue);
-libxl__wait_for_device_model_deprecated(gc, domid, running,
- NULL, NULL, NULL);
+rc = libxl__domain_unpause_device_model(gc, domid);
+if (rc  0) {
+LOG(ERROR, failed to unpause device model for domain %u:%d,
+domid, rc);
+goto out;
 }
 }
 
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v1 COLO Pre 05/12] tools/libxl: Introduce a new internal API libxl__domain_unpause()

2015-06-02 Thread Yang Hongyang
From: Wen Congyang we...@cn.fujitsu.com

The guest is paused after libxl_domain_create_restore().
Secondary vm is running in colo mode. So we need to unpause
the guest. The current API libxl_domain_unpause() is
not an internal API. Introduce a new API to support it.
No functional change.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxl/libxl.c  | 20 ++--
 tools/libxl/libxl_internal.h |  1 +
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 5b58312..98f2a4f 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -966,9 +966,8 @@ out:
 return AO_INPROGRESS;
 }
 
-int libxl_domain_unpause(libxl_ctx *ctx, uint32_t domid)
+int libxl__domain_unpause(libxl__gc *gc, uint32_t domid)
 {
-GC_INIT(ctx);
 char *path;
 char *state;
 int ret, rc = 0;
@@ -980,7 +979,7 @@ int libxl_domain_unpause(libxl_ctx *ctx, uint32_t domid)
 }
 
 if (type == LIBXL_DOMAIN_TYPE_HVM) {
-uint32_t dm_domid = libxl_get_stubdom_id(ctx, domid);
+uint32_t dm_domid = libxl_get_stubdom_id(CTX, domid);
 
 path = libxl__device_model_xs_path(gc, dm_domid, domid, /state);
 state = libxl__xs_read(gc, XBT_NULL, path);
@@ -990,12 +989,21 @@ int libxl_domain_unpause(libxl_ctx *ctx, uint32_t domid)
  NULL, NULL, NULL);
 }
 }
-ret = xc_domain_unpause(ctx-xch, domid);
-if (ret0) {
-LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, unpausing domain %d, domid);
+
+ret = xc_domain_unpause(CTX-xch, domid);
+if (ret  0) {
+LIBXL__LOG_ERRNO(CTX, LIBXL__LOG_ERROR, unpausing domain %d, domid);
 rc = ERROR_FAIL;
 }
  out:
+return rc;
+}
+
+int libxl_domain_unpause(libxl_ctx *ctx, uint32_t domid)
+{
+GC_INIT(ctx);
+int rc = libxl__domain_unpause(gc, domid);
+
 GC_FREE;
 return rc;
 }
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 71728ff..b9c93aa 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -1043,6 +1043,7 @@ _hidden int libxl__userdata_store(libxl__gc *gc, uint32_t 
domid,
 _hidden int libxl__domain_restore(libxl__gc *gc, uint32_t domid);
 _hidden int libxl__domain_resume(libxl__gc *gc, uint32_t domid,
  int suspend_cancel);
+_hidden int libxl__domain_unpause(libxl__gc *gc, uint32_t domid);
 
 /* returns 0 or 1, or a libxl error code */
 _hidden int libxl__domain_pvcontrol_available(libxl__gc *gc, uint32_t domid);
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v1 COLO Pre 00/12] Prerequisite patches for COLO

2015-06-02 Thread Yang Hongyang
This patchset is Prerequisite for COLO feature. For what COLO is, refer
to http://wiki.xen.org/wiki/COLO_-_Coarse_Grain_Lock_Stepping

This patchset is based on:
[PATCH v1 0/5] Misc cleanups for libxl 
http://lists.xenproject.org/archives/html/xen-devel/2015-05/msg02591.html

and is taken from previous sent RFC v5 COLO patches.

You can also get the patchset from:
https://github.com/macrosheep/xen/tree/pre-colo-v1

Wen Congyang (4):
  tools/libxc: support to resume uncooperative HVM guests
  tools/libxl: Introduce a new internal API libxl__domain_unpause()
  tools/libxl: Update libxl_save_msgs_gen.pl to support return data from
xl to xc
  tools/libxl: Add back channel to allow migration target send data back

Yang Hongyang (8):
  libxc/restore: zero ioreq page only one time
  tools/libxc: export xc_bitops.h
  tools/libxl: introduce a new API libxl__domain_restore() to load qemu
state
  tools/libxl: Update libxl__domain_unpause() to support qemu-xen
  tools/libxl: introduce libxl__domain_common_switch_qemu_logdirty()
  tools/libxl: rename remus device to checkpoint device
  tools/libxl: adjust the indentation
  tools/libxl: don't touch remus in checkpoint_device

 tools/libxc/include/xc_bitops.h   |  76 
 tools/libxc/xc_bitops.h   |  76 
 tools/libxc/xc_resume.c   |  22 ++-
 tools/libxc/xc_sr_restore_x86_hvm.c   |   3 +-
 tools/libxl/Makefile  |   5 +-
 tools/libxl/libxl.c   |  62 +--
 tools/libxl/libxl_checkpoint_device.c | 282 +
 tools/libxl/libxl_create.c|  14 +-
 tools/libxl/libxl_dom_restore.c   |  76 
 tools/libxl/libxl_dom_save.c  |  81 +
 tools/libxl/libxl_internal.h  | 171 ++
 tools/libxl/libxl_netbuffer.c | 117 ++--
 tools/libxl/libxl_nonetbuffer.c   |  10 +-
 tools/libxl/libxl_qmp.c   |  10 ++
 tools/libxl/libxl_remus.c | 145 ++-
 tools/libxl/libxl_remus_device.c  | 327 --
 tools/libxl/libxl_remus_disk_drbd.c   |  56 +++---
 tools/libxl/libxl_save_callout.c  |  31 
 tools/libxl/libxl_save_helper.c   |  17 ++
 tools/libxl/libxl_save_msgs_gen.pl|  65 ++-
 tools/libxl/libxl_types.idl   |  11 +-
 tools/libxl/xl_cmdimpl.c  |   7 +
 22 files changed, 991 insertions(+), 673 deletions(-)
 create mode 100644 tools/libxc/include/xc_bitops.h
 delete mode 100644 tools/libxc/xc_bitops.h
 create mode 100644 tools/libxl/libxl_checkpoint_device.c
 create mode 100644 tools/libxl/libxl_dom_restore.c
 delete mode 100644 tools/libxl/libxl_remus_device.c

-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v1 COLO Pre 07/12] tools/libxl: introduce libxl__domain_common_switch_qemu_logdirty()

2015-06-02 Thread Yang Hongyang
Secondary vm is running in colo mode, we need to send
secondary vm's dirty page information to master at checkpoint,
so we have to enable qemu logdirty on secondary.

libxl__domain_suspend_common_switch_qemu_logdirty() is to enable
qemu logdirty. But it uses domain_save_state, and calls
libxl__xc_domain_saverestore_async_callback_done()
before exits. This can not be used for secondary vm.

Update libxl__domain_suspend_common_switch_qemu_logdirty() to
introduce a new API libxl__domain_common_switch_qemu_logdirty().
This API only uses libxl__logdirty_switch, and calls
lds-callback before exits.

Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
---
 tools/libxl/libxl_dom_save.c | 78 ++--
 tools/libxl/libxl_internal.h |  8 +
 2 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/tools/libxl/libxl_dom_save.c b/tools/libxl/libxl_dom_save.c
index c98f1dd..d34eeab 100644
--- a/tools/libxl/libxl_dom_save.c
+++ b/tools/libxl/libxl_dom_save.c
@@ -32,7 +32,7 @@ static void switch_logdirty_timeout(libxl__egc *egc, 
libxl__ev_time *ev,
 static void switch_logdirty_xswatch(libxl__egc *egc, libxl__ev_xswatch*,
 const char *watch_path, const char *event_path);
 static void switch_logdirty_done(libxl__egc *egc,
- libxl__domain_save_state *dss, int ok);
+ libxl__logdirty_switch *lds, int ok);
 
 static void logdirty_init(libxl__logdirty_switch *lds)
 {
@@ -42,13 +42,10 @@ static void logdirty_init(libxl__logdirty_switch *lds)
 }
 
 static void domain_suspend_switch_qemu_xen_traditional_logdirty
-   (int domid, unsigned enable,
-libxl__save_helper_state *shs)
+   (libxl__egc *egc, int domid, unsigned enable,
+libxl__logdirty_switch *lds)
 {
-libxl__egc *egc = shs-egc;
-libxl__domain_save_state *dss = CONTAINER_OF(shs, *dss, shs);
-libxl__logdirty_switch *lds = dss-logdirty;
-STATE_AO_GC(dss-ao);
+STATE_AO_GC(lds-ao);
 int rc;
 xs_transaction_t t = 0;
 const char *got;
@@ -110,64 +107,81 @@ static void 
domain_suspend_switch_qemu_xen_traditional_logdirty
  out:
 LOG(ERROR,logdirty switch failed (rc=%d), aborting suspend,rc);
 libxl__xs_transaction_abort(gc, t);
-switch_logdirty_done(egc,dss,-1);
+switch_logdirty_done(egc,lds,-1);
 }
 
 static void domain_suspend_switch_qemu_xen_logdirty
-   (int domid, unsigned enable,
-libxl__save_helper_state *shs)
+   (libxl__egc *egc, int domid, unsigned enable,
+libxl__logdirty_switch *lds)
 {
-libxl__egc *egc = shs-egc;
-libxl__domain_save_state *dss = CONTAINER_OF(shs, *dss, shs);
-STATE_AO_GC(dss-ao);
+STATE_AO_GC(lds-ao);
 int rc;
 
 rc = libxl__qmp_set_global_dirty_log(gc, domid, enable);
 if (!rc) {
-libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+lds-callback(egc, lds, 0);
 } else {
 LOG(ERROR,logdirty switch failed (rc=%d), aborting suspend,rc);
-libxl__xc_domain_saverestore_async_callback_done(egc, shs, -1);
+lds-callback(egc, lds, -1);
 }
 }
 
+static void libxl__domain_suspend_switch_qemu_logdirty_done
+(libxl__egc *egc, libxl__logdirty_switch *lds, int rc)
+{
+libxl__domain_save_state *dss = CONTAINER_OF(lds, *dss, logdirty);
+
+libxl__xc_domain_saverestore_async_callback_done(egc, dss-shs, rc);
+}
+
 void libxl__domain_suspend_common_switch_qemu_logdirty
(int domid, unsigned enable, void *user)
 {
 libxl__save_helper_state *shs = user;
 libxl__egc *egc = shs-egc;
 libxl__domain_save_state *dss = CONTAINER_OF(shs, *dss, shs);
-STATE_AO_GC(dss-ao);
+
+/* convenience aliases */
+libxl__logdirty_switch *const lds = dss-logdirty;
+
+lds-callback = libxl__domain_suspend_switch_qemu_logdirty_done;
+libxl__domain_common_switch_qemu_logdirty(egc, domid, enable, lds);
+}
+
+void libxl__domain_common_switch_qemu_logdirty(libxl__egc *egc,
+   int domid, unsigned enable,
+   libxl__logdirty_switch *lds)
+{
+STATE_AO_GC(lds-ao);
 
 switch (libxl__device_model_version_running(gc, domid)) {
 case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
-domain_suspend_switch_qemu_xen_traditional_logdirty(domid, enable, 
shs);
+domain_suspend_switch_qemu_xen_traditional_logdirty(egc, domid, enable,
+lds);
 break;
 case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
-domain_suspend_switch_qemu_xen_logdirty(domid, enable, shs);
+

[Xen-devel] [PATCH v1 COLO Pre 01/12] tools/libxc: support to resume uncooperative HVM guests

2015-06-02 Thread Yang Hongyang
From: Wen Congyang we...@cn.fujitsu.com

For PVHVM, the hypercall return code is 0, and it can be resumed
in a new domain context.
we suspend PVHVM and resume it is like this:
1. suspend it via evtchn
2. modifty the return code to 1
3. the guest know that the suspend is cancelled, we will use fast path
   to resume it.

Under COLO, we will update the guest's state(modify memory, cpu's registers,
device status...). In this case, we cannot use the fast path to resume it.
Keep the return code 0, and use a slow path to resume the guest. We have
updated the guest state, so we call it a new domain context.

For HVM, the hypercall is a NOP.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxc/xc_resume.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/tools/libxc/xc_resume.c b/tools/libxc/xc_resume.c
index e67bebd..bd82334 100644
--- a/tools/libxc/xc_resume.c
+++ b/tools/libxc/xc_resume.c
@@ -109,6 +109,23 @@ static int xc_domain_resume_cooperative(xc_interface *xch, 
uint32_t domid)
 return do_domctl(xch, domctl);
 }
 
+static int xc_domain_resume_hvm(xc_interface *xch, uint32_t domid)
+{
+DECLARE_DOMCTL;
+
+/*
+ * If it is PVHVM, the hypercall return code is 0, because this
+ * is not a fast path resume, we do not modify_returncode as in
+ * xc_domain_resume_cooperative.
+ * (resuming it in a new domain context)
+ *
+ * If it is a HVM, the hypercall is a NOP.
+ */
+domctl.cmd = XEN_DOMCTL_resumedomain;
+domctl.domain = domid;
+return do_domctl(xch, domctl);
+}
+
 static int xc_domain_resume_any(xc_interface *xch, uint32_t domid)
 {
 DECLARE_DOMCTL;
@@ -138,10 +155,7 @@ static int xc_domain_resume_any(xc_interface *xch, 
uint32_t domid)
  */
 #if defined(__i386__) || defined(__x86_64__)
 if ( info.hvm )
-{
-ERROR(Cannot resume uncooperative HVM guests);
-return rc;
-}
+return xc_domain_resume_hvm(xch, domid);
 
 if ( xc_domain_get_guest_width(xch, domid, dinfo-guest_width) != 0 )
 {
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v1 COLO Pre 09/12] tools/libxl: Add back channel to allow migration target send data back

2015-06-02 Thread Yang Hongyang
From: Wen Congyang we...@cn.fujitsu.com

In colo mode, slave needs to send data to master, but the io_fd
only can be written in master, and only can be read in slave.
Save recv_fd in domain_suspend_state, and send_fd in
domain_create_state.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
---
 tools/libxl/libxl.c  |  2 +-
 tools/libxl/libxl_create.c   | 14 ++
 tools/libxl/libxl_internal.h |  2 ++
 tools/libxl/libxl_types.idl  |  7 +++
 tools/libxl/xl_cmdimpl.c |  7 +++
 5 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index b960984..8b15be6 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -865,7 +865,7 @@ int libxl_domain_remus_start(libxl_ctx *ctx, 
libxl_domain_remus_info *info,
 dss-callback = remus_failover_cb;
 dss-domid = domid;
 dss-fd = send_fd;
-/* TODO do something with recv_fd */
+dss-recv_fd = recv_fd;
 dss-type = type;
 dss-live = 1;
 dss-debug = 0;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 86384d2..bd8149c 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1577,8 +1577,8 @@ static void domain_create_cb(libxl__egc *egc,
  int rc, uint32_t domid);
 
 static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
-uint32_t *domid,
-int restore_fd, int checkpointed_stream,
+uint32_t *domid, int restore_fd,
+int send_fd, int checkpointed_stream,
 const libxl_asyncop_how *ao_how,
 const libxl_asyncprogress_how *aop_console_how)
 {
@@ -1591,6 +1591,7 @@ static int do_domain_create(libxl_ctx *ctx, 
libxl_domain_config *d_config,
 libxl_domain_config_init(cdcs-dcs.guest_config_saved);
 libxl_domain_config_copy(ctx, cdcs-dcs.guest_config_saved, d_config);
 cdcs-dcs.restore_fd = restore_fd;
+cdcs-dcs.send_fd = send_fd;
 cdcs-dcs.callback = domain_create_cb;
 cdcs-dcs.checkpointed_stream = checkpointed_stream;
 libxl__ao_progress_gethow(cdcs-dcs.aop_console_how, aop_console_how);
@@ -1619,7 +1620,7 @@ int libxl_domain_create_new(libxl_ctx *ctx, 
libxl_domain_config *d_config,
 const libxl_asyncop_how *ao_how,
 const libxl_asyncprogress_how *aop_console_how)
 {
-return do_domain_create(ctx, d_config, domid, -1, 0,
+return do_domain_create(ctx, d_config, domid, -1, -1, 0,
 ao_how, aop_console_how);
 }
 
@@ -1629,7 +1630,12 @@ int libxl_domain_create_restore(libxl_ctx *ctx, 
libxl_domain_config *d_config,
 const libxl_asyncop_how *ao_how,
 const libxl_asyncprogress_how *aop_console_how)
 {
-return do_domain_create(ctx, d_config, domid, restore_fd,
+int send_fd = -1;
+
+if (params-checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO)
+send_fd = params-send_fd;
+
+return do_domain_create(ctx, d_config, domid, restore_fd, send_fd,
 params-checkpointed_stream, ao_how, 
aop_console_how);
 }
 
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 8b60fef..9cd976f 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2873,6 +2873,7 @@ struct libxl__domain_save_state {
 
 uint32_t domid;
 int fd;
+int recv_fd;
 libxl_domain_type type;
 int live;
 int debug;
@@ -3142,6 +3143,7 @@ struct libxl__domain_create_state {
 libxl_domain_config *guest_config;
 libxl_domain_config guest_config_saved; /* vanilla config */
 int restore_fd;
+int send_fd;
 libxl__domain_create_cb *callback;
 libxl_asyncprogress_how aop_console_how;
 /* private to domain_create */
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 23f27d4..8a3d7ba 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -198,6 +198,12 @@ libxl_viridian_enlightenment = 
Enumeration(viridian_enlightenment, [
 (3, reference_tsc),
 ])
 
+libxl_checkpointed_stream = Enumeration(checkpointed_stream, [
+(0, NONE),
+(1, REMUS),
+(2, COLO),
+], init_val = 0)
+
 #
 # Complex libxl types
 #
@@ -346,6 +352,7 @@ libxl_domain_create_info = Struct(domain_create_info,[
 
 libxl_domain_restore_params = Struct(domain_restore_params, [
 (checkpointed_stream, integer),
+(send_fd, integer),
 ])
 
 libxl_domain_sched_params = Struct(domain_sched_params,[
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index c858068..adfadd1 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -154,6 +154,7 @@ struct domain_create {
 const char *extra_config; /* extra config string */
 const char 

Re: [Xen-devel] [PATCH v8 01/13] x86: add socket_cpumask

2015-06-02 Thread Jan Beulich
 On 02.06.15 at 08:35, chao.p.p...@linux.intel.com wrote:
 On Fri, May 29, 2015 at 09:52:03AM +0100, Jan Beulich wrote:
  On 29.05.15 at 10:28, chao.p.p...@linux.intel.com wrote:
  On Fri, May 29, 2015 at 09:01:53AM +0100, Jan Beulich wrote:
   On 29.05.15 at 04:35, chao.p.p...@linux.intel.com wrote:
   On Thu, May 28, 2015 at 01:38:05PM +0100, Jan Beulich wrote:
On 21.05.15 at 10:41, chao.p.p...@linux.intel.com wrote:
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -87,6 +87,18 @@ void __init set_nr_cpu_ids(unsigned int max_cpus)
 #endif
 }
 
+void __init set_nr_sockets(void)
+{
+unsigned int cpus = bitmap_weight(phys_cpu_present_map.mask,
+  boot_cpu_data.x86_max_cores *
+  
boot_cpu_data.x86_num_siblings);
+
+if ( cpus == 0 )
+cpus = 1;
+
+nr_sockets = DIV_ROUND_UP(num_processors + disabled_cpus, cpus);
+}
   
   Is there a reason why this can't just be added to the end of the
   immediately preceding set_nr_cpu_ids()?
   
   You mean the declaration or invocation? If the former I have no special
   reason for it (e.g. I can change it).
  
  Neither - I just don't see the need for a new function.
  
  In which case the invocation of set_nr_cpu_ids() should move to the
  place where now set_nr_sockets() is invoked, to make sure
  boot_cpu_data.x86_max_cores/x86_num_siblings available, which may not be
  your expectation.
 
 Ah, in which case this _is_ the explanation, albeit only provided the
 use of the two boot_cpu_data fields has to remain (which I had put
 under question). And if these have to remain, couldn't this be done
 in a presmp initcall instead of an explicitly called function?
 
 presmp is too late. nr_sockets will get used in smp_prepare_cpus()
 before calling set_cpu_sibling_map for cpu 0.

Okay. In which case - why not calculate the value there?

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [RESEND] nested EPT: fix the handling of nested EPT.

2015-06-02 Thread Liang Li
If the host EPT entry is changed, the nested EPT should be updated.
The current code does not do this, and it's wrong.

Reported-by: Tim Deegan t...@xen.org
Signed-off-by: Liang Li liang.z...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
---
 xen/arch/x86/mm/p2m-ept.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 5133eb6..26293a0 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -26,6 +26,7 @@
 #include asm/p2m.h
 #include asm/hvm/vmx/vmx.h
 #include asm/hvm/vmx/vmcs.h
+#include asm/hvm/nestedhvm.h
 #include xen/iommu.h
 #include asm/mtrr.h
 #include asm/hvm/cacheattr.h
@@ -1076,6 +1077,9 @@ void ept_sync_domain(struct p2m_domain *p2m)
 
 ASSERT(local_irq_is_enabled());
 
+if ( nestedhvm_enabled(d) )
+p2m_flush_nestedp2m(d);
+
 /*
  * Flush active cpus synchronously. Flush others the next time this domain
  * is scheduled onto them. We accept the race of other CPUs adding to
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [linux-next test] 57731: regressions - trouble: blocked/broken/fail/pass

2015-06-02 Thread osstest service user
flight 57731 linux-next real [real]
http://logs.test-lab.xenproject.org/osstest/logs/57731/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-i386-xl-qemut-debianhvm-amd64  6 xen-boot  fail REGR. vs. 57516
 test-armhf-armhf-xl-xsm   9 debian-installfail REGR. vs. 57516
 build-armhf-libvirt   4 host-build-prep   fail REGR. vs. 57516

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-libvirt 11 guest-start  fail   like 57516
 test-amd64-i386-freebsd10-amd64  9 freebsd-install fail like 57516
 test-amd64-i386-freebsd10-i386  9 freebsd-install  fail like 57516
 test-amd64-i386-xl-qemut-win7-amd64 16 guest-stop  fail like 57516
 test-amd64-i386-xl-qemuu-win7-amd64 16 guest-stop  fail like 57516
 test-amd64-amd64-xl-qemuu-win7-amd64 16 guest-stop fail like 57516

Tests which did not succeed, but are not blocking:
 test-armhf-armhf-libvirt-xsm  1 build-check(1)   blocked  n/a
 test-armhf-armhf-libvirt  1 build-check(1)   blocked  n/a
 test-amd64-i386-xl-xsm   14 guest-localmigrate   fail   never pass
 test-amd64-amd64-xl-pvh-amd  11 guest-start  fail   never pass
 test-amd64-amd64-xl-pvh-intel 11 guest-start  fail  never pass
 test-amd64-amd64-xl-xsm  14 guest-localmigrate   fail   never pass
 test-amd64-i386-xl-qemut-debianhvm-amd64-xsm 12 guest-localmigrate fail never 
pass
 test-amd64-amd64-xl-qemut-debianhvm-amd64-xsm 12 guest-localmigrate fail never 
pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64-xsm 12 guest-localmigrate fail never 
pass
 test-amd64-i386-libvirt-xsm  12 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 12 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  12 migrate-support-checkfail   never pass
 test-amd64-i386-xl-qemuu-debianhvm-amd64-xsm 12 guest-localmigrate fail never 
pass
 test-armhf-armhf-xl-arndale  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 12 migrate-support-checkfail never pass
 test-amd64-amd64-xl-qemut-win7-amd64 16 guest-stop fail never pass
 test-armhf-armhf-xl-sedf-pin 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-sedf 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 12 migrate-support-checkfail  never pass

version targeted for testing:
 linux7732a9817fb01002bde7615066e86c156fb5a31b
baseline version:
 linuxace6a22a9fbcdba0ccf190f97b82a79ef2f44aeb

jobs:
 build-amd64-xsm  pass
 build-armhf-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-armhf  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-armhf-libvirt  broken  
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-armhf-pvopspass
 build-i386-pvops pass
 build-amd64-rumpuserxen  pass
 build-i386-rumpuserxen   pass
 test-amd64-amd64-xl  pass
 test-armhf-armhf-xl  pass
 test-amd64-i386-xl   pass
 test-amd64-amd64-xl-qemut-debianhvm-amd64-xsmfail
 test-amd64-i386-xl-qemut-debianhvm-amd64-xsm fail
 test-amd64-amd64-xl-qemuu-debianhvm-amd64-xsmfail
 test-amd64-i386-xl-qemuu-debianhvm-amd64-xsm fail
 test-amd64-amd64-libvirt-xsm pass
 test-armhf-armhf-libvirt-xsm blocked 
 test-amd64-i386-libvirt-xsm  pass
 test-amd64-amd64-xl-xsm  fail
 test-armhf-armhf-xl-xsm  fail
 test-amd64-i386-xl-xsm   fail
 test-amd64-amd64-xl-pvh-amd  fail
 test-amd64-i386-qemut-rhel6hvm-amd   pass
 test-amd64-i386-qemuu-rhel6hvm-amd  

Re: [Xen-devel] [Draft C] Xen on ARM vITS Handling

2015-06-02 Thread Ian Campbell
On Mon, 2015-06-01 at 16:29 +0100, Julien Grall wrote:
 On 01/06/15 14:12, Ian Campbell wrote:
  On Fri, 2015-05-29 at 14:40 +0100, Julien Grall wrote:
  Hi Ian,
 
 Hi Ian,
 
  NIT: You used my Linaro email which I think is de-activated now :).
  
  I keep finding new address books with that address  in them!
  
  ## ITS Translation Table
 
  Message signalled interrupts are translated into an LPI via an ITS
  translation table which must be configured for each device which can
  generate an MSI.
 
  I'm not sure what is the ITS Table Table. Did you mean Interrupt
  Translation Table?
  
  I don't think I wrote Table Table anywhere.
 
 Sorry I meant ITS translation table
 
  I'm referring to the tables which are established by e.g. the MAPD
  command and friends, e.g. the thing shown in 4.9.12 Notional ITS Table
  Structure.
 
 On previous paragraph you are referring particularly to Interrupt
 Translation Table. This is the only table that is configured per device.

I'm afraid I'm still not getting your point. Please quote the exact text
which you think is wrong and if possible suggest an alternative.

 [..]
 
  XXX there are other aspects to virtualising the ITS (LPI collection
  management, assignment of LPI ranges to guests, device
  management). However these are not currently considered here. XXX
  Should they be/do they need to be?
 
  I think we began to cover these aspect with the section command 
  emulation.
  
  Some aspects, yes. I went with:
  
  There are other aspects to virtualising the ITS (LPI collection
  management, assignment of LPI ranges to guests, device
  management). However these are only considered here to the extent
  needed for describing the vITS emulation.
  
  XXX In the context of virtualised device ids this may not be the case,
  e.g. we can arrange for (mostly) contiguous device ids and we know the
  bound is significantly lower than 2^32
 
  Well, the deviceID is computed from the BDF and some DMA alias. As the
  algorithm can't be tweaked, it's very likely that we will have
  non-contiguous Device ID. See pci_for_each_dma_alias in Linux
  (drivers/pci/search.c).
  
  The implication here is that deviceID is fixed in hardware and is used
  by driver domain software in contexts where we do not get the
  opportunity to translate is that right? What contexts are those?
 
 No, the driver domain software will always use a virtual DeviceID (based
 on the vBDF and other things). The problem I wanted to raise is how to
 translate back the vDeviceID to a physical deviceID/BDF.

Right, so this goes back to my original point, which is that if we
completely control the translation from vDeviceID to pDeviceID/BDF then
the vDeviceId space need not be sparse and need not utilise the entire
2^32 space, at least for domU uses.

  Note that the BDF is also something which we could in principal
  virtualise (we already do for domU). Perhaps that is infeasible for dom0
  though?
 
 For DOM0 the virtual BDF is equal to the physical BDF. So the both
 deviceID (physical and virtual) will be the same.
 
 We may decide to do vBDF == pBDF for guest too in order to simplify the
 code.

It seems to me that choosing vBDF such that the vDeviceId space is to
our liking would be a good idea.

  That gives me two thoughts.
  
  The first is that although device identifiers are not necessarily
  contiguous, they are generally at least grouped and not allocated at
  random through the 2^32 options. For example a PCI Host bridge typically
  has a range of device ids associated with it and each device has a
  device id derived from that.
 
 Usually it's one per (device, function).

Yes, but my point is that they are generally grouped by bus. The bus is
assigned a (contiguous) range and individual (device,function)= device
id mappings are based on a formula applied to the base address.

i.e. for a given PCI bus the device ids are in the range 1000..1000+N,
not N random number selected from the 2^32 space.

 
  
  I'm not sure if we can leverage that into a more useful data structure
  than an R-B tree, or for example to arrange for the R-B to allow for the
  translation of a device within a span into the parent span and from
  there do the lookup. Specifically when looking up a device ID
  corresponding to a PCI device we could arrange to find the PCI host
  bridge and find the actual device from there. This would keep the RB
  tree much smaller and therefore perhaps quicker? Of course that depends
  on what the lookup from PCI host bridge to a device looked like.
 
 I'm not sure why you are speaking about PCI host bridge. AFAIK, the
 guest doesn't have a physical host bridge.

It has a virtual one provided by the pciif/pcifront+back thing. Any PCI
bus is behind some sort of host bridge, whether physical, virtual or
notional.

 Although, this is an optimization that we can think about it later. The
 R-B will already be fast enough for a first implementation. My main
 point 

Re: [Xen-devel] [Patch v3 27/36] x86, irq: Use access helper irq_data_get_affinity_mask()

2015-06-02 Thread Thomas Gleixner
On Mon, 1 Jun 2015, Jiang Liu wrote:

 diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
 index 9b62f690b0ff..dfa3a5f5b3d3 100644
 --- a/arch/x86/kernel/apic/vector.c
 +++ b/arch/x86/kernel/apic/vector.c
 @@ -494,9 +494,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
  
   err = assign_irq_vector(irq, data, dest);
   if (err) {
 - struct irq_data *top = irq_get_irq_data(irq);
 -
 - if (assign_irq_vector(irq, data, top-affinity))
 + if (assign_irq_vector(irq, data,
 +   irq_data_get_affinity_mask(irq_data)))

Does this patch work w/o moving the affinity mask to common data? I
doubt so, as you remove the retrieval of 'top'.

Thanks,

tglx

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [xen-4.3-testing test] 57749: regressions - trouble: blocked/broken/fail/pass

2015-06-02 Thread osstest service user
flight 57749 xen-4.3-testing real [real]
http://logs.test-lab.xenproject.org/osstest/logs/57749/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 build-armhf-libvirt  3 host-install(3) broken in 57680 REGR. vs. 53768
 test-amd64-i386-xend-qemut-winxpsp3 16 guest-stop fail REGR. vs. 53768

Tests which are failing intermittently (not blocking):
 test-armhf-armhf-xl-sedf-pin  3 host-install(3)   broken pass in 57680

Regressions which are regarded as allowable (not blocking):
 test-amd64-i386-xl-qemuu-winxpsp3-vcpus1 15 guest-localmigrate/x10 fail in 
57680 like 53023
 test-amd64-amd64-xl-qemuu-win7-amd64 16 guest-stop fail like 53023

Tests which did not succeed, but are not blocking:
 test-armhf-armhf-libvirt  1 build-check(1)blocked in 57680 n/a
 test-amd64-amd64-rumpuserxen-amd64  1 build-check(1)   blocked n/a
 test-amd64-i386-rumpuserxen-i386  1 build-check(1)   blocked  n/a
 test-armhf-armhf-xl-sedf-pin  6 xen-boot  fail in 57680 never pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  9 debian-hvm-install  fail never pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64  9 debian-hvm-install fail never pass
 build-i386-rumpuserxen6 xen-buildfail   never pass
 build-amd64-rumpuserxen   6 xen-buildfail   never pass
 test-armhf-armhf-xl-multivcpu  6 xen-boot fail  never pass
 test-armhf-armhf-xl   6 xen-boot fail   never pass
 test-armhf-armhf-xl-sedf  6 xen-boot fail   never pass
 test-amd64-i386-libvirt  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2   6 xen-boot fail   never pass
 test-amd64-amd64-libvirt 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck  6 xen-boot fail never pass
 test-amd64-i386-xl-qemut-win7-amd64 16 guest-stop  fail never pass
 test-amd64-amd64-xl-qemut-win7-amd64 16 guest-stop fail never pass
 test-amd64-i386-xl-qemuu-win7-amd64 16 guest-stop  fail never pass
 test-armhf-armhf-xl-arndale   6 xen-boot fail   never pass
 test-armhf-armhf-libvirt  6 xen-boot fail   never pass

version targeted for testing:
 xen  e580a92dd53dbba62518e17a3f4ebd57b626926c
baseline version:
 xen  58db71c5cdd48126b9380c230dc5b61554bad7d8


People who touched revisions under test:
  Ian Jackson ian.jack...@eu.citrix.com
  Petr Matousek pmato...@redhat.com


jobs:
 build-amd64  pass
 build-armhf  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-armhf-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-armhf-pvopspass
 build-i386-pvops pass
 build-amd64-rumpuserxen  fail
 build-i386-rumpuserxen   fail
 test-amd64-amd64-xl  pass
 test-armhf-armhf-xl  fail
 test-amd64-i386-xl   pass
 test-amd64-i386-qemut-rhel6hvm-amd   pass
 test-amd64-i386-qemuu-rhel6hvm-amd   pass
 test-amd64-amd64-xl-qemut-debianhvm-amd64pass
 test-amd64-i386-xl-qemut-debianhvm-amd64 pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-i386-xl-qemuu-debianhvm-amd64 pass
 test-amd64-i386-freebsd10-amd64  pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 fail
 test-amd64-i386-xl-qemuu-ovmf-amd64  fail
 test-amd64-amd64-rumpuserxen-amd64   blocked 
 test-amd64-amd64-xl-qemut-win7-amd64 fail
 test-amd64-i386-xl-qemut-win7-amd64  fail
 test-amd64-amd64-xl-qemuu-win7-amd64 fail
 test-amd64-i386-xl-qemuu-win7-amd64  fail
 test-armhf-armhf-xl-arndale  fail
 test-amd64-amd64-xl-credit2  pass
 test-armhf-armhf-xl-credit2  

[Xen-devel] [PATCH v7 1/4] pci: add PCI_SBDF and PCI_SEG macros

2015-06-02 Thread elena . ufimtseva
From: Elena Ufimtseva elena.ufimts...@oracle.com

Signed-off-by: Elena Ufimtseva elena.ufimts...@oracle.com
---
 xen/include/xen/pci.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 3908146..414106a 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -33,6 +33,8 @@
 #define PCI_DEVFN2(bdf) ((bdf)  0xff)
 #define PCI_BDF(b,d,f)  b)  0xff)  8) | PCI_DEVFN(d,f))
 #define PCI_BDF2(b,df)  b)  0xff)  8) | ((df)  0xff))
+#define PCI_SBDF(s,b,d,f) s)  0x)  16) | PCI_BDF(b,d,f))
+#define PCI_SEG(sbdf) (((sbdf)  16)  0x)
 
 struct pci_dev_info {
 bool_t is_extfn;
-- 
2.1.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v7 0/4] iommu: add rmrr Xen command line option

2015-06-02 Thread elena . ufimtseva
From: Elena Ufimtseva elena.ufimts...@oracle.com

v7 of rmrr comman line patches.
Thank you for comments on v6.
  
Add Xen command line option rmrr to specify RMRR
regions for devices that are not defined in ACPI thus   
causing IO Page Fault while booting dom0 in PVH mode.   
These additional regions will be added to the list of   
RMRR regions parsed from ACPI.  

Changes in v7:  
 - make sure RMRRs ranges are being checked correctly;  
 - dont interrupt RMRRs checking if some of checks fails, instead   
 continue to next RMRR; 
 - make rmrr variable names more obvious;   
 - fix debug output formatting to match type of rmrr range; 
 - fix typos in rmrr command line document and in comments; 

Changes in v6:  
 - make __parse_pci return correct result and error codes;  
 - move add_extra_rmrr  
 - previous patch was missing RMRR addresses in range check, add it here;   
 - add overlap check and range boundaries check;
 - moved extra rmrr structure definition to dmar.c; 
 - change def_seg in __parse_pci type from int to bool_t;   
 - change name for extra rmrr range to reflect they hold now pfns;  

Changes in v5:  
 - make parse_pci a wrapper and add __parse_pci with additional def_seg param   
   to identify if segment was specified;
 - make possible not to define segment for each device within same rmrr;
 - limit number of pages for one RMRR by 16;
 - run mfn_valid check for every address in RMRR range; 
 - add PCI_SBDF macro;  
 - remove list for extra rmrrs as they are kept in static array;

Changes in v4 after comments by Jan Beulich:
 - keep sbdf per device instead of bdf and one segment per RMRR when parsing 
and compare later;
 - add check for segment values and make sure they are same for one RMRR;   
 - move RMRR parameters checks and add error messages if RMRRs are incorrect;   
 - make relevant variables and functions static;
 - mention requirement for segment values in rmrr documentation;  

Changes in v3:  
 - use ';' instead of '#' in command line and add proper notes for grub ';' 
 special treatment; 

Changes in v2:  
 - move rmrr parser to dmar.c and make it custom_param; 
 - change of rmrr command line oprion format; since adding multiple device  
 per range support needs to utilize more special characters and offered from
 the previous review ';' is not supported, '[' ']' are reserved, ':' and used 
in pci
 format, range and devices are separated by '#'; Suggestions are welcome;   
 - added support for multiple devices per range;
 - moved adding misc RMRRs before ACPI RMRR parsing;
 - make parser fail if pci device is specified incorrectly;

Elena Ufimtseva (4):
  pci: add PCI_SBDF and PCI_SEG macros
  iommu VT-d: separate rmrr addition function
  pci: add wrapper for parse_pci
  iommu: add rmrr Xen command line option for extra rmrrs

 docs/misc/xen-command-line.markdown |  12 ++
 xen/drivers/passthrough/vtd/dmar.c  | 313 +---
 xen/drivers/pci/pci.c   |  11 ++
 xen/include/xen/pci.h   |   5 +
 4 files changed, 279 insertions(+), 62 deletions(-)

-- 
2.1.3


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3] dmar: device scope mem leak fix

2015-06-02 Thread elena . ufimtseva
From: Elena Ufimtseva elena.ufimts...@oracle.com

Third attempt to incorporate memory leak fix.
Thanks for comment on v2.

Release memory allocated for scope.devices when disabling
dmar units. Also set device count after memory allocation when
device scope parsing.

Changes in v3:
 - make freeing memory for scope devices and zeroing device counter
 a function and use it;
 - make sure parse_one_rmrr has memory leak fix in this patch;
 - make sure ret values are not lost acpi_parse_one_drhd;

Changes in v2:
 - release memory for devices scope on error paths in acpi_parse_one_drhd
 and acpi_parse_one_atsr and set the count to zero;

Signed-off-by: Elena Ufimtseva elena.ufimts...@oracle.com
---
 xen/drivers/passthrough/vtd/dmar.c | 32 +---
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 2b07be9..a675bf7 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -81,6 +81,12 @@ static int __init acpi_register_rmrr_unit(struct 
acpi_rmrr_unit *rmrr)
 return 0;
 }
 
+static void scope_devices_free(struct dmar_scope *scope)
+{
+scope-devices_cnt = 0;
+xfree(scope-devices);
+}
+
 static void __init disable_all_dmar_units(void)
 {
 struct acpi_drhd_unit *drhd, *_drhd;
@@ -90,16 +96,19 @@ static void __init disable_all_dmar_units(void)
 list_for_each_entry_safe ( drhd, _drhd, acpi_drhd_units, list )
 {
 list_del(drhd-list);
+scope_devices_free(drhd-scope);
 xfree(drhd);
 }
 list_for_each_entry_safe ( rmrr, _rmrr, acpi_rmrr_units, list )
 {
 list_del(rmrr-list);
+scope_devices_free(rmrr-scope);
 xfree(rmrr);
 }
 list_for_each_entry_safe ( atsr, _atsr, acpi_atsr_units, list )
 {
 list_del(atsr-list);
+scope_devices_free(atsr-scope);
 xfree(atsr);
 }
 }
@@ -318,13 +327,13 @@ static int __init acpi_parse_dev_scope(
 if ( (cnt = scope_device_count(start, end))  0 )
 return cnt;
 
-scope-devices_cnt = cnt;
 if ( cnt  0 )
 {
 scope-devices = xzalloc_array(u16, cnt);
 if ( !scope-devices )
 return -ENOMEM;
 }
+scope-devices_cnt = cnt;
 
 while ( start  end )
 {
@@ -427,7 +436,7 @@ static int __init acpi_parse_dev_scope(
 
  out:
 if ( ret )
-xfree(scope-devices);
+scope_devices_free(scope);
 
 return ret;
 }
@@ -476,11 +485,6 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
 if ( ret )
 goto out;
 
-dev_scope_start = (void *)(drhd + 1);
-dev_scope_end = ((void *)drhd) + header-length;
-ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
-   dmaru-scope, DMAR_TYPE, drhd-segment);
-
 if ( dmaru-include_all )
 {
 if ( iommu_verbose )
@@ -495,7 +499,13 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
 if ( drhd-segment == 0 )
 include_all = 1;
 }
+if ( ret )
+goto out;
 
+dev_scope_start = (void *)(drhd + 1);
+dev_scope_end = ((void *)drhd) + header-length;
+ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
+   dmaru-scope, DMAR_TYPE, drhd-segment);
 if ( ret )
 goto out;
 else if ( force_iommu || dmaru-include_all )
@@ -542,6 +552,7 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
   Workaround BIOS bug: ignore the DRHD due to all 
 devices under its scope are not PCI discoverable!\n);
 
+scope_devices_free(dmaru-scope);
 iommu_free(dmaru);
 xfree(dmaru);
 }
@@ -552,6 +563,7 @@ acpi_parse_one_drhd(struct acpi_dmar_header *header)
 its scope are not PCI discoverable! Pls try option 
 iommu=force or iommu=workaround_bios_bug if you 
 really want VT-d\n);
+scope_devices_free(dmaru-scope);
 ret = -EINVAL;
 }
 }
@@ -565,6 +577,7 @@ out:
 iommu_free(dmaru);
 xfree(dmaru);
 }
+
 return ret;
 }
 
@@ -658,6 +671,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_header *header)
   Ignore the RMRR (%PRIx64, %PRIx64) due to 
 devices under its scope are not PCI discoverable!\n,
 rmrru-base_address, rmrru-end_address);
+scope_devices_free(rmrru-scope);
 xfree(rmrru);
 }
 else if ( base_addr  end_addr )
@@ -665,6 +679,7 @@ acpi_parse_one_rmrr(struct acpi_dmar_header *header)
 dprintk(XENLOG_WARNING VTDPREFIX,
   The RMRR (%PRIx64, %PRIx64) is incorrect!\n,
 rmrru-base_address, rmrru-end_address);
+scope_devices_free(rmrru-scope);
 xfree(rmrru);
 ret = -EFAULT;
 }
@@ -727,7 +742,10 @@ 

[Xen-devel] [seabios test] 57755: tolerable FAIL - PUSHED

2015-06-02 Thread osstest service user
flight 57755 seabios real [real]
http://logs.test-lab.xenproject.org/osstest/logs/57755/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-i386-xl-qemuu-debianhvm-amd64-xsm 12 guest-localmigrate fail never 
pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64-xsm 12 guest-localmigrate fail never 
pass
 test-amd64-amd64-xl-qemuu-win7-amd64 16 guest-stop fail never pass
 test-amd64-i386-xl-qemuu-win7-amd64 16 guest-stop  fail never pass

version targeted for testing:
 seabios  2aff1c10953bfb2f17b0702eb9e2962e1c78f3c9
baseline version:
 seabios  67643955c7467781c28c4da1669775d7564dc74a


People who touched revisions under test:
  Kevin O'Connor ke...@koconnor.net
  Stefan Berger stef...@linux.vnet.ibm.com
  Vladimir Serbinenko phco...@gmail.com


jobs:
 build-amd64-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64-xsmfail
 test-amd64-i386-xl-qemuu-debianhvm-amd64-xsm fail
 test-amd64-i386-qemuu-rhel6hvm-amd   pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-i386-xl-qemuu-debianhvm-amd64 pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  pass
 test-amd64-amd64-xl-qemuu-win7-amd64 fail
 test-amd64-i386-xl-qemuu-win7-amd64  fail
 test-amd64-i386-qemuu-rhel6hvm-intel pass
 test-amd64-i386-xl-qemuu-winxpsp3-vcpus1 pass
 test-amd64-amd64-xl-qemuu-winxpsp3   pass
 test-amd64-i386-xl-qemuu-winxpsp3pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

+ branch=seabios
+ revision=2aff1c10953bfb2f17b0702eb9e2962e1c78f3c9
+ . cri-lock-repos
++ . cri-common
+++ . cri-getconfig
+++ umask 002
+++ getconfig Repos
+++ perl -e '
use Osstest;
readglobalconfig();
print $c{Repos} or die $!;
'
++ repos=/home/osstest/repos
++ repos_lock=/home/osstest/repos/lock
++ '[' x '!=' x/home/osstest/repos/lock ']'
++ OSSTEST_REPOS_LOCK_LOCKED=/home/osstest/repos/lock
++ exec with-lock-ex -w /home/osstest/repos/lock ./ap-push seabios 
2aff1c10953bfb2f17b0702eb9e2962e1c78f3c9
+ branch=seabios
+ revision=2aff1c10953bfb2f17b0702eb9e2962e1c78f3c9
+ . cri-lock-repos
++ . cri-common
+++ . cri-getconfig
+++ umask 002
+++ getconfig Repos
+++ perl -e '
use Osstest;
readglobalconfig();
print $c{Repos} or die $!;
'
++ repos=/home/osstest/repos
++ repos_lock=/home/osstest/repos/lock
++ '[' x/home/osstest/repos/lock '!=' x/home/osstest/repos/lock ']'
+ . cri-common
++ . cri-getconfig
++ umask 002
+ select_xenbranch
+ case $branch in
+ tree=seabios
+ xenbranch=xen-unstable
+ '[' xseabios = xlinux ']'
+ linuxbranch=
+ '[' x = x ']'
+ qemuubranch=qemu-upstream-unstable
+ : tested/2.6.39.x
+ . ap-common
++ : osst...@xenbits.xen.org
+++ getconfig OsstestUpstream
+++ perl -e '
use Osstest;
readglobalconfig();
print $c{OsstestUpstream} or die $!;
'
++ :
++ : git://xenbits.xen.org/xen.git
++ : osst...@xenbits.xen.org:/home/xen/git/xen.git
++ : git://xenbits.xen.org/staging/qemu-xen-unstable.git
++ : git://git.kernel.org
++ : git://git.kernel.org/pub/scm/linux/kernel/git
++ : git
++ : git://xenbits.xen.org/libvirt.git
++ : osst...@xenbits.xen.org:/home/xen/git/libvirt.git
++ : git://xenbits.xen.org/libvirt.git
++ : git://xenbits.xen.org/rumpuser-xen.git
++ : git
++ : git://xenbits.xen.org/rumpuser-xen.git
++ : osst...@xenbits.xen.org:/home/xen/git/rumpuser-xen.git
+++ besteffort_repo https://github.com/rumpkernel/rumpkernel-netbsd-src
+++ local repo=https://github.com/rumpkernel/rumpkernel-netbsd-src
+++ cached_repo 

Re: [Xen-devel] [PATCHv2] frontswap: allow multiple backends

2015-06-02 Thread Andrew Morton
On Mon,  1 Jun 2015 10:22:24 -0400 Dan Streetman ddstr...@ieee.org wrote:

 Change frontswap single pointer to a singly linked list of frontswap
 implementations.  Update Xen tmem implementation as register no longer
 returns anything.
 
 Frontswap only keeps track of a single implementation; any implementation
 that registers second (or later) will replace the previously registered
 implementation, and gets a pointer to the previous implementation that
 the new implementation is expected to pass all frontswap functions to
 if it can't handle the function itself.  However that method doesn't
 really make much sense, as passing that work on to every implementation
 adds unnecessary work to implementations; instead, frontswap should
 simply keep a list of all registered implementations and try each
 implementation for any function.  Most importantly, neither of the
 two currently existing frontswap implementations in the kernel actually
 do anything with any previous frontswap implementation that they
 replace when registering.
 
 This allows frontswap to successfully manage multiple implementations
 by keeping a list of them all.
 
 ...

 -struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
 +void frontswap_register_ops(struct frontswap_ops *ops)
  {
 - struct frontswap_ops *old = frontswap_ops;
 - int i;
 -
 - for (i = 0; i  MAX_SWAPFILES; i++) {
 - if (test_and_clear_bit(i, need_init)) {
 - struct swap_info_struct *sis = swap_info[i];
 - /* __frontswap_init _should_ have set it! */
 - if (!sis-frontswap_map)
 - return ERR_PTR(-EINVAL);
 - ops-init(i);
 - }
 + DECLARE_BITMAP(a, MAX_SWAPFILES);
 + DECLARE_BITMAP(b, MAX_SWAPFILES);
 + struct swap_info_struct *si;
 + unsigned int i;
 +
 + spin_lock(swap_lock);
 + plist_for_each_entry(si, swap_active_head, list) {
 + if (!WARN_ON(!si-frontswap_map))
 + set_bit(si-type, a);

umm, DECLARE_BITMAP() doesn't initialise the storage.  Either this
patch wasn't tested very well or you should buy me a lottery ticket!

   }
 - /*
 -  * We MUST have frontswap_ops set _after_ the frontswap_init's
 -  * have been called. Otherwise __frontswap_store might fail. Hence
 -  * the barrier to make sure compiler does not re-order us.
 + spin_unlock(swap_lock);
 +
 + /* the new ops needs to know the currently active swap devices */
 + for_each_set_bit(i, a, MAX_SWAPFILES)
 + ops-init(i);
 +
 + /* setting frontswap_ops must happen after the ops-init() calls
 +  * above; cmpxchg implies smp_mb() which will ensure the init is
 +  * complete at this point
 +  */

Like this, please:

/*
 * Setting ...

and sentences start with capital letters ;)



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v7 4/4] iommu: add rmrr Xen command line option for extra rmrrs

2015-06-02 Thread elena . ufimtseva
From: Elena Ufimtseva elena.ufimts...@oracle.com

From: Elena Ufimtseva elena.ufimts...@oracle.com

On some platforms RMRR regions may be not specified
in ACPI and thus will not be mapped 1:1 in dom0. This
causes IO Page Faults and prevents dom0 from booting
in PVH mode.
New Xen command line option rmrr allows to specify
such devices and memory regions. These regions are added
to the list of RMRR defined in ACPI if the device
is present in system. As a result, additional RMRRs will
be mapped 1:1 in dom0 with correct permissions.

Mentioned above problems were discovered during PVH work with
ThinkCentre M and Dell 5600T. No official documentation
was found so far in regards to what devices and why cause this.
Experiments show that ThinkCentre M USB devices with enabled
debug port generate DMA read transactions to the regions of
memory marked reserved in host e820 map.
For Dell 5600T the device and faulting addresses are not found yet.

For detailed history of the discussion please check following threads:
http://lists.Xen.org/archives/html/xen-devel/2015-02/msg01724.html
http://lists.Xen.org/archives/html/xen-devel/2015-01/msg02513.html

Format for rmrr Xen command line option:
rmrr=start-end=[s1]bdf1[,[s1]bdf2[,...]];start-end=[s2]bdf1[,[s2]bdf2[,...]]
If grub2 used and multiple ranges are specified, ';' should be
quoted/escaped, refer to grub2 manual for more information.

Signed-off-by: Elena Ufimtseva elena.ufimts...@oracle.com
---
 docs/misc/xen-command-line.markdown |  12 +++
 xen/drivers/passthrough/vtd/dmar.c  | 183 +++-
 2 files changed, 194 insertions(+), 1 deletion(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 4889e27..d2f0668 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1185,6 +1185,18 @@ Specify the host reboot method.
 'efi' instructs Xen to reboot using the EFI reboot call (in EFI mode by
  default it will use that method first).
 
+### rmrr
+ '= 
start-end=[s1]bdf1[,[s1]bdf2[,...]];start-end=[s2]bdf1[,[s2]bdf2[,...]]
+
+Define RMRRs units that are missing from ACPI table along with device they
+belong to and use them for 1:1 mapping. End addresses can be omitted and one
+page will be mapped. The ranges are inclusive when start and end are specified.
+If segment of the first device is not specified, segment zero will be used.
+If other segments are not specified, first device segment will be used.
+If segments are specified for every device and not equal, an error will be 
reported.
+Note: grub2 requires to escape or use quotations if special characters are 
used,
+namely ';', refer to the grub2 documentation if multiple ranges are specified.
+
 ### ro-hpet
  `= boolean`
 
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 5d78a37..857373f 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -869,6 +869,120 @@ out:
 return ret;
 }
 
+#define MAX_EXTRA_RMRR_PAGES 16
+#define MAX_EXTRA_RMRR 10
+
+/* RMRR units derived from command line rmrr option */
+#define MAX_EXTRA_RMRR_DEV 20
+struct extra_rmrr_unit {
+struct list_head list;
+unsigned long base_pfn, end_pfn;
+u16dev_count;
+u32sbdf[MAX_EXTRA_RMRR_DEV];
+};
+static __initdata unsigned int nr_rmrr;
+static struct __initdata extra_rmrr_unit extra_rmrr_units[MAX_EXTRA_RMRR];
+
+#define PRI_RMRRL [%lx - %lx]
+static void __init add_extra_rmrr(void)
+{
+struct acpi_rmrr_unit *acpi_rmrr;
+unsigned int dev, seg, i, j;
+unsigned long pfn;
+
+for ( i = 0; i  nr_rmrr; i++ )
+{
+if ( extra_rmrr_units[i].base_pfn  extra_rmrr_units[i].end_pfn )
+{
+printk(XENLOG_ERR VTDPREFIX
+   Start pfn  end pfn for RMRR range PRI_RMRRL\n,
+   extra_rmrr_units[i].base_pfn, extra_rmrr_units[i].end_pfn);
+continue;
+}
+
+if ( extra_rmrr_units[i].end_pfn - extra_rmrr_units[i].base_pfn = 
MAX_EXTRA_RMRR_PAGES )
+{
+printk(XENLOG_ERR VTDPREFIX
+   RMRR range exceeds %s pages 
PRI_RMRRL\n,__stringify(MAX_EXTRA_RMRR_PAGES),
+   extra_rmrr_units[i].base_pfn, extra_rmrr_units[i].end_pfn);
+continue;
+}
+
+for ( j = 0; j  nr_rmrr; j++ )
+{
+if ( i != j  extra_rmrr_units[i].base_pfn = 
extra_rmrr_units[j].end_pfn 
+ extra_rmrr_units[j].base_pfn = extra_rmrr_units[i].end_pfn )
+{
+printk(XENLOG_ERR VTDPREFIX
+  Overlapping RMRRs PRI_RMRRL and PRI_RMRRL\n,
+  extra_rmrr_units[i].base_pfn, 
extra_rmrr_units[i].end_pfn,
+  extra_rmrr_units[j].base_pfn, 
extra_rmrr_units[j].end_pfn);
+break;
+}
+}
+/* Broke out of the overlap loop check, continue with next rmrr. */
+if ( j  nr_rmrr )
+   

[Xen-devel] [PATCH v7 2/4] iommu VT-d: separate rmrr addition function

2015-06-02 Thread elena . ufimtseva
From: Elena Ufimtseva elena.ufimts...@oracle.com

In preparation for auxiliary RMRR data provided on Xen
command line, make RMRR adding a separate function.
Also free memery for rmrr device scope in error path.

Signed-off-by: Elena Ufimtseva elena.ufimts...@oracle.com
---
 xen/drivers/passthrough/vtd/dmar.c | 130 -
 1 file changed, 69 insertions(+), 61 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index a675bf7..5d78a37 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -581,6 +581,72 @@ out:
 return ret;
 }
 
+static int register_one_rmrr(struct acpi_rmrr_unit *rmrru)
+{
+bool_t ignore = 0;
+unsigned int i = 0;
+int ret = 0;
+
+/* Skip checking if segment is not accessible yet. */
+if ( !pci_known_segment(rmrru-segment) )
+{
+dprintk(XENLOG_WARNING VTDPREFIX, UNKNOWN Prefix! %04x, 
rmrru-segment);
+i = UINT_MAX;
+}
+
+for ( ; i  rmrru-scope.devices_cnt; i++ )
+{
+u8 b = PCI_BUS(rmrru-scope.devices[i]);
+u8 d = PCI_SLOT(rmrru-scope.devices[i]);
+u8 f = PCI_FUNC(rmrru-scope.devices[i]);
+
+if ( pci_device_detect(rmrru-segment, b, d, f) == 0 )
+{
+dprintk(XENLOG_WARNING VTDPREFIX,
+ Non-existent device (%04x:%02x:%02x.%u) is reported
+ in RMRR (%PRIx64, %PRIx64)'s scope!\n,
+rmrru-segment, b, d, f,
+rmrru-base_address, rmrru-end_address);
+ignore = 1;
+}
+else
+{
+ignore = 0;
+break;
+}
+}
+
+if ( ignore )
+{
+dprintk(XENLOG_WARNING VTDPREFIX,
+  Ignore the RMRR (%PRIx64, %PRIx64) due to 
+devices under its scope are not PCI discoverable!\n,
+rmrru-base_address, rmrru-end_address);
+scope_devices_free(rmrru-scope);
+xfree(rmrru);
+}
+else if ( rmrru-base_address  rmrru-end_address )
+{
+dprintk(XENLOG_WARNING VTDPREFIX,
+  The RMRR (%PRIx64, %PRIx64) is incorrect!\n,
+rmrru-base_address, rmrru-end_address);
+scope_devices_free(rmrru-scope);
+xfree(rmrru);
+ret = -EFAULT;
+}
+else
+{
+if ( iommu_verbose )
+dprintk(VTDPREFIX,
+  RMRR region: base_addr %PRIx64
+ end_address %PRIx64\n,
+rmrru-base_address, rmrru-end_address);
+acpi_register_rmrr_unit(rmrru);
+}
+
+return ret;
+}
+
 static int __init
 acpi_parse_one_rmrr(struct acpi_dmar_header *header)
 {
@@ -631,68 +697,10 @@ acpi_parse_one_rmrr(struct acpi_dmar_header *header)
 ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end,
rmrru-scope, RMRR_TYPE, rmrr-segment);
 
-if ( ret || (rmrru-scope.devices_cnt == 0) )
-xfree(rmrru);
+if ( !ret  (rmrru-scope.devices_cnt != 0) )
+register_one_rmrr(rmrru);
 else
-{
-u8 b, d, f;
-bool_t ignore = 0;
-unsigned int i = 0;
-
-/* Skip checking if segment is not accessible yet. */
-if ( !pci_known_segment(rmrr-segment) )
-i = UINT_MAX;
-
-for ( ; i  rmrru-scope.devices_cnt; i++ )
-{
-b = PCI_BUS(rmrru-scope.devices[i]);
-d = PCI_SLOT(rmrru-scope.devices[i]);
-f = PCI_FUNC(rmrru-scope.devices[i]);
-
-if ( !pci_device_detect(rmrr-segment, b, d, f) )
-{
-dprintk(XENLOG_WARNING VTDPREFIX,
- Non-existent device (%04x:%02x:%02x.%u) is reported
- in RMRR (%PRIx64, %PRIx64)'s scope!\n,
-rmrr-segment, b, d, f,
-rmrru-base_address, rmrru-end_address);
-ignore = 1;
-}
-else
-{
-ignore = 0;
-break;
-}
-}
-
-if ( ignore )
-{
-dprintk(XENLOG_WARNING VTDPREFIX,
-  Ignore the RMRR (%PRIx64, %PRIx64) due to 
-devices under its scope are not PCI discoverable!\n,
-rmrru-base_address, rmrru-end_address);
-scope_devices_free(rmrru-scope);
-xfree(rmrru);
-}
-else if ( base_addr  end_addr )
-{
-dprintk(XENLOG_WARNING VTDPREFIX,
-  The RMRR (%PRIx64, %PRIx64) is incorrect!\n,
-rmrru-base_address, rmrru-end_address);
-scope_devices_free(rmrru-scope);
-xfree(rmrru);
-ret = -EFAULT;
-}
-else
-{
-if ( iommu_verbose )
-dprintk(VTDPREFIX,
-  RMRR region: base_addr %PRIx64
- end_address %PRIx64\n,
-   

Re: [Xen-devel] RFC: QEMU bumping memory limit and domain restore

2015-06-02 Thread Yang Hongyang



On 06/02/2015 11:49 PM, Ian Campbell wrote:

On Tue, 2015-06-02 at 15:08 +0100, Wei Liu wrote:
[...]

So here is a proof of concept patch to record and honour that value
during migration.  A new field is added in IDL. Note that we don't
provide xl level config option for it and mandate it to be default value
during domain creation. This is to prevent libxl user from using it to
avoid unforeseen repercussions.

[...]

This field is mandated to be default value during guest creation to
avoid unforeseen repercussions. It's only honour when restoring a guest.


IMHO this means that the libxl API/IDL is the wrong place for this
value. Only user and/or application serviceable parts belong in the API.

So while I agree that this value need to be communicated across a
migration, the JSON blob is not the right mechanism for doing so. IOW if
you go down this general path I think you need a new
field/record/whatever in the migration protocol at some layer or other
(if not libxc then at the libxl layer).

To my mind this actual state vs user configured state is more akin
to the sorts of things which is in the hypervisor save blob or something
like that (nb: This is not a suggestion that it should go there).

IIRC Don also outlined another case, which is
 xl create -p
 xl migrate
 xl unpause


Actually this is what COLO do. On primary, we must start using -p then
migrate to ensure the disk is consistent.



Which might need more thought if any bumping can happen after the
migrate i.e. on unpause?


.



--
Thanks,
Yang.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv3] frontswap: allow multiple backends

2015-06-02 Thread Andrew Morton
On Tue,  2 Jun 2015 18:08:46 -0400 Dan Streetman ddstr...@ieee.org wrote:

 Change frontswap single pointer to a singly linked list of frontswap
 implementations.  Update Xen tmem implementation as register no longer
 returns anything.
 
 Frontswap only keeps track of a single implementation; any implementation
 that registers second (or later) will replace the previously registered
 implementation, and gets a pointer to the previous implementation that
 the new implementation is expected to pass all frontswap functions to
 if it can't handle the function itself.  However that method doesn't
 really make much sense, as passing that work on to every implementation
 adds unnecessary work to implementations; instead, frontswap should
 simply keep a list of all registered implementations and try each
 implementation for any function.  Most importantly, neither of the
 two currently existing frontswap implementations in the kernel actually
 do anything with any previous frontswap implementation that they
 replace when registering.
 
 This allows frontswap to successfully manage multiple implementations
 by keeping a list of them all.
 

offtopic trivia: this

--- a/mm/frontswap.c~frontswap-allow-multiple-backends-fix
+++ a/mm/frontswap.c
@@ -111,14 +111,11 @@ static inline void inc_frontswap_invalid
  */
 void frontswap_register_ops(struct frontswap_ops *ops)
 {
-   DECLARE_BITMAP(a, MAX_SWAPFILES);
-   DECLARE_BITMAP(b, MAX_SWAPFILES);
+   DECLARE_BITMAP(a, MAX_SWAPFILES) = { };
+   DECLARE_BITMAP(b, MAX_SWAPFILES) = { };
struct swap_info_struct *si;
unsigned int i;
 
-   bitmap_zero(a, MAX_SWAPFILES);
-   bitmap_zero(b, MAX_SWAPFILES);
-
spin_lock(swap_lock);
plist_for_each_entry(si, swap_active_head, list) {
if (!WARN_ON(!si-frontswap_map))

saves 64 bytes of text with my gcc.


It shouldn't be open-coded here, but a new macro in bitmap.h could be
useful, assuming it's a win for other sizes of bitmaps.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv2] frontswap: allow multiple backends

2015-06-02 Thread Dan Streetman
On Tue, Jun 2, 2015 at 5:06 PM, Andrew Morton a...@linux-foundation.org wrote:
 On Mon,  1 Jun 2015 10:22:24 -0400 Dan Streetman ddstr...@ieee.org wrote:

 Change frontswap single pointer to a singly linked list of frontswap
 implementations.  Update Xen tmem implementation as register no longer
 returns anything.

 Frontswap only keeps track of a single implementation; any implementation
 that registers second (or later) will replace the previously registered
 implementation, and gets a pointer to the previous implementation that
 the new implementation is expected to pass all frontswap functions to
 if it can't handle the function itself.  However that method doesn't
 really make much sense, as passing that work on to every implementation
 adds unnecessary work to implementations; instead, frontswap should
 simply keep a list of all registered implementations and try each
 implementation for any function.  Most importantly, neither of the
 two currently existing frontswap implementations in the kernel actually
 do anything with any previous frontswap implementation that they
 replace when registering.

 This allows frontswap to successfully manage multiple implementations
 by keeping a list of them all.

 ...

 -struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
 +void frontswap_register_ops(struct frontswap_ops *ops)
  {
 - struct frontswap_ops *old = frontswap_ops;
 - int i;
 -
 - for (i = 0; i  MAX_SWAPFILES; i++) {
 - if (test_and_clear_bit(i, need_init)) {
 - struct swap_info_struct *sis = swap_info[i];
 - /* __frontswap_init _should_ have set it! */
 - if (!sis-frontswap_map)
 - return ERR_PTR(-EINVAL);
 - ops-init(i);
 - }
 + DECLARE_BITMAP(a, MAX_SWAPFILES);
 + DECLARE_BITMAP(b, MAX_SWAPFILES);
 + struct swap_info_struct *si;
 + unsigned int i;
 +
 + spin_lock(swap_lock);
 + plist_for_each_entry(si, swap_active_head, list) {
 + if (!WARN_ON(!si-frontswap_map))
 + set_bit(si-type, a);

 umm, DECLARE_BITMAP() doesn't initialise the storage.  Either this
 patch wasn't tested very well or you should buy me a lottery ticket!

Doh!  I'll fix and resend.

I did test it, too, but zswap doesn't care if the swap device actually
exists, it just alloc's a tree for whatever it's told.  So likely it
was allocing some extra trees there :)


   }
 - /*
 -  * We MUST have frontswap_ops set _after_ the frontswap_init's
 -  * have been called. Otherwise __frontswap_store might fail. Hence
 -  * the barrier to make sure compiler does not re-order us.
 + spin_unlock(swap_lock);
 +
 + /* the new ops needs to know the currently active swap devices */
 + for_each_set_bit(i, a, MAX_SWAPFILES)
 + ops-init(i);
 +
 + /* setting frontswap_ops must happen after the ops-init() calls
 +  * above; cmpxchg implies smp_mb() which will ensure the init is
 +  * complete at this point
 +  */

 Like this, please:

 /*
  * Setting ...

 and sentences start with capital letters ;)

okay, okay :-)




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCHv3] frontswap: allow multiple backends

2015-06-02 Thread Dan Streetman
Change frontswap single pointer to a singly linked list of frontswap
implementations.  Update Xen tmem implementation as register no longer
returns anything.

Frontswap only keeps track of a single implementation; any implementation
that registers second (or later) will replace the previously registered
implementation, and gets a pointer to the previous implementation that
the new implementation is expected to pass all frontswap functions to
if it can't handle the function itself.  However that method doesn't
really make much sense, as passing that work on to every implementation
adds unnecessary work to implementations; instead, frontswap should
simply keep a list of all registered implementations and try each
implementation for any function.  Most importantly, neither of the
two currently existing frontswap implementations in the kernel actually
do anything with any previous frontswap implementation that they
replace when registering.

This allows frontswap to successfully manage multiple implementations
by keeping a list of them all.

Signed-off-by: Dan Streetman ddstr...@ieee.org
---
Changes since v2: 
  -initialize bitmaps in frontswap_register_ops
  -fix comment capitalization

 drivers/xen/tmem.c|   8 +-
 include/linux/frontswap.h |  14 +--
 mm/frontswap.c| 215 --
 3 files changed, 139 insertions(+), 98 deletions(-)

diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index c4211a3..d88f367 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void)
 #ifdef CONFIG_FRONTSWAP
if (tmem_enabled  frontswap) {
char *s = ;
-   struct frontswap_ops *old_ops;
 
tmem_frontswap_poolid = -1;
-   old_ops = frontswap_register_ops(tmem_frontswap_ops);
-   if (IS_ERR(old_ops) || old_ops) {
-   if (IS_ERR(old_ops))
-   return PTR_ERR(old_ops);
-   s =  (WARNING: frontswap_ops overridden);
-   }
+   frontswap_register_ops(tmem_frontswap_ops);
pr_info(frontswap enabled, RAM provided by Xen Transcendent 
Memory%s\n,
s);
}
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 8293262..e65ef95 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -6,16 +6,16 @@
 #include linux/bitops.h
 
 struct frontswap_ops {
-   void (*init)(unsigned);
-   int (*store)(unsigned, pgoff_t, struct page *);
-   int (*load)(unsigned, pgoff_t, struct page *);
-   void (*invalidate_page)(unsigned, pgoff_t);
-   void (*invalidate_area)(unsigned);
+   void (*init)(unsigned); /* this swap type was just swapon'ed */
+   int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
+   int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
+   void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
+   void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
+   struct frontswap_ops *next; /* private pointer to next ops */
 };
 
 extern bool frontswap_enabled;
-extern struct frontswap_ops *
-   frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
 extern void frontswap_writethrough(bool);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809..27a9924 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
 #include linux/swapfile.h
 
 /*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
- * to the frontswap backend implementation functions.
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap backend implementation functions.  Multiple implementations
+ * may be registered, but implementations can never deregister.  This
+ * is a simple singly-linked list of all registered implementations.
  */
 static struct frontswap_ops *frontswap_ops __read_mostly;
 
+#define for_each_frontswap_ops(ops)\
+   for ((ops) = frontswap_ops; (ops); (ops) = (ops)-next)
+
 /*
  * If enabled, frontswap_store will return failure even on success.  As
  * a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
  * on all frontswap functions to not call the backend until the backend
  * has registered.
  *
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon - enable_swap_info - frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to 

Re: [Xen-devel] [PATCH v1 4/5] tools/libxl: move toolstack code into libxl_toolstack.c

2015-06-02 Thread Yang Hongyang



On 06/03/2015 12:22 AM, Yang Hongyang wrote:



On 06/02/2015 10:48 PM, Ian Campbell wrote:

On Wed, 2015-05-20 at 18:01 +0800, Yang Hongyang wrote:

move toolstack code into libxl_toolstack.c


It's not clear to me what toolstack code is here, the whole of libxl
and xl is toolstack code.

Is the code being moved stuff to do with adding toolstack state to the
save stream? Perhaps libxl_{suspend,save}_toolstack.c? Or could this not
go in the libxl_dom_suspend.c you just created?


My thought was libxl_dom_suspend.c only contains code that do vm suspend
stuff(not save). So compare to move it to dom_suspend.c, I prefer not
move it or use libxl_{suspend,save}_toolstack.c. Maybe leave the code
not moved?


I think I'm going to move this code into libxl_dom_save.c, by merge this
patch with the next patch.






.





--
Thanks,
Yang.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [xen-4.2-testing test] 57781: regressions - FAIL

2015-06-02 Thread osstest service user
flight 57781 xen-4.2-testing real [real]
http://logs.test-lab.xenproject.org/osstest/logs/57781/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-i386-xend-winxpsp3 16 guest-stop   fail REGR. vs. 53018
 test-amd64-i386-xend-qemut-winxpsp3 16 guest-stop fail REGR. vs. 53018

Tests which are failing intermittently (not blocking):
 test-amd64-i386-xl-qemuu-win7-amd64 15 guest-localmigrate/x10 fail in 57697 
pass in 57781
 test-amd64-i386-rhel6hvm-amd 12 guest-start/redhat.repeat   fail pass in 57697

Regressions which are regarded as allowable (not blocking):
 test-i386-i386-xl-qemuu-winxpsp3 15 guest-localmigrate/x10 fail in 57697 like 
53018
 test-amd64-i386-xl-qemut-win7-amd64 16 guest-stop  fail like 53018
 test-amd64-amd64-xl-win7-amd64 16 guest-stop   fail like 53018
 test-amd64-amd64-xl-qemut-win7-amd64 16 guest-stop fail like 53018
 test-amd64-i386-xl-win7-amd64 16 guest-stop   fail  like 53018

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-rumpuserxen-amd64  1 build-check(1)   blocked n/a
 test-i386-i386-rumpuserxen-i386  1 build-check(1)   blocked  n/a
 test-amd64-i386-rumpuserxen-i386  1 build-check(1)   blocked  n/a
 test-amd64-amd64-xl-qemuu-ovmf-amd64  9 debian-hvm-install fail never pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  9 debian-hvm-install  fail never pass
 build-amd64-rumpuserxen   5 rumpuserxen-buildfail   never pass
 build-i386-rumpuserxen5 rumpuserxen-buildfail   never pass
 test-amd64-amd64-libvirt 12 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  12 migrate-support-checkfail   never pass
 test-i386-i386-libvirt   12 migrate-support-checkfail   never pass
 test-amd64-amd64-xl-qemuu-win7-amd64 16 guest-stop fail never pass
 test-amd64-i386-xl-qemuu-win7-amd64 16 guest-stop  fail never pass

version targeted for testing:
 xen  63aeca00c805fa1c47d9f7b1978e83e41ab482d4
baseline version:
 xen  7e527e2ab6c95ef84035d02e9e50b956a0d469c9


People who touched revisions under test:
  Ian Jackson ian.jack...@eu.citrix.com
  Petr Matousek pmato...@redhat.com


jobs:
 build-amd64  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-i386-pvops pass
 build-amd64-rumpuserxen  fail
 build-i386-rumpuserxen   fail
 test-amd64-amd64-xl  pass
 test-amd64-i386-xl   pass
 test-i386-i386-xlpass
 test-amd64-i386-rhel6hvm-amd fail
 test-amd64-i386-qemut-rhel6hvm-amd   pass
 test-amd64-i386-qemuu-rhel6hvm-amd   pass
 test-amd64-amd64-xl-qemut-debianhvm-amd64pass
 test-amd64-i386-xl-qemut-debianhvm-amd64 pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-i386-xl-qemuu-debianhvm-amd64 pass
 test-amd64-i386-qemuu-freebsd10-amd64pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 fail
 test-amd64-i386-xl-qemuu-ovmf-amd64  fail
 test-amd64-amd64-rumpuserxen-amd64   blocked 
 test-amd64-amd64-xl-qemut-win7-amd64 fail
 test-amd64-i386-xl-qemut-win7-amd64  fail
 test-amd64-amd64-xl-qemuu-win7-amd64 fail
 test-amd64-i386-xl-qemuu-win7-amd64  fail
 test-amd64-amd64-xl-win7-amd64   fail
 test-amd64-i386-xl-win7-amd64fail
 test-amd64-amd64-xl-credit2  pass
 test-i386-i386-xl-credit2pass
 test-amd64-i386-qemuu-freebsd10-i386 pass
 test-amd64-i386-rumpuserxen-i386 blocked 
 test-i386-i386-rumpuserxen-i386  blocked 
 test-amd64-i386-rhel6hvm-intel   pass
 test-amd64-i386-qemut-rhel6hvm-intel pass
 

Re: [Xen-devel] [RFC][v2][PATCH 08/14] tools: extend xc_assign_device() to support rdm reservation policy

2015-06-02 Thread Chen, Tiejun

On 2015/6/3 0:36, Wei Liu wrote:

On Fri, May 22, 2015 at 05:35:08PM +0800, Tiejun Chen wrote:

This patch passes rdm reservation policy to xc_assign_device() so the policy
is checked when assigning devices to a VM.

Signed-off-by: Tiejun Chen tiejun.c...@intel.com
---
  tools/libxc/include/xenctrl.h   |  3 ++-
  tools/libxc/xc_domain.c |  4 +++-
  tools/libxl/libxl_pci.c | 11 ++-
  tools/libxl/xl_cmdimpl.c| 23 +++
  tools/libxl/xl_cmdtable.c   |  2 +-


Where is document for the new options you added to xl pci commands?


Looks I'm missing to describe something specific to pci-attach?

diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 4eb929d..2ebfd54 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -1368,10 +1368,15 @@ it will also attempt to re-bind the device to 
its original driver, making it

 usable by Domain 0 again.  If the device is not bound to pciback, it will
 return success.

-=item Bpci-attach Idomain-id IBDF
+=item Bpci-attach Idomain-id IBDF Irdm policy

 Hot-plug a new pass-through pci device to the specified domain.
 BBDF is the PCI Bus/Device/Function of the physical device to 
pass-through.
+Brdm policy is about how to handle conflict between reserving 
reserved device
+memory and guest address space. strict means an unsolved conflict 
leads to

+immediate VM crash, while relaxed allows VM moving forward with a warning
+message thrown out. Here strict is default.
+

 =item Bpci-detach [I-f] Idomain-id IBDF




BTW you might want to consider rearrange patches in this series so that


Yes, this is really what I intend to do.


you keep the tree bisectable.


Overall, I can separate this series as several parts,

#1. Introduce our policy configuration on tools side
#2. Interact with Hypervisor to get rdm info
#3. Implement our policy with rdm info on tool side
#4. Make hvmloader to align our policy

If you already see something obviously wrong, let me know.




  tools/ocaml/libs/xc/xenctrl_stubs.c | 18 ++
  tools/python/xen/lowlevel/xc/xc.c   | 29 +++--
  xen/drivers/passthrough/pci.c   |  3 ++-
  8 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 5f84a62..2a447b9 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2078,7 +2078,8 @@ int xc_hvm_destroy_ioreq_server(xc_interface *xch,
  /* HVM guest pass-through */
  int xc_assign_device(xc_interface *xch,
   uint32_t domid,
- uint32_t machine_sbdf);
+ uint32_t machine_sbdf,
+ uint32_t flag);

  int xc_get_device_group(xc_interface *xch,
   uint32_t domid,
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index c17a5a8..9761e5a 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -1704,7 +1704,8 @@ int xc_domain_setdebugging(xc_interface *xch,
  int xc_assign_device(
  xc_interface *xch,
  uint32_t domid,
-uint32_t machine_sbdf)
+uint32_t machine_sbdf,
+uint32_t flag)
  {
  DECLARE_DOMCTL;

@@ -1712,6 +1713,7 @@ int xc_assign_device(
  domctl.domain = domid;
  domctl.u.assign_device.dev = XEN_DOMCTL_DEV_PCI;
  domctl.u.assign_device.u.pci.machine_sbdf = machine_sbdf;
+domctl.u.assign_device.flag = flag;

  return do_domctl(xch, domctl);
  }
diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
index 07e84f2..ac70edc 100644
--- a/tools/libxl/libxl_pci.c
+++ b/tools/libxl/libxl_pci.c
@@ -894,6 +894,7 @@ static int do_pci_add(libxl__gc *gc, uint32_t domid, 
libxl_device_pci *pcidev, i
  FILE *f;
  unsigned long long start, end, flags, size;
  int irq, i, rc, hvm = 0;
+uint32_t flag;

  if (type == LIBXL_DOMAIN_TYPE_INVALID)
  return ERROR_FAIL;
@@ -987,7 +988,15 @@ static int do_pci_add(libxl__gc *gc, uint32_t domid, 
libxl_device_pci *pcidev, i

  out:
  if (!libxl_is_stubdom(ctx, domid, NULL)) {
-rc = xc_assign_device(ctx-xch, domid, pcidev_encode_bdf(pcidev));
+if (pcidev-rdm_reserve == LIBXL_RDM_RESERVE_FLAG_RELAXED) {
+flag = XEN_DOMCTL_DEV_RDM_RELAXED;
+} else if (pcidev-rdm_reserve == LIBXL_RDM_RESERVE_FLAG_STRICT) {
+flag = XEN_DOMCTL_DEV_RDM_STRICT;
+} else {
+LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, unkwon rdm check flag.);


unknown

Couldn't continue reviewing because I don't know the expected behaviour.
But the changes look mostly mechanical.



I want to make this assignment failed so return ERROR_FAIL


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [RFC][v2][PATCH 01/14] tools: introduce some new parameters to set rdm policy

2015-06-02 Thread Chen, Tiejun

+=item Brdm= RDM_RESERVE_STRING 


Stray space after before and after RDM_RESERVE_STRING.


Sure,

=item Brdm=RDM_RESERVE_STRING




+
+(HVM/x86 only) Specifies the information about Reserved Device Memory (RDM),
+which is necessary to enable robust device passthrough usage. One example of


Delete usage.


Okay.




+RDM is reported through ACPI Reserved Memory Region Reporting (RMRR)
+structure on x86 platform.
+
+BRDM_RESERVE_STRING has the form C[KEY=VALUE,KEY=VALUE,... where:
+
+=over 4
+
+=item BKEY=VALUE
+
+Possible BKEYs are:
+
+=over 4
+
+=item Btype=STRING
+
+Currently we just have two types:
+
+host means all reserved device memory on this platform should be reserved
+in this VM's pfn space. This global RDM parameter allows user to specify


PFN is Xen internal terminology. Do you mean guest address space? Note
that the reader is system administrators who might not know / want to
know Xen internals.


Sure.




+reserved regions explicitly. And using host to include all reserved regions
+reported on this platform which is good to handle hotplug scenario. In the
+future this parameter may be further extended to allow specifying random
+regions, e.g. even those belonging to another platform as a preparation


Extending how? What's your envisaged syntax for those random regions?


We didn't go into details while discussing that design. Maybe we can do 
something like this,


rdm=type=host,reserve=strict,rdm_add=size[KMG][@offset[KMG]],size[KMG][@offset[KMG]],...


Should you want to reserve more, an array is more useful. Could you


Yeah.


provide some examples?


But we may have alternative approach to this when I noticed some guys 
are trying to delivery some patches about setting rmrr region by xen 
commandline. So I also would like to check this likelihood when we can 
step forward.





+for live migration with passthrough devices.
+
+none means we have nothing to do all reserved regions and ignore all 
policies,
+so guest work as before.
+
+=over 4
+
+=item Breserve=STRING
+
+Conflict may be detected when reserving reserved device memory in gfn space.


GFN is a Xen internal terminology. Maybe you should use guest address
space?

Nonetheless the terminology throughout this document should be
consistent.


Sure, so I will do this,

s/pfn/guest address space/g

s/gfn/guest address space/g




+strict means an unsolved conflict leads to immediate VM crash, while
+relaxed allows VM moving forward with a warning message thrown out. relaxed
+is default.
+
+Note this may be overrided by another sub item, rdm_reserve, in pci device.
+


overridden by rdm_reserve option in PCI device configuration.


Okay.




  =item Bpci=[ PCI_SPEC_STRING, PCI_SPEC_STRING, ... ]

  Specifies the host PCI devices to passthrough to this guest. Each 
BPCI_SPEC_STRING
@@ -707,6 +750,20 @@ dom0 without confirmation.  Please use with care.
  D0-D3hot power management states for the PCI device. False (0) by
  default.

+=item Brdm_reserv=STRING
+
+(HVM/x86 only) Specifies the information about Reserved Device Memory (RDM),
+which is necessary to enable robust device passthrough usage. One example of


Delete usage.


+RDM is reported through ACPI Reserved Memory Region Reporting (RMRR)
+structure on x86 platform.
+
+Conflict may be detected when reserving reserved device memory in gfn space.
+strict means an unsolved conflict leads to immediate VM crash, while
+relaxed allows VM moving forward with a warning message thrown out. strict
+is default.
+


Actually these two paragraphs are the same as before. You can just point
readers to previous sections instead of copying them here.


So instead,

(HVM/x86 only) This is same as reserve option above but just specific
to a given device, and strict is default here.




+Note this would override global Brdm option.
+
  =back

  =back
diff --git a/docs/misc/vtd.txt b/docs/misc/vtd.txt
index 9af0e99..7d63c47 100644
--- a/docs/misc/vtd.txt
+++ b/docs/misc/vtd.txt
@@ -111,6 +111,30 @@ in the config file:
  To override for a specific device:
pci = [ '01:00.0,msitranslate=0', '03:00.0' ]

+RDM, 'reserved device memory', for PCI Device Passthrough
+-
+
+There are some devices the BIOS controls, for e.g. USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+identity mappings for these regions for these devices to access these regions.
+
+While creating a VM we should reserve them in advance, and avoid any conflicts.
+So we introduce user configurable parameters to specify RDM resource and
+according policies,
+
+To enable this globally, add rdm in the config file:
+
+rdm = type=host, reserve=relaxed   (default policy is relaxed)
+
+Or just for a specific device:

Re: [Xen-devel] [Patch v3 27/36] x86, irq: Use access helper irq_data_get_affinity_mask()

2015-06-02 Thread Jiang Liu
On 2015/6/3 3:19, Thomas Gleixner wrote:
 On Mon, 1 Jun 2015, Jiang Liu wrote:
 
 diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
 index 9b62f690b0ff..dfa3a5f5b3d3 100644
 --- a/arch/x86/kernel/apic/vector.c
 +++ b/arch/x86/kernel/apic/vector.c
 @@ -494,9 +494,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
  
  err = assign_irq_vector(irq, data, dest);
  if (err) {
 -struct irq_data *top = irq_get_irq_data(irq);
 -
 -if (assign_irq_vector(irq, data, top-affinity))
 +if (assign_irq_vector(irq, data,
 +  irq_data_get_affinity_mask(irq_data)))
 
 Does this patch work w/o moving the affinity mask to common data? I
 doubt so, as you remove the retrieval of 'top'.
Hi Thomas,
This piece of code should be moved into [31/36], otherwise
it will break bisecting. I will redo patch this and [31/36] to
support bisecting.
Thanks!
Gerry

 
 Thanks,
 
   tglx
 

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [RFC][v2][PATCH 04/14] tools/libxl: detect and avoid conflicts with RDM

2015-06-02 Thread Chen, Tiejun

On 2015/6/3 0:29, Wei Liu wrote:

On Fri, May 22, 2015 at 05:35:04PM +0800, Tiejun Chen wrote:

While building a VM, HVM domain builder provides struct hvm_info_table{}
to help hvmloader. Currently it includes two fields to construct guest
e820 table by hvmloader, low_mem_pgend and high_mem_pgend. So we should
check them to fix any conflict with RAM.

RMRR can reside in address space beyond 4G theoretically, but we never
see this in real world. So in order to avoid breaking highmem layout
we don't solve highmem conflict. Note this means highmem rmrr could still
be supported if no conflict.

But in the case of lowmem, RMRR probably scatter the whole RAM space.
Especially multiple RMRR entries would worsen this to lead a complicated
memory layout. And then its hard to extend hvm_info_table{} to work
hvmloader out. So here we're trying to figure out a simple solution to
avoid breaking existing layout. So when a conflict occurs,

 #1. Above a predefined boundary (default 2G)
 - move lowmem_end below reserved region to solve conflict;

 #2. Below a predefined boundary (default 2G)
 - Check strict/relaxed policy.
 strict policy leads to fail libxl. Note when both policies
 are specified on a given region, 'strict' is always preferred.
 relaxed policy issue a warning message and also mask this entry 
INVALID
 to indicate we shouldn't expose this entry to hvmloader.

Note this predefined boundary can be changes with the parameter
rdm_mem_boundary in .cfg file.

Signed-off-by: Tiejun Chen tiejun.c...@intel.com
---


It would be better you write down what you changed in this version after
--- marker.

What we normally do is


libxl: implement FOO

FOO is needed because ...

Signed-off-by: Wei Liu wei.l...@citrix.com
---
changes in vN:
  * bar - baz
  * more comments
---

The stuff between two --- will be automatically discarded when
committing.


I knew about this rule.

Actually I already mentioned this change in patch #00,

v2:

* Instead of that fixed predefined rdm memory boundary, we'd like to
  introduce a parameter, rdm_mem_boundary, to set this threshold value.
...

So I didn't explain this again separately so sorry for this inconvenience.




  docs/man/xl.cfg.pod.5  |  21 
  tools/libxc/include/xenguest.h |   1 +
  tools/libxc/xc_hvm_build_x86.c |  25 ++--
  tools/libxl/libxl_create.c |   2 +-
  tools/libxl/libxl_dm.c | 253 +
  tools/libxl/libxl_dom.c|  27 -
  tools/libxl/libxl_internal.h   |  11 +-
  tools/libxl/libxl_types.idl|   8 ++
  tools/libxl/xl_cmdimpl.c   |   3 +
  9 files changed, 337 insertions(+), 14 deletions(-)

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index 12c34c4..80e3930 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -764,6 +764,27 @@ is default.

  Note this would override global Brdm option.

+=item Brdm_mem_boundary=MBYTES
+
+Number of megabytes to set a boundary for checking rdm conflict.
+
+When RDM conflicts with RAM, RDM probably scatter the whole RAM space.
+Especially multiple RMRR entries would worsen this to lead a complicated
+memory layout. So here we're trying to figure out a simple solution to
+avoid breaking existing layout. So when a conflict occurs,
+
+#1. Above a predefined boundary
+- move lowmem_end below reserved region to solve conflict;
+
+#2. Below a predefined boundary
+- Check strict/relaxed policy.
+strict policy leads to fail libxl. Note when both policies
+are specified on a given region, 'strict' is always preferred.
+relaxed policy issue a warning message and also mask this entry 
INVALID
+to indicate we shouldn't expose this entry to hvmloader.
+
+Her the default is 2G.


Typo her.


s/her/here



I get the idea. I will leave grammar / syntax check to native speakers.


Sure :)




+
  =back

  =back
diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index 7581263..4cb7e9f 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -234,6 +234,7 @@ struct xc_hvm_firmware_module {
  };

  struct xc_hvm_build_args {
+uint64_t lowmem_size;/* All low memory size in bytes. */


You might find this value unnecessary with my patch to consolidate
memory layout generation in libxl?


I also noticed this from your patch. And also I replied you online, I 
would rebase my patches once yours is acked. So at this point, yes, this 
should be gone when you introduce lowmem_end.





  uint64_t mem_size;   /* Memory size in bytes. */
  uint64_t mem_target; /* Memory target in bytes. */
  uint64_t mmio_size;  /* Size of the MMIO hole in bytes. */
diff --git a/tools/libxc/xc_hvm_build_x86.c b/tools/libxc/xc_hvm_build_x86.c
index e45ae4a..9a1567a 100644
--- a/tools/libxc/xc_hvm_build_x86.c
+++ b/tools/libxc/xc_hvm_build_x86.c
@@ -21,6 +21,7 @@
  

[Xen-devel] [Patch v3 27/36] x86, irq: Use access helper irq_data_get_affinity_mask()

2015-06-02 Thread Jiang Liu
Use access helper irq_data_get_affinity_mask() to hide implementation
details of struct irq_desc.

Signed-off-by: Jiang Liu jiang@linux.intel.com
---
Hi Thomas,
This version changes the patch to correctly support bisecting.
Thanks!
Gerry
---
 arch/x86/kernel/apic/io_apic.c   |2 +-
 arch/x86/kernel/apic/vector.c|3 ++-
 arch/x86/kernel/irq.c|5 +++--
 drivers/xen/events/events_base.c |4 ++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 845dc0df2002..09921de4210f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2541,7 +2541,7 @@ void __init setup_ioapic_dest(void)
 * Honour affinities which have been set in early boot
 */
if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
-   mask = idata-affinity;
+   mask = irq_data_get_affinity_mask(idata);
else
mask = apic-target_cpus();
 
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 9b62f690b0ff..7ad911ea4f56 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -496,7 +496,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
if (err) {
struct irq_data *top = irq_get_irq_data(irq);
 
-   if (assign_irq_vector(irq, data, top-affinity))
+   if (assign_irq_vector(irq, data,
+ irq_data_get_affinity_mask(top)))
pr_err(Failed to recover vector for irq %d\n, irq);
return err;
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7e10c8b4b318..37685e37550c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -342,7 +342,8 @@ int check_irq_vectors_for_cpu_disable(void)
continue;
 
data = irq_desc_get_irq_data(desc);
-   cpumask_copy(affinity_new, data-affinity);
+   cpumask_copy(affinity_new,
+irq_data_get_affinity_mask(data));
cpumask_clear_cpu(this_cpu, affinity_new);
 
/* Do not count inactive or per-cpu irqs. */
@@ -420,7 +421,7 @@ void fixup_irqs(void)
raw_spin_lock(desc-lock);
 
data = irq_desc_get_irq_data(desc);
-   affinity = data-affinity;
+   affinity = irq_data_get_affinity_mask(data);
if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
cpumask_subset(affinity, cpu_online_mask)) {
raw_spin_unlock(desc-lock);
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 2b8553bd8715..d00e0be8e9ea 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -336,7 +336,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned 
int cpu)
 
BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-   cpumask_copy(irq_get_irq_data(irq)-affinity, cpumask_of(cpu));
+   cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
 #endif
xen_evtchn_port_bind_to_cpu(info, cpu);
 
@@ -373,7 +373,7 @@ static void xen_irq_init(unsigned irq)
struct irq_info *info;
 #ifdef CONFIG_SMP
/* By default all event channels notify CPU#0. */
-   cpumask_copy(irq_get_irq_data(irq)-affinity, cpumask_of(0));
+   cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
 #endif
 
info = kzalloc(sizeof(*info), GFP_KERNEL);
-- 
1.7.10.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 12/13] tools: add tools support for Intel CAT

2015-06-02 Thread Chao Peng
This is the xc/xl changes to support Intel Cache Allocation
Technology(CAT).

'xl psr-hwinfo' is updated to show CAT info and two new commands
for CAT are introduced:
- xl psr-cat-cbm-set [-s socket] domain cbm
  Set cache capacity bitmasks(CBM) for a domain.
- xl psr-cat-show domain
  Show CAT domain information.

Examples:
[root@vmm-psr vmm]# xl psr-hwinfo --cat
Cache Allocation Technology (CAT):
Socket ID   : 0
L3 Cache: 12288KB
Maximum COS : 15
CBM length  : 12
Default CBM : 0xfff

[root@vmm-psr vmm]# xl psr-cat-cbm-set 0 0xff

[root@vmm-psr vmm]# xl psr-cat-show
Socket ID   : 0
L3 Cache: 12288KB
Default CBM : 0xfff
   ID NAME CBM
0 Domain-00xff

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Reviewed-by: Dario Faggioli dario.faggi...@citrix.com
Acked-by: Ian Campbell ian.campb...@citrix.com
---
Changes in v7:
* Add PSR head1 level section and change CMT/CAT as its subsections for xl man 
page.
* Other minor document changes.
Changes in v6:
* Merge xl psr-cmt/cat-hwinfo = xl psr-hwinfo.
* Add function header to explain the 'target' parameter.
* Use bitmap instead of TARGETS_ALL.
* Remove the need to store the return value form libxc.
* Minor document/commit msg adjustment.
Changes in v5:
* Add psr-cat-hwinfo.
* Add libxl_psr_cat_info_list_free.
* malloc = libxl__malloc
* Other comments from Ian/Wei.
Changes in v4:
* Add example output in commit message.
* Make libxl__count_physical_sockets private to libxl_psr.c.
* Set errno in several error cases.
* Change libxl_psr_cat_get_l3_info to return all sockets information.
* Remove unused libxl_domain_info call.
Changes in v3:
* Add manpage.
* libxl_psr_cat_set/get_domain_data = libxl_psr_cat_set/get_cbm.
* Move libxl_count_physical_sockets into seperate patch.
* Support LIBXL_PSR_TARGET_ALL for libxl_psr_cat_set_cbm.
* Clean up the print codes.
---
 docs/man/xl.pod.1 |  75 +++--
 tools/libxc/include/xenctrl.h |  15 +++
 tools/libxc/xc_psr.c  |  76 ++
 tools/libxl/libxl.h   |  35 +++
 tools/libxl/libxl_psr.c   | 143 +++--
 tools/libxl/libxl_types.idl   |  10 ++
 tools/libxl/xl.h  |   4 +
 tools/libxl/xl_cmdimpl.c  | 237 --
 tools/libxl/xl_cmdtable.c |  18 +++-
 9 files changed, 584 insertions(+), 29 deletions(-)

diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index cebec46..d77ce77 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -1484,28 +1484,52 @@ policy. Loading new security policy will reset runtime 
changes to device labels.
 
 =back
 
-=head1 CACHE MONITORING TECHNOLOGY
+=head1 PLATFORM SHARED RESOURCE MONITORING/CONTROL
+
+Intel Haswell and later server platforms offer shared resource monitoring
+and control technologies. The availability of these technologies and the
+hardware capabilities can be shown with Bpsr-hwinfo.
+
+=over 4
+
+=item Bpsr-hwinfo [IOPTIONS]
+
+Show Platform Shared Resource (PSR) hardware information.
+
+BOPTIONS
+
+=over 4
+
+=item B-m, B--cmt
+
+Show Cache Monitoring Technology (CMT) hardware information.
+
+=item B-a, B--cat
+
+Show Cache Allocation Technology (CAT) hardware information.
+
+=back
+
+=back
+
+=head2 CACHE MONITORING TECHNOLOGY
 
 Intel Haswell and later server platforms offer monitoring capability in each
 logical processor to measure specific platform shared resource metric, for
-example, L3 cache occupancy. In Xen implementation, the monitoring granularity
-is domain level. To monitor a specific domain, just attach the domain id with
-the monitoring service. When the domain doesn't need to be monitored any more,
-detach the domain id from the monitoring service.
+example, L3 cache occupancy. In the Xen implementation, the monitoring
+granularity is domain level. To monitor a specific domain, just attach the
+domain id with the monitoring service. When the domain doesn't need to be
+monitored any more, detach the domain id from the monitoring service.
 
 Intel Broadwell and later server platforms also offer total/local memory
 bandwidth monitoring. Xen supports per-domain monitoring for these two
 additional monitoring types. Both memory bandwidth monitoring and L3 cache
 occupancy monitoring share the same set of underlying monitoring service. Once
-a domain is attached to the monitoring service, monitoring data can be showed
+a domain is attached to the monitoring service, monitoring data can be shown
 for any of these monitoring types.
 
 =over 4
 
-=item Bpsr-hwinfo
-
-Show CMT hardware information.
-
 =item Bpsr-cmt-attach [Idomain-id]
 
 attach: Attach the platform shared resource monitoring service to a domain.
@@ -1536,6 +1560,37 @@ ignored:
 
 =back
 
+=head2 CACHE ALLOCATION TECHNOLOGY
+
+Intel Broadwell and later server platforms offer capabilities to configure and
+make use of the Cache Allocation Technology (CAT) mechanisms, which enable 

[Xen-devel] [PATCH v9 02/13] x86: detect and initialize Intel CAT feature

2015-06-02 Thread Chao Peng
Detect Intel Cache Allocation Technology(CAT) feature and store the
cpuid information for later use. Currently only L3 cache allocation is
supported. The L3 CAT features may vary among sockets so per-socket
feature information is stored. The initialization can happen either at
boot time or when CPU(s) is hot plugged after booting.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Reviewed-by: Andrew Cooper andrew.coop...@citrix.com
---
Changes in v9:
* Add __read_mostly for opt_cos_max.
* Add check for cpuid_level.
* Add priority for cpu notifier.
Changes in v8:
* Remove cat_socket_init_bitmap and rename cat_socket_enable_bitmap.
* Ensure opt_cos_max is not too small.
* Use CPU_DEAD instead of CPU_DYING.
* indentation fix.
Changes in v7:
* Clear the init/enable flag when a socket going offline.
* Reorder the statements in init_psr_cat.
Changes in v6:
* Introduce cat_socket_init(_enable)_bitmap.
Changes in v5:
* Add cos_max boot option.
Changes in v4:
* check X86_FEATURE_CAT available before doing initialization.
Changes in v3:
* Remove num_sockets boot option instead calculate it at boot time.
* Name hardcoded CAT cpuid leaf as PSR_CPUID_LEVEL_CAT.
Changes in v2:
* socket_num = num_sockets and fix several documentaion issues.
* refactor boot line parameters parsing into standlone patch.
* set opt_num_sockets = NR_CPUS when opt_num_sockets  NR_CPUS.
* replace CPU_ONLINE with CPU_STARTING and integrate that into scheduling
  improvement patch.
* reimplement get_max_socket() with cpu_to_socket();
* cbm is still uint64 as there is a path forward for supporting long masks.
---
 docs/misc/xen-command-line.markdown |  15 +-
 xen/arch/x86/psr.c  | 100 ++--
 xen/include/asm-x86/cpufeature.h|   1 +
 xen/include/asm-x86/psr.h   |   3 ++
 4 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 4889e27..28a09a8 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1137,9 +1137,9 @@ This option can be specified more than once (up to 8 
times at present).
  `= integer`
 
 ### psr (Intel)
- `= List of ( cmt:boolean | rmid_max:integer )`
+ `= List of ( cmt:boolean | rmid_max:integer | cat:boolean | 
cos_max:integer )`
 
- Default: `psr=cmt:0,rmid_max:255`
+ Default: `psr=cmt:0,rmid_max:255,cat:0,cos_max:255`
 
 Platform Shared Resource(PSR) Services.  Intel Haswell and later server
 platforms offer information about the sharing of resources.
@@ -1149,6 +1149,12 @@ Monitoring ID(RMID) is used to bind the domain to 
corresponding shared
 resource.  RMID is a hardware-provided layer of abstraction between software
 and logical processors.
 
+To use the PSR cache allocation service for a certain domain, a capacity
+bitmasks(CBM) is used to bind the domain to corresponding shared resource.
+CBM represents cache capacity and indicates the degree of overlap and isolation
+between domains. In hypervisor a Class of Service(COS) ID is allocated for each
+unique CBM.
+
 The following resources are available:
 
 * Cache Monitoring Technology (Haswell and later).  Information regarding the
@@ -1159,6 +1165,11 @@ The following resources are available:
   total/local memory bandwidth. Follow the same options with Cache Monitoring
   Technology.
 
+* Cache Alllocation Technology (Broadwell and later).  Information regarding
+  the cache allocation.
+  * `cat` instructs Xen to enable/disable Cache Allocation Technology.
+  * `cos_max` indicates the max value for COS ID.
+
 ### reboot
  `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | 
  [c]old]`
 
diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index 2490d22..cf6ae06 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -19,14 +19,25 @@
 #include asm/psr.h
 
 #define PSR_CMT(10)
+#define PSR_CAT(11)
+
+struct psr_cat_socket_info {
+unsigned int cbm_len;
+unsigned int cos_max;
+};
 
 struct psr_assoc {
 uint64_t val;
 };
 
 struct psr_cmt *__read_mostly psr_cmt;
+
+static unsigned long *__read_mostly cat_socket_enable;
+static struct psr_cat_socket_info *__read_mostly cat_socket_info;
+
 static unsigned int __initdata opt_psr;
 static unsigned int __initdata opt_rmid_max = 255;
+static unsigned int __read_mostly opt_cos_max = 255;
 static uint64_t rmid_mask;
 static DEFINE_PER_CPU(struct psr_assoc, psr_assoc);
 
@@ -63,10 +74,14 @@ static void __init parse_psr_param(char *s)
 *val_str++ = '\0';
 
 parse_psr_bool(s, val_str, cmt, PSR_CMT);
+parse_psr_bool(s, val_str, cat, PSR_CAT);
 
 if ( val_str  !strcmp(s, rmid_max) )
 opt_rmid_max = simple_strtoul(val_str, NULL, 0);
 
+if ( val_str  !strcmp(s, cos_max) )
+opt_cos_max = simple_strtoul(val_str, NULL, 0);
+
 s = ss + 1;
 } while ( ss );
 }
@@ -194,22 +209,98 @@ void psr_ctxt_switch_to(struct domain *d)
 

[Xen-devel] [PATCH v9 08/13] xsm: add CAT related xsm policies

2015-06-02 Thread Chao Peng
Add xsm policies for Cache Allocation Technology(CAT) related hypercalls
to restrict the functions visibility to control domain only.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Acked-by:  Daniel De Graaf dgde...@tycho.nsa.gov
---
 tools/flask/policy/policy/modules/xen/xen.if | 2 +-
 tools/flask/policy/policy/modules/xen/xen.te | 4 +++-
 xen/xsm/flask/hooks.c| 6 ++
 xen/xsm/flask/policy/access_vectors  | 4 
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tools/flask/policy/policy/modules/xen/xen.if 
b/tools/flask/policy/policy/modules/xen/xen.if
index 620d151..aa5eb72 100644
--- a/tools/flask/policy/policy/modules/xen/xen.if
+++ b/tools/flask/policy/policy/modules/xen/xen.if
@@ -51,7 +51,7 @@ define(`create_domain_common', `
getaffinity setaffinity setvcpuextstate };
allow $1 $2:domain2 { set_cpuid settsc setscheduler setclaim
set_max_evtchn set_vnumainfo get_vnumainfo cacheflush
-   psr_cmt_op };
+   psr_cmt_op psr_cat_op };
allow $1 $2:security check_context;
allow $1 $2:shadow enable;
allow $1 $2:mmu { map_read map_write adjust memorymap physmap pinpage 
mmuext_op updatemp };
diff --git a/tools/flask/policy/policy/modules/xen/xen.te 
b/tools/flask/policy/policy/modules/xen/xen.te
index ce70639..c9ecbc4 100644
--- a/tools/flask/policy/policy/modules/xen/xen.te
+++ b/tools/flask/policy/policy/modules/xen/xen.te
@@ -67,6 +67,7 @@ allow dom0_t xen_t:xen {
 allow dom0_t xen_t:xen2 {
 resource_op
 psr_cmt_op
+psr_cat_op
 };
 allow dom0_t xen_t:mmu memorymap;
 
@@ -80,7 +81,8 @@ allow dom0_t dom0_t:domain {
getpodtarget setpodtarget set_misc_info set_virq_handler
 };
 allow dom0_t dom0_t:domain2 {
-   set_cpuid gettsc settsc setscheduler set_max_evtchn set_vnumainfo 
get_vnumainfo psr_cmt_op
+   set_cpuid gettsc settsc setscheduler set_max_evtchn set_vnumainfo
+   get_vnumainfo psr_cmt_op psr_cat_op
 };
 allow dom0_t dom0_t:resource { add remove };
 
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 6e37d29..317f50f 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -735,6 +735,9 @@ static int flask_domctl(struct domain *d, int cmd)
 case XEN_DOMCTL_psr_cmt_op:
 return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__PSR_CMT_OP);
 
+case XEN_DOMCTL_psr_cat_op:
+return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__PSR_CAT_OP);
+
 default:
 printk(flask_domctl: Unknown op %d\n, cmd);
 return -EPERM;
@@ -794,6 +797,9 @@ static int flask_sysctl(int cmd)
 case XEN_SYSCTL_psr_cmt_op:
 return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
 XEN2__PSR_CMT_OP, NULL);
+case XEN_SYSCTL_psr_cat_op:
+return avc_current_has_perm(SECINITSID_XEN, SECCLASS_XEN2,
+XEN2__PSR_CAT_OP, NULL);
 
 default:
 printk(flask_sysctl: Unknown op %d\n, cmd);
diff --git a/xen/xsm/flask/policy/access_vectors 
b/xen/xsm/flask/policy/access_vectors
index 68284d5..e1a11b2 100644
--- a/xen/xsm/flask/policy/access_vectors
+++ b/xen/xsm/flask/policy/access_vectors
@@ -85,6 +85,8 @@ class xen2
 resource_op
 # XEN_SYSCTL_psr_cmt_op
 psr_cmt_op
+# XEN_SYSCTL_psr_cat_op
+psr_cat_op
 }
 
 # Classes domain and domain2 consist of operations that a domain performs on
@@ -230,6 +232,8 @@ class domain2
 mem_paging
 # XENMEM_sharing_op
 mem_sharing
+# XEN_DOMCTL_psr_cat_op
+psr_cat_op
 }
 
 # Similar to class domain, but primarily contains domctls related to HVM 
domains
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 13/13] docs: add xl-psr.markdown

2015-06-02 Thread Chao Peng
Add document to introduce basic concepts and terms in PSR family
technologies and the xl interfaces.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Acked-by: Ian Campbell ian.campb...@citrix.com
---
Changes in v7:
* Correct 'xl psr-hwinfo'.
Changes in v6:
* Address comments from Ian.
Changes in v5:
* Address comments from Andrew/Ian.
---
 docs/man/xl.pod.1 |   7 ++-
 docs/misc/xl-psr.markdown | 133 ++
 2 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 docs/misc/xl-psr.markdown

diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index d77ce77..45600e8 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -1490,6 +1490,9 @@ Intel Haswell and later server platforms offer shared 
resource monitoring
 and control technologies. The availability of these technologies and the
 hardware capabilities can be shown with Bpsr-hwinfo.
 
+See Lhttp://xenbits.xen.org/docs/unstable/misc/xl-psr.html for more
+information.
+
 =over 4
 
 =item Bpsr-hwinfo [IOPTIONS]
@@ -1573,7 +1576,8 @@ on VM basis. To enforce cache on a specific domain, just 
set capacity bitmasks
 
 =item Bpsr-cat-cbm-set [IOPTIONS] Idomain-id Icbm
 
-Set cache capacity bitmasks(CBM) for a domain.
+Set cache capacity bitmasks(CBM) for a domain. For how to specify Icbm
+please refer to Lhttp://xenbits.xen.org/docs/unstable/misc/xl-psr.html.
 
 BOPTIONS
 
@@ -1614,6 +1618,7 @@ And the following documents on the xen.org website:
 Lhttp://xenbits.xen.org/docs/unstable/misc/xl-network-configuration.html
 Lhttp://xenbits.xen.org/docs/unstable/misc/xl-disk-configuration.txt
 Lhttp://xenbits.xen.org/docs/unstable/misc/xsm-flask.txt
+Lhttp://xenbits.xen.org/docs/unstable/misc/xl-psr.html
 
 For systems that don't automatically bring CPU online:
 
diff --git a/docs/misc/xl-psr.markdown b/docs/misc/xl-psr.markdown
new file mode 100644
index 000..3545912
--- /dev/null
+++ b/docs/misc/xl-psr.markdown
@@ -0,0 +1,133 @@
+# Intel Platform Shared Resource Monitoring/Control in xl
+
+This document introduces Intel Platform Shared Resource Monitoring/Control
+technologies, their basic concepts and the xl interfaces.
+
+## Cache Monitoring Technology (CMT)
+
+Cache Monitoring Technology (CMT) is a new feature available on Intel Haswell
+and later server platforms that allows an OS or Hypervisor/VMM to determine
+the usage of cache (currently only L3 cache supported) by applications running
+on the platform. A Resource Monitoring ID (RMID) is the abstraction of the
+application(s) that will be monitored for its cache usage. The CMT hardware
+tracks cache utilization of memory accesses according to the RMID and reports
+monitored data via a counter register.
+
+For more detailed information please refer to Intel SDM chapter
+17.14 - Platform Shared Resource Monitoring: Cache Monitoring Technology.
+
+In Xen's implementation, each domain in the system can be assigned a RMID
+independently, while RMID=0 is reserved for monitoring domains that don't
+have CMT service attached. RMID is opaque for xl/libxl and is only used in
+hypervisor.
+
+### xl interfaces
+
+A domain is assigned a RMID implicitly by attaching it to CMT service:
+
+`xl psr-cmt-attach domid`
+
+After that, cache usage for the domain can be shown by:
+
+`xl psr-cmt-show cache-occupancy domid`
+
+Once monitoring is not needed any more, the domain can be detached from the
+CMT service by:
+
+`xl psr-cmt-detach domid`
+
+An attach may fail because of no free RMID available. In such case unused
+RMID(s) can be freed by detaching corresponding domains from CMT service.
+
+Maximum RMID and supported monitor types in the system can be obtained by:
+
+`xl psr-hwinfo --cmt`
+
+## Memory Bandwidth Monitoring (MBM)
+
+Memory Bandwidth Monitoring(MBM) is a new hardware feature available on Intel
+Broadwell and later server platforms which builds on the CMT infrastructure to
+allow monitoring of system memory bandwidth. It introduces two new monitoring
+event type to monitor system total/local memory bandwidth. The same RMID can
+be used to monitor both cache usage and memory bandwidth at the same time.
+
+For more detailed information please refer to Intel SDM chapter
+17.14 - Platform Shared Resource Monitoring: Cache Monitoring Technology.
+
+In Xen's implementation, MBM shares the same set of underlying monitoring
+service with CMT and can be used to monitor memory bandwidth on a per domain
+basis.
+
+The xl interfaces are the same with that of CMT. The difference is the
+monitor type is corresponding memory monitoring type (local-mem-bandwidth/
+total-mem-bandwidth instead of cache-occupancy). E.g. after a `xl 
psr-cmt-attach`:
+
+`xl psr-cmt-show local-mem-bandwidth domid`
+
+`xl psr-cmt-show total-mem-bandwidth domid`
+
+## Cache Allocation Technology (CAT)
+
+Cache Allocation Technology (CAT) is a new feature available on Intel
+Broadwell and later server platforms that allows an OS or Hypervisor/VMM to
+partition cache allocation 

[Xen-devel] [PATCH v9 06/13] x86: dynamically get/set CBM for a domain

2015-06-02 Thread Chao Peng
For CAT, COS is maintained in hypervisor only while CBM is exposed to
user space directly to allow getting/setting domain's cache capacity.
For each specified CBM, hypervisor will either use a existed COS which
has the same CBM or allocate a new one if the same CBM is not found. If
the allocation fails because of no enough COS available then error is
returned. The getting/setting are always operated on a specified socket.
For multiple sockets system, the interface may be called several times.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Reviewed-by: Dario Faggioli dario.faggi...@citrix.com
Acked-by: Jan Beulich jbeul...@suse.com
---
Changes in v9:
* Initialize 'info' explictly so that compiler would not complain.
* Simplify the code and remove multiple return points.
* Remove confused comment for 'target'.
* Add an additional check for cbm to make sure at least one bit is set(which is 
required).
Changes in v8:
* Add likely for 'socket  nr_sockets' in get_socket_cpu.
Changes in v7:
* find = found in psr_set_l3_cbm().
Changes in v6:
* Correct spin_lock scope.
Changes in v5:
* Add spin_lock to protect cbm_map.
---
 xen/arch/x86/domctl.c   |  20 ++
 xen/arch/x86/psr.c  | 139 
 xen/include/asm-x86/msr-index.h |   1 +
 xen/include/asm-x86/psr.h   |   2 +
 xen/include/public/domctl.h |  12 
 5 files changed, 174 insertions(+)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index e9f76d0..84baec0 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1130,6 +1130,26 @@ long arch_do_domctl(
 }
 break;
 
+case XEN_DOMCTL_psr_cat_op:
+switch ( domctl-u.psr_cat_op.cmd )
+{
+case XEN_DOMCTL_PSR_CAT_OP_SET_L3_CBM:
+ret = psr_set_l3_cbm(d, domctl-u.psr_cat_op.target,
+ domctl-u.psr_cat_op.data);
+break;
+
+case XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM:
+ret = psr_get_l3_cbm(d, domctl-u.psr_cat_op.target,
+ domctl-u.psr_cat_op.data);
+copyback = 1;
+break;
+
+default:
+ret = -EOPNOTSUPP;
+break;
+}
+break;
+
 default:
 ret = iommu_do_domctl(domctl, d, u_domctl);
 break;
diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index 10c4cdd..fce5bca 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -48,6 +48,14 @@ static unsigned int __read_mostly opt_cos_max = 255;
 static uint64_t rmid_mask;
 static DEFINE_PER_CPU(struct psr_assoc, psr_assoc);
 
+static unsigned int get_socket_cpu(unsigned int socket)
+{
+if ( likely(socket  nr_sockets) )
+return cpumask_any(socket_cpumask[socket]);
+
+return nr_cpu_ids;
+}
+
 static void __init parse_psr_bool(char *s, char *value, char *feature,
   unsigned int mask)
 {
@@ -248,6 +256,137 @@ int psr_get_cat_l3_info(unsigned int socket, uint32_t 
*cbm_len,
 return ret;
 }
 
+int psr_get_l3_cbm(struct domain *d, unsigned int socket, uint64_t *cbm)
+{
+struct psr_cat_socket_info *info = NULL;
+int ret = get_cat_socket_info(socket, info);
+
+if ( !ret )
+*cbm = info-cos_to_cbm[d-arch.psr_cos_ids[socket]].cbm;
+
+return ret;
+}
+
+static bool_t psr_check_cbm(unsigned int cbm_len, uint64_t cbm)
+{
+unsigned int first_bit, zero_bit;
+
+/* Set bits should only in the range of [0, cbm_len). */
+if ( cbm  (~0ull  cbm_len) )
+return 0;
+
+/* At least one bit need to be set. */
+if ( hweight_long(cbm)  1 )
+return 0;
+
+first_bit = find_first_bit(cbm, cbm_len);
+zero_bit = find_next_zero_bit(cbm, cbm_len, first_bit);
+
+/* Set bits should be contiguous. */
+if ( zero_bit  cbm_len 
+ find_next_bit(cbm, cbm_len, zero_bit)  cbm_len )
+return 0;
+
+return 1;
+}
+
+struct cos_cbm_info
+{
+unsigned int cos;
+uint64_t cbm;
+};
+
+static void do_write_l3_cbm(void *data)
+{
+struct cos_cbm_info *info = data;
+
+wrmsrl(MSR_IA32_PSR_L3_MASK(info-cos), info-cbm);
+}
+
+static int write_l3_cbm(unsigned int socket, unsigned int cos, uint64_t cbm)
+{
+struct cos_cbm_info info = { .cos = cos, .cbm = cbm };
+
+if ( socket == cpu_to_socket(smp_processor_id()) )
+do_write_l3_cbm(info);
+else
+{
+unsigned int cpu = get_socket_cpu(socket);
+
+if ( cpu = nr_cpu_ids )
+return -EBADSLT;
+on_selected_cpus(cpumask_of(cpu), do_write_l3_cbm, info, 1);
+}
+
+return 0;
+}
+
+int psr_set_l3_cbm(struct domain *d, unsigned int socket, uint64_t cbm)
+{
+unsigned int old_cos, cos;
+struct psr_cat_cbm *map, *found = NULL;
+struct psr_cat_socket_info *info = NULL;
+int ret = get_cat_socket_info(socket, info);
+
+if ( ret )
+return ret;
+
+if ( !psr_check_cbm(info-cbm_len, cbm) )
+return -EINVAL;
+
+old_cos = 

[Xen-devel] [PATCH v9 01/13] x86: add socket_cpumask

2015-06-02 Thread Chao Peng
Maintain socket_cpumask which contains all the HT and core siblings
in the same socket.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
---
Changes in v9:
* Add comments for set_nr_sockets.
* Move set_nr_sockets() invocation from __start_xen() to smp_prepare_cpus().
Changes in v8:
* Remove total_cpus and retrofit the algorithm for calculating nr_sockets.
* Change per-socket cpumask allocation as on demand.
* socket_to_cpumask = socket_cpumask.
Changes in v7:
* Introduce total_cpus to calculate nr_sockets.
* Minor code sequence improvement in set_cpu_sibling_map.
* Improve comments for nr_sockets.
---
 xen/arch/x86/mpparse.c| 17 +
 xen/arch/x86/smpboot.c| 26 +-
 xen/include/asm-x86/smp.h | 11 +++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/mpparse.c b/xen/arch/x86/mpparse.c
index 003c56e..8609f4a 100644
--- a/xen/arch/x86/mpparse.c
+++ b/xen/arch/x86/mpparse.c
@@ -87,6 +87,23 @@ void __init set_nr_cpu_ids(unsigned int max_cpus)
 #endif
 }
 
+void __init set_nr_sockets(void)
+{
+/*
+ * Count the actual cpus in the socket 0 and use it to calculate nr_sockets
+ * so that the latter will be always = the actual socket number in the
+ * system even when APIC IDs from MP table are too sparse.
+ */
+unsigned int cpus = bitmap_weight(phys_cpu_present_map.mask,
+  boot_cpu_data.x86_max_cores *
+  boot_cpu_data.x86_num_siblings);
+
+if ( cpus == 0 )
+cpus = 1;
+
+nr_sockets = DIV_ROUND_UP(num_processors + disabled_cpus, cpus);
+}
+
 /*
  * Intel MP BIOS table parsing routines:
  */
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 2289284..e75bbd3 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -60,6 +60,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+unsigned int __read_mostly nr_sockets;
+cpumask_var_t *__read_mostly socket_cpumask;
+
 struct cpuinfo_x86 cpu_data[NR_CPUS];
 
 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
@@ -245,6 +248,8 @@ static void set_cpu_sibling_map(int cpu)
 
 cpumask_set_cpu(cpu, cpu_sibling_setup_map);
 
+cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
+
 if ( c[cpu].x86_num_siblings  1 )
 {
 for_each_cpu ( i, cpu_sibling_setup_map )
@@ -649,7 +654,13 @@ void cpu_exit_clear(unsigned int cpu)
 
 static void cpu_smpboot_free(unsigned int cpu)
 {
-unsigned int order;
+unsigned int order, socket = cpu_to_socket(cpu);
+
+if ( cpumask_empty(socket_cpumask[socket]) )
+{
+free_cpumask_var(socket_cpumask[socket]);
+socket_cpumask[socket] = NULL;
+}
 
 free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
 free_cpumask_var(per_cpu(cpu_core_mask, cpu));
@@ -694,6 +705,7 @@ static int cpu_smpboot_alloc(unsigned int cpu)
 nodeid_t node = cpu_to_node(cpu);
 struct desc_struct *gdt;
 unsigned long stub_page;
+unsigned int socket = cpu_to_socket(cpu);
 
 if ( node != NUMA_NO_NODE )
 memflags = MEMF_node(node);
@@ -736,6 +748,10 @@ static int cpu_smpboot_alloc(unsigned int cpu)
 goto oom;
 per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
 
+if ( !socket_cpumask[socket] 
+ !zalloc_cpumask_var(socket_cpumask + socket) )
+goto oom;
+
 if ( zalloc_cpumask_var(per_cpu(cpu_sibling_mask, cpu)) 
  zalloc_cpumask_var(per_cpu(cpu_core_mask, cpu)) )
 return 0;
@@ -786,6 +802,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 stack_base[0] = stack_start;
 
+set_nr_sockets();
+
+socket_cpumask = xzalloc_array(cpumask_var_t, nr_sockets);
+if ( !socket_cpumask || !zalloc_cpumask_var(socket_cpumask) )
+panic(No memory for socket CPU siblings map);
+
 if ( !zalloc_cpumask_var(per_cpu(cpu_sibling_mask, 0)) ||
  !zalloc_cpumask_var(per_cpu(cpu_core_mask, 0)) )
 panic(No memory for boot CPU sibling/core maps);
@@ -851,6 +873,8 @@ remove_siblinginfo(int cpu)
 int sibling;
 struct cpuinfo_x86 *c = cpu_data;
 
+cpumask_clear_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]);
+
 for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) )
 {
 cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling));
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
index 67518cf..e594062 100644
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -58,6 +58,17 @@ int hard_smp_processor_id(void);
 
 void __stop_this_cpu(void);
 
+/*
+ * The value may be greater than the actual socket number in the system and
+ * is required not to change from the initial startup.
+ */
+extern unsigned int nr_sockets;
+
+void set_nr_sockets(void);
+
+/* Representing HT and core siblings in each socket. */
+extern cpumask_var_t *socket_cpumask;
+
 #endif /* !__ASSEMBLY__ */
 

[Xen-devel] [PATCH v9 03/13] x86: maintain COS to CBM mapping for each socket

2015-06-02 Thread Chao Peng
For each socket, a COS to CBM mapping structure is maintained for each
COS. The mapping is indexed by COS and the value is the corresponding
CBM. Different VMs may use the same CBM, a reference count is used to
indicate if the CBM is available.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
---
Changes in v9:
* Allocate cos_to_cbm with opt_cos_max instead the actual cos_max from cpuid.
* Move CAT initialization code back to CPU_STARTING.
* Correct initialization logic for boot cpu.
Changes in v8:
* Move the memory allocation and CAT initialization code to CPU_UP_PREPARE.
* Add memory freeing code in CPU_DEAD path.
Changes in v5:
* rename cos_cbm_map to cos_to_cbm.
---
 xen/arch/x86/psr.c | 65 --
 1 file changed, 58 insertions(+), 7 deletions(-)

diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index cf6ae06..2388121 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -21,9 +21,15 @@
 #define PSR_CMT(10)
 #define PSR_CAT(11)
 
+struct psr_cat_cbm {
+uint64_t cbm;
+unsigned int ref;
+};
+
 struct psr_cat_socket_info {
 unsigned int cbm_len;
 unsigned int cos_max;
+struct psr_cat_cbm *cos_to_cbm;
 };
 
 struct psr_assoc {
@@ -209,6 +215,23 @@ void psr_ctxt_switch_to(struct domain *d)
 }
 }
 
+static int cat_cpu_prepare(unsigned int cpu)
+{
+struct psr_cat_socket_info *info;
+
+if ( !cat_socket_info )
+return 0;
+
+info = cat_socket_info + cpu_to_socket(cpu);
+if ( info-cos_to_cbm )
+return 0;
+else
+{
+info-cos_to_cbm = xzalloc_array(struct psr_cat_cbm, opt_cos_max + 
1UL);
+return info-cos_to_cbm ? 0 : -ENOMEM;
+}
+}
+
 static void cat_cpu_init(void)
 {
 unsigned int eax, ebx, ecx, edx;
@@ -232,6 +255,9 @@ static void cat_cpu_init(void)
 info-cbm_len = (eax  0x1f) + 1;
 info-cos_max = min(opt_cos_max, edx  0x);
 
+/* cos=0 is reserved as default cbm(all ones). */
+info-cos_to_cbm[0].cbm = (1ull  info-cbm_len) - 1;
+
 set_bit(socket, cat_socket_enable);
 printk(XENLOG_INFO CAT: enabled on socket %u, cos_max:%u, 
cbm_len:%u\n,
socket, info-cos_max, info-cbm_len);
@@ -243,7 +269,24 @@ static void cat_cpu_fini(unsigned int cpu)
 unsigned int socket = cpu_to_socket(cpu);
 
 if ( !socket_cpumask[socket] || cpumask_empty(socket_cpumask[socket]) )
+{
+struct psr_cat_socket_info *info = cat_socket_info + socket;
+
+if ( info-cos_to_cbm )
+{
+xfree(info-cos_to_cbm);
+info-cos_to_cbm = NULL;
+}
 clear_bit(socket, cat_socket_enable);
+}
+}
+
+static void __init psr_cat_free(void)
+{
+xfree(cat_socket_enable);
+cat_socket_enable = NULL;
+xfree(cat_socket_info);
+cat_socket_info = NULL;
 }
 
 static void __init init_psr_cat(void)
@@ -258,12 +301,12 @@ static void __init init_psr_cat(void)
 cat_socket_info = xzalloc_array(struct psr_cat_socket_info, nr_sockets);
 
 if ( !cat_socket_enable || !cat_socket_info )
-{
-xfree(cat_socket_enable);
-cat_socket_enable = NULL;
-xfree(cat_socket_info);
-cat_socket_info = NULL;
-}
+psr_cat_free();
+}
+
+static int psr_cpu_prepare(unsigned int cpu)
+{
+return cat_cpu_prepare(cpu);
 }
 
 static void psr_cpu_init(void)
@@ -283,19 +326,24 @@ static void psr_cpu_fini(unsigned int cpu)
 static int cpu_callback(
 struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
+int rc = 0;
 unsigned int cpu = (unsigned long)hcpu;
 
 switch ( action )
 {
+case CPU_UP_PREPARE:
+rc = psr_cpu_prepare(cpu);
+break;
 case CPU_STARTING:
 psr_cpu_init();
 break;
+case CPU_UP_CANCELED:
 case CPU_DEAD:
 psr_cpu_fini(cpu);
 break;
 }
 
-return NOTIFY_DONE;
+return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
 }
 
 static struct notifier_block cpu_nfb = {
@@ -311,6 +359,9 @@ static int __init psr_presmp_init(void)
 if ( opt_psr  PSR_CAT )
 init_psr_cat();
 
+if ( psr_cpu_prepare(0) )
+psr_cat_free();
+
 psr_cpu_init();
 if ( psr_cmt_enabled() || cat_socket_info )
 register_cpu_notifier(cpu_nfb);
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 04/13] x86: add COS information for each domain

2015-06-02 Thread Chao Peng
In Xen's implementation, the CAT enforcement granularity is per domain.
Due to the length of CBM and the number of COS may be socket-different,
each domain has COS ID for each socket. The domain get COS=0 by default
and at runtime its COS is then allocated dynamically when user specifies
a CBM for the domain.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Reviewed-by: Andrew Cooper andrew.coop...@citrix.com
---
Changes in v6:
* Add spinlock for cos_to_cbm.
---
 xen/arch/x86/domain.c|  6 +-
 xen/arch/x86/psr.c   | 49 
 xen/include/asm-x86/domain.h |  5 -
 xen/include/asm-x86/psr.h|  3 +++
 4 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index db073a6..0c4c43b 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -616,6 +616,9 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags,
 /* 64-bit PV guest by default. */
 d-arch.is_32bit_pv = d-arch.has_32bit_shinfo = 0;
 
+if ( (rc = psr_domain_init(d)) != 0 )
+goto fail;
+
 /* initialize default tsc behavior in case tools don't */
 tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
 spin_lock_init(d-arch.vtsc_lock);
@@ -634,6 +637,7 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags,
 free_perdomain_mappings(d);
 if ( is_pv_domain(d) )
 free_xenheap_page(d-arch.pv_domain.gdt_ldt_l1tab);
+psr_domain_free(d);
 return rc;
 }
 
@@ -657,7 +661,7 @@ void arch_domain_destroy(struct domain *d)
 free_xenheap_page(d-shared_info);
 cleanup_domain_irq_mapping(d);
 
-psr_free_rmid(d);
+psr_domain_free(d);
 }
 
 void arch_domain_shutdown(struct domain *d)
diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index 2388121..bbb2485 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -30,6 +30,7 @@ struct psr_cat_socket_info {
 unsigned int cbm_len;
 unsigned int cos_max;
 struct psr_cat_cbm *cos_to_cbm;
+spinlock_t cbm_lock;
 };
 
 struct psr_assoc {
@@ -215,6 +216,52 @@ void psr_ctxt_switch_to(struct domain *d)
 }
 }
 
+/* Called with domain lock held, no extra lock needed for 'psr_cos_ids' */
+static void psr_free_cos(struct domain *d)
+{
+unsigned int socket;
+unsigned int cos;
+struct psr_cat_socket_info *info;
+
+if( !d-arch.psr_cos_ids )
+return;
+
+for ( socket = 0; socket  nr_sockets; socket++ )
+{
+if ( !test_bit(socket, cat_socket_enable) )
+continue;
+
+if ( (cos = d-arch.psr_cos_ids[socket]) == 0 )
+continue;
+
+info = cat_socket_info + socket;
+spin_lock(info-cbm_lock);
+info-cos_to_cbm[cos].ref--;
+spin_unlock(info-cbm_lock);
+}
+
+xfree(d-arch.psr_cos_ids);
+d-arch.psr_cos_ids = NULL;
+}
+
+int psr_domain_init(struct domain *d)
+{
+if ( cat_socket_info )
+{
+d-arch.psr_cos_ids = xzalloc_array(unsigned int, nr_sockets);
+if ( !d-arch.psr_cos_ids )
+return -ENOMEM;
+}
+
+return 0;
+}
+
+void psr_domain_free(struct domain *d)
+{
+psr_free_rmid(d);
+psr_free_cos(d);
+}
+
 static int cat_cpu_prepare(unsigned int cpu)
 {
 struct psr_cat_socket_info *info;
@@ -258,6 +305,8 @@ static void cat_cpu_init(void)
 /* cos=0 is reserved as default cbm(all ones). */
 info-cos_to_cbm[0].cbm = (1ull  info-cbm_len) - 1;
 
+spin_lock_init(info-cbm_lock);
+
 set_bit(socket, cat_socket_enable);
 printk(XENLOG_INFO CAT: enabled on socket %u, cos_max:%u, 
cbm_len:%u\n,
socket, info-cos_max, info-cbm_len);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 45b5283..fee50a1 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -333,7 +333,10 @@ struct arch_domain
 struct e820entry *e820;
 unsigned int nr_e820;
 
-unsigned int psr_rmid; /* RMID assigned to the domain for CMT */
+/* RMID assigned to the domain for CMT */
+unsigned int psr_rmid;
+/* COS assigned to the domain for each socket */
+unsigned int *psr_cos_ids;
 
 /* Shared page for notifying that explicit PIRQ EOI is required. */
 unsigned long *pirq_eoi_map;
diff --git a/xen/include/asm-x86/psr.h b/xen/include/asm-x86/psr.h
index bdda111..1023d5f 100644
--- a/xen/include/asm-x86/psr.h
+++ b/xen/include/asm-x86/psr.h
@@ -51,6 +51,9 @@ int psr_alloc_rmid(struct domain *d);
 void psr_free_rmid(struct domain *d);
 void psr_ctxt_switch_to(struct domain *d);
 
+int psr_domain_init(struct domain *d);
+void psr_domain_free(struct domain *d);
+
 #endif /* __ASM_PSR_H__ */
 
 /*
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 05/13] x86: expose CBM length and COS number information

2015-06-02 Thread Chao Peng
General CAT information such as maximum COS and CBM length are exposed to
user space by a SYSCTL hypercall, to help user space to construct the CBM.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Reviewed-by: Andrew Cooper andrew.coop...@citrix.com
---
Changes in v9:
* Initialize 'info' explictly so that compiler would not complain.
* Simplify the code and remove multiple return points.
* Remove confused comment for 'target'.
Changes in v7:
* Copyback psr_cat_op only for XEN_SYSCTL_PSR_CAT_get_l3_info.
---
 xen/arch/x86/psr.c  | 32 
 xen/arch/x86/sysctl.c   | 18 ++
 xen/include/asm-x86/psr.h   |  3 +++
 xen/include/public/sysctl.h | 16 
 4 files changed, 69 insertions(+)

diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index bbb2485..10c4cdd 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -216,6 +216,38 @@ void psr_ctxt_switch_to(struct domain *d)
 }
 }
 
+static int get_cat_socket_info(unsigned int socket,
+   struct psr_cat_socket_info **info)
+{
+if ( !cat_socket_info )
+return -ENODEV;
+
+if ( socket = nr_sockets )
+return -EBADSLT;
+
+if ( !test_bit(socket, cat_socket_enable) )
+return -ENOENT;
+
+*info = cat_socket_info + socket;
+
+return 0;
+}
+
+int psr_get_cat_l3_info(unsigned int socket, uint32_t *cbm_len,
+uint32_t *cos_max)
+{
+struct psr_cat_socket_info *info = NULL;
+int ret = get_cat_socket_info(socket, info);
+
+if ( !ret )
+{
+*cbm_len = info-cbm_len;
+*cos_max = info-cos_max;
+}
+
+return ret;
+}
+
 /* Called with domain lock held, no extra lock needed for 'psr_cos_ids' */
 static void psr_free_cos(struct domain *d)
 {
diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
index 611a291..f36b52f 100644
--- a/xen/arch/x86/sysctl.c
+++ b/xen/arch/x86/sysctl.c
@@ -171,6 +171,24 @@ long arch_do_sysctl(
 
 break;
 
+case XEN_SYSCTL_psr_cat_op:
+switch ( sysctl-u.psr_cat_op.cmd )
+{
+case XEN_SYSCTL_PSR_CAT_get_l3_info:
+ret = psr_get_cat_l3_info(sysctl-u.psr_cat_op.target,
+  sysctl-u.psr_cat_op.u.l3_info.cbm_len,
+  sysctl-u.psr_cat_op.u.l3_info.cos_max);
+
+if ( !ret  __copy_field_to_guest(u_sysctl, sysctl, u.psr_cat_op) 
)
+ret = -EFAULT;
+
+break;
+default:
+ret = -EOPNOTSUPP;
+break;
+}
+break;
+
 default:
 ret = -ENOSYS;
 break;
diff --git a/xen/include/asm-x86/psr.h b/xen/include/asm-x86/psr.h
index 1023d5f..d364e8c 100644
--- a/xen/include/asm-x86/psr.h
+++ b/xen/include/asm-x86/psr.h
@@ -51,6 +51,9 @@ int psr_alloc_rmid(struct domain *d);
 void psr_free_rmid(struct domain *d);
 void psr_ctxt_switch_to(struct domain *d);
 
+int psr_get_cat_l3_info(unsigned int socket, uint32_t *cbm_len,
+uint32_t *cos_max);
+
 int psr_domain_init(struct domain *d);
 void psr_domain_free(struct domain *d);
 
diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
index 0cf9277..cd544c0 100644
--- a/xen/include/public/sysctl.h
+++ b/xen/include/public/sysctl.h
@@ -694,6 +694,20 @@ struct xen_sysctl_pcitopoinfo {
 typedef struct xen_sysctl_pcitopoinfo xen_sysctl_pcitopoinfo_t;
 DEFINE_XEN_GUEST_HANDLE(xen_sysctl_pcitopoinfo_t);
 
+#define XEN_SYSCTL_PSR_CAT_get_l3_info   0
+struct xen_sysctl_psr_cat_op {
+uint32_t cmd;   /* IN: XEN_SYSCTL_PSR_CAT_* */
+uint32_t target;/* IN */
+union {
+struct {
+uint32_t cbm_len;   /* OUT: CBM length */
+uint32_t cos_max;   /* OUT: Maximum COS */
+} l3_info;
+} u;
+};
+typedef struct xen_sysctl_psr_cat_op xen_sysctl_psr_cat_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_psr_cat_op_t);
+
 struct xen_sysctl {
 uint32_t cmd;
 #define XEN_SYSCTL_readconsole1
@@ -717,6 +731,7 @@ struct xen_sysctl {
 #define XEN_SYSCTL_coverage_op   20
 #define XEN_SYSCTL_psr_cmt_op21
 #define XEN_SYSCTL_pcitopoinfo   22
+#define XEN_SYSCTL_psr_cat_op23
 uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
 union {
 struct xen_sysctl_readconsole   readconsole;
@@ -740,6 +755,7 @@ struct xen_sysctl {
 struct xen_sysctl_scheduler_op  scheduler_op;
 struct xen_sysctl_coverage_op   coverage_op;
 struct xen_sysctl_psr_cmt_oppsr_cmt_op;
+struct xen_sysctl_psr_cat_oppsr_cat_op;
 uint8_t pad[128];
 } u;
 };
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 00/13] enable Cache Allocation Technology (CAT) for VMs

2015-06-02 Thread Chao Peng
Changes in v9:
Address comments from Jan, mainly:
* Move set_nr_sockets() invocation from __start_xen() to smp_prepare_cpus().
* Add check for cpuid_level.
* Add priority for cpu notifier.
* Allocate cos_to_cbm with opt_cos_max instead of the actual cos_max from cpuid.
* Move CAT initialization code back to CPU_STARTING.
* Initialize 'info' explictly so that compiler would not complain.
* Add an additional check for cbm to make sure at least one bit is set(which is 
required).
Changes in v8:
Address comments from Jan, mainly:
* Remove total_cpus and retrofit the algorithm for calculating nr_sockets.
* Change per-socket cpumask allocation as on demand.
* Remove cat_socket_init_bitmap and rename cat_socket_enable_bitmap.
* Ensure opt_cos_max is not too small.
* Use the right notification for memory allocation/freeing.
Changes in v7:
Address comments from Jan/Ian, mainly:
* Introduce total_cpus to calculate nr_sockets.
* Clear the init/enable flag when a socket going offline.
* Reorder the statements in init_psr_cat.
* Copyback psr_cat_op only for XEN_SYSCTL_PSR_CAT_get_l3_info.
* Broadcast LIBXL_HAVE_SOCKET_BITMAP_ALLOC.
* Add PSR head1 level section and change CMT/CAT as its subsections for xl man 
page.
Changes in v6:
Address comments from Andrew/Dario/Ian, mainly:
* Introduce cat_socket_init(_enable)_bitmap.
* Merge xl psr-cmt/cat-hwinfo = xl psr-hwinfo.
* Add function header to explain the 'target' parameter.
* Use bitmap instead of TARGETS_ALL.
* Document fix.
Changes in v5:
* Address comments from Andrew and Ian(Detail in patch).
* Add socket_to_cpumask.
* Add xl psr-cmt/cat-hwinfo.
* Add some libxl CMT enhancement.
Changes in v4:
* Address comments from Andrew and Ian(Detail in patch).
* Split COS/CBM management patch into 4 small patches.
* Add documentation xl-psr.markdown.
Changes in v3:
* Address comments from Jan and Ian(Detail in patch).
* Add xl sample output in cover letter.
Changes in v2:
* Address comments from Konrad and Jan(Detail in patch):
* Make all cat unrelated changes into the preparation patches. 

This patch serial enables the new Cache Allocation Technology (CAT) feature
found in Intel Broadwell and later server platform. In Xen's implementation,
CAT is used to control cache allocation on VM basis.

Detail hardware spec can be found in section 17.15 of the Intel SDM [1].
The design for XEN can be found at [2].

patch1: preparation.
patch2-8:   real work for CAT.
patch9-10:  enhancement for CMT.
patch11:libxl prepareation
patch12:tools side work for CAT.
patch13:xl document for CMT/MBM/CAT.

[1] Intel SDM 
(http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf)
[2] CAT design for XEN( 
http://lists.xen.org/archives/html/xen-devel/2014-12/msg01382.html)


Chao Peng (13):
  x86: add socket_cpumask
  x86: detect and initialize Intel CAT feature
  x86: maintain COS to CBM mapping for each socket
  x86: add COS information for each domain
  x86: expose CBM length and COS number information
  x86: dynamically get/set CBM for a domain
  x86: add scheduling support for Intel CAT
  xsm: add CAT related xsm policies
  tools/libxl: minor name changes for CMT commands
  tools/libxl: add command to show PSR hardware info
  tools/libxl: introduce some socket helpers
  tools: add tools support for Intel CAT
  docs: add xl-psr.markdown

 docs/man/xl.pod.1|  76 -
 docs/misc/xen-command-line.markdown  |  15 +-
 docs/misc/xl-psr.markdown| 133 +
 tools/flask/policy/policy/modules/xen/xen.if |   2 +-
 tools/flask/policy/policy/modules/xen/xen.te |   4 +-
 tools/libxc/include/xenctrl.h|  15 +
 tools/libxc/xc_psr.c |  76 +
 tools/libxl/libxl.h  |  42 +++
 tools/libxl/libxl_internal.h |   2 +
 tools/libxl/libxl_psr.c  | 143 +-
 tools/libxl/libxl_types.idl  |  10 +
 tools/libxl/libxl_utils.c|  46 
 tools/libxl/libxl_utils.h|   2 +
 tools/libxl/xl.h |   5 +
 tools/libxl/xl_cmdimpl.c | 262 +-
 tools/libxl/xl_cmdtable.c|  27 +-
 xen/arch/x86/domain.c|   6 +-
 xen/arch/x86/domctl.c|  20 ++
 xen/arch/x86/mpparse.c   |  17 ++
 xen/arch/x86/psr.c   | 396 ++-
 xen/arch/x86/smpboot.c   |  26 +-
 xen/arch/x86/sysctl.c|  18 ++
 xen/include/asm-x86/cpufeature.h |   1 +
 xen/include/asm-x86/domain.h |   5 +-
 xen/include/asm-x86/msr-index.h  |   1 +
 xen/include/asm-x86/psr.h|  11 +
 xen/include/asm-x86/smp.h|  11 +
 xen/include/public/domctl.h  |  12 +
 

[Xen-devel] [PATCH v9 07/13] x86: add scheduling support for Intel CAT

2015-06-02 Thread Chao Peng
On context switch, write the the domain's Class of Service(COS) to MSR
IA32_PQR_ASSOC, to notify hardware to use the new COS.

For performance reason, the COS mask for current cpu is also cached in
the local per-CPU variable.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Acked-by: Jan Beulich jbeul...@suse.com
---
Changes in v5:
* Remove the need to cache socket.
Changes in v2:
* merge common scheduling changes into scheduling improvement patch.
* use readable expr for psra-cos_mask.
---
 xen/arch/x86/psr.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index fce5bca..daa3c9c 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -35,6 +35,7 @@ struct psr_cat_socket_info {
 
 struct psr_assoc {
 uint64_t val;
+uint64_t cos_mask;
 };
 
 struct psr_cmt *__read_mostly psr_cmt;
@@ -200,7 +201,16 @@ static inline void psr_assoc_init(void)
 {
 struct psr_assoc *psra = this_cpu(psr_assoc);
 
-if ( psr_cmt_enabled() )
+if ( cat_socket_info )
+{
+unsigned int socket = cpu_to_socket(smp_processor_id());
+
+if ( test_bit(socket, cat_socket_enable) )
+psra-cos_mask = ((1ull  get_count_order(
+ cat_socket_info[socket].cos_max)) - 1)  32;
+}
+
+if ( psr_cmt_enabled() || psra-cos_mask )
 rdmsrl(MSR_IA32_PSR_ASSOC, psra-val);
 }
 
@@ -209,6 +219,12 @@ static inline void psr_assoc_rmid(uint64_t *reg, unsigned 
int rmid)
 *reg = (*reg  ~rmid_mask) | (rmid  rmid_mask);
 }
 
+static inline void psr_assoc_cos(uint64_t *reg, unsigned int cos,
+ uint64_t cos_mask)
+{
+*reg = (*reg  ~cos_mask) | (((uint64_t)cos  32)  cos_mask);
+}
+
 void psr_ctxt_switch_to(struct domain *d)
 {
 struct psr_assoc *psra = this_cpu(psr_assoc);
@@ -217,6 +233,11 @@ void psr_ctxt_switch_to(struct domain *d)
 if ( psr_cmt_enabled() )
 psr_assoc_rmid(reg, d-arch.psr_rmid);
 
+if ( psra-cos_mask )
+psr_assoc_cos(reg, d-arch.psr_cos_ids ?
+  d-arch.psr_cos_ids[cpu_to_socket(smp_processor_id())] :
+  0, psra-cos_mask);
+
 if ( reg != psra-val )
 {
 wrmsrl(MSR_IA32_PSR_ASSOC, reg);
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 11/13] tools/libxl: introduce some socket helpers

2015-06-02 Thread Chao Peng
Add libxl_socket_bitmap_alloc() to allow allocating a socket specific
libxl_bitmap (as it is for cpu/node bitmap).

Internal function libxl__count_physical_sockets() is introduced together
to get the socket count when the size of bitmap is not specified.

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Acked-by: Ian Campbell ian.campb...@citrix.com
---
Changes in v7:
* Broadcast LIBXL_HAVE_SOCKET_BITMAP_ALLOC
---
 tools/libxl/libxl.h  |  7 +++
 tools/libxl/libxl_internal.h |  2 ++
 tools/libxl/libxl_utils.c| 46 
 tools/libxl/libxl_utils.h|  2 ++
 4 files changed, 57 insertions(+)

diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 0a7913b..13e7a8c 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -775,6 +775,13 @@ void libxl_mac_copy(libxl_ctx *ctx, libxl_mac *dst, 
libxl_mac *src);
  */
 #define LIBXL_HAVE_PCITOPOLOGY 1
 
+/*
+ * LIBXL_HAVE_SOCKET_BITMAP_ALLOC
+ *
+ * If this is defined, then libxl_socket_bitmap_alloc exists.
+ */
+#define LIBXL_HAVE_SOCKET_BITMAP_ALLOC 1
+
 typedef char **libxl_string_list;
 void libxl_string_list_dispose(libxl_string_list *sl);
 int libxl_string_list_length(const libxl_string_list *sl);
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index bb3a5c7..696e4a0 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3702,6 +3702,8 @@ static inline void libxl__update_config_vtpm(libxl__gc 
*gc,
  */
 void libxl__bitmap_copy_best_effort(libxl__gc *gc, libxl_bitmap *dptr,
 const libxl_bitmap *sptr);
+
+int libxl__count_physical_sockets(libxl__gc *gc, int *sockets);
 #endif
 
 /*
diff --git a/tools/libxl/libxl_utils.c b/tools/libxl/libxl_utils.c
index f6be2d7..bfc9699 100644
--- a/tools/libxl/libxl_utils.c
+++ b/tools/libxl/libxl_utils.c
@@ -840,6 +840,52 @@ int libxl_node_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap 
*nodemap,
 return rc;
 }
 
+int libxl__count_physical_sockets(libxl__gc *gc, int *sockets)
+{
+int rc;
+libxl_physinfo info;
+
+libxl_physinfo_init(info);
+
+rc = libxl_get_physinfo(CTX, info);
+if (rc)
+return rc;
+
+*sockets = info.nr_cpus / info.threads_per_core
+/ info.cores_per_socket;
+
+libxl_physinfo_dispose(info);
+return 0;
+}
+
+int libxl_socket_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *socketmap,
+  int max_sockets)
+{
+GC_INIT(ctx);
+int rc = 0;
+
+if (max_sockets  0) {
+rc = ERROR_INVAL;
+LOG(ERROR, invalid number of sockets provided);
+goto out;
+}
+
+if (max_sockets == 0) {
+rc = libxl__count_physical_sockets(gc, max_sockets);
+if (rc) {
+LOGE(ERROR, failed to get system socket count);
+goto out;
+}
+}
+/* This can't fail: no need to check and log */
+libxl_bitmap_alloc(ctx, socketmap, max_sockets);
+
+ out:
+GC_FREE;
+return rc;
+
+}
+
 int libxl_nodemap_to_cpumap(libxl_ctx *ctx,
 const libxl_bitmap *nodemap,
 libxl_bitmap *cpumap)
diff --git a/tools/libxl/libxl_utils.h b/tools/libxl/libxl_utils.h
index 1c1761d..82340ec 100644
--- a/tools/libxl/libxl_utils.h
+++ b/tools/libxl/libxl_utils.h
@@ -141,6 +141,8 @@ static inline int libxl_bitmap_equal(const libxl_bitmap *ba,
 int libxl_cpu_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *cpumap, int max_cpus);
 int libxl_node_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *nodemap,
 int max_nodes);
+int libxl_socket_bitmap_alloc(libxl_ctx *ctx, libxl_bitmap *socketmap,
+  int max_sockets);
 
 /* Populate cpumap with the cpus spanned by the nodes in nodemap */
 int libxl_nodemap_to_cpumap(libxl_ctx *ctx,
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v9 10/13] tools/libxl: add command to show PSR hardware info

2015-06-02 Thread Chao Peng
Add dedicated one to show hardware information.

[root@vmm-psr]xl psr-hwinfo
Cache Monitoring Technology (CMT):
Enabled : 1
Total RMID  : 63
Supported monitor types:
cache-occupancy
total-mem-bandwidth
local-mem-bandwidth

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
Reviewed-by: Dario Faggioli dario.faggi...@citrix.com
Acked-by: Ian Campbell ian.campb...@citrix.com
---
Changes in v6:
* Add SWITCH_FOREACH_OPT to make '-h' work.
---
 docs/man/xl.pod.1 |  4 
 tools/libxl/xl.h  |  1 +
 tools/libxl/xl_cmdimpl.c  | 41 +
 tools/libxl/xl_cmdtable.c |  5 +
 4 files changed, 51 insertions(+)

diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 4eb929d..cebec46 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -1502,6 +1502,10 @@ for any of these monitoring types.
 
 =over 4
 
+=item Bpsr-hwinfo
+
+Show CMT hardware information.
+
 =item Bpsr-cmt-attach [Idomain-id]
 
 attach: Attach the platform shared resource monitoring service to a domain.
diff --git a/tools/libxl/xl.h b/tools/libxl/xl.h
index 5bc138c..7b56449 100644
--- a/tools/libxl/xl.h
+++ b/tools/libxl/xl.h
@@ -113,6 +113,7 @@ int main_remus(int argc, char **argv);
 #endif
 int main_devd(int argc, char **argv);
 #ifdef LIBXL_HAVE_PSR_CMT
+int main_psr_hwinfo(int argc, char **argv);
 int main_psr_cmt_attach(int argc, char **argv);
 int main_psr_cmt_detach(int argc, char **argv);
 int main_psr_cmt_show(int argc, char **argv);
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index fbc69ab..e76a154 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -8080,6 +8080,36 @@ out:
 }
 
 #ifdef LIBXL_HAVE_PSR_CMT
+static int psr_cmt_hwinfo(void)
+{
+int rc;
+int enabled;
+uint32_t total_rmid;
+
+printf(Cache Monitoring Technology (CMT):\n);
+
+enabled = libxl_psr_cmt_enabled(ctx);
+printf(%-16s: %s\n, Enabled, enabled ? 1 : 0);
+if (!enabled)
+return 0;
+
+rc = libxl_psr_cmt_get_total_rmid(ctx, total_rmid);
+if (rc) {
+fprintf(stderr, Failed to get max RMID value\n);
+return rc;
+}
+printf(%-16s: %u\n, Total RMID, total_rmid);
+
+printf(Supported monitor types:\n);
+if (libxl_psr_cmt_type_supported(ctx, LIBXL_PSR_CMT_TYPE_CACHE_OCCUPANCY))
+printf(cache-occupancy\n);
+if (libxl_psr_cmt_type_supported(ctx, LIBXL_PSR_CMT_TYPE_TOTAL_MEM_COUNT))
+printf(total-mem-bandwidth\n);
+if (libxl_psr_cmt_type_supported(ctx, LIBXL_PSR_CMT_TYPE_LOCAL_MEM_COUNT))
+printf(local-mem-bandwidth\n);
+
+return rc;
+}
 
 #define MBM_SAMPLE_RETRY_MAX 4
 static int psr_cmt_get_mem_bandwidth(uint32_t domid,
@@ -8246,6 +8276,17 @@ static int psr_cmt_show(libxl_psr_cmt_type type, 
uint32_t domid)
 return 0;
 }
 
+int main_psr_hwinfo(int argc, char **argv)
+{
+int opt;
+
+SWITCH_FOREACH_OPT(opt, , NULL, psr-hwinfo, 0) {
+/* No options */
+}
+
+return psr_cmt_hwinfo();
+}
+
 int main_psr_cmt_attach(int argc, char **argv)
 {
 uint32_t domid;
diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
index 12899d1..77a37c5 100644
--- a/tools/libxl/xl_cmdtable.c
+++ b/tools/libxl/xl_cmdtable.c
@@ -525,6 +525,11 @@ struct cmd_spec cmd_table[] = {
   -F  Run in the foreground,
 },
 #ifdef LIBXL_HAVE_PSR_CMT
+{ psr-hwinfo,
+  main_psr_hwinfo, 0, 1,
+  Show hardware information for Platform Shared Resource,
+  ,
+},
 { psr-cmt-attach,
   main_psr_cmt_attach, 0, 1,
   Attach Cache Monitoring Technology service to a domain,
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 1/3] drivers: xen-blkback: delay pending_req allocation to connect_ring

2015-06-02 Thread Bob Liu
This is a pre-patch for multi-page ring feature.
In connect_ring, we can know exactly how many pages are used for the shared
ring, delay pending_req allocation here so that we won't waste too much memory.

Signed-off-by: Bob Liu bob@oracle.com
---
 drivers/block/xen-blkback/common.h |2 +-
 drivers/block/xen-blkback/xenbus.c |   82 +---
 2 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h 
b/drivers/block/xen-blkback/common.h
index f620b5d..043f13b 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -248,7 +248,7 @@ struct backend_info;
 #define PERSISTENT_GNT_WAS_ACTIVE  1
 
 /* Number of requests that we can fit in a ring */
-#define XEN_BLKIF_REQS 32
+#define XEN_BLKIF_REQS_PER_PAGE32
 
 struct persistent_gnt {
struct page *page;
diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index 6ab69ad..c212d41 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -124,8 +124,6 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 {
struct xen_blkif *blkif;
-   struct pending_req *req, *n;
-   int i, j;
 
BUILD_BUG_ON(MAX_INDIRECT_PAGES  BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
 
@@ -151,51 +149,11 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 
INIT_LIST_HEAD(blkif-pending_free);
INIT_WORK(blkif-free_work, xen_blkif_deferred_free);
-
-   for (i = 0; i  XEN_BLKIF_REQS; i++) {
-   req = kzalloc(sizeof(*req), GFP_KERNEL);
-   if (!req)
-   goto fail;
-   list_add_tail(req-free_list,
- blkif-pending_free);
-   for (j = 0; j  MAX_INDIRECT_SEGMENTS; j++) {
-   req-segments[j] = kzalloc(sizeof(*req-segments[0]),
-  GFP_KERNEL);
-   if (!req-segments[j])
-   goto fail;
-   }
-   for (j = 0; j  MAX_INDIRECT_PAGES; j++) {
-   req-indirect_pages[j] = 
kzalloc(sizeof(*req-indirect_pages[0]),
-GFP_KERNEL);
-   if (!req-indirect_pages[j])
-   goto fail;
-   }
-   }
spin_lock_init(blkif-pending_free_lock);
init_waitqueue_head(blkif-pending_free_wq);
init_waitqueue_head(blkif-shutdown_wq);
 
return blkif;
-
-fail:
-   list_for_each_entry_safe(req, n, blkif-pending_free, free_list) {
-   list_del(req-free_list);
-   for (j = 0; j  MAX_INDIRECT_SEGMENTS; j++) {
-   if (!req-segments[j])
-   break;
-   kfree(req-segments[j]);
-   }
-   for (j = 0; j  MAX_INDIRECT_PAGES; j++) {
-   if (!req-indirect_pages[j])
-   break;
-   kfree(req-indirect_pages[j]);
-   }
-   kfree(req);
-   }
-
-   kmem_cache_free(xen_blkif_cachep, blkif);
-
-   return ERR_PTR(-ENOMEM);
 }
 
 static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
@@ -312,7 +270,7 @@ static void xen_blkif_free(struct xen_blkif *blkif)
i++;
}
 
-   WARN_ON(i != XEN_BLKIF_REQS);
+   WARN_ON(i != XEN_BLKIF_REQS_PER_PAGE);
 
kmem_cache_free(xen_blkif_cachep, blkif);
 }
@@ -864,7 +822,8 @@ static int connect_ring(struct backend_info *be)
unsigned int evtchn;
unsigned int pers_grants;
char protocol[64] = ;
-   int err;
+   struct pending_req *req, *n;
+   int err, i, j;
 
pr_debug(%s %s\n, __func__, dev-otherend);
 
@@ -905,6 +864,24 @@ static int connect_ring(struct backend_info *be)
ring_ref, evtchn, be-blkif-blk_protocol, protocol,
pers_grants ? persistent grants : );
 
+   for (i = 0; i  XEN_BLKIF_REQS_PER_PAGE; i++) {
+   req = kzalloc(sizeof(*req), GFP_KERNEL);
+   if (!req)
+   goto fail;
+   list_add_tail(req-free_list, be-blkif-pending_free);
+   for (j = 0; j  MAX_INDIRECT_SEGMENTS; j++) {
+   req-segments[j] = kzalloc(sizeof(*req-segments[0]), 
GFP_KERNEL);
+   if (!req-segments[j])
+   goto fail;
+   }
+   for (j = 0; j  MAX_INDIRECT_PAGES; j++) {
+   req-indirect_pages[j] = 
kzalloc(sizeof(*req-indirect_pages[0]),
+GFP_KERNEL);
+   if (!req-indirect_pages[j])
+   goto fail;
+  

[Xen-devel] [PATCH 2/3] driver: xen-blkfront: move talk_to_blkback to a more suitable place

2015-06-02 Thread Bob Liu
The major responsibility of talk_to_blkback() is allocate and initialize
the request ring and write the ring info to xenstore.
But this work should be done after backend entered 'XenbusStateInitWait' as
defined in the protocol file.
See xen/include/public/io/blkif.h in XEN git tree:
FrontBack
==
XenbusStateInitialising  XenbusStateInitialising
 o Query virtual device   o Query backend device identification
   properties.  data.
 o Setup OS device instance.  o Open and validate backend device.
  o Publish backend features and
transport parameters.
 |
 |
 V
 XenbusStateInitWait

o Query backend features and
  transport parameters.
o Allocate and initialize the
  request ring.

There is no problem with this yet, but it is an violation of the design and
furthermore it would not allow frontend/backend to negotiate 'multi-page'
and 'multi-queue' features.

Changes in v2:
 - Re-write the commit message to be more clear.

Signed-off-by: Bob Liu bob@oracle.com
Acked-by: Roger Pau Monné roger@citrix.com
---
 drivers/block/xen-blkfront.c |   14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c61cf8..88e23fd 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1430,13 +1430,6 @@ static int blkfront_probe(struct xenbus_device *dev,
info-handle = simple_strtoul(strrchr(dev-nodename, '/')+1, NULL, 0);
dev_set_drvdata(dev-dev, info);
 
-   err = talk_to_blkback(dev, info);
-   if (err) {
-   kfree(info);
-   dev_set_drvdata(dev-dev, NULL);
-   return err;
-   }
-
return 0;
 }
 
@@ -1906,8 +1899,13 @@ static void blkback_changed(struct xenbus_device *dev,
dev_dbg(dev-dev, blkfront:blkback_changed to state %d.\n, 
backend_state);
 
switch (backend_state) {
-   case XenbusStateInitialising:
case XenbusStateInitWait:
+   if (talk_to_blkback(dev, info)) {
+   kfree(info);
+   dev_set_drvdata(dev-dev, NULL);
+   break;
+   }
+   case XenbusStateInitialising:
case XenbusStateInitialised:
case XenbusStateReconfiguring:
case XenbusStateReconfigured:
-- 
1.7.10.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 3/3] xen/block: add multi-page ring support

2015-06-02 Thread Bob Liu
Extend xen/block to support multi-page ring, so that more requests can be
issued by using more than one pages as the request ring between blkfront
and backend.
As a result, the performance can get improved significantly.

We got some impressive improvements on our highend iscsi storage cluster
backend. If using 64 pages as the ring, the IOPS increased about 15 times
for the throughput testing and above doubled for the latency testing.

The reason was the limit on outstanding requests is 32 if use only one-page
ring, but in our case the iscsi lun was spread across about 100 physical
drives, 32 was really not enough to keep them busy.

Changes in v2:
 - Rebased to 4.0-rc6.
 - Document on how multi-page ring feature working to linux io/blkif.h.

Changes in v3:
 - Remove changes to linux io/blkif.h and follow the protocol defined
   in io/blkif.h of XEN tree.
 - Rebased to 4.1-rc3

Changes in v4:
 - Turn to use 'ring-page-order' and 'max-ring-page-order'.
 - A few comments from Roger.

Changes in v5:
 - Clarify with 4k granularity to comment
 - Address more comments from Roger

Signed-off-by: Bob Liu bob@oracle.com
---
 drivers/block/xen-blkback/blkback.c |   13 
 drivers/block/xen-blkback/common.h  |2 +
 drivers/block/xen-blkback/xenbus.c  |   89 +--
 drivers/block/xen-blkfront.c|  135 +--
 4 files changed, 180 insertions(+), 59 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c 
b/drivers/block/xen-blkback/blkback.c
index 713fc9f..2126842 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,13 @@ MODULE_PARM_DESC(max_persistent_grants,
  Maximum number of grants to map persistently);
 
 /*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 
S_IRUGO);
+MODULE_PARM_DESC(max_ring_page_order, Maximum order of pages to be used for 
the shared ring);
+/*
  * The LRU mechanism to clean the lists of persistent grants needs to
  * be executed periodically. The time interval between consecutive executions
  * of the purge mechanism is set in ms.
@@ -1438,6 +1445,12 @@ static int __init xen_blkif_init(void)
if (!xen_domain())
return -ENODEV;
 
+   if (xen_blkif_max_ring_order  XENBUS_MAX_RING_PAGE_ORDER) {
+   pr_info(Invalid max_ring_order (%d), will use default max: 
%d.\n,
+   xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
+   xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+   }
+
rc = xen_blkif_interface_init();
if (rc)
goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h 
b/drivers/block/xen-blkback/common.h
index 043f13b..8ccc49d 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -44,6 +44,7 @@
 #include xen/interface/io/blkif.h
 #include xen/interface/io/protocols.h
 
+extern unsigned int xen_blkif_max_ring_order;
 /*
  * This is the maximum number of segments that would be allowed in indirect
  * requests. This value will also be passed to the frontend.
@@ -320,6 +321,7 @@ struct xen_blkif {
struct work_struct  free_work;
/* Thread shutdown wait queue. */
wait_queue_head_t   shutdown_wq;
+   unsigned int nr_ring_pages;
 };
 
 struct seg_buf {
diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index c212d41..deb3f00 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -25,6 +25,7 @@
 
 /* Enlarge the array size in order to fully show blkback name. */
 #define BLKBACK_NAME_LEN (20)
+#define RINGREF_NAME_LEN (20)
 
 struct backend_info {
struct xenbus_device*dev;
@@ -156,8 +157,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
return blkif;
 }
 
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
-unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+unsigned int nr_grefs, unsigned int evtchn)
 {
int err;
 
@@ -165,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, 
grant_ref_t gref,
if (blkif-irq)
return 0;
 
-   err = xenbus_map_ring_valloc(blkif-be-dev, gref, 1,
+   err = xenbus_map_ring_valloc(blkif-be-dev, gref, nr_grefs,
 blkif-blk_ring);
if (err  0)
return err;
@@ -175,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, 
grant_ref_t gref,
{
struct blkif_sring *sring;
sring = (struct blkif_sring *)blkif-blk_ring;
-   BACK_RING_INIT(blkif-blk_rings.native, sring, 

Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Stefan Bader
On 02.06.2015 12:35, Stefano Stabellini wrote:
 On Tue, 2 Jun 2015, Stefano Stabellini wrote:
 On Tue, 2 Jun 2015, Stefan Bader wrote:
 On 02.06.2015 09:40, Sanjeev Pandita wrote:
 All,

 I am pretty new to xen . I am trying to boot DOMU with qemu qcow AARCH64
 Ubuntu 15.04 disk on Xen but I am getting the errors which link to
 /usr/local/lib/xen/bin/qemu-system-i386.
 Since I am working on aarch64 system the
 /usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or might
 not work as expected.

 Because I am lacking hardware and feedback, the arm64 packaging is a rather
 theoretical exercise. At least for armhf I thought qemu-system-x86 was a
 dependency. That binary should provide x86 emulation on arm64, the same as 
 one
 could install qemu for other arches on x86.
 Have you tried to install qemu-system-x86 manually?

 Hi Stefan,

 On arm and arm64 Xen still needs a qemu-system-i386 binary, just to
 provide the PV backends in userspace (disk, console, etc.).
 Unfortunately the output binary is still named qemu-system-i386. I
 know that the name is misleading, but fixing it is not trivial: it
 requires disentangling code in QEMU in non trivial ways.
 
 Just to be clear, qemu-system-i386 for ARM is the output of a QEMU build
 on ARM with ./configure --enable-xen --target-list=i386-softmmu. It
 could do x86 emulation, but it does not when used on Xen.
 

Hi Stefano,

so for Debian and Ubuntu we moved to use the standard qemu binary which is build
with xen enabled. This works on x86, but I could not verify correctness for any
arm port (due to lack of hw).

-Stefan



signature.asc
Description: OpenPGP digital signature
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 COLO Pre 03/12] tools/libxc: export xc_bitops.h

2015-06-02 Thread Andrew Cooper
On 02/06/15 10:26, Yang Hongyang wrote:
 When we are under COLO, we will send dirty page bitmap info from
 secondary to primary at every checkpoint. So we need to get/test
 the dirty page bitmap. We just expose xc_bitops.h for libxl use.

 NOTE:
   Need to make clean and rerun configure to get it compiled.

 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com

I like this change, but lets take the opportunity to fix some of the
issues in it.

 ---
  tools/libxc/include/xc_bitops.h | 76 
 +
  tools/libxc/xc_bitops.h | 76 
 -
  2 files changed, 76 insertions(+), 76 deletions(-)
  create mode 100644 tools/libxc/include/xc_bitops.h
  delete mode 100644 tools/libxc/xc_bitops.h

 diff --git a/tools/libxc/include/xc_bitops.h b/tools/libxc/include/xc_bitops.h
 new file mode 100644
 index 000..cd749f4
 --- /dev/null
 +++ b/tools/libxc/include/xc_bitops.h
 @@ -0,0 +1,76 @@
 +#ifndef XC_BITOPS_H
 +#define XC_BITOPS_H 1

No need for a 1 here

 +
 +/* bitmap operations for single threaded access */
 +
 +#include stdlib.h
 +#include string.h
 +
 +#define BITS_PER_LONG (sizeof(unsigned long) * 8)

All defines like this need XC_ prefixes, and CHAR_BIT should be used in
preference to 8.

 +#define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)

This name is misleading, as it is in terms of bits not bytes. 
XC_BITMAP_SHIFT perhaps?

 +
 +#define BITMAP_ENTRY(_nr,_bmap) ((_bmap))[(_nr)/BITS_PER_LONG]
 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)

I would recommend dropping these and open coding the few cases below. 
It would be far more clear.

 +
 +/* calculate required space for number of longs needed to hold nr_bits */
 +static inline int bitmap_size(int nr_bits)

int has been inappropriate everywhere in this file.  unsigned long
please (or settle on unsigned int everywhere)

 +{
 +int nr_long, nr_bytes;
 +nr_long = (nr_bits + BITS_PER_LONG - 1)  ORDER_LONG;

This calculation can overflow.

(nr_bits  ORDER_LONG) + !!(nr_bits % BITS_PER_LONG)

 +nr_bytes = nr_long * sizeof(unsigned long);
 +return nr_bytes;
 +}
 +
 +static inline unsigned long *bitmap_alloc(int nr_bits)
 +{
 +return calloc(1, bitmap_size(nr_bits));
 +}
 +
 +static inline void bitmap_set(unsigned long *addr, int nr_bits)
 +{
 +memset(addr, 0xff, bitmap_size(nr_bits));
 +}
 +
 +static inline void bitmap_clear(unsigned long *addr, int nr_bits)
 +{
 +memset(addr, 0, bitmap_size(nr_bits));
 +}
 +
 +static inline int test_bit(int nr, unsigned long *addr)

const *addr, as this is a read-only operation.

 +{
 +return (BITMAP_ENTRY(nr, addr)  BITMAP_SHIFT(nr))  1;
 +}
 +
 +static inline void clear_bit(int nr, unsigned long *addr)
 +{
 +BITMAP_ENTRY(nr, addr) = ~(1UL  BITMAP_SHIFT(nr));
 +}
 +
 +static inline void set_bit(int nr, unsigned long *addr)
 +{
 +BITMAP_ENTRY(nr, addr) |= (1UL  BITMAP_SHIFT(nr));
 +}

It would be nice to be consistent on whether the bitmap pointer or the
bit is the first parameter.  Perhaps a second cleanup patch which makes
this consistent and adjusts all current callers.

~Andrew

 +
 +static inline int test_and_clear_bit(int nr, unsigned long *addr)
 +{
 +int oldbit = test_bit(nr, addr);
 +clear_bit(nr, addr);
 +return oldbit;
 +}
 +
 +static inline int test_and_set_bit(int nr, unsigned long *addr)
 +{
 +int oldbit = test_bit(nr, addr);
 +set_bit(nr, addr);
 +return oldbit;
 +}
 +
 +static inline void bitmap_or(unsigned long *dst, const unsigned long *other,
 + int nr_bits)
 +{
 +int i, nr_longs = (bitmap_size(nr_bits) / sizeof(unsigned long));
 +for ( i = 0; i  nr_longs; ++i )
 +dst[i] |= other[i];
 +}
 +
 +#endif  /* XC_BITOPS_H */
 diff --git a/tools/libxc/xc_bitops.h b/tools/libxc/xc_bitops.h
 deleted file mode 100644
 index cd749f4..000
 --- a/tools/libxc/xc_bitops.h
 +++ /dev/null
 @@ -1,76 +0,0 @@
 -#ifndef XC_BITOPS_H
 -#define XC_BITOPS_H 1
 -
 -/* bitmap operations for single threaded access */
 -
 -#include stdlib.h
 -#include string.h
 -
 -#define BITS_PER_LONG (sizeof(unsigned long) * 8)
 -#define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
 -
 -#define BITMAP_ENTRY(_nr,_bmap) ((_bmap))[(_nr)/BITS_PER_LONG]
 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
 -
 -/* calculate required space for number of longs needed to hold nr_bits */
 -static inline int bitmap_size(int nr_bits)
 -{
 -int nr_long, nr_bytes;
 -nr_long = (nr_bits + BITS_PER_LONG - 1)  ORDER_LONG;
 -nr_bytes = nr_long * sizeof(unsigned long);
 -return nr_bytes;
 -}
 -
 -static inline unsigned long *bitmap_alloc(int nr_bits)
 -{
 -return calloc(1, bitmap_size(nr_bits));
 -}
 -
 -static inline void bitmap_set(unsigned long *addr, int nr_bits)
 -{
 -memset(addr, 0xff, bitmap_size(nr_bits));
 -}
 -
 -static inline void bitmap_clear(unsigned long *addr, int nr_bits)
 -{
 -memset(addr, 0, bitmap_size(nr_bits));
 -}
 -
 -static 

Re: [Xen-devel] tcp: refine TSO autosizing causes performance regression on Xen

2015-06-02 Thread Wei Liu
Hi Eric

Sorry for coming late to the discussion.

On Thu, Apr 16, 2015 at 05:42:16AM -0700, Eric Dumazet wrote:
 On Thu, 2015-04-16 at 11:01 +0100, George Dunlap wrote:
 
  He suggested that after he'd been prodded by 4 more e-mails in which two
  of us guessed what he was trying to get at.  That's what I was
  complaining about.
 
 My big complain is that I suggested to test to double the sysctl, which
 gave good results.
 

Do I understand correctly that it's acceptable to you to double the size
of the buffer? If so I will send a patch to do that.

Wei.

 Then you provided a patch using a 8x factor. How does that sound ?
 
 Next time I ask a raise, I should try a 8x factor as well, who knows,
 it might be accepted.
 
 

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [Draft C] Xen on ARM vITS Handling

2015-06-02 Thread Ian Campbell
On Tue, 2015-06-02 at 11:46 +0100, Julien Grall wrote:
 Hi Ian,
 
 On 01/06/15 14:36, Ian Campbell wrote:
  On Fri, 2015-05-29 at 15:06 +0100, Julien Grall wrote:
  Hi Vijay,
 
  On 27/05/15 17:44, Vijay Kilari wrote:
  ## Command Translation
 
  Of the existing GICv3 ITS commands, `MAPC`, `MAPD`, `MAPVI`/`MAPI` are
  potentially time consuming commands as these commands creates entry in
  the Xen ITS structures, which are used to validate other ITS commands.
 
  `INVALL` and `SYNC` are global and potentially disruptive to other
  guests and so need consideration.
 
  All other ITS command like `MOVI`, `DISCARD`, `INV`, `INT`, `CLEAR`
  just validate and generate physical command.
 
  ### `MAPC` command translation
 
  Format: `MAPC vCID, vTA`
 
 -  The GITS_TYPER.PAtype is emulated as 0. Hence vTA is always 
  represents
vcpu number. Hence vTA is validated against physical Collection
  IDs by querying
ITS driver and corresponding Physical Collection ID is retrieved.
 -  Each vITS will have cid_map (struct cid_mapping) which holds 
  mapping of
 
  Why do you speak about each vITS? The emulation is only related to one
  vITS and not shared...
 
  And each vITS will have a cid_map, which is used. This seems like a
  reasonable way to express this concept in the context.
 
 This is rather strange when everything in the command emulation is per-vits.

I'm afraid you are going to have to say more explicitly what you find
strange here.

  Perhaps there is a need to include discussion of some of the secondary
  data structures alongside the defintion `cits_cq`. In which case we
  could talk about its associated `cid_map` and things.
 
Virtual Collection ID(vCID), Virtual Target address(vTA) and
Physical Collection ID (pCID).
If vCID entry already exists in cid_map, then that particular
  mapping is updated with
the new pCID and vTA else new entry is made in cid_map
 
  When you move a collection, you also have to make sure that all the
  interrupts associated to it will be delivered to the new target.
 
  I'm not sure what you are suggesting for that...
 
  This is going to be rather painful I fear.
 
 -  MAPC pCID, pTA physical ITS command is generated
 
  We should not send any MAPC command to the physical ITS. The collection
  is already mapped during Xen boot and the guest should not be able to
  move the physical collection (they are shared between all the guests and
  Xen).
 
  This needs discussion in the background section, to describe the
  physical setup which the virtual stuff can make assumption of.
 
 I don't think this is a background section. The physical number of
 collection is limited (the mandatory number of collections is nr_cpus +
 1). Those collection will likely be shared between Xen and the different
 guests.

Right, and this needs to be explained in the document as an assumption
upon which other things can draw, so that the document is (so far as
possible) a coherent whole...

 If we let the guest moving the physical collection we will also move all
 the interrupts which is wrong.

... and therefore things like this would become apparent.

  - `MAPC pCID, pTA` physical ITS command is generated
 
  ### `MAPD` Command translation
 
  Format: `MAPD device, Valid, ITT IPA, ITT Size`
 
  `MAPD` is sent with `Valid` bit set if device needs to be added and reset
  when device is removed.
 
  If `Valid` bit is set:
 
  - Allocate memory for `its_device` struct
  - Validate ITT IPA  ITT size and update its_device struct
  - Find number of vectors(nrvecs) for this device by querying PCI
helper function
  - Allocate nrvecs number of LPI XXX nrvecs is a function of `ITT Size`?
  - Allocate memory for `struct vlpi_map` for this device. This
`vlpi_map` holds mapping of Virtual LPI to Physical LPI and ID.
  - Find physical ITS node with which this device is associated
  - Call `p2m_lookup` on ITT IPA addr and get physical ITT address
  - Validate ITT Size
  - Generate/format physical ITS command: `MAPD, ITT PA, ITT Size`
 
  Here the overhead is with memory allocation for `its_device` and 
  `vlpi_map`
 
  XXX Suggestion was to preallocate some of those at device passthrough
  setup time?
 
  If Validation bit is set:
 - Query its_device tree and get its_device structure for this device.
 - (XXX: If pci device is hidden from dom0, does this device is added
 with PHYSDEVOP_pci_device_add hypercall?)
 - If device does not exists return
 - If device exists in RB-tree then
- Validate ITT IPA  ITT size and update its_device struct
 
  To validate the ITT size you need to know the number of interrupt ID.
 
  Please could you get into the habit of making concrete suggestions for
  changes to the text. I've no idea what change I should make based on
  this observation. If not concrete suggestions please try and make the
  implications of what you are saying clear.
 
 The size of the ITT is based on the 

Re: [Xen-devel] [PATCH] libxl: Don't insert PCI device into xenstore for HVM guests

2015-06-02 Thread Malcolm Crossley
On 01/06/15 18:55, Konrad Rzeszutek Wilk wrote:
 On Mon, Jun 01, 2015 at 05:03:14PM +0100, Malcolm Crossley wrote:
 On 01/06/15 16:43, Ross Lagerwall wrote:
 On 06/01/2015 04:26 PM, Konrad Rzeszutek Wilk wrote:
 On Fri, May 29, 2015 at 08:59:45AM +0100, Ross Lagerwall wrote:
 When doing passthrough of a PCI device for an HVM guest, don't insert
 the device into xenstore, otherwise pciback attempts to use it which
 conflicts with QEMU.

 How does it conflict?

 It doesn't work with repeated use. See below.


 This manifests itself such that the first time a device is passed to a
 domain, it succeeds. Subsequent attempts fail unless the device is
 unbound from pciback or the machine rebooted.

 Can you be more specific please? What are the issues? Why does it
 fail?

 Without this patch, if a device (e.g. a GPU) is bound to pciback and
 then passed through to a guest using xl pci-attach, it appears in the
 guest and works fine. If the guest is rebooted, and the device is again
 passed through with xl pci-attach, it appears in the guest as before but
 does not work. In Windows, it gets something like Error Code 43 and on
 Linux, the Nouveau driver fails to initialize the device (with error -22
 or something). The only way to get the device to work again is to reboot
 the host or unbind and rebind it to pciback.

 With this patch, it works as expected. The device is bound to pciback
 and works after being passed through, even after the VM is rebooted.


 There are certain things that pciback does to prepare an PCI device
 which QEMU also does. Some of them - such as saving the configuration
 registers (And then restoring them after the device has been detached) -
 is something that QEMU does not do.


 I really have no idea what the correct thing to do is, but the current
 code with qemu-trad doesn't seem to work (for me).

 The pciback pci_stub.c implements the pciback.hide and the device reset
 logic.

 The rest of pciback implements the pciback xenbus device which PV guests
 need in order to map/unmap MSI interrupts and access PCI config space.

 QEMU emulates and handles the MSI interrupt capabilities and PCI config
 space directly.
 
 Right..

 This is why a pciback xenbus device should not be created for
 passthrough PCI device being handled by QEMU.
 
 To me that sounds that we should not have PV drivers because QEMU
 emulates IDE or network devices.

That is different. We first boot with QEMU handling the devices and then
we explictly unplug QEMU's handling of IDE and network devices.

That handover protocol does not currently exist for PCI passthrough
devices so we have to chose one mechanism or the other to manage the
passed through PCI device at boot time. Otherwise a HVM guest could load
pcifront and cause's all kinds of chaos with interrupt management or
outbound MMIO window management.

 
 The crux here is that none of the operations that pciback performs
 should affect QEMU or guests. But it does - so there is a bug.

I agree there is a bug but should we try to fix it based upon my
comments above?
 
 I would like to understand which ones do it so I can fix in
 pciback - as it might be also be a problem with PV.
 
 Unless... are you by any chance using extra patches on top of the
 native pciback?

We do have extra patches but they only allow us to do a SBR on PCI
device's which require it. They failure listed above occurs on devices
with device specific resets (e.g. FLR,D3) as well so those extra patches
aren't being used.

 

 Malcolm


 Regards



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Stefano Stabellini
On Tue, 2 Jun 2015, Stefan Bader wrote:
 On 02.06.2015 09:40, Sanjeev Pandita wrote:
  All,
  
  I am pretty new to xen . I am trying to boot DOMU with qemu qcow AARCH64
  Ubuntu 15.04 disk on Xen but I am getting the errors which link to
  /usr/local/lib/xen/bin/qemu-system-i386.
  Since I am working on aarch64 system the
  /usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or might
  not work as expected.
 
 Because I am lacking hardware and feedback, the arm64 packaging is a rather
 theoretical exercise. At least for armhf I thought qemu-system-x86 was a
 dependency. That binary should provide x86 emulation on arm64, the same as one
 could install qemu for other arches on x86.
 Have you tried to install qemu-system-x86 manually?

Hi Stefan,

On arm and arm64 Xen still needs a qemu-system-i386 binary, just to
provide the PV backends in userspace (disk, console, etc.).
Unfortunately the output binary is still named qemu-system-i386. I
know that the name is misleading, but fixing it is not trivial: it
requires disentangling code in QEMU in non trivial ways.

Cheers,

Stefano

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Stefano Stabellini
On Tue, 2 Jun 2015, Stefan Bader wrote:
 On 02.06.2015 12:35, Stefano Stabellini wrote:
  On Tue, 2 Jun 2015, Stefano Stabellini wrote:
  On Tue, 2 Jun 2015, Stefan Bader wrote:
  On 02.06.2015 09:40, Sanjeev Pandita wrote:
  All,
 
  I am pretty new to xen . I am trying to boot DOMU with qemu qcow AARCH64
  Ubuntu 15.04 disk on Xen but I am getting the errors which link to
  /usr/local/lib/xen/bin/qemu-system-i386.
  Since I am working on aarch64 system the
  /usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or might
  not work as expected.
 
  Because I am lacking hardware and feedback, the arm64 packaging is a 
  rather
  theoretical exercise. At least for armhf I thought qemu-system-x86 was a
  dependency. That binary should provide x86 emulation on arm64, the same 
  as one
  could install qemu for other arches on x86.
  Have you tried to install qemu-system-x86 manually?
 
  Hi Stefan,
 
  On arm and arm64 Xen still needs a qemu-system-i386 binary, just to
  provide the PV backends in userspace (disk, console, etc.).
  Unfortunately the output binary is still named qemu-system-i386. I
  know that the name is misleading, but fixing it is not trivial: it
  requires disentangling code in QEMU in non trivial ways.
  
  Just to be clear, qemu-system-i386 for ARM is the output of a QEMU build
  on ARM with ./configure --enable-xen --target-list=i386-softmmu. It
  could do x86 emulation, but it does not when used on Xen.
  
 
 Hi Stefano,
 
 so for Debian and Ubuntu we moved to use the standard qemu binary which is 
 build
 with xen enabled. This works on x86, but I could not verify correctness for 
 any
 arm port (due to lack of hw).

OK, from what you say it should work

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [Draft C] Xen on ARM vITS Handling

2015-06-02 Thread Julien Grall

Hi Ian,

On 01/06/15 14:36, Ian Campbell wrote:

On Fri, 2015-05-29 at 15:06 +0100, Julien Grall wrote:

Hi Vijay,

On 27/05/15 17:44, Vijay Kilari wrote:

## Command Translation

Of the existing GICv3 ITS commands, `MAPC`, `MAPD`, `MAPVI`/`MAPI` are
potentially time consuming commands as these commands creates entry in
the Xen ITS structures, which are used to validate other ITS commands.

`INVALL` and `SYNC` are global and potentially disruptive to other
guests and so need consideration.

All other ITS command like `MOVI`, `DISCARD`, `INV`, `INT`, `CLEAR`
just validate and generate physical command.

### `MAPC` command translation

Format: `MAPC vCID, vTA`


   -  The GITS_TYPER.PAtype is emulated as 0. Hence vTA is always represents
  vcpu number. Hence vTA is validated against physical Collection
IDs by querying
  ITS driver and corresponding Physical Collection ID is retrieved.
   -  Each vITS will have cid_map (struct cid_mapping) which holds mapping of


Why do you speak about each vITS? The emulation is only related to one
vITS and not shared...


And each vITS will have a cid_map, which is used. This seems like a
reasonable way to express this concept in the context.


This is rather strange when everything in the command emulation is per-vits.


Perhaps there is a need to include discussion of some of the secondary
data structures alongside the defintion `cits_cq`. In which case we
could talk about its associated `cid_map` and things.


  Virtual Collection ID(vCID), Virtual Target address(vTA) and
  Physical Collection ID (pCID).
  If vCID entry already exists in cid_map, then that particular
mapping is updated with
  the new pCID and vTA else new entry is made in cid_map


When you move a collection, you also have to make sure that all the
interrupts associated to it will be delivered to the new target.

I'm not sure what you are suggesting for that...


This is going to be rather painful I fear.


   -  MAPC pCID, pTA physical ITS command is generated


We should not send any MAPC command to the physical ITS. The collection
is already mapped during Xen boot and the guest should not be able to
move the physical collection (they are shared between all the guests and
Xen).


This needs discussion in the background section, to describe the
physical setup which the virtual stuff can make assumption of.


I don't think this is a background section. The physical number of
collection is limited (the mandatory number of collections is nr_cpus +
1). Those collection will likely be shared between Xen and the different
guests.

If we let the guest moving the physical collection we will also move all
the interrupts which is wrong.




   Here there is no overhead, the cid_map entries are preallocated
with size of nr_cpus
   in the platform.


As said the number of collection should be at least nr_cpus + 1.


FWIW I read this as with size appropriate for nr_cpus, which leaves
the +1 as implicit. I added the +1 nevertheless.


I wanted to make clear. His implementation was only considering nr_cpus
collections.




- `MAPC pCID, pTA` physical ITS command is generated

### `MAPD` Command translation

Format: `MAPD device, Valid, ITT IPA, ITT Size`

`MAPD` is sent with `Valid` bit set if device needs to be added and reset
when device is removed.

If `Valid` bit is set:

- Allocate memory for `its_device` struct
- Validate ITT IPA  ITT size and update its_device struct
- Find number of vectors(nrvecs) for this device by querying PCI
  helper function
- Allocate nrvecs number of LPI XXX nrvecs is a function of `ITT Size`?
- Allocate memory for `struct vlpi_map` for this device. This
  `vlpi_map` holds mapping of Virtual LPI to Physical LPI and ID.
- Find physical ITS node with which this device is associated
- Call `p2m_lookup` on ITT IPA addr and get physical ITT address
- Validate ITT Size
- Generate/format physical ITS command: `MAPD, ITT PA, ITT Size`

Here the overhead is with memory allocation for `its_device` and `vlpi_map`

XXX Suggestion was to preallocate some of those at device passthrough
setup time?


If Validation bit is set:
   - Query its_device tree and get its_device structure for this device.
   - (XXX: If pci device is hidden from dom0, does this device is added
   with PHYSDEVOP_pci_device_add hypercall?)
   - If device does not exists return
   - If device exists in RB-tree then
  - Validate ITT IPA  ITT size and update its_device struct


To validate the ITT size you need to know the number of interrupt ID.


Please could you get into the habit of making concrete suggestions for
changes to the text. I've no idea what change I should make based on
this observation. If not concrete suggestions please try and make the
implications of what you are saying clear.


The size of the ITT is based on the number of Interrupt supported by the
device.

The only way to validate the size getting the number of Interrupt
before. i.e

- Find the number of MSI for 

Re: [Xen-devel] [PATCH v1 COLO Pre 02/12] libxc/restore: zero ioreq page only one time

2015-06-02 Thread Wen Congyang
On 06/02/2015 06:16 PM, Andrew Cooper wrote:
 On 02/06/15 10:26, Yang Hongyang wrote:
 ioreq page contains evtchn which will be set when we resume the
 secondary vm the first time. The hypervisor will check if the
 evtchn is corrupted, so we cannot zero the ioreq page more
 than one time.

 The ioreq-state is always STATE_IOREQ_NONE after the vm is
 suspended, so it is OK if we only zero it one time.

 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 Signed-off-by: Wen congyang we...@cn.fujitsu.com
 CC: Andrew Cooper andrew.coop...@citrix.com
 
 Is the qemu process for the secondary running at this point?  If so,
 this is very much unsafe.

No, we restore the secondary vm while it has been suspended.
The problem is that: the ioreq page containes evtchn which is
used by hypervisor to notify qemu. Before migration finished,
we can clear it because the evtchn is invalid, and qemu will
allocate it and save it in ioreq page later.

Thanks
Wen Congyang

 
 ~Andrew
 
 ---
  tools/libxc/xc_sr_restore_x86_hvm.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

 diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c 
 b/tools/libxc/xc_sr_restore_x86_hvm.c
 index 6f5af0e..06177e0 100644
 --- a/tools/libxc/xc_sr_restore_x86_hvm.c
 +++ b/tools/libxc/xc_sr_restore_x86_hvm.c
 @@ -78,7 +78,8 @@ static int handle_hvm_params(struct xc_sr_context *ctx,
  break;
  case HVM_PARAM_IOREQ_PFN:
  case HVM_PARAM_BUFIOREQ_PFN:
 -xc_clear_domain_page(xch, ctx-domid, entry-value);
 +if ( !ctx-restore.buffer_all_records )
 +xc_clear_domain_page(xch, ctx-domid, entry-value);
  break;
  }
  
 
 .
 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 0/4] x86/xen Several unassociated fixes

2015-06-02 Thread Andrew Cooper
While investigating a separate issue on Broadwell hardware, we encountered a
cascade crash, with 3 indepent issues.  For anyone interested, the full
backtrace was:

(XEN) Xen SMAP violation
(XEN) [ Xen-4.5.0-xs101665-d  x86_64  debug=y  Not tainted ]
(XEN) CPU:15
(XEN) RIP:e008:[82d08018c12f] memcpy+0x17/0x1b
(XEN) RFLAGS: 00010202   CONTEXT: hypervisor (d0v0)
(XEN) rax: 7ffe632f6eb8   rbx: 830286d1a000   rcx: 0004
(XEN) rdx: 0004   rsi: 820040054dd8   rdi: 7ffe632f6eb8
(XEN) rbp: 83043cbc7c48   rsp: 83043cbc7c48   r8:  f060011802af
(XEN) r9:  002c   r10: 82d08024e0e0   r11: 0282
(XEN) r12: 0004   r13: 002508f6   r14: 
(XEN) r15: 820040054dd8   cr0: 8005003b   cr4: 003126f0
(XEN) cr3: 00043c02b000   cr2: 7ffe632f6eb8
(XEN) ds:    es:    fs:    gs:    ss: e010   cs: e008
(XEN) Xen stack trace from rsp=83043cbc7c48:
(XEN)83043cbc7ce8 82d0801619e6 83043cbc 83043cbc7c78
(XEN)83043cbc7cb0 83043cbc7cb4  83043cbc7cac
(XEN) 7ffe632f6eb8 0004 ec83fdd8
(XEN)82d1 00858f5d 83043cbc7d08 006091e0
(XEN) 006091e0 83043cbc7e38 830286d1a000
(XEN)83043cbc7da8 82d080163494 4000 83043cbc
(XEN)83043cbc7d18 82e010ac11e0 0001 880106a0a150
(XEN)0001 83043c57c000 82e010ac11e0 0001
(XEN)83043cbc7e58 82d08018229a 82d08018dca8 82d080349e58
(XEN)82d080349e50  0202 830286d1a000
(XEN) 006091e0  
(XEN)83043cbc7ef8 82d080106760 8300784f 00027ff0
(XEN)82d0 880106a0a980  
(XEN) 83007b7d6000 8300784f 00031fd4c88a1167
(XEN)3cbc7e28 000f 00858f5d 88003ffb9788
(XEN)82d08018cd97 8300784f0208 000a03e8 0059
(XEN) ec83fdd8 7ffe632f6eb8 0004
(XEN)0004   
(XEN)   
(XEN)   
(XEN) Xen call trace:
(XEN)[82d08018c12f] memcpy+0x17/0x1b
(XEN)[82d0801619e6] dbg_rw_mem+0x2f6/0x360
(XEN)[82d080163494] arch_do_domctl+0x19c0/0x25f4
(XEN)[82d080106760] do_domctl+0x1b4b/0x1edb
(XEN)[82d080233fcb] syscall_enter+0xeb/0x145
(XEN) 
(XEN) Faulting linear address: 7ffe632f6eb8
(XEN) Pagetable walk from 7ffe632f6eb8:
(XEN)  L4[0x0ff] = 00084ed00067 000312ff
(XEN)  L3[0x1f9] = 00040b104067 00104513
(XEN)  L2[0x119] = 00050f511067 0010457c 
(XEN)  L1[0x0f6] = 80087d665167 00101dcc
(XEN) 
(XEN) 
(XEN) Panic on CPU 15:
(XEN) FATAL TRAP: vector = 14 (page fault)
(XEN) [error_code=0003] 
(XEN) 
(XEN) 
(XEN) Reboot in five seconds...
(XEN) Executing kexec image on cpu15
(XEN) Assertion 'local_irq_is_enabled()' failed at smp.c:223
(XEN) [ Xen-4.5.0-xs101665-d  x86_64  debug=y  Not tainted ]
(XEN) CPU:15
(XEN) RIP:e008:[82d08018a0d3] flush_area_mask+0x7/0x134
(XEN) RFLAGS: 00050046   CONTEXT: hypervisor (d0v0)
(XEN) rax: 00040046   rbx: 82e008b2faa0   rcx: 
(XEN) rdx: 0100   rsi:    rdi: 83043cbc78c0
(XEN) rbp: 83043cbc7918   rsp: 83043cbc78a0   r8:  
(XEN) r9:  0038   r10: 0040   r11: 82d080310ba0
(XEN) r12: 82d0803492c0   r13: 225692e4   r14: 83043cbc78c0
(XEN) r15: 00c0   cr0: 8005003b   cr4: 003126f0
(XEN) cr3: 00043c02b000   cr2: 7ffe632f6eb8
(XEN) ds:    es:    fs:    gs:    ss: e010   cs: e008
(XEN) Xen stack trace from rsp=83043cbc78a0:
(XEN)82d08011eb63  00150028 82d080299c20
(XEN)d7f55de9 00f7  
(XEN)0003 83043cbc78f8  0028
(XEN)  000ff000 83043cbc7958
(XEN)82d08011f7c6 83043cbc7940  82cfffb74000
(XEN)00082cfffb74 83043c57c001 000ff000 83043cbc7978
(XEN)82d08011f8ab 830078693fe8 830078693fe8 83043cbc7988
(XEN)82d080178e08 83043cbc79b8 82d08017926b 830078693fe8
(XEN)0001 00082cfffb74 83043c57c000 

[Xen-devel] [PATCH 4/4] x86/memcpy: Reduce code size

2015-06-02 Thread Andrew Cooper
'n % BYTES_PER_LONG' is at most 7, and doesn't need a 64bit register mov.

Signed-off-by: Andrew Cooper andrew.coop...@citrix.com
CC: Jan Beulich jbeul...@suse.com

---
Admittedly very trivial, but no need to be wasteful
---
 xen/arch/x86/string.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/arch/x86/string.c b/xen/arch/x86/string.c
index 3af0ea8..043ae66 100644
--- a/xen/arch/x86/string.c
+++ b/xen/arch/x86/string.c
@@ -15,7 +15,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 
 asm volatile (
rep ; movs__OS ; 
-   mov %4,%3; 
+   mov %k4,%k3  ; 
rep ; movsb
 : =c (d0), =D (d1), =S (d2)
 : 0 (n/BYTES_PER_LONG), r (n%BYTES_PER_LONG), 1 (dest), 2 (src)
-- 
1.7.10.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 1/4] x86/apic: Disable the LAPIC later in smp_send_stop()

2015-06-02 Thread Andrew Cooper
__stop_this_cpu() may reset the LAPIC mode back from x2apic to xapic, but will
leave x2apic_enabled alone.  This may cause disconnect_bsp_APIC() in
disable_IO_APIC() to suffer a #GP fault.

Disabling the LAPIC can safely be deferred to being the last action.

Signed-off-by: Andrew Cooper andrew.coop...@citrix.com
CC: Jan Beulich jbeul...@suse.com

---

I still think that x2apic_enabled is not appropriate an appropriate predicate
for apic_read/write() to use.  Currently LAPIC mode is inherently a per-pcpu
properly rather than a global properly, and can result in all kinds of fun
depending on the exact nature of the crash.  In this example, had the original
crash attempt got further before reentering, x2apic_enabled would have already
changed, and smp_call_function() higher would have failed to IPI the other
cpus (by trying to drive the LAPIC in xapic mode when it was actually in
x2apic mode).
---
 xen/arch/x86/smp.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 06a833e..8caa0bc 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -311,9 +311,9 @@ void smp_send_stop(void)
 mdelay(1);
 
 local_irq_disable();
-__stop_this_cpu();
 disable_IO_APIC();
 hpet_disable();
+__stop_this_cpu();
 local_irq_enable();
 }
 
-- 
1.7.10.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 2/4] xen/crash: Don't use set_fixmap() in the crash path

2015-06-02 Thread Andrew Cooper
Experimentally, this can result in memory allocation, and in particular a
failed assertion that interrupts are enabled when performing a TLB flush.

  (XEN) Assertion 'local_irq_is_enabled()' failed at smp.c:223
  snip
  (XEN) [82d08018a0d3] flush_area_mask+0x7/0x134
  (XEN) [82d08011f7c6] alloc_domheap_pages+0xa9/0x12a
  (XEN) [82d08011f8ab] alloc_xenheap_pages+0x64/0xdb
  (XEN) [82d080178e08] alloc_xen_pagetable+0x1c/0xa0
  (XEN) [82d08017926b] virt_to_xen_l1e+0x38/0x1be
  (XEN) [82d080179bff] map_pages_to_xen+0x80e/0xfd9
  (XEN) [82d080185a23] __set_fixmap+0x2c/0x2e
  (XEN) [82d0801a6fd4] machine_crash_shutdown+0x186/0x2b2
  (XEN) [82d0801172bb] kexec_crash+0x3f/0x5b
  (XEN) [82d0801479b7] panic+0x100/0x118
  (XEN) [82d08019002b] set_guest_machinecheck_trapbounce+0/0x6d
  (XEN) [82d080195c15] do_page_fault+0x40b/0x541
  (XEN) [82d0802345e0] handle_exception_saved+0x2e/0x6c

Instead, use the directmap mapping which are writable and involve far less
complexity than set_fixmap()

Signed-off-by: Andrew Cooper andrew.coop...@citrix.com
CC: Jan Beulich jbeul...@suse.com
---
 xen/arch/x86/crash.c |9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index eb7be9c..501e18e 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -140,13 +140,10 @@ static void nmi_shootdown_cpus(void)
  * Ideally would be:
  *   exception_table[TRAP_nmi] = do_nmi_crash;
  *
- * but the exception_table is read only.  Borrow an unused fixmap entry
- * to construct a writable mapping.
+ * but the exception_table is read only.  Access it via its directmap
+ * mappings.
  */
-set_fixmap(FIX_TBOOT_MAP_ADDRESS, __pa(exception_table[TRAP_nmi]));
-write_atomic((unsigned long *)
- (fix_to_virt(FIX_TBOOT_MAP_ADDRESS) +
-  ((unsigned long)exception_table[TRAP_nmi]  ~PAGE_MASK)),
+write_atomic((unsigned long*)__va(__pa(exception_table[TRAP_nmi])),
  (unsigned long)do_nmi_crash);
 
 /* Ensure the new callback function is set before sending out the NMI. */
-- 
1.7.10.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 1/5] libxl/save: Refactor libxl__domain_suspend_state

2015-06-02 Thread Ian Campbell
On Wed, 2015-05-20 at 18:01 +0800, Yang Hongyang wrote:
 @@ -1762,16 +1762,18 @@ static void libxl__domain_suspend_callback(void *data)
  {
  libxl__save_helper_state *shs = data;
  libxl__egc *egc = shs-egc;
 -libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
 +libxl__domain_save_state *dss = CONTAINER_OF(shs, *dss, shs);
 +libxl__domain_suspend_state *dss2 = dss-dss;

With dss now being ambiguously save vs suspend I don't think adding a 2
suffix to one of the usages is the right answer.

I think in contexts where you are dealing with both that *save_state and
*suspend_state are the way to go for the local variables. I'm afraid
this will make the change noisier, sorry.

I'm afraid I think that the dss member of struct
libxl__domain_save_state will therefore also need to be called
suspend_state too.

I think we can tolerate using dss in contexts where there is only one of
the two structs in active use, just to avoid even more noise.

Alternatively if there is another name for either save or suspend
which doesn't start with an s (or conflict with some other type) perhaps
we could go with that. I can't think of one off hand.

Another name might also help because the semantic difference between
suspend and save is something I have to think about every time. Is there
a split along live/dead lines which we could use here perhaps?


  static void domain_suspend_callback_common_done(libxl__egc *egc,
  libxl__domain_suspend_state *dss, int ok)
  {
 -libxl__xc_domain_saverestore_async_callback_done(egc, dss-shs, ok);
 +libxl__domain_save_state *dsvs = CONTAINER_OF(dss, *dsvs, dss);

I suppose dsvs is a bit better then dss2. Maybe that's the answer, if
used consistently.

Ian.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 2/5] tools/libxl: move domain suspend codes into a separate file

2015-06-02 Thread Ian Campbell
On Wed, 2015-05-20 at 18:01 +0800, Yang Hongyang wrote:
 move domain suspend codes into a separate file libxl_dom_suspend.c

Just code.

 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com

Acked-by: Ian Campbell ian.campb...@citrix.com




___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] Xen Security Advisory 130 (CVE-2015-4105) - Guest triggerable qemu MSI-X pass-through error messages

2015-06-02 Thread Xen . org security team
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Xen Security Advisory CVE-2015-4105 / XSA-130
  version 2

 Guest triggerable qemu MSI-X pass-through error messages

UPDATES IN VERSION 2


Public release.

CVE assigned.

ISSUE DESCRIPTION
=

Device model code dealing with guest PCI MSI-X interrupt management
activities logs messages on certain (supposedly) invalid guest
operations.

IMPACT
==

A buggy or malicious guest repeatedly invoking such operations may
result in the host disk to fill up, possibly leading to a Denial of
Service.

VULNERABLE SYSTEMS
==

Xen versions 3.3 and onwards are vulnerable due to supporting PCI
pass-through.

Only x86 systems are vulnerable.  ARM systems are not vulnerable.

Only HVM guests with their device model run in Dom0 can take advantage
of this vulnerability.

Only HVM guests which have been granted access to physical PCI devices
(`PCI passthrough') can take advantage of this vulnerability.

Furthermore, the vulnerability is only applicable when the
passed-through PCI devices are MSI-X capable.  (Many modern devices
are.)

MITIGATION
==

This issue can be avoided by not assigning MSI-X capable PCI devices to
untrusted HVM guests.

This issue can also be avoided by only using PV guests.

It can also be avoided by configuring HVM guests with their device
model run in a separate (stub) domain.  (When using xl, this can be
requested with device_model_stubdomain_override=1 in the domain
configuration file.)

CREDITS
===

This issue was discovered by Jan Beulich of SUSE.

RESOLUTION
==

Applying the appropriate attached patch resolves this issue.

xsa130-qemuu.patch   qemu-upstream-unstable, Xen 4.5.x, Xen 4.4.x
xsa130-qemuu-4.3.patch   Xen 4.3.x
xsa130-qemut.patch   qemu-xen-unstable, Xen 4.5.x, Xen 4.4.x, Xen 
4.3.x, Xen 4.2.x

$ sha256sum xsa130*.patch
0ed6d75b6758c43a3042994f5127cb69d03796acf5c4d305d7ec2486500753da  
xsa130-qemut.patch
fd6e835e945c2eee197f9e18501aeefb6e1d33a714f6ce66c16481d5aca8fcd0  
xsa130-qemuu-4.3.patch
87fb70041d1fe9c997461c4a6fdaf9157667ec2eff7c77b8db6ee8f9d730753d  
xsa130-qemuu.patch
$

DEPLOYMENT DURING EMBARGO
=

Deployment of the patches and/or mitigations described above (or
others which are substantially similar) is permitted during the
embargo, even on public-facing systems with untrusted guest users and
administrators.

But: Distribution of updated software is prohibited (except to other
members of the predisclosure list).

Predisclosure list members who wish to deploy significantly different
patches and/or mitigations, please contact the Xen Project Security
Team.

(Note: this during-embargo deployment notice is retained in
post-embargo publicly released Xen Project advisories, even though it
is then no longer applicable.  This is to enable the community to have
oversight of the Xen Project Security Team's decisionmaking.)

For more information about permissible uses of embargoed information,
consult the Xen Project community's agreed Security Policy:
  http://www.xenproject.org/security-policy.html
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.12 (GNU/Linux)

iQEcBAEBAgAGBQJVbbdWAAoJEIP+FMlX6CvZhyIH/3VkV4yhOpHsYzHEdkcikqTP
w3KzOsPqtZs2++XWN48Ewpt1Dy12vLkq65hljfvHj9AIWmB0qgWXNC51lkkIFffT
KgcNuUbuJkyy+hNk7K/OWblXbehTrSIAWkl13xKymIQYiS+UN8TYp9kM7QIFkYh2
GGJlCzTljnxeKFZY0z7uW6OKnZzBkdcGmRS5tyH+cqikfAEDSGaV7ffSC0mukd0/
LrTodM+0+8C40znDAyjUiz91YfGXyXtTTEKvmPzdhiv9Fsp3FZ6kMkSGPhcAUUUh
WJmP23QXwm1Tt0qZn9wp1w1DmgihkDoS9Jdw/as29qSCNE2UrfsXaPghGkujQTU=
=KrtH
-END PGP SIGNATURE-


xsa130-qemut.patch
Description: Binary data


xsa130-qemuu-4.3.patch
Description: Binary data


xsa130-qemuu.patch
Description: Binary data
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] RFC: QEMU bumping memory limit and domain restore

2015-06-02 Thread Wei Liu
Previous discussion at [0].

For the benefit of discussion, we refer to max_memkb inside hypervisor
as hv_max_memkb (name subject to improvement). That's the maximum number
of memory a domain can use.

Libxl doesn't know hv_max_memkb for a domain needs prior to QEMU start-up
because of optional ROMs etc.

Libxl doesn't know the hv_max_memkb even after QEMU start-up, because
there is no mechanism to community between QEMU and libxl. This is an
area that needs improvement, we've encountered problems in this area
before.

QEMU calls xc_domain_setmaxmem to increase hv_max_memkb by N pages. Those
pages are only accounted in hypervisor. During migration, libxl
(currently) doesn't extract that value from hypervisor.

So now the problem is on the remote end:

1. Libxl indicates domain needs X pages.
2. Domain actually needs X + N pages.
3. Remote end tries to write N more pages and fail.

This behaviour currently doesn't affect normal migration (that you
transfer libxl JSON to remote, construct a domain, then start QEMU)
because QEMU won't bump hv_max_memkb again. This is by design and
reflected in QEMU code.

This behaviour affects COLO and becomes a bug in that case, because
secondary VM's QEMU doesn't go through the same start-of-day
initialisation (Hongyang, correct me if I'm wrong), i.e. no bumping
hv_max_memkb inside QEMU.

Andrew plans to embed JSON inside migration v2 and COLO is based on
migration v2. The bug is fixed if JSON is correct in the first place.

As COLO is not yet upstream, so this bug is not a blocker for 4.6. But
it should be fixed for the benefit of COLO.

So here is a proof of concept patch to record and honour that value
during migration.  A new field is added in IDL. Note that we don't
provide xl level config option for it and mandate it to be default value
during domain creation. This is to prevent libxl user from using it to
avoid unforeseen repercussions.

This patch is compiled test only. If we agree this is the way to go I
will test and submit a proper patch.

Wei.

[0] 1428941353-18673-1-git-send-email-dsl...@verizon.com

---8---
From ab9dc179ea4ee26eb88f61f8dad36dd01b63bb6b Mon Sep 17 00:00:00 2001
From: Wei Liu wei.l...@citrix.com
Date: Tue, 2 Jun 2015 14:53:20 +0100
Subject: [PATCH] libxl: record and honour hv_max_memkb

The new field hv_max_memkb in IDL is used to record max_memkb inside
hypervisor. That reflects the maximum memory a guest can ask for.

This field is mandated to be default value during guest creation to
avoid unforeseen repercussions. It's only honour when restoring a guest.

(XXX compiled test only at this stage)

Signed-off-by: Wei Liu wei.l...@citrix.com
---
 tools/libxl/libxl.c | 17 +
 tools/libxl/libxl_create.c  |  6 ++
 tools/libxl/libxl_dom.c |  9 +++--
 tools/libxl/libxl_types.idl |  1 +
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 9117b01..72fec8b 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -6614,6 +6614,7 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
uint32_t domid,
 GC_INIT(ctx);
 int rc;
 libxl__domain_userdata_lock *lock = NULL;
+uint64_t hv_max_memkb;
 
 CTX_LOCK;
 
@@ -6654,6 +6655,7 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
uint32_t domid,
 }
 libxl_uuid_copy(ctx, d_config-c_info.uuid, info.uuid);
 libxl_dominfo_dispose(info);
+hv_max_memkb = info.max_memkb; /* store and use later */
 }
 
 /* Memory limits:
@@ -6661,17 +6663,15 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
uint32_t domid,
  * Currently there are three memory limits:
  *  1. target in xenstore (originally memory= in config file)
  *  2. static-max in xenstore (originally maxmem= in config file)
- *  3. max_memkb in hypervisor
- *
- * The third one is not visible and currently managed by
- * toolstack. In order to rebuild a domain we only need to have
- * target and static-max.
+ *  3. max_memkb in hypervisor (corresponds to hv_max_memkb in
+ * idl, not visible to xl level)
  */
 {
-uint32_t target_memkb = 0, max_memkb = 0;
+uint32_t target_memkb = 0, static_max_memkb = 0;
 
 /* target */
-rc = libxl__get_memory_target(gc, domid, target_memkb, max_memkb);
+rc = libxl__get_memory_target(gc, domid, target_memkb,
+  static_max_memkb);
 if (rc) {
 LOG(ERROR, fail to get memory target for domain %d, domid);
 goto out;
@@ -6683,7 +6683,8 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
uint32_t domid,
 d_config-b_info.target_memkb = target_memkb +
 d_config-b_info.video_memkb;
 
-d_config-b_info.max_memkb = max_memkb;
+d_config-b_info.max_memkb = static_max_memkb;
+d_config-b_info.hv_max_memkb = hv_max_memkb;
 }
 
 /* Devices: disk, nic, 

Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Stefano Stabellini
On Tue, 2 Jun 2015, Sanjeev Pandita wrote:
 From: Stefan Bader [mailto:stefan.ba...@canonical.com]
 Sent: Tuesday, June 02, 2015 1:52 PM
 To: Sanjeev Pandita; xen-devel@lists.xen.org
 Cc: ian.campb...@citrix.com; Pranavkumar Sawargaonkar;
 stefano.stabell...@eu.citrix.com
 Subject: Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow
 AARCH64 Ubuntu 15.04 disk
 
 On 02.06.2015 09:40, Sanjeev Pandita wrote:
  All,
 
  I am pretty new to xen . I am trying to boot DOMU with qemu qcow
  AARCH64 Ubuntu 15.04 disk on Xen but I am getting the errors which
  link to /usr/local/lib/xen/bin/qemu-system-i386.
  Since I am working on aarch64 system the
  /usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or
  might not work as expected.
 
 Because I am lacking hardware and feedback, the arm64 packaging is a
 rather theoretical exercise. At least for armhf I thought qemu-system-x86
 was a dependency. That binary should provide x86 emulation on arm64, the
 same as one could install qemu for other arches on x86.
 Have you tried to install qemu-system-x86 manually?
 
 -Stefan
 
 
  Please let me know how to make the Qemu qcow image work on Xen.
  Attached are the DomU boot log and config file.
 
  Thanks,
  San
 
 Thanks for your inputs, I have installed the qemu-system-i386 but my DomU
 booting is still crashing with following short logs. Am I missing anything
 ?
 
 Kernel Crash logs:
 
 xenbus_probe_frontend: Waiting for devices to initialise:
 25s...20s...15s...10s...5s...0s...
 235s...230s...225s...220s...215s...210s...205s...200s...195s...190s...185s
 ...180s...175s...170s...165s...160s...155s...150s...145s...140s...135s...1
 30s...125s...120s...115s...110s...105s...100s...95s...90s...85s...80s...75
 s...70s...65s...60s...55s...50s...45s...40s...35s...30s...25s...20s...15s.
 ..10s...5s...0s...
 
 xenbus_probe_frontend: Timeout connecting to device: device/vbd/51712
 (local state 3, remote state 2)
 console [netcon0] enabled
 netconsole: network logging started
 drivers/rtc/hctosys.c: unable to open rtc device (rtc0)
 VFS: Cannot open root device xvda or unknown-block(0,0): error -6
 Please append a correct root= boot option; here are the available
 partitions:
 Kernel panic - not syncing: VFS: Unable to mount root fs on
 unknown-block(0,0)
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.19.8 #5
 Hardware name: XENVM-4.6 (DT)
 Call trace:
 [ffc8a0dc] dump_backtrace+0x0/0x124
 [ffc8a210] show_stack+0x10/0x1c
 [ffc000657d88] dump_stack+0x80/0xc4
 [ffc000656f04] panic+0xe0/0x220
 [ffc00087eea8] mount_block_root+0x1a4/0x24c
 [ffc00087f19c] mount_root+0x110/0x130
 [ffc00087f328] prepare_namespace+0x16c/0x1b8
 [ffc00087eb44] kernel_init_freeable+0x1c4/0x1ec
 [ffc00065481c] kernel_init+0xc/0xd8
 ---[ end Kernel panic - not syncing: VFS: Unable to mount root fs on
 unknown-block(0,0)

It looks like the backend (QEMU) hasn't been initialized properly.
Could you please post the output of xenstore-ls? Also could you run ps
aux|grep qemu to check whether QEMU was spawned correctly?


 
 My config file  :
 
 linux:/mnt/xen # cat vm1
 name = vm1
 uuid = 3fb78ba6-8182-484c-acf7-8faba9773f68
 disk = [ 'tap:qcow:/mnt/xen/vivid-server-cloudimg-arm64-disk1.img,xvda,w'
 ]
 memory = 512
 kernel = ./Image
 extra = root=/dev/xvda rw console=hvc0 earlyprintk=xen

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] libxl: Don't insert PCI device into xenstore for HVM guests

2015-06-02 Thread Konrad Rzeszutek Wilk
On Tue, Jun 02, 2015 at 11:06:26AM +0100, Malcolm Crossley wrote:
 On 01/06/15 18:55, Konrad Rzeszutek Wilk wrote:
  On Mon, Jun 01, 2015 at 05:03:14PM +0100, Malcolm Crossley wrote:
  On 01/06/15 16:43, Ross Lagerwall wrote:
  On 06/01/2015 04:26 PM, Konrad Rzeszutek Wilk wrote:
  On Fri, May 29, 2015 at 08:59:45AM +0100, Ross Lagerwall wrote:
  When doing passthrough of a PCI device for an HVM guest, don't insert
  the device into xenstore, otherwise pciback attempts to use it which
  conflicts with QEMU.
 
  How does it conflict?
 
  It doesn't work with repeated use. See below.
 
 
  This manifests itself such that the first time a device is passed to a
  domain, it succeeds. Subsequent attempts fail unless the device is
  unbound from pciback or the machine rebooted.
 
  Can you be more specific please? What are the issues? Why does it
  fail?
 
  Without this patch, if a device (e.g. a GPU) is bound to pciback and
  then passed through to a guest using xl pci-attach, it appears in the
  guest and works fine. If the guest is rebooted, and the device is again
  passed through with xl pci-attach, it appears in the guest as before but
  does not work. In Windows, it gets something like Error Code 43 and on
  Linux, the Nouveau driver fails to initialize the device (with error -22
  or something). The only way to get the device to work again is to reboot
  the host or unbind and rebind it to pciback.
 
  With this patch, it works as expected. The device is bound to pciback
  and works after being passed through, even after the VM is rebooted.
 
 
  There are certain things that pciback does to prepare an PCI device
  which QEMU also does. Some of them - such as saving the configuration
  registers (And then restoring them after the device has been detached) -
  is something that QEMU does not do.
 
 
  I really have no idea what the correct thing to do is, but the current
  code with qemu-trad doesn't seem to work (for me).

I think I know what the problem is. Do you by any chance have the XSA133-addenum
patch in? If not could you apply it and tell me if it works?

 
  The pciback pci_stub.c implements the pciback.hide and the device reset
  logic.
 
  The rest of pciback implements the pciback xenbus device which PV guests
  need in order to map/unmap MSI interrupts and access PCI config space.
 
  QEMU emulates and handles the MSI interrupt capabilities and PCI config
  space directly.
  
  Right..
 
  This is why a pciback xenbus device should not be created for
  passthrough PCI device being handled by QEMU.
  
  To me that sounds that we should not have PV drivers because QEMU
  emulates IDE or network devices.
 
 That is different. We first boot with QEMU handling the devices and then
 we explictly unplug QEMU's handling of IDE and network devices.
 
 That handover protocol does not currently exist for PCI passthrough
 devices so we have to chose one mechanism or the other to manage the
 passed through PCI device at boot time. Otherwise a HVM guest could load
 pcifront and cause's all kinds of chaos with interrupt management or
 outbound MMIO window management.

Which would be fun! :-)
 
  
  The crux here is that none of the operations that pciback performs
  should affect QEMU or guests. But it does - so there is a bug.
 
 I agree there is a bug but should we try to fix it based upon my
 comments above?

I am still thinking about it. I do like certain things that pciback
does as part of it being notified that a device is to be used by
a guest and performing the configuration save/reset (see
pcistub_put_pci_dev in the pciback).

If somehow that can still be done by libxl (or QEMU) via SysFS
that would be good.

Just to clarify:
 - I concur with you that having xen-pcifront loaded in HVM
   guest and doing odd things behind QEMU is not good.
 - I like the fact that xen-pciback does a bunch of safety
   things with the PCI device to prepare it for a guest.
 - Currently these 'safety things'  are done when you
   'unbind' or 'bind' the device to pciback.
 - Or when the guest is shutdown and via XenBus we are told
   and can do the 'safety things'. This is the crux - if there
   is a way to do this via SysFS this would be super.

   Or perhaps xenpciback can figure out that the guest is HVM
   and ignore any XenBus actions?

  
  I would like to understand which ones do it so I can fix in
  pciback - as it might be also be a problem with PV.
  
  Unless... are you by any chance using extra patches on top of the
  native pciback?
 
 We do have extra patches but they only allow us to do a SBR on PCI
 device's which require it. They failure listed above occurs on devices
 with device specific resets (e.g. FLR,D3) as well so those extra patches
 aren't being used.
 
  
 
  Malcolm
 
 
  Regards
 
 

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 4/5] tools/libxl: move toolstack code into libxl_toolstack.c

2015-06-02 Thread Ian Campbell
On Wed, 2015-05-20 at 18:01 +0800, Yang Hongyang wrote:
 move toolstack code into libxl_toolstack.c

It's not clear to me what toolstack code is here, the whole of libxl
and xl is toolstack code.

Is the code being moved stuff to do with adding toolstack state to the
save stream? Perhaps libxl_{suspend,save}_toolstack.c? Or could this not
go in the libxl_dom_suspend.c you just created?



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] Xen Security Advisory 131 (CVE-2015-4106) - Unmediated PCI register access in qemu

2015-06-02 Thread Xen . org security team
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Xen Security Advisory CVE-2015-4106 / XSA-131
  version 3

Unmediated PCI register access in qemu

UPDATES IN VERSION 3


Public release.

CVE assigned.

ISSUE DESCRIPTION
=

Qemu allows guests to not only read, but also write all parts of the
PCI config space (but not extended config space) of passed through PCI
devices not explicitly dealt with for (partial) emulation purposes.

IMPACT
==

Since the effect depends on the specific purpose of the the config
space field, it's not possbile to give a general statement about the
exact impact on the host or other guests.  Privilege escalation, host
crash (Denial of Service), and leaked information all cannot be
excluded.

VULNERABLE SYSTEMS
==

Xen versions 3.3 and onwards are vulnerable due to supporting PCI
pass-through.

Only x86 systems are vulnerable.  ARM systems are not vulnerable.

Only HVM guests with their device model run in Dom0 can take advantage
of this vulnerability.

Only HVM guests which have been granted access to physical PCI devices
(`PCI passthrough') can take advantage of this vulnerability.

MITIGATION
==

This issue can be avoided by not assigning PCI devices to untrusted HVM
guests.

This issue can also be avoided by only using PV guests.

It can also be avoided by configuring HVM guests with their device
model run in a separate (stub) domain.  (When using xl, this can be
requested with device_model_stubdomain_override=1 in the domain
configuration file.)

CREDITS
===

This issue was discovered by Jan Beulich of SUSE.

RESOLUTION
==

Applying the appropriate attached patch resolves this issue.

xsa131-qemuu-$n.patch   qemu-upstream-unstable, Xen 4.5.x, Xen 4.4.x
xsa131-qemuu-4.4-1.patchXen 4.4.x replacement for xsa131-qemuu-1.patch
xsa131-qemuu-4.3-$n.patch   Xen 4.3.x
xsa131-qemut-$n.patch   qemu-xen-unstable, Xen 4.5.x, Xen 4.4.x, Xen 
4.3.x, Xen 4.2.x
xsa131-qemut-4.2-1.patchXen 4.2.x replacement for xsa131-qemut-1.patch

$ sha256sum xsa131*.patch
2ff4aa092247ff0911d837adc5f4de1ffa8ed32a39eaea9b0bfc4a40b7921b06  
xsa131-qemut-1.patch
dafa524374d890e517d4e2600a594064b55af645172422b9e81a64b5f4a64575  
xsa131-qemut-2.patch
b37d3e22ce4410bf0db87217c60a543f0143a23ab0652f1746bd5fe17dbadd70  
xsa131-qemut-3.patch
b5f0882717129142f11297a62b2ed826da94ce5ed42f6b2ea60f9101b652aed9  
xsa131-qemut-4.2-1.patch
3bfc58b6288bafb4c2039265be32c6bd9e048b63a4cae279ead3ec1154af9abe  
xsa131-qemut-4.patch
60c44b63d2c7bd7e12631db7fd05622d782e1a5ccd7dfa17a1671b36b5ff7bee  
xsa131-qemut-5.patch
8f2a9c4333155fac670ad3a932703051ce8a47f4f6d3a067458e5ab49da7e93a  
xsa131-qemut-6.patch
ed4facfa80b2ab7ecfc9b232878d3f4d54ad93214c75f4b4af71c8f07a1d04c4  
xsa131-qemut-7.patch
d400d03ae792699fec9a54bbb6b08c2f5523427ef8af85b0c5ede497ba87f61c  
xsa131-qemut-8.patch
7a7f294303a8bcf9a316e3e6b8a0511dac3e92dbf7e373b21c94b97835c03f2f  
xsa131-qemuu-1.patch
dc72bd4993fdcea3dc98d18f314da3ac1c7e73e0b99dac325b0e59d0229f67e5  
xsa131-qemuu-2.patch
61524a47fd29406ba9a2983ea9cb59e45a56d716d65d78689177d9c8e95f76e6  
xsa131-qemuu-3.patch
21493c5db68115d97a6aecf1159ee05023b59545627d7f03d7fdaa238bb3bd27  
xsa131-qemuu-4.3-1.patch
5828647db6f090ce6c7ea20f90331008f2a0bba18b3a3a371f2ba9054871a7cb  
xsa131-qemuu-4.3-2.patch
eab05df32e8a7c729cc52affd28b109a8f75cabb8fd4027934059d303b2232fa  
xsa131-qemuu-4.3-3.patch
8dc95a2a8a45d851476b938e4cab2e65d87b8dc28c721949824ce900552ba489  
xsa131-qemuu-4.3-4.patch
7a358fba18ae9c0dde1134564151a97c8e6d6f5982ac74c450f81d2ed8e9d540  
xsa131-qemuu-4.3-5.patch
fcb77a8d2adde1daf03f8faeb6e92788b2727f5b11563b6f770c74251b0964a4  
xsa131-qemuu-4.3-6.patch
79933b2744e7b69c4eb23f3974d242e2592cb4553be115a4aec1c6e30e7564cf  
xsa131-qemuu-4.3-7.patch
bb4021a36a9f36dc0082cfd42869adc737ec4afea92ac1100f0971118174b58c  
xsa131-qemuu-4.3-8.patch
f70516fa38a3d2e0cf906c41e3b7dfd7cf998c9189b232dac20633c7b0d1ab8b  
xsa131-qemuu-4.4-1.patch
041c82a341755bcbab18f834a0fccf9c031674d956958092cbfa5e64f05b6318  
xsa131-qemuu-4.patch
91aeb9c0d3e9a251faf12840e0519a342cfb7e35af3fea429bedb452182fae47  
xsa131-qemuu-5.patch
60482fe37fd405032b92de85ed5d333c210c85662b1645016dce2f0053aa6ec0  
xsa131-qemuu-6.patch
05fc2e614620449e52a056ce6e5f4033970ade22fde623e3b789fc57b3e4143e  
xsa131-qemuu-7.patch
358849d7c0dff29bf96f49e56d00c4d7bd4c8d0c71c122a7b3655e10f45cb53b  
xsa131-qemuu-8.patch
$

DEPLOYMENT DURING EMBARGO
=

Deployment of patches or migitations is NOT permitted (except on
systems used and administered only by organisations which are members
of the Xen Project Security Issues Predisclosure List).  Specifically,
deployent on public cloud systems is NOT permitted.

This is because the altered PCI config space access behavior is visible
to guests.

Deployment is permitted only AFTER the embargo ends.

(Note: this during-embargo deployment notice is 

[Xen-devel] [PATCH 3/4] x86/debugger: Use copy_to/from_guest() in dbg_rw_guest_mem()

2015-06-02 Thread Andrew Cooper
Using gdbsx on Broadwell systems suffers a SMAP violation because
dbg_rw_guest_mem() uses memcpy() with a userspace pointer.

The functions dbg_rw_mem() and dbg_rw_guest_mem() have been updated to pass
'void * __user' pointers which indicates their nature clearly.

Signed-off-by: Andrew Cooper andrew.coop...@citrix.com
CC: Jan Beulich jbeul...@suse.com

---
After these changes, 'gdbsx -c' works as well as it did before (i.e. not at
all, for the stack trace), but doesn't take Xen down with it.

There are other issues in this area, and XEN_DOMCTL_gdbsx_guestmemio is
certainly not fit yet for removal from the XSA-77 exclusion list.
---
 xen/arch/x86/debug.c   |   45 +++-
 xen/arch/x86/domctl.c  |   14 ++---
 xen/include/asm-x86/debugger.h |7 +++
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/xen/arch/x86/debug.c b/xen/arch/x86/debug.c
index 435bd40..801dcf2 100644
--- a/xen/arch/x86/debug.c
+++ b/xen/arch/x86/debug.c
@@ -41,6 +41,9 @@
 #define DBGP2(...) ((void)0)
 #endif
 
+typedef unsigned long dbgva_t;
+typedef unsigned char dbgbyte_t;
+
 /* Returns: mfn for the given (hvm guest) vaddr */
 static unsigned long 
 dbg_hvm_va2mfn(dbgva_t vaddr, struct domain *dp, int toaddr,
@@ -154,13 +157,14 @@
 }
 
 /* Returns: number of bytes remaining to be copied */
-static int
-dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, 
- int toaddr, uint64_t pgd3)
+unsigned int dbg_rw_guest_mem(struct domain *dp, void * __user gaddr,
+  void * __user buf, unsigned int len,
+  bool_t toaddr, uint64_t pgd3)
 {
 while ( len  0 )
 {
 char *va;
+unsigned long addr = (unsigned long)gaddr;
 unsigned long mfn, gfn = INVALID_GFN, pagecnt;
 
 pagecnt = min_t(long, PAGE_SIZE - (addr  ~PAGE_MASK), len);
@@ -176,12 +180,12 @@
 
 if ( toaddr )
 {
-memcpy(va, buf, pagecnt);/* va = buf */
+copy_from_user(va, buf, pagecnt);/* va = buf */
 paging_mark_dirty(dp, mfn);
 }
 else
 {
-memcpy(buf, va, pagecnt);/* buf = va */
+copy_to_user(buf, va, pagecnt);/* buf = va */
 }
 
 unmap_domain_page(va);
@@ -203,27 +207,30 @@
  * pgd3: value of init_mm.pgd[3] in guest. see above.
  * Returns: number of bytes remaining to be copied. 
  */
-int
-dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len, domid_t domid, int toaddr,
-   uint64_t pgd3)
+unsigned int dbg_rw_mem(void * __user addr, void * __user buf,
+unsigned int len, domid_t domid, bool_t toaddr,
+uint64_t pgd3)
 {
-struct domain *dp = get_domain_by_id(domid);
-int hyp = (domid == DOMID_IDLE);
+DBGP2(gmem:addr:%lx buf:%p len:$%u domid:%d toaddr:%x\n,
+  addr, buf, len, domid, toaddr);
 
-DBGP2(gmem:addr:%lx buf:%p len:$%d domid:%x toaddr:%x dp:%p\n, 
-  addr, buf, len, domid, toaddr, dp);
-if ( hyp )
+if ( domid == DOMID_IDLE )
 {
 if ( toaddr )
-len = __copy_to_user((void *)addr, buf, len);
+len = __copy_to_user(addr, buf, len);
 else
-len = __copy_from_user(buf, (void *)addr, len);
+len = __copy_from_user(buf, addr, len);
 }
-else if ( dp )
+else
 {
-if ( !dp-is_dying )   /* make sure guest is still there */
-len= dbg_rw_guest_mem(addr, buf, len, dp, toaddr, pgd3);
-put_domain(dp);
+struct domain *d = get_domain_by_id(domid);
+
+if ( d )
+{
+if ( !d-is_dying )
+len = dbg_rw_guest_mem(d, addr, buf, len, toaddr, pgd3);
+put_domain(d);
+}
 }
 
 DBGP2(gmem:exit:len:$%d\n, len);
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index e9f76d0..1d3854f 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -37,14 +37,14 @@
 #include asm/debugger.h
 #include asm/psr.h
 
-static int gdbsx_guest_mem_io(
-domid_t domid, struct xen_domctl_gdbsx_memio *iop)
+static int gdbsx_guest_mem_io(domid_t domid, struct xen_domctl_gdbsx_memio 
*iop)
 {
-ulong l_uva = (ulong)iop-uva;
-iop-remain = dbg_rw_mem(
-(dbgva_t)iop-gva, (dbgbyte_t *)l_uva, iop-len, domid,
-iop-gwr, iop-pgd3val);
-return (iop-remain ? -EFAULT : 0);
+void * __user gva = (void *)iop-gva, * __user uva = (void *)iop-uva;
+
+iop-remain = dbg_rw_mem(gva, uva, iop-len, domid,
+ !!iop-gwr, iop-pgd3val);
+
+return iop-remain ? -EFAULT : 0;
 }
 
 #define MAX_IOPORTS 0x1
diff --git a/xen/include/asm-x86/debugger.h b/xen/include/asm-x86/debugger.h
index 0408bec..33f4700 100644
--- a/xen/include/asm-x86/debugger.h
+++ b/xen/include/asm-x86/debugger.h
@@ -82,9 +82,8 @@ static inline int debugger_trap_entry(
 return 0;
 }
 
-typedef 

[Xen-devel] Xen Security Advisory 129 (CVE-2015-4104) - PCI MSI mask bits inadvertently exposed to guests

2015-06-02 Thread Xen . org security team
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Xen Security Advisory CVE-2015-4104 / XSA-129
  version 2

PCI MSI mask bits inadvertently exposed to guests

UPDATES IN VERSION 2


Public release.

CVE assigned.

ISSUE DESCRIPTION
=

The mask bits optionally available in the PCI MSI capability structure
are used by the hypervisor to occasionally suppress interrupt
delivery.  Unprivileged guests were, however, nevertheless allowed
direct control of these bits.

IMPACT
==

Interrupts may be observed by Xen at unexpected times, which may lead
to a host crash and therefore a Denial of Service.

VULNERABLE SYSTEMS
==

Xen versions 3.3 and onwards are vulnerable due to supporting PCI
pass-through.

Only x86 systems are vulnerable.  ARM systems are not vulnerable.

Only HVM guests with their device model run in Dom0 can take advantage
of this vulnerability.

Only HVM guests which have been granted access to physical PCI devices
(`PCI passthrough') can take advantage of this vulnerability.

Furthermore, the vulnerability is only applicable when the
passed-through PCI devices are MSI-capable.  (Most modern devices
are.)

MITIGATION
==

This issue can be avoided by not assigning MSI capable PCI devices to
untrusted HVM guests.

This issue can also be avoided by only using PV guests.

It can also be avoided by configuring HVM guests with their device
model run in a separate (stub) domain.  (When using xl, this can be
requested with device_model_stubdomain_override=1 in the domain
configuration file.)

CREDITS
===

This issue was discovered by Jan Beulich of SUSE.

RESOLUTION
==

Applying the appropriate attached patch resolves this issue.

xsa129-qemuu.patch   qemu-upstream-unstable, Xen 4.5.x, Xen 4.4.x
xsa129-qemuu-4.3.patch   Xen 4.3.x
xsa129-qemut.patch   qemu-xen-unstable, Xen 4.5.x, Xen 4.4.x, Xen 
4.3.x, Xen 4.2.x

$ sha256sum xsa129*.patch
3c6b5a085eec3a528b18207ca65222300911fd25501a9ffaffa76a5d85d23992  
xsa129-qemut.patch
314808fbaa97d06bc4bb6cb6644dca1ae2da55534661c662c6e442d5b91e6061  
xsa129-qemuu-4.3.patch
9f0658e197c539306118723d63b468d09fe3a1d9f9364f6d06e53b7be8268bdc  
xsa129-qemuu.patch
$

DEPLOYMENT DURING EMBARGO
=

Deployment of patches or migitations is NOT permitted (except on
systems used and administered only by organisations which are members
of the Xen Project Security Issues Predisclosure List).  Specifically,
deployent on public cloud systems is NOT permitted.

This is because the altered PCI config space access behavior is visible
to guests.

Deployment is permitted only AFTER the embargo ends.

(Note: this during-embargo deployment notice is retained in
post-embargo publicly released Xen Project advisories, even though it
is then no longer applicable.  This is to enable the community to have
oversight of the Xen Project Security Team's decisionmaking.)

For more information about permissible uses of embargoed information,
consult the Xen Project community's agreed Security Policy:
  http://www.xenproject.org/security-policy.html
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.12 (GNU/Linux)

iQEcBAEBAgAGBQJVbbdRAAoJEIP+FMlX6CvZe+4H/RcQcEsggqHg5eK/9yowQV1c
erLWwpP18+v1pSRKqC+In/snL4g6H1DiC7ezwEbyQzOA8GGgiikTHqyTyFATvEHN
hCwMgYW4ZYcR/euqJ7kgi7q368+39sM6ZzEnKCwr4GUeWLtBh+6ABeih5XlfjyfS
0HWuw+NBkT7IcIR/KaQwa17or3fZ2cZKq1NU4EksFjuD+ucMS7a4sPs1SztoSbXc
Qf5TZn0XsDWoAodX/EmI4xRubpKL6Ae6noOCkBDelssvwzIhR1rZfFL8qALy+axf
vb4le4Woy7USkWssOURSvkY8iMio25qvwGFxORzI9x4ImMU+XC+r6QSCLER202Q=
=VQRQ
-END PGP SIGNATURE-


xsa129-qemut.patch
Description: Binary data


xsa129-qemuu-4.3.patch
Description: Binary data


xsa129-qemuu.patch
Description: Binary data
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] Xen Security Advisory 128 (CVE-2015-4103) - Potential unintended writes to host MSI message data field via qemu

2015-06-02 Thread Xen . org security team
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Xen Security Advisory CVE-2015-4103 / XSA-128
  version 2

Potential unintended writes to host MSI message data field via qemu

UPDATES IN VERSION 2


Public release.

CVE assigned.

ISSUE DESCRIPTION
=

Logic is in place to avoid writes to certain host config space fields
when the guest must nevertheless be able to access their virtual
counterparts.  A bug in how this logic deals with accesses spanning
multiple fields allows the guest to write to the host MSI message data
field.

While generally the writes write back the values previously read,
their value in config space may have got changed by the host between
the qemu read and write.  In such a case host side interrupt handling
could become confused, possibly losing interrupts or allowing spurious
interrupt injection into other guests.

IMPACT
==

Certain untrusted guest administrators may be able to confuse host
side interrupt handling, leading to a Denial of Service.

VULNERABLE SYSTEMS
==

Xen versions 3.3 and onwards are vulnerable due to supporting PCI
pass-through.

Only x86 systems are vulnerable.  ARM systems are not vulnerable.

Only HVM guests with their device model run in Dom0 can take advantage
of this vulnerability.

Only HVM guests which have been granted access to physical PCI devices
(`PCI passthrough') can take advantage of this vulnerability.

Furthermore, the vulnerability is only applicable when the
passed-through PCI devices are MSI-capable.  (Most modern devices
are.)

MITIGATION
==

This issue can be avoided by not assigning MSI capable PCI devices to
untrusted HVM guests.

This issue can also be avoided by only using PV guests.

It can also be avoided by configuring HVM guests with their device
model run in a separate (stub) domain.  (When using xl, this can be
requested with device_model_stubdomain_override=1 in the domain
configuration file.)

CREDITS
===

This issue was discovered by Jan Beulich of SUSE.

RESOLUTION
==

Applying the appropriate attached patch resolves this issue.

xsa128-qemuu.patch   qemu-upstream-unstable, Xen 4.5.x, Xen 4.4.x
xsa128-qemuu-4.3.patch   Xen 4.3.x
xsa128-qemut.patch   qemu-xen-unstable, Xen 4.5.x, Xen 4.4.x, Xen 
4.3.x, Xen 4.2.x

$ sha256sum xsa128*.patch
68b85a4c7d531d343d7fac2e92dbec3677bc2e4a83de75d78d7f605a2fc8ad3f  
xsa128-qemut.patch
2ec657a6f22cac922854548c9d83698656ab7a36634ad05de7f14439cc4405bc  
xsa128-qemuu-4.3.patch
104cf2e2816d253cc1eca3084f6ea9b6007f7773a88bda245bab00539e08b359  
xsa128-qemuu.patch
$

DEPLOYMENT DURING EMBARGO
=

Deployment of the patches and/or mitigations described above (or
others which are substantially similar) is permitted during the
embargo, even on public-facing systems with untrusted guest users and
administrators.

But: Distribution of updated software is prohibited (except to other
members of the predisclosure list).

Predisclosure list members who wish to deploy significantly different
patches and/or mitigations, please contact the Xen Project Security
Team.

(Note: this during-embargo deployment notice is retained in
post-embargo publicly released Xen Project advisories, even though it
is then no longer applicable.  This is to enable the community to have
oversight of the Xen Project Security Team's decisionmaking.)

For more information about permissible uses of embargoed information,
consult the Xen Project community's agreed Security Policy:
  http://www.xenproject.org/security-policy.html
-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.12 (GNU/Linux)

iQEcBAEBAgAGBQJVbbdOAAoJEIP+FMlX6CvZEPUIAIti0HdxCX4JNy5MKqNFxHRB
KtGibssSaoGcPmkhLDqtOQ+8BwTUe/owezKlX799Jf0Jqn1bVXejCLyh0e6cyauq
pPoyQd+zblIpTFw3ByqVzicLajmVfY5v8yGGBAnSpuvfVEd3K5qWZCvFx+rEJ4AB
JI8jQdMAn2oFGtLbYDysRUpSjg/OtqIC6o3a4yfVnPDcduPq9XFpnxcdHHVfrklS
SeY1MGLbJtrNzya+zX1GZxFh5kuZnF/qSY3o60LF+2ZpK9nyH8toX1flvW9lXa86
9r1zxgy6qE1iWOHo4E1HjlK3lUUqW0XgkB/3zj+2LtX1uTwOhPtATn5/Neje0GY=
=4I3/
-END PGP SIGNATURE-


xsa128-qemut.patch
Description: Binary data


xsa128-qemuu-4.3.patch
Description: Binary data


xsa128-qemuu.patch
Description: Binary data
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 3/5] tools/libxl: move remus codes into libxl_remus.c

2015-06-02 Thread Ian Campbell
On Wed, 2015-05-20 at 18:01 +0800, Yang Hongyang wrote:
 move remus codes into libxl_remus.c

code

Apart from dropping some static and adding some prototypes to the
header this is purely motion, correct? (I assume this about the last one
too).
 diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
 index b2eeb89..c3d70eb 100644
 --- a/tools/libxl/libxl_internal.h
 +++ b/tools/libxl/libxl_internal.h
 @@ -2815,6 +2815,7 @@ _hidden void libxl__remus_devices_commit(libxl__egc 
 *egc,
   libxl__remus_devices_state *rds);
  _hidden int libxl__netbuffer_enabled(libxl__gc *gc);
  
 +

Please avoid spurious whitespace changes.

  /*- Domain suspend (save) state structure -*/
  
  typedef struct libxl__domain_suspend_state libxl__domain_suspend_state;
 @@ -3197,6 +3198,16 @@ void libxl__domain_suspend_callback(void *data);
  void libxl__domain_suspend(libxl__egc *egc,
 libxl__domain_suspend_state *dss);
  
 +/* Remus callbacks for save */
 +void libxl__remus_domain_suspend_callback(void *data);
 +void libxl__remus_domain_resume_callback(void *data);
 +void libxl__remus_domain_checkpoint_callback(void *data);
 +/* Remus setup and teardown*/
 +void libxl__remus_setup(libxl__egc *egc, libxl__domain_save_state *dss);
 +void libxl__remus_teardown(libxl__egc *egc,
 +   libxl__domain_save_state *dss,
 +   int rc);

Please mark these all _hidden.



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v1 COLO Pre 02/12] libxc/restore: zero ioreq page only one time

2015-06-02 Thread Andrew Cooper
On 02/06/15 10:26, Yang Hongyang wrote:
 ioreq page contains evtchn which will be set when we resume the
 secondary vm the first time. The hypervisor will check if the
 evtchn is corrupted, so we cannot zero the ioreq page more
 than one time.

 The ioreq-state is always STATE_IOREQ_NONE after the vm is
 suspended, so it is OK if we only zero it one time.

 Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
 Signed-off-by: Wen congyang we...@cn.fujitsu.com
 CC: Andrew Cooper andrew.coop...@citrix.com

Is the qemu process for the secondary running at this point?  If so,
this is very much unsafe.

~Andrew

 ---
  tools/libxc/xc_sr_restore_x86_hvm.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

 diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c 
 b/tools/libxc/xc_sr_restore_x86_hvm.c
 index 6f5af0e..06177e0 100644
 --- a/tools/libxc/xc_sr_restore_x86_hvm.c
 +++ b/tools/libxc/xc_sr_restore_x86_hvm.c
 @@ -78,7 +78,8 @@ static int handle_hvm_params(struct xc_sr_context *ctx,
  break;
  case HVM_PARAM_IOREQ_PFN:
  case HVM_PARAM_BUFIOREQ_PFN:
 -xc_clear_domain_page(xch, ctx-domid, entry-value);
 +if ( !ctx-restore.buffer_all_records )
 +xc_clear_domain_page(xch, ctx-domid, entry-value);
  break;
  }
  


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [xen-unstable test] 56759: regressions - FAIL

2015-06-02 Thread Jan Beulich
 On 29.05.15 at 18:32, ian.campb...@citrix.com wrote:
 On Wed, 2015-05-27 at 17:04 +0100, Ian Campbell wrote:
 Looking at the netback side though it seems like netback_remove is
 switching to state=Closed _before_ it calls kobject_uevent(...,
 KOBJ_OFFLINE) and it is this which generates the call to netback_uevent
 which tries and fails to read script and produces the error message.
 
 I've just sent out a patch which fixes this issue, although I am still
 at a loss to explain why we have only started seeing this now and only
 under such specific circumstances.
 
 I'm still slightly concerned that perhaps the new spinlock stuff has
 some sort of bad behaviour either on arndale specifically or more
 generally for ARM systems which has pushed this particular case over the
 edge.
 
 I did run some benchmarks (hackbench+fio on arndale domU and hackbench
 on midway dom0) with and without the ticket locks and the results were
 close enough that I'm basically not too worried that there is something
 wrong with the ticket locks on ARM.
 
 It still niggles somewhat not to have a good theory about why this
 change had this seemingly random effect, but I've not got any good ideas
 for avenues to explore and I've got other things to do so I think I'll
 leave it at that.

So should we then re-instate the ticket lock patches?

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] ARM64: XEN Domu not booting with the qemu qcow AARCH64 Ubuntu 15.04 disk

2015-06-02 Thread Stefano Stabellini
On Tue, 2 Jun 2015, Stefano Stabellini wrote:
 On Tue, 2 Jun 2015, Stefan Bader wrote:
  On 02.06.2015 09:40, Sanjeev Pandita wrote:
   All,
   
   I am pretty new to xen . I am trying to boot DOMU with qemu qcow AARCH64
   Ubuntu 15.04 disk on Xen but I am getting the errors which link to
   /usr/local/lib/xen/bin/qemu-system-i386.
   Since I am working on aarch64 system the
   /usr/local/lib/xen/bin/qemu-system-i386 bin might not be present or might
   not work as expected.
  
  Because I am lacking hardware and feedback, the arm64 packaging is a rather
  theoretical exercise. At least for armhf I thought qemu-system-x86 was a
  dependency. That binary should provide x86 emulation on arm64, the same as 
  one
  could install qemu for other arches on x86.
  Have you tried to install qemu-system-x86 manually?
 
 Hi Stefan,
 
 On arm and arm64 Xen still needs a qemu-system-i386 binary, just to
 provide the PV backends in userspace (disk, console, etc.).
 Unfortunately the output binary is still named qemu-system-i386. I
 know that the name is misleading, but fixing it is not trivial: it
 requires disentangling code in QEMU in non trivial ways.

Just to be clear, qemu-system-i386 for ARM is the output of a QEMU build
on ARM with ./configure --enable-xen --target-list=i386-softmmu. It
could do x86 emulation, but it does not when used on Xen.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] RFC: QEMU bumping memory limit and domain restore

2015-06-02 Thread Wei Liu
I fat-fingered Andrew's email address. Really CC him this time.

On Tue, Jun 02, 2015 at 03:05:07PM +0100, Wei Liu wrote:
 Previous discussion at [0].
 
 For the benefit of discussion, we refer to max_memkb inside hypervisor
 as hv_max_memkb (name subject to improvement). That's the maximum number
 of memory a domain can use.
 
 Libxl doesn't know hv_max_memkb for a domain needs prior to QEMU start-up
 because of optional ROMs etc.
 
 Libxl doesn't know the hv_max_memkb even after QEMU start-up, because
 there is no mechanism to community between QEMU and libxl. This is an
 area that needs improvement, we've encountered problems in this area
 before.
 
 QEMU calls xc_domain_setmaxmem to increase hv_max_memkb by N pages. Those
 pages are only accounted in hypervisor. During migration, libxl
 (currently) doesn't extract that value from hypervisor.
 
 So now the problem is on the remote end:
 
 1. Libxl indicates domain needs X pages.
 2. Domain actually needs X + N pages.
 3. Remote end tries to write N more pages and fail.
 
 This behaviour currently doesn't affect normal migration (that you
 transfer libxl JSON to remote, construct a domain, then start QEMU)
 because QEMU won't bump hv_max_memkb again. This is by design and
 reflected in QEMU code.
 
 This behaviour affects COLO and becomes a bug in that case, because
 secondary VM's QEMU doesn't go through the same start-of-day
 initialisation (Hongyang, correct me if I'm wrong), i.e. no bumping
 hv_max_memkb inside QEMU.
 
 Andrew plans to embed JSON inside migration v2 and COLO is based on
 migration v2. The bug is fixed if JSON is correct in the first place.
 
 As COLO is not yet upstream, so this bug is not a blocker for 4.6. But
 it should be fixed for the benefit of COLO.
 
 So here is a proof of concept patch to record and honour that value
 during migration.  A new field is added in IDL. Note that we don't
 provide xl level config option for it and mandate it to be default value
 during domain creation. This is to prevent libxl user from using it to
 avoid unforeseen repercussions.
 
 This patch is compiled test only. If we agree this is the way to go I
 will test and submit a proper patch.
 
 Wei.
 
 [0] 1428941353-18673-1-git-send-email-dsl...@verizon.com
 
 ---8---
 From ab9dc179ea4ee26eb88f61f8dad36dd01b63bb6b Mon Sep 17 00:00:00 2001
 From: Wei Liu wei.l...@citrix.com
 Date: Tue, 2 Jun 2015 14:53:20 +0100
 Subject: [PATCH] libxl: record and honour hv_max_memkb
 
 The new field hv_max_memkb in IDL is used to record max_memkb inside
 hypervisor. That reflects the maximum memory a guest can ask for.
 
 This field is mandated to be default value during guest creation to
 avoid unforeseen repercussions. It's only honour when restoring a guest.
 
 (XXX compiled test only at this stage)
 
 Signed-off-by: Wei Liu wei.l...@citrix.com
 ---
  tools/libxl/libxl.c | 17 +
  tools/libxl/libxl_create.c  |  6 ++
  tools/libxl/libxl_dom.c |  9 +++--
  tools/libxl/libxl_types.idl |  1 +
  4 files changed, 23 insertions(+), 10 deletions(-)
 
 diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
 index 9117b01..72fec8b 100644
 --- a/tools/libxl/libxl.c
 +++ b/tools/libxl/libxl.c
 @@ -6614,6 +6614,7 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
 uint32_t domid,
  GC_INIT(ctx);
  int rc;
  libxl__domain_userdata_lock *lock = NULL;
 +uint64_t hv_max_memkb;
  
  CTX_LOCK;
  
 @@ -6654,6 +6655,7 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
 uint32_t domid,
  }
  libxl_uuid_copy(ctx, d_config-c_info.uuid, info.uuid);
  libxl_dominfo_dispose(info);
 +hv_max_memkb = info.max_memkb; /* store and use later */
  }
  
  /* Memory limits:
 @@ -6661,17 +6663,15 @@ int libxl_retrieve_domain_configuration(libxl_ctx 
 *ctx, uint32_t domid,
   * Currently there are three memory limits:
   *  1. target in xenstore (originally memory= in config file)
   *  2. static-max in xenstore (originally maxmem= in config file)
 - *  3. max_memkb in hypervisor
 - *
 - * The third one is not visible and currently managed by
 - * toolstack. In order to rebuild a domain we only need to have
 - * target and static-max.
 + *  3. max_memkb in hypervisor (corresponds to hv_max_memkb in
 + * idl, not visible to xl level)
   */
  {
 -uint32_t target_memkb = 0, max_memkb = 0;
 +uint32_t target_memkb = 0, static_max_memkb = 0;
  
  /* target */
 -rc = libxl__get_memory_target(gc, domid, target_memkb, max_memkb);
 +rc = libxl__get_memory_target(gc, domid, target_memkb,
 +  static_max_memkb);
  if (rc) {
  LOG(ERROR, fail to get memory target for domain %d, domid);
  goto out;
 @@ -6683,7 +6683,8 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
 uint32_t domid,
  d_config-b_info.target_memkb = 

Re: [Xen-devel] [PATCH v2 for Xen 4.6 3/4] libxl: enabling XL to set per-VCPU parameters of a domain for RTDS scheduler

2015-06-02 Thread Chong Li
On Tue, Jun 2, 2015 at 7:53 AM, George Dunlap
george.dun...@eu.citrix.com wrote:
 On 05/26/2015 01:09 AM, Chong Li wrote:
 Add libxl_vcpu_sched_params_get/set and sched_rtds_vcpu_get/set functions to 
 support
 per-VCPU settings for RTDS scheduler.

 Add a new data structure (libxl_vcpu_sched_params) to help per-VCPU settings.

 Signed-off-by: Chong Li chong...@wustl.edu
 Signed-off-by: Meng Xu men...@cis.upenn.edu
 Signed-off-by: Sisu Xi xis...@gmail.com

 This doesn't apply cleanly for me anymore -- can you refresh and resend?

 Thanks,
  -George


Yes, sure. I'm working on the comments that have already been posted
in this series and will send out a new version soon.

Chong
 ---
  tools/libxl/libxl.c | 189 
 ++--
  tools/libxl/libxl.h |  19 +
  tools/libxl/libxl_types.idl |  11 +++
  3 files changed, 196 insertions(+), 23 deletions(-)

 diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
 index feb3aa9..169901a 100644
 --- a/tools/libxl/libxl.c
 +++ b/tools/libxl/libxl.c
 @@ -5797,6 +5797,120 @@ static int sched_sedf_domain_set(libxl__gc *gc, 
 uint32_t domid,
  return 0;
  }

 +static int sched_rtds_validate_params(libxl__gc *gc, uint32_t period,
 + uint32_t budget, uint32_t *sdom_period,
 + uint32_t *sdom_budget)
 +{
 +if (period != LIBXL_DOMAIN_SCHED_PARAM_PERIOD_DEFAULT) {
 +if (period  1) {
 +LOG(ERROR, VCPU period is not set or out of range, 
 +   valid values are larger than 1);
 +return ERROR_INVAL;
 +}
 +*sdom_period = period;
 +}
 +
 +if (budget != LIBXL_DOMAIN_SCHED_PARAM_BUDGET_DEFAULT) {
 +if (budget  1) {
 +LOG(ERROR, VCPU budget is not set or out of range, 
 +   valid values are larger than 1);
 +return ERROR_INVAL;
 +}
 +*sdom_budget = budget;
 +}
 +
 +if (budget  period) {
 +LOG(ERROR, VCPU budget is larger than VCPU period, 
 +   VCPU budget should be no larger than VCPU period);
 +return ERROR_INVAL;
 +}
 +
 +return 0;
 +}
 +
 +static int sched_rtds_vcpu_get(libxl__gc *gc, uint32_t domid,
 +   libxl_vcpu_sched_params *scinfo)
 +{
 +uint16_t num_vcpus;
 +int rc, i;
 +xc_dominfo_t info;
 +
 +rc = xc_domain_getinfo(CTX-xch, domid, 1, info);
 +if (rc  0) {
 +LOGE(ERROR, getting domain info);
 +return ERROR_FAIL;
 +}
 +num_vcpus = info.max_vcpu_id + 1;
 +
 +struct xen_domctl_sched_rtds_params  *sdom = libxl__malloc(NOGC,
 +sizeof(struct xen_domctl_sched_rtds_params) * num_vcpus);
 +rc = xc_sched_rtds_vcpu_get(CTX-xch, domid, sdom, num_vcpus);
 +if (rc != 0) {
 +LOGE(ERROR, getting vcpu sched rtds);
 +return ERROR_FAIL;
 +}
 +
 +libxl_vcpu_sched_params_init(scinfo);
 +
 +scinfo-sched = LIBXL_SCHEDULER_RTDS;
 +scinfo-num_vcpus = num_vcpus;
 +scinfo-vcpus = (libxl_rtds_vcpu *)
 +libxl__malloc(NOGC, sizeof(libxl_rtds_vcpu) * num_vcpus);
 +for(i = 0; i  num_vcpus; i++) {
 +scinfo-vcpus[i].period = sdom[i].period;
 +scinfo-vcpus[i].budget = sdom[i].budget;
 +}
 +
 +return 0;
 +}
 +
 +static int sched_rtds_vcpu_set(libxl__gc *gc, uint32_t domid,
 +   const libxl_vcpu_sched_params *scinfo)
 +{
 +int rc;
 +int i;
 +uint16_t num_vcpus;
 +int vcpuid;
 +uint32_t budget, period;
 +xc_dominfo_t info;
 +
 +rc = xc_domain_getinfo(CTX-xch, domid, 1, info);
 +if (rc  0) {
 +LOGE(ERROR, getting domain info);
 +return ERROR_FAIL;
 +}
 +num_vcpus = info.max_vcpu_id + 1;
 +
 +struct xen_domctl_sched_rtds_params  *sdom =
 +libxl__malloc(NOGC, scinfo-num_vcpus);
 +for (i = 0; i  scinfo-num_vcpus; i++) {
 +vcpuid = scinfo-vcpus[i].vcpuid;
 +budget = scinfo-vcpus[i].budget;
 +period = scinfo-vcpus[i].period;
 +if (vcpuid  0 || vcpuid = num_vcpus) {
 +LOG(ERROR, VCPU index is out of range, 
 +   valid values are within range from 0 to %d,
 +   num_vcpus);
 +return ERROR_INVAL;
 +}
 +sdom[i].vcpuid = vcpuid;
 +
 +rc = sched_rtds_validate_params(gc, period, budget,
 +sdom[i].period, sdom[i].budget);
 +if (rc == ERROR_INVAL)
 +return rc;
 +}
 +
 +rc = xc_sched_rtds_vcpu_set(CTX-xch, domid,
 +sdom, scinfo-num_vcpus);
 +if (rc != 0) {
 +LOGE(ERROR, setting vcpu sched rtds);
 +return ERROR_FAIL;
 +}
 +
 +return rc;
 +}
 +
  static int sched_rtds_domain_get(libxl__gc *gc, uint32_t domid,
 libxl_domain_sched_params *scinfo)
  {
 @@ -5830,29 +5944,10 @@ static int sched_rtds_domain_set(libxl__gc 

Re: [Xen-devel] [PATCH v1 2/5] tools/libxl: move domain suspend codes into a separate file

2015-06-02 Thread Ian Campbell
On Wed, 2015-05-20 at 18:01 +0800, Yang Hongyang wrote:
 diff --git a/tools/libxl/libxl_internal.h
 b/tools/libxl/libxl_internal.h
 index f86fc89..b2eeb89 100644
 --- a/tools/libxl/libxl_internal.h
 +++ b/tools/libxl/libxl_internal.h
 @@ -3191,6 +3191,12 @@ _hidden void
 libxl__domain_save_device_model(libxl__egc *egc,
  
  _hidden const char *libxl__device_model_savefile(libxl__gc *gc,
 uint32_t domid);
  
 +void libxl__domain_suspend_callback(void *data);
 +
 +/* calls dss-callback_common_done when done */
 +void libxl__domain_suspend(libxl__egc *egc,
 +   libxl__domain_suspend_state *dss);

Please mark these internal functions _hidden like the other examples in
this file.

With that the ack I just sent before I notice this stands.

Ian.



___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] x86/HVM: Avoid cache flush operations during hvm_load

2015-06-02 Thread Jan Beulich
 On 02.06.15 at 14:47, ross.lagerw...@citrix.com wrote:
 +void arch_hvm_load_post(struct domain *d)
 +{
 +/* Re-enable cache flushes and flush the cache. */
 +this_cpu(memory_type_changed_ignore) = 0;
 +memory_type_changed(d);
 +}

Does this really need to be done unconditionally? I.e. couldn't this be
a tristate, with memory_type_changed() e.g. flipping its sign when it
gets actually called?

Also, are we certain that memory_type_changed() will never be
called from asynchronous code (i.e. this doesn't introduce a latent,
hard to debug issue)?

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCHv10 3/4] gnttab: make the grant table lock a read-write lock

2015-06-02 Thread David Vrabel
On 29/05/15 09:31, Jan Beulich wrote:
 On 28.05.15 at 18:09, dvra...@cantab.net wrote:
 On 28/05/15 15:55, Jan Beulich wrote:
 On 26.05.15 at 20:00, david.vra...@citrix.com wrote:
 @@ -254,23 +254,23 @@ double_gt_lock(struct grant_table *lgt, struct 
 grant_table *rgt)
  {
  if ( lgt  rgt )
  {
 -spin_lock(lgt-lock);
 -spin_lock(rgt-lock);
 +write_lock(lgt-lock);
 +write_lock(rgt-lock);
  }
  else
  {
  if ( lgt != rgt )
 -spin_lock(rgt-lock);
 -spin_lock(lgt-lock);
 +write_lock(rgt-lock);
 +write_lock(lgt-lock);
  }
  }

 So I looked at the two uses of double_gt_lock() again: in both cases
 only a read lock is needed on rgt (which is also the natural thing to
 expect: we aren't supposed to modify the remote domain's grant
 table in any way here). Albeit that's contradicting ...

 See comment below.

 @@ -568,10 +568,10 @@ static void mapcount(
  *wrc = *rdc = 0;
  
  /*
 - * Must have the remote domain's grant table lock while counting
 - * its active entries.
 + * Must have the remote domain's grant table write lock while
 + * counting its active entries.
   */
 -ASSERT(spin_is_locked(rd-grant_table-lock));
 +ASSERT(rw_is_write_locked(rd-grant_table-lock));

 ... this: Why would we need to hold the write lock here? We're
 not changing anything in rd-grant_table.

 @@ -837,12 +838,22 @@ __gnttab_map_grant_ref(
  
  TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op-dom);
  
 +/*
 + * All maptrack entry users check mt-flags first before using the
 + * other fields so just ensure the flags field is stored last.
 + *
 + * However, if gnttab_need_iommu_mapping() then this would race
 + * with a concurrent mapcount() call (on an unmap, for example)
 + * and a lock is required.
 + */
  mt = maptrack_entry(lgt, handle);
  mt-domid = op-dom;
  mt-ref   = op-ref;
 -mt-flags = op-flags;
 +wmb();
 +write_atomic(mt-flags, op-flags);
 Further, why are only races against mapcount()
 a problem, but not such against __gnttab_unmap_common() as a
 whole? I.e. what's the locking around the op-map-flags and
 op-map-domid accesses below good for? Or, alternatively, isn't
 this an indication of a problem with the previous patch splitting off
 the maptrack lock (effectively leaving some map track entry
 accesses without any guard)?

 The double_gt_lock() takes both write locks, thus does not race with
 __gnttab_unmap_common clearing the flag on the maptrack entry which is
 done while holding the remote read lock.
 
 The maptrack entries are items of the local domain, i.e. the state
 of the remote domain's lock shouldn't matter there at all. Anything
 else would be extremely counterintuitive and hence prone to future
 breakage. With that the earlier two comments (above) remain un-
 addressed too.

mapcount() looks at the active entries of the remote domain and hence
these cannot change while counting, thus the write lock is required.

I cannot see how to do what you ask.

 @@ -2645,7 +2663,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, 
 grant_ref_t ref_b)
  struct active_grant_entry *act_b = NULL;
  s16 rc = GNTST_okay;
  
 -spin_lock(gt-lock);
 +write_lock(gt-lock);
  
  /* Bounds check on the grant refs */
  if ( unlikely(ref_a = nr_grant_entries(d-grant_table)))
 @@ -2689,7 +2707,7 @@ out:
  active_entry_release(act_b);
  if ( act_a != NULL )
  active_entry_release(act_a);
 -spin_unlock(gt-lock);
 +write_unlock(gt-lock);

 It would seem to me that these could be dropped when the per-active-
 entry locks get introduced.

 I'm not sure what you want dropped here?  We require the write lock here
 because we're taking two active entries at once.
 
 Ah, right. But couldn't the write lock then be dropped as soon as the
 two active entries got locked?

No, because at least the read lock is required for the subsequent
gt-gt_version check.  If the write lock was dropped and the read lock
acquired we would get the active entry and read lock ordering wrong.

David

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] x86/HVM: Avoid cache flush operations during hvm_load

2015-06-02 Thread Andrew Cooper
On 02/06/15 13:47, Ross Lagerwall wrote:
 An MTRR record is processed for each vCPU during hvm_load. Each MTRR
 record sets several mtrrs, each of which flushes the cache on all pCPUs.
 This can take some time and trip the watchdog for HVM guests with many
 CPUs.

 To fix this, introduce a flag which prevents flushing the cache on x86
 while loading the restore records and instead does a single cache flush
 at the end of hvm_load.

 This reduces the time to restore an HVM guest with 32 vCPUs by about 5
 seconds on an Intel Xeon CPU E7-2870.

 Signed-off-by: Ross Lagerwall ross.lagerw...@citrix.com
 ---

 In v2: Code moved into arch hooks since it's x86 specific.

  xen/arch/x86/hvm/mtrr.c|  5 +
  xen/arch/x86/hvm/save.c| 10 ++
  xen/common/hvm/save.c  | 15 ++-
  xen/include/asm-x86/mtrr.h |  9 +
  xen/include/xen/hvm/save.h |  1 +
  5 files changed, 35 insertions(+), 5 deletions(-)

 diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
 index a69ee62..f21b367 100644
 --- a/xen/arch/x86/hvm/mtrr.c
 +++ b/xen/arch/x86/hvm/mtrr.c
 @@ -65,6 +65,8 @@ static const uint8_t 
 mm_type_tbl[MTRR_NUM_TYPES][PAT_TYPE_NUMS] = {
  #undef RS
  };
  
 +DEFINE_PER_CPU(bool_t, memory_type_changed_ignore);

Thinking about this, this should be memory_type_changed_defer and
there should be a second bool memory_type_changed_wanted.

This way, we don't suffer a memory_type_changed() in
arch_hvm_load_post() if there were no MTRR records loaded; we support
partial loads of state via this path as well.

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2] x86/HVM: Avoid cache flush operations during hvm_load

2015-06-02 Thread Ross Lagerwall
An MTRR record is processed for each vCPU during hvm_load. Each MTRR
record sets several mtrrs, each of which flushes the cache on all pCPUs.
This can take some time and trip the watchdog for HVM guests with many
CPUs.

To fix this, introduce a flag which prevents flushing the cache on x86
while loading the restore records and instead does a single cache flush
at the end of hvm_load.

This reduces the time to restore an HVM guest with 32 vCPUs by about 5
seconds on an Intel Xeon CPU E7-2870.

Signed-off-by: Ross Lagerwall ross.lagerw...@citrix.com
---

In v2: Code moved into arch hooks since it's x86 specific.

 xen/arch/x86/hvm/mtrr.c|  5 +
 xen/arch/x86/hvm/save.c| 10 ++
 xen/common/hvm/save.c  | 15 ++-
 xen/include/asm-x86/mtrr.h |  9 +
 xen/include/xen/hvm/save.h |  1 +
 5 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
index a69ee62..f21b367 100644
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -65,6 +65,8 @@ static const uint8_t 
mm_type_tbl[MTRR_NUM_TYPES][PAT_TYPE_NUMS] = {
 #undef RS
 };
 
+DEFINE_PER_CPU(bool_t, memory_type_changed_ignore);
+
 /*
  * Reverse lookup table, to find a pat type according to MTRR and effective
  * memory type. This table is dynamically generated.
@@ -789,6 +791,9 @@ HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, 
hvm_load_mtrr_msr,
 
 void memory_type_changed(struct domain *d)
 {
+if ( this_cpu(memory_type_changed_ignore) )
+return;
+
 if ( need_iommu(d)  d-vcpu  d-vcpu[0] )
 {
 p2m_memory_type_changed(d);
diff --git a/xen/arch/x86/hvm/save.c b/xen/arch/x86/hvm/save.c
index 61f780d..b6325d0 100644
--- a/xen/arch/x86/hvm/save.c
+++ b/xen/arch/x86/hvm/save.c
@@ -76,9 +76,19 @@ int arch_hvm_load(struct domain *d, struct hvm_save_header 
*hdr)
 /* VGA state is not saved/restored, so we nobble the cache. */
 d-arch.hvm_domain.stdvga.cache = 0;
 
+/* Prevent cache flushes until after all restore records. */
+this_cpu(memory_type_changed_ignore) = 1;
+
 return 0;
 }
 
+void arch_hvm_load_post(struct domain *d)
+{
+/* Re-enable cache flushes and flush the cache. */
+this_cpu(memory_type_changed_ignore) = 0;
+memory_type_changed(d);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/common/hvm/save.c b/xen/common/hvm/save.c
index da6e668..40ab9c0 100644
--- a/xen/common/hvm/save.c
+++ b/xen/common/hvm/save.c
@@ -206,6 +206,7 @@ int hvm_load(struct domain *d, hvm_domain_context_t *h)
 struct hvm_save_descriptor *desc;
 hvm_load_handler handler;
 struct vcpu *v;
+int ret = 0;
 
 if ( d-is_dying )
 return -EINVAL;
@@ -230,13 +231,14 @@ int hvm_load(struct domain *d, hvm_domain_context_t *h)
 printk(XENLOG_G_ERR
HVM%d restore: save did not end with a null entry\n,
d-domain_id);
-return -1;
+ret = -1;
+break;
 }
 
 /* Read the typecode of the next entry  and check for the end-marker */
 desc = (struct hvm_save_descriptor *)(h-data[h-cur]);
 if ( desc-typecode == 0 )
-return 0; 
+break;
 
 /* Find the handler for this entry */
 if ( (desc-typecode  HVM_SAVE_CODE_MAX) ||
@@ -244,7 +246,8 @@ int hvm_load(struct domain *d, hvm_domain_context_t *h)
 {
 printk(XENLOG_G_ERR HVM%d restore: unknown entry typecode %u\n,
d-domain_id, desc-typecode);
-return -1;
+ret = -1;
+break;
 }
 
 /* Load the entry */
@@ -254,11 +257,13 @@ int hvm_load(struct domain *d, hvm_domain_context_t *h)
 {
 printk(XENLOG_G_ERR HVM%d restore: failed to load entry %u/%u\n,
d-domain_id, desc-typecode, desc-instance);
-return -1;
+ret = -1;
+break;
 }
 }
 
-/* Not reached */
+arch_hvm_load_post(d);
+return ret;
 }
 
 int _hvm_init_entry(struct hvm_domain_context *h,
diff --git a/xen/include/asm-x86/mtrr.h b/xen/include/asm-x86/mtrr.h
index 0569db6..9df87cd 100644
--- a/xen/include/asm-x86/mtrr.h
+++ b/xen/include/asm-x86/mtrr.h
@@ -60,6 +60,15 @@ struct mtrr_state {
 };
 extern struct mtrr_state mtrr_state;
 
+/*
+ * The purpose of the memory_type_changed_ignore cpu flag is to
+ * avoid unecessary cache flushes when doing multiple memory type
+ * operations that may flush the cache. Code can set this flag, do
+ * several memory type operations, clear the flag and then call
+ * memory_type_changed() to flush the cache at the end.
+ */
+DECLARE_PER_CPU(bool_t, memory_type_changed_ignore);
+
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
 extern int mtrr_add(unsigned long base, unsigned long size,
diff --git a/xen/include/xen/hvm/save.h b/xen/include/xen/hvm/save.h
index ae6f0bb..815780b 100644
--- 

Re: [Xen-devel] [PATCH v2 for Xen 4.6 3/4] libxl: enabling XL to set per-VCPU parameters of a domain for RTDS scheduler

2015-06-02 Thread George Dunlap
On 05/26/2015 01:09 AM, Chong Li wrote:
 Add libxl_vcpu_sched_params_get/set and sched_rtds_vcpu_get/set functions to 
 support
 per-VCPU settings for RTDS scheduler.
 
 Add a new data structure (libxl_vcpu_sched_params) to help per-VCPU settings.
 
 Signed-off-by: Chong Li chong...@wustl.edu
 Signed-off-by: Meng Xu men...@cis.upenn.edu
 Signed-off-by: Sisu Xi xis...@gmail.com

This doesn't apply cleanly for me anymore -- can you refresh and resend?

Thanks,
 -George

 ---
  tools/libxl/libxl.c | 189 
 ++--
  tools/libxl/libxl.h |  19 +
  tools/libxl/libxl_types.idl |  11 +++
  3 files changed, 196 insertions(+), 23 deletions(-)
 
 diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
 index feb3aa9..169901a 100644
 --- a/tools/libxl/libxl.c
 +++ b/tools/libxl/libxl.c
 @@ -5797,6 +5797,120 @@ static int sched_sedf_domain_set(libxl__gc *gc, 
 uint32_t domid,
  return 0;
  }
  
 +static int sched_rtds_validate_params(libxl__gc *gc, uint32_t period,
 + uint32_t budget, uint32_t *sdom_period,
 + uint32_t *sdom_budget)
 +{
 +if (period != LIBXL_DOMAIN_SCHED_PARAM_PERIOD_DEFAULT) {
 +if (period  1) {
 +LOG(ERROR, VCPU period is not set or out of range, 
 +   valid values are larger than 1);
 +return ERROR_INVAL;
 +}
 +*sdom_period = period;
 +}
 +
 +if (budget != LIBXL_DOMAIN_SCHED_PARAM_BUDGET_DEFAULT) {
 +if (budget  1) {
 +LOG(ERROR, VCPU budget is not set or out of range, 
 +   valid values are larger than 1);
 +return ERROR_INVAL;
 +}
 +*sdom_budget = budget;
 +}
 +
 +if (budget  period) {
 +LOG(ERROR, VCPU budget is larger than VCPU period, 
 +   VCPU budget should be no larger than VCPU period);
 +return ERROR_INVAL;
 +}
 +
 +return 0;
 +}
 +
 +static int sched_rtds_vcpu_get(libxl__gc *gc, uint32_t domid,
 +   libxl_vcpu_sched_params *scinfo)
 +{
 +uint16_t num_vcpus;
 +int rc, i;
 +xc_dominfo_t info;
 +
 +rc = xc_domain_getinfo(CTX-xch, domid, 1, info);
 +if (rc  0) {
 +LOGE(ERROR, getting domain info);
 +return ERROR_FAIL;
 +}
 +num_vcpus = info.max_vcpu_id + 1;
 +
 +struct xen_domctl_sched_rtds_params  *sdom = libxl__malloc(NOGC,
 +sizeof(struct xen_domctl_sched_rtds_params) * num_vcpus);
 +rc = xc_sched_rtds_vcpu_get(CTX-xch, domid, sdom, num_vcpus);
 +if (rc != 0) {
 +LOGE(ERROR, getting vcpu sched rtds);
 +return ERROR_FAIL;
 +}
 +
 +libxl_vcpu_sched_params_init(scinfo);
 +
 +scinfo-sched = LIBXL_SCHEDULER_RTDS;
 +scinfo-num_vcpus = num_vcpus;
 +scinfo-vcpus = (libxl_rtds_vcpu *)
 +libxl__malloc(NOGC, sizeof(libxl_rtds_vcpu) * num_vcpus);
 +for(i = 0; i  num_vcpus; i++) {
 +scinfo-vcpus[i].period = sdom[i].period;
 +scinfo-vcpus[i].budget = sdom[i].budget;
 +}
 +
 +return 0;
 +}
 +
 +static int sched_rtds_vcpu_set(libxl__gc *gc, uint32_t domid,
 +   const libxl_vcpu_sched_params *scinfo)
 +{
 +int rc;
 +int i;
 +uint16_t num_vcpus;
 +int vcpuid;
 +uint32_t budget, period;
 +xc_dominfo_t info;
 +
 +rc = xc_domain_getinfo(CTX-xch, domid, 1, info);
 +if (rc  0) {
 +LOGE(ERROR, getting domain info);
 +return ERROR_FAIL;
 +}
 +num_vcpus = info.max_vcpu_id + 1;
 +
 +struct xen_domctl_sched_rtds_params  *sdom =
 +libxl__malloc(NOGC, scinfo-num_vcpus);
 +for (i = 0; i  scinfo-num_vcpus; i++) {
 +vcpuid = scinfo-vcpus[i].vcpuid;
 +budget = scinfo-vcpus[i].budget;
 +period = scinfo-vcpus[i].period;
 +if (vcpuid  0 || vcpuid = num_vcpus) {
 +LOG(ERROR, VCPU index is out of range, 
 +   valid values are within range from 0 to %d,
 +   num_vcpus);
 +return ERROR_INVAL;
 +}
 +sdom[i].vcpuid = vcpuid;
 +
 +rc = sched_rtds_validate_params(gc, period, budget,
 +sdom[i].period, sdom[i].budget);
 +if (rc == ERROR_INVAL)
 +return rc;
 +}
 +
 +rc = xc_sched_rtds_vcpu_set(CTX-xch, domid,
 +sdom, scinfo-num_vcpus);
 +if (rc != 0) {
 +LOGE(ERROR, setting vcpu sched rtds);
 +return ERROR_FAIL;
 +}
 +
 +return rc;
 +}
 +
  static int sched_rtds_domain_get(libxl__gc *gc, uint32_t domid,
 libxl_domain_sched_params *scinfo)
  {
 @@ -5830,29 +5944,10 @@ static int sched_rtds_domain_set(libxl__gc *gc, 
 uint32_t domid,
  return ERROR_FAIL;
  }
  
 -if (scinfo-period != LIBXL_DOMAIN_SCHED_PARAM_PERIOD_DEFAULT) {
 -if (scinfo-period  1) {
 -LOG(ERROR, VCPU period is not 

  1   2   >