[PATCH] x86/xen: make xen_pvmmu_arch_setup() static

2020-04-07 Thread Jason Yan
Fix the following sparse warning:

arch/x86/xen/setup.c:998:12: warning: symbol 'xen_pvmmu_arch_setup' was not
declared. Should it be static?

Reported-by: Hulk Robot 
Signed-off-by: Jason Yan 
---
 arch/x86/xen/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 33b0e20df7fc..1a2d8a50dac4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -985,7 +985,7 @@ void xen_enable_syscall(void)
 #endif /* CONFIG_X86_64 */
 }
 
-void __init xen_pvmmu_arch_setup(void)
+static void __init xen_pvmmu_arch_setup(void)
 {
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, 
VMASST_TYPE_writable_pagetables);
-- 
2.17.2




[RFC PATCH 22/26] kvm/paravirt: Encapsulate KVM pv switching logic

2020-04-07 Thread Ankur Arora
KVM pv-ops are dependent on KVM features/hints which are exposed
via CPUID. Decouple the probing and the enabling of these ops from
__init so they can be called post-init as well.

Signed-off-by: Ankur Arora 
---
 arch/x86/Kconfig  |  1 +
 arch/x86/kernel/kvm.c | 87 ++-
 2 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 605619938f08..e0629558b6b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -809,6 +809,7 @@ config KVM_GUEST
depends on PARAVIRT
select PARAVIRT_CLOCK
select ARCH_CPUIDLE_HALTPOLL
+   select PARAVIRT_RUNTIME
default y
---help---
  This option enables various optimizations for running under the KVM
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e56d263159d7..31f5ecfd3907 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -262,12 +263,20 @@ do_async_page_fault(struct pt_regs *regs, unsigned long 
error_code, unsigned lon
 }
 NOKPROBE_SYMBOL(do_async_page_fault);
 
+static bool kvm_pv_io_delay(void)
+{
+   bool cond = kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY);
+
+   paravirt_stage_alt(cond, cpu.io_delay, kvm_io_delay);
+
+   return cond;
+}
+
 static void __init paravirt_ops_setup(void)
 {
pv_info.name = "KVM";
 
-   if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
-   pv_ops.cpu.io_delay = kvm_io_delay;
+   kvm_pv_io_delay();
 
 #ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
@@ -432,6 +441,15 @@ static bool pv_tlb_flush_supported(void)
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
+static bool kvm_pv_steal_clock(void)
+{
+   bool cond = kvm_para_has_feature(KVM_FEATURE_STEAL_TIME);
+
+   paravirt_stage_alt(cond, time.steal_clock, kvm_steal_clock);
+
+   return cond;
+}
+
 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
 
 #ifdef CONFIG_SMP
@@ -624,6 +642,17 @@ static void kvm_flush_tlb_others(const struct cpumask 
*cpumask,
native_flush_tlb_others(flushmask, info);
 }
 
+static bool kvm_pv_tlb(void)
+{
+   bool cond = pv_tlb_flush_supported();
+
+   paravirt_stage_alt(cond, mmu.flush_tlb_others,
+  kvm_flush_tlb_others);
+   paravirt_stage_alt(cond, mmu.tlb_remove_table,
+  tlb_remove_table);
+   return cond;
+}
+
 static void __init kvm_guest_init(void)
 {
int i;
@@ -635,16 +664,11 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
 
-   if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+   if (kvm_pv_steal_clock())
has_steal_clock = 1;
-   pv_ops.time.steal_clock = kvm_steal_clock;
-   }
 
-   if (pv_tlb_flush_supported()) {
-   pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
-   pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+   if (kvm_pv_tlb())
pr_info("KVM setup pv remote TLB flush\n");
-   }
 
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
@@ -849,33 +873,46 @@ asm(
 
 #endif
 
+static inline bool kvm_para_lock_ops(void)
+{
+   /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+   return kvm_para_has_feature(KVM_FEATURE_PV_UNHALT) &&
+   !kvm_para_has_hint(KVM_HINTS_REALTIME);
+}
+
+static bool kvm_pv_spinlock(void)
+{
+   bool cond = kvm_para_lock_ops();
+   bool preempt_cond = cond &&
+   kvm_para_has_feature(KVM_FEATURE_STEAL_TIME);
+
+   paravirt_stage_alt(cond, lock.queued_spin_lock_slowpath,
+  __pv_queued_spin_lock_slowpath);
+   paravirt_stage_alt(cond, lock.queued_spin_unlock.func,
+  PV_CALLEE_SAVE(__pv_queued_spin_unlock).func);
+   paravirt_stage_alt(cond, lock.wait, kvm_wait);
+   paravirt_stage_alt(cond, lock.kick, kvm_kick_cpu);
+
+   paravirt_stage_alt(preempt_cond,
+  lock.vcpu_is_preempted.func,
+  PV_CALLEE_SAVE(__kvm_vcpu_is_preempted).func);
+   return cond;
+}
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
 void __init kvm_spinlock_init(void)
 {
-   /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
-   if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
-   return;
-
-   if (kvm_para_has_hint(KVM_HINTS_REALTIME))
-   return;
 
/* Don't use the pvqspinlock code if there is only 1 vCPU. */
if (num_possible_cpus() == 1)
return;
 
-   __pv_init_lock_hash();
-   pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-   

[RFC PATCH 24/26] x86/kvm: Support dynamic CPUID hints

2020-04-07 Thread Ankur Arora
Change in the state of a KVM hint like KVM_HINTS_REALTIME can lead
to significant performance impact. Given that the hint might not be
stable across the lifetime of a guest, dynamic hints allow the host
to inform the guest if the hint changes.

Do this via KVM CPUID leaf in %ecx.  If the guest has registered a
callback via MSR_KVM_HINT_VECTOR, the hint change is notified to it by
means of a callback triggered via vcpu ioctl KVM_CALLBACK.

Signed-off-by: Ankur Arora 
---
The callback vector is currently tied in with the hint notification
and can (should) be made more generic such that we could deliver
arbitrary callbacks on it.

One use might be for TSC frequency switching notifications support for
emulated Hyper-V guests.

---
 Documentation/virt/kvm/api.rst   | 17 
 Documentation/virt/kvm/cpuid.rst |  9 +--
 arch/x86/include/asm/kvm_host.h  |  6 +
 arch/x86/include/uapi/asm/kvm_para.h |  2 ++
 arch/x86/kvm/cpuid.c |  3 ++-
 arch/x86/kvm/x86.c   | 39 
 include/uapi/linux/kvm.h |  4 +++
 7 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index efbbe570aa9b..40a9b22d6979 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4690,6 +4690,17 @@ KVM_PV_VM_VERIFY
   Verify the integrity of the unpacked image. Only if this succeeds,
   KVM is allowed to start protected VCPUs.
 
+4.126 KVM_CALLBACK
+--
+
+:Capability: KVM_CAP_CALLBACK
+:Architectures: x86
+:Type: vcpu ioctl
+:Parameters: none
+:Returns: 0 on success, -1 on error
+
+Queues a callback on the guess's vcpu if a callback has been regisered.
+
 
 5. The kvm_run structure
 
@@ -6109,3 +6120,9 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.24 KVM_CAP_CALLBACK
+
+Architectures: x86_64
+
+This capability indicates that the ioctl KVM_CALLBACK is available.
diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst
index 01b081f6e7ea..5a997c9e74c0 100644
--- a/Documentation/virt/kvm/cpuid.rst
+++ b/Documentation/virt/kvm/cpuid.rst
@@ -86,6 +86,9 @@ KVM_FEATURE_PV_SCHED_YIELD13  guest checks 
this feature bit
   before using paravirtualized
   sched yield.
 
+KVM_FEATURE_DYNAMIC_HINTS14  guest handles feature hints
+ changing under it.
+
 KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24  host will warn if no guest-side
   per-cpu warps are expeced in
   kvmclock
@@ -93,9 +96,11 @@ KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24  host will warn 
if no guest-side
 
 ::
 
-  edx = an OR'ed group of (1 << flag)
+  ecx, edx = an OR'ed group of (1 << flag)
 
-Where ``flag`` here is defined as below:
+Where the ``flag`` in ecx is currently applicable hints, and ``flag`` in
+edx is the union of all hints ever provided to the guest, both drawn from
+the set listed below:
 
 ==  =
 flag   valuemeaning
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..4f061550274d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -723,6 +723,8 @@ struct kvm_vcpu_arch {
bool nmi_injected;/* Trying to inject an NMI this entry */
bool smi_pending;/* SMI queued after currently running handler */
 
+   bool callback_pending;  /* Callback queued after running handler */
+
struct kvm_mtrr mtrr_state;
u64 pat;
 
@@ -982,6 +984,10 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+
+   struct {
+   u8 vector;
+   } callback;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 2a8e0b6b9805..bf016e232f2f 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -31,6 +31,7 @@
 #define KVM_FEATURE_PV_SEND_IPI11
 #define KVM_FEATURE_POLL_CONTROL   12
 #define KVM_FEATURE_PV_SCHED_YIELD 13
+#define KVM_FEATURE_DYNAMIC_HINTS  14
 
 #define KVM_HINTS_REALTIME  0
 
@@ -50,6 +51,7 @@
 #define MSR_KVM_STEAL_TIME  0x4b564d03
 #define MSR_KVM_PV_EOI_EN  0x4b564d04
 #define MSR_KVM_POLL_CONTROL   0x4b564d05
+#define MSR_KVM_HINT_VECTOR0x4b564d06
 
 struct kvm_steal_time {
__u64 steal;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 

[RFC PATCH 13/26] x86/alternatives: Split __text_poke()

2020-04-07 Thread Ankur Arora
Separate __text_poke() into map, memcpy and unmap portions,
(__text_poke_map(), __text_do_poke() and __text_poke_unmap().)

Do this to separate the non-reentrant bits from the reentrant
__text_do_poke(). __text_poke_map()/_unmap() modify poking_mm,
poking_addr and do the pte-mapping and thus are non-reentrant.

This allows __text_do_poke() to be safely called from an INT3
context with __text_poke_map()/_unmap() being called at the
start and the end of the patching of a call-site instead of
doing that for each stage of the three patching stages.

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/alternative.c | 46 +--
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 0344e49a4ade..337aad8c2521 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -805,13 +805,12 @@ void __init_or_module text_poke_early(void *addr, const 
void *opcode,
 __ro_after_init struct mm_struct *poking_mm;
 __ro_after_init unsigned long poking_addr;
 
-static void __text_poke(void *addr, const void *opcode, size_t len)
+static void __text_poke_map(void *addr, size_t len,
+   temp_mm_state_t *prev_mm, pte_t **ptep)
 {
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
-   temp_mm_state_t prev;
-   unsigned long flags;
-   pte_t pte, *ptep;
+   pte_t pte;
pgprot_t pgprot;
 
/*
@@ -836,8 +835,6 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
 */
BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
 
-   local_irq_save(flags);
-
/*
 * Map the page without the global bit, as TLB flushing is done with
 * flush_tlb_mm_range(), which is intended for non-global PTEs.
@@ -849,30 +846,42 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
 * unlocked. This does mean that we need to be careful that no other
 * context (ex. INT3 handler) is simultaneously writing to this pte.
 */
-   ptep = __get_unlocked_pte(poking_mm, poking_addr);
+   *ptep = __get_unlocked_pte(poking_mm, poking_addr);
/*
 * This must not fail; preallocated in poking_init().
 */
-   VM_BUG_ON(!ptep);
+   VM_BUG_ON(!*ptep);
 
pte = mk_pte(pages[0], pgprot);
-   set_pte_at(poking_mm, poking_addr, ptep, pte);
+   set_pte_at(poking_mm, poking_addr, *ptep, pte);
 
if (cross_page_boundary) {
pte = mk_pte(pages[1], pgprot);
-   set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
+   set_pte_at(poking_mm, poking_addr + PAGE_SIZE, *ptep + 1, pte);
}
 
/*
 * Loading the temporary mm behaves as a compiler barrier, which
 * guarantees that the PTE will be set at the time memcpy() is done.
 */
-   prev = use_temporary_mm(poking_mm);
+   *prev_mm = use_temporary_mm(poking_mm);
+}
 
+/*
+ * Do the actual poke. Needs to be re-entrant as this can be called
+ * via INT3 context as well.
+ */
+static void __text_do_poke(unsigned long offset, const void *opcode, size_t 
len)
+{
kasan_disable_current();
-   memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
+   memcpy((u8 *)poking_addr + offset, opcode, len);
kasan_enable_current();
+}
 
+static void __text_poke_unmap(void *addr, const void *opcode, size_t len,
+ temp_mm_state_t *prev_mm, pte_t *ptep)
+{
+   bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
/*
 * Ensure that the PTE is only cleared after the instructions of memcpy
 * were issued by using a compiler barrier.
@@ -888,7 +897,7 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
 * instruction that already allows the core to see the updated version.
 * Xen-PV is assumed to serialize execution in a similar manner.
 */
-   unuse_temporary_mm(prev);
+   unuse_temporary_mm(*prev_mm);
 
/*
 * Flushing the TLB might involve IPIs, which would require enabled
@@ -903,7 +912,18 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
 * fundamentally screwy; there's nothing we can really do about that.
 */
BUG_ON(memcmp(addr, opcode, len));
+}
 
+static void __text_poke(void *addr, const void *opcode, size_t len)
+{
+   temp_mm_state_t prev_mm;
+   unsigned long flags;
+   pte_t *ptep;
+
+   local_irq_save(flags);
+   __text_poke_map(addr, len, _mm, );
+   __text_do_poke(offset_in_page(addr), opcode, len);
+   __text_poke_unmap(addr, opcode, len, _mm, ptep);
local_irq_restore(flags);
 }
 
-- 
2.20.1




[RFC PATCH 03/26] x86/paravirt: PVRTOP macros for PARAVIRT_RUNTIME

2020-04-07 Thread Ankur Arora
Define PVRT* macros which can be used to put pv-ops in
.parainstructions.runtime.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/paravirt_types.h | 49 +++
 1 file changed, 49 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 00e4a062ca10..f1153f53c529 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -337,6 +337,12 @@ struct paravirt_patch_template {
 extern struct pv_info pv_info;
 extern struct paravirt_patch_template pv_ops;
 
+#ifdef CONFIG_PARAVIRT_RUNTIME
+#define PVRT_SUFFIX ".runtime"
+#else
+#define PVRT_SUFFIX ""
+#endif
+
 /* Sub-section for .parainstructions */
 #define PV_SUFFIX ""
 
@@ -693,6 +699,49 @@ int paravirt_disable_iospace(void);
 #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4)
\
_PVOP_VCALL4(PV_SUFFIX, op, arg1, arg2, arg3, arg4)
 
+/*
+ * PVRTOP macros for .parainstructions.runtime
+ */
+#define PVRTOP_CALL0(rettype, op)  \
+   _PVOP_CALL0(PVRT_SUFFIX, rettype, op)
+#define PVRTOP_VCALL0(op)  \
+   _PVOP_VCALL0(PVRT_SUFFIX, op)
+
+#define PVRTOP_CALLEE0(rettype, op)\
+   _PVOP_CALLEE0(PVRT_SUFFIX, rettype, op)
+#define PVRTOP_VCALLEE0(op)\
+   _PVOP_VCALLEE0(PVRT_SUFFIX, op)
+
+#define PVRTOP_CALL1(rettype, op, arg1)
\
+   _PVOP_CALL1(PVRT_SUFFIX, rettype, op, arg1)
+#define PVRTOP_VCALL1(op, arg1)
\
+   _PVOP_VCALL1(PVRT_SUFFIX, op, arg1)
+
+#define PVRTOP_CALLEE1(rettype, op, arg1)  \
+   _PVOP_CALLEE1(PVRT_SUFFIX, rettype, op, arg1)
+#define PVRTOP_VCALLEE1(op, arg1)  \
+   _PVOP_VCALLEE1(PVRT_SUFFIX, op, arg1)
+
+#define PVRTOP_CALL2(rettype, op, arg1, arg2)  \
+   _PVOP_CALL2(PVRT_SUFFIX, rettype, op, arg1, arg2)
+#define PVRTOP_VCALL2(op, arg1, arg2)  \
+   _PVOP_VCALL2(PVRT_SUFFIX, op, arg1, arg2)
+
+#define PVRTOP_CALLEE2(rettype, op, arg1, arg2)
\
+   _PVOP_CALLEE2(PVRT_SUFFIX, rettype, op, arg1, arg2)
+#define PVRTOP_VCALLEE2(op, arg1, arg2)
\
+   _PVOP_VCALLEE2(PVRT_SUFFIX, op, arg1, arg2)
+
+#define PVRTOP_CALL3(rettype, op, arg1, arg2, arg3)\
+   _PVOP_CALL3(PVRT_SUFFIX, rettype, op, arg1, arg2, arg3)
+#define PVRTOP_VCALL3(op, arg1, arg2, arg3)\
+   _PVOP_VCALL3(PVRT_SUFFIX, op, arg1, arg2, arg3)
+
+#define PVRTOP_CALL4(rettype, op, arg1, arg2, arg3, arg4)  \
+   _PVOP_CALL4(PVRT_SUFFIX, rettype, op, arg1, arg2, arg3, arg4)
+#define PVRTOP_VCALL4(op, arg1, arg2, arg3, arg4)  \
+   _PVOP_VCALL4(PVRT_SUFFIX, op, arg1, arg2, arg3, arg4)
+
 /* Lazy mode for batching updates / context switch */
 enum paravirt_lazy_mode {
PARAVIRT_LAZY_NONE,
-- 
2.20.1




[RFC PATCH 04/26] x86/alternatives: Refactor alternatives_smp_module*

2020-04-07 Thread Ankur Arora
Refactor alternatives_smp_module* logic to make it available for
holding generic late patching state.

Most of the changes are to pull the module handling logic out
from CONFIG_SMP. In addition now we unconditionally call
alternatives_smp_module_add() and make the decision on patching
for UP or not there.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/alternative.h | 13 ++-
 arch/x86/kernel/alternative.c  | 55 --
 2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 13adca37c99a..8235bbb746d9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,24 +75,15 @@ extern void apply_alternatives(struct alt_instr *start, 
struct alt_instr *end);
 
 struct module;
 
-#ifdef CONFIG_SMP
 extern void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end);
 extern void alternatives_smp_module_del(struct module *mod);
-extern void alternatives_enable_smp(void);
 extern int alternatives_text_reserved(void *start, void *end);
-extern bool skip_smp_alternatives;
+#ifdef CONFIG_SMP
+extern void alternatives_enable_smp(void);
 #else
-static inline void alternatives_smp_module_add(struct module *mod, char *name,
-  void *locks, void *locks_end,
-  void *text, void *text_end) {}
-static inline void alternatives_smp_module_del(struct module *mod) {}
 static inline void alternatives_enable_smp(void) {}
-static inline int alternatives_text_reserved(void *start, void *end)
-{
-   return 0;
-}
 #endif /* CONFIG_SMP */
 
 #define b_replacement(num) "664"#num
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index fdfda1375f82..32aa1ddf441d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -470,6 +470,13 @@ static void alternatives_smp_unlock(const s32 *start, 
const s32 *end,
}
 }
 
+static bool uniproc_patched;   /* protected by text_mutex */
+#else  /* !CONFIG_SMP */
+#define uniproc_patched false
+static inline void alternatives_smp_unlock(const s32 *start, const s32 *end,
+  u8 *text, u8 *text_end) { }
+#endif /* CONFIG_SMP */
+
 struct smp_alt_module {
/* what is this ??? */
struct module   *mod;
@@ -486,7 +493,6 @@ struct smp_alt_module {
struct list_head next;
 };
 static LIST_HEAD(smp_alt_modules);
-static bool uniproc_patched = false;   /* protected by text_mutex */
 
 void __init_or_module alternatives_smp_module_add(struct module *mod,
  char *name,
@@ -495,23 +501,27 @@ void __init_or_module alternatives_smp_module_add(struct 
module *mod,
 {
struct smp_alt_module *smp;
 
-   mutex_lock(_mutex);
+#ifdef CONFIG_SMP
+   /* Patch to UP if other cpus not imminent. */
+   if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1))
+   uniproc_patched = true;
+#endif
if (!uniproc_patched)
-   goto unlock;
+   return;
 
-   if (num_possible_cpus() == 1)
-   /* Don't bother remembering, we'll never have to undo it. */
-   goto smp_unlock;
+   mutex_lock(_mutex);
 
-   smp = kzalloc(sizeof(*smp), GFP_KERNEL);
-   if (NULL == smp)
-   /* we'll run the (safe but slow) SMP code then ... */
-   goto unlock;
+   smp = kzalloc(sizeof(*smp), GFP_KERNEL | __GFP_NOFAIL);
 
smp->mod= mod;
smp->name   = name;
-   smp->locks  = locks;
-   smp->locks_end  = locks_end;
+
+   if (num_possible_cpus() != 1 || uniproc_patched) {
+   /* Remember only if we'll need to undo it. */
+   smp->locks  = locks;
+   smp->locks_end  = locks_end;
+   }
+
smp->text   = text;
smp->text_end   = text_end;
DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
@@ -519,9 +529,9 @@ void __init_or_module alternatives_smp_module_add(struct 
module *mod,
smp->text, smp->text_end, smp->name);
 
list_add_tail(>next, _alt_modules);
-smp_unlock:
-   alternatives_smp_unlock(locks, locks_end, text, text_end);
-unlock:
+
+   if (uniproc_patched)
+   alternatives_smp_unlock(locks, locks_end, text, text_end);
mutex_unlock(_mutex);
 }
 
@@ -540,6 +550,7 @@ void __init_or_module alternatives_smp_module_del(struct 
module *mod)
mutex_unlock(_mutex);
 }
 
+#ifdef CONFIG_SMP
 void alternatives_enable_smp(void)
 {
struct smp_alt_module *mod;
@@ -561,6 +572,7 @@ void alternatives_enable_smp(void)
}
mutex_unlock(_mutex);
 }
+#endif /* CONFIG_SMP */
 
 /*
  * Return 1 

[RFC PATCH 17/26] x86/alternatives: Add patching logic in text_poke_site()

2020-04-07 Thread Ankur Arora
Add actual poking and pipeline sync logic in poke_sync(). This is called
from text_poke_site()).

The patching logic is similar to that in text_poke_bp_batch() where we
patch the first byte with an INT3, which serves as a barrier, then patch
the remaining bytes and then come back and fixup the first byte.

The first and the last steps are single byte writes and are thus
atomic, and the second step is protected because the INT3 serves
as a barrier.

Between each of these steps is a global pipeline sync which ensures that
remote pipelines flush out any stale opcodes that they might have cached.
This is driven from poke_sync() where the primary introduces a sync_core()
on secondary CPUs for every PATCH_SYNC_* state change. The corresponding
loop on the secondary executes in text_poke_sync_site().

Note that breakpoints are not handled yet.

 CPU0CPUx
 

 patch_worker()  patch_worker()

   /* Traversal, insn-gen */   text_poke_sync_finish()
   tps.patch_worker()/* wait until:
 /* = paravirt_worker() */*  tps->state == PATCH_DONE
  */
  /* for each patch-site */
 generate_paravirt()
   runtime_patch()
 text_poke_site()text_poke_sync_site()
poke_sync()   /* for each:
  __text_do_poke() *  PATCH_SYNC_[012] */
  sync_one()   sync_one()
  ack()ack()
  wait_for_acks()

   ... ...

  smp_store_release(>state, PATCH_DONE)

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/alternative.c | 103 +++---
 1 file changed, 95 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1c5acdc4f349..7fdaae9edbf0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1441,27 +1441,57 @@ struct text_poke_state {
 
 static struct text_poke_state text_poke_state;
 
+static void wait_for_acks(struct text_poke_state *tps)
+{
+   int cpu = smp_processor_id();
+
+   cpumask_set_cpu(cpu, >sync_ack_map);
+
+   /* Wait until all CPUs are known to have observed the state change. */
+   while (cpumask_weight(>sync_ack_map) < tps->num_acks)
+   cpu_relax();
+}
+
 /**
- * poke_sync() - transitions to the specified state.
+ * poke_sync() - carries out one poke-step for a single site and
+ * transitions to the specified state.
+ * Called with the target populated in poking_mm and poking_addr.
  *
  * @tps - struct text_poke_state *
  * @state - one of PATCH_SYNC_* states
  * @offset - offset to be patched
  * @insns - insns to write
  * @len - length of insn sequence
+ *
+ * Returns after all CPUs have observed the state change and called
+ * sync_core().
  */
 static void poke_sync(struct text_poke_state *tps, int state, int offset,
  const char *insns, int len)
 {
+   if (len)
+   __text_do_poke(offset, insns, len);
/*
-* STUB: no patching or synchronization, just go through the
-* motions.
+* Stores to tps.sync_ack_map are ordered with
+* smp_load_acquire(tps->state) in text_poke_sync_site()
+* so we can safely clear the cpumask.
 */
smp_store_release(>state, state);
+
+   cpumask_clear(>sync_ack_map);
+
+   /*
+* Introduce a synchronizing instruction in local and remote insn
+* streams. This flushes any stale cached uops from CPU pipelines.
+*/
+   sync_one();
+
+   wait_for_acks(tps);
 }
 
 /**
  * text_poke_site() - called on the primary to patch a single call site.
+ * The interlocking sync work on the secondary is done in 
text_poke_sync_site().
  *
  * Called in thread context with tps->state == PATCH_SYNC_DONE where it
  * takes tps->state through different PATCH_SYNC_* states, returning
@@ -1514,6 +1544,43 @@ static void __maybe_unused text_poke_site(struct 
text_poke_state *tps,
  _mm, ptep);
 }
 
+/**
+ * text_poke_sync_site() -- called to synchronize the CPU pipeline
+ * on secondary CPUs for each patch site.
+ *
+ * Called in thread context with tps->state == PATCH_SYNC_0.
+ *
+ * Returns after having observed tps->state == PATCH_SYNC_DONE.
+ */
+static void text_poke_sync_site(struct text_poke_state *tps)
+{
+   int cpu = smp_processor_id();
+   int prevstate = -1;
+   int acked;
+
+   /*
+* In thread context we arrive here expecting tps->state to move
+* in-order from PATCH_SYNC_{0 -> 1 -> 2} -> PATCH_SYNC_DONE.
+*/
+   do {
+   /*
+* Wait until there's some work for us to do.
+*/
+   smp_cond_load_acquire(>state,
+ 

[RFC PATCH 19/26] x86/alternatives: NMI safe runtime patching

2020-04-07 Thread Ankur Arora
Runtime patching can deadlock with multiple simultaneous NMIs.
This can happen while patching inter-dependent pv-ops which are
used in the NMI path (ex pv_lock_ops):

 CPU0   CPUx
    

 patch_worker() patch_worker()

   /* Traversal, insn-gen */  text_poke_sync_finish()
   tps.patch_worker()   /* wait until:
 /* = paravirt_worker() */   *  tps->state == PATCH_DONE
 */
   /* start-patching:lock.spin_unlock */
  generate_paravirt()
runtime_patch()

  text_poke_site()  text_poke_sync_site()
poke_sync()  /* for state in:
  __text_do_poke()*  PATCH_SYNC_[012]
  ==NMI== */
 ==NMI==
 tries-to-acquire:nmi_lock   acquires:nmi_lock
 tries-to-release:nmi_lock
 ==BP==
 text_poke_sync_site()

  /* waiting-for:nmi_lock *//* waiting-for:patched-spin_unlock() */

A similar deadlock exists if two secondary CPUs get an NMI as well.

Fix this by patching NMI-unsafe ops in an NMI context. Given that the
NMI entry code ensures that NMIs do not nest, we are guaranteed that
this can be done atomically.

We do this by registering a local NMI handler (text_poke_nmi()) and
triggering a local NMI on the primary (via patch_worker_nmi()) which
then calls the same worker (tps->patch_worker()) as in thread-context.

On the secondary, we continue with the pipeline sync loop (via
text_poke_sync_finish()) in thread-context; however, if there is an
NMI on the secondary, we call text_poke_sync_finish() in the handler
which continues the work that was being done in thread-context.

Also note that text_poke_nmi() always executes first so we know that
it takes priority over any arbitrary code executing in the installed
NMI handlers.

 CPU0CPUx
 

 patch_worker(nmi=true)  patch_worker(nmi=true)

   patch_worker_nmi() -> triggers NMI   text_poke_sync_finish()
   /* wait for return from NMI */ /* wait until:
...*  tps->state == PATCH_DONE
   */

   smp_store_release(>state,
 PATCH_DONE)
  /* for each patch-site */

  text_poke_sync_site()
 CPU0-NMI /* for each:
   *  PATCH_SYNC_[012]
   */
 text_poke_nmi()sync_one()
   /* Traversal, insn-gen */ack()
   tps.patch_worker()
   /* = paravirt_worker() */...

   /* for each patch-site */

 generate_paravirt()
   runtime_patch()

 text_poke_site()
   poke_sync()
 __text_do_poke()
 sync_one()
 ack()
 wait_for_acks()

  ...


Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/text-patching.h |   2 +-
 arch/x86/kernel/alternative.c| 120 ++-
 2 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index e86709a8287e..9ba329bf9479 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -22,7 +22,7 @@ static inline void apply_paravirt(struct paravirt_patch_site 
*start,
 #define __parainstructions_runtime NULL
 #define __parainstructions_runtime_end NULL
 #else
-int paravirt_runtime_patch(void);
+int paravirt_runtime_patch(bool nmi);
 #endif
 
 /*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c68d940356a2..385c3e6ea925 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1442,6 +1442,14 @@ struct text_poke_state {
 
unsigned int primary_cpu; /* CPU doing the patching. */
unsigned int num_acks; /* Number of Acks needed. */
+
+   /*
+* To synchronize with the NMI handler.
+*/
+   atomic_t nmi_work;
+
+   /* Ensure this is patched atomically against NMIs. */
+   bool nmi_context;
 };
 
 static struct text_poke_state text_poke_state;
@@ -1715,6 +1723,7 @@ static void poke_int3_native(struct pt_regs *regs,
  * on secondary CPUs for all patch sites.
  *
  * Called in thread context with tps->state == PATCH_SYNC_DONE.
+ * Also might be called from NMI context with an arbitrary tps->state.
  * Returns with tps->state == PATCH_DONE.
  */
 static void text_poke_sync_finish(struct text_poke_state *tps)
@@ -1741,6 +1750,12 @@ static void text_poke_sync_finish(struct text_poke_state 
*tps)
  

[RFC PATCH 25/26] x86/kvm: Guest support for dynamic hints

2020-04-07 Thread Ankur Arora
If the hypervisor supports KVM_FEATURE_DYNAMIC_HINTS, then register a
callback vector (currently chosen to be HYPERVISOR_CALLBACK_VECTOR.)
The callback triggers on a change in the active hints which are
are exported via KVM CPUID in %ecx.

Trigger re-evaluation of KVM_HINTS based on change in their active
status.

Signed-off-by: Ankur Arora 
---
 arch/x86/Kconfig|  1 +
 arch/x86/entry/entry_64.S   |  5 +++
 arch/x86/include/asm/kvm_para.h |  7 
 arch/x86/kernel/kvm.c   | 58 ++---
 include/asm-generic/kvm_para.h  |  4 +++
 include/linux/kvm_para.h|  5 +++
 6 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e0629558b6b5..23b239d184fc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -810,6 +810,7 @@ config KVM_GUEST
select PARAVIRT_CLOCK
select ARCH_CPUIDLE_HALTPOLL
select PARAVIRT_RUNTIME
+   select X86_HV_CALLBACK_VECTOR
default y
---help---
  This option enables various optimizations for running under the KVM
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0e9504fabe52..96b2a243c54f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1190,6 +1190,11 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
acrn_hv_callback_vector acrn_hv_vector_handler
 #endif
 
+#if IS_ENABLED(CONFIG_KVM_GUEST)
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+   kvm_callback_vector kvm_do_callback
+#endif
+
 idtentry debug do_debughas_error_code=0
paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
 idtentry int3  do_int3 has_error_code=0
create_gap=1
 idtentry stack_segment do_stack_segmenthas_error_code=1
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 9b4df6eaa11a..5a7ca5639c2e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,11 +88,13 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned 
long p1,
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 unsigned int kvm_arch_para_hints(void);
+unsigned int kvm_arch_para_active_hints(void);
 void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
 void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, 
unsigned long address);
+void kvm_callback_vector(struct pt_regs *regs);
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
@@ -121,6 +123,11 @@ static inline unsigned int kvm_arch_para_hints(void)
return 0;
 }
 
+static inline unsigned int kvm_arch_para_active_hints(void)
+{
+   return 0;
+}
+
 static inline u32 kvm_read_and_reset_pf_reason(void)
 {
return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1cb7eab805a6..163b7a7ec5f9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -25,6 +25,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -438,7 +440,7 @@ static void __init sev_map_percpu_data(void)
 static bool pv_tlb_flush_supported(void)
 {
return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
-   !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+   !kvm_para_has_active_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
@@ -463,7 +465,7 @@ static bool pv_ipi_supported(void)
 static bool pv_sched_yield_supported(void)
 {
return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
-   !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+   !kvm_para_has_active_hint(KVM_HINTS_REALTIME) &&
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
@@ -568,7 +570,7 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask 
*mask)
 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
 {
native_smp_prepare_cpus(max_cpus);
-   if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+   if (kvm_para_has_active_hint(KVM_HINTS_REALTIME))
static_branch_disable(_spin_lock_key);
 }
 
@@ -654,6 +656,13 @@ static bool kvm_pv_tlb(void)
return cond;
 }
 
+#ifdef CONFIG_PARAVIRT_RUNTIME
+static bool has_dynamic_hint;
+static void __init kvm_register_callback_vector(void);
+#else
+#define has_dynamic_hint false
+#endif /* CONFIG_PARAVIRT_RUNTIME */
+
 static void __init kvm_guest_init(void)
 {
int i;
@@ -674,6 +683,12 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
+   if (IS_ENABLED(CONFIG_PARAVIRT_RUNTIME) &&
+   kvm_para_has_feature(KVM_FEATURE_DYNAMIC_HINTS)) {
+   kvm_register_callback_vector();
+   

[RFC PATCH 26/26] x86/kvm: Add hint change notifier for KVM_HINT_REALTIME

2020-04-07 Thread Ankur Arora
Add a blocking notifier that triggers when the host sends a hint
change notification.

Suggested-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/kvm_para.h | 10 ++
 arch/x86/kernel/kvm.c   | 16 
 include/asm-generic/kvm_para.h  |  8 
 3 files changed, 34 insertions(+)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5a7ca5639c2e..54c3c7a3225e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_KVM_PARA_H
 #define _ASM_X86_KVM_PARA_H
 
+#include 
 #include 
 #include 
 #include 
@@ -96,6 +97,9 @@ extern void kvm_disable_steal_time(void);
 void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, 
unsigned long address);
 void kvm_callback_vector(struct pt_regs *regs);
 
+void kvm_realtime_notifier_register(struct notifier_block *nb);
+void kvm_realtime_notifier_unregister(struct notifier_block *nb);
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
 #else /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -137,6 +141,14 @@ static inline void kvm_disable_steal_time(void)
 {
return;
 }
+
+static inline void kvm_realtime_notifier_register(struct notifier_block *nb)
+{
+}
+
+static inline void kvm_realtime_notifier_unregister(struct notifier_block *nb)
+{
+}
 #endif
 
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 163b7a7ec5f9..35ba4a837027 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -951,6 +951,20 @@ void __init kvm_spinlock_init(void)
 static inline bool kvm_pv_spinlock(void) { return false; }
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
 
+static BLOCKING_NOTIFIER_HEAD(realtime_notifier);
+
+void kvm_realtime_notifier_register(struct notifier_block *nb)
+{
+   blocking_notifier_chain_register(_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(kvm_realtime_notifier_register);
+
+void kvm_realtime_notifier_unregister(struct notifier_block *nb)
+{
+   blocking_notifier_chain_unregister(_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(kvm_realtime_notifier_unregister);
+
 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
 
 static void kvm_disable_host_haltpoll(void *i)
@@ -1004,6 +1018,8 @@ void kvm_trigger_reprobe_cpuid(struct work_struct *work)
paravirt_runtime_patch(true);
 
mutex_unlock(_mutex);
+
+   blocking_notifier_call_chain(_notifier, 0, NULL);
 }
 
 static DECLARE_WORK(trigger_reprobe, kvm_trigger_reprobe_cpuid);
diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
index 4a575299ad62..d443531b49ac 100644
--- a/include/asm-generic/kvm_para.h
+++ b/include/asm-generic/kvm_para.h
@@ -33,4 +33,12 @@ static inline bool kvm_para_available(void)
return false;
 }
 
+static inline void kvm_realtime_notifier_register(struct notifier_block *nb)
+{
+}
+
+static inline void kvm_realtime_notifier_unregister(struct notifier_block *nb)
+{
+}
+
 #endif
-- 
2.20.1




[RFC PATCH 18/26] x86/alternatives: Handle BP in non-emulated text poking

2020-04-07 Thread Ankur Arora
Handle breakpoints if we hit an INT3 either by way of an NMI while
patching a site in the NMI handling path, or if we are patching text
in text_poke_site() (executes on the primary), or in the pipeline sync
path in text_poke_sync_site() (executes on secondary CPUs.)
(The last two are not expected to happen, but see below.)

The handling on the primary CPU is to update the insn stream locally
such that we can return to the primary patching loop but not force
the secondary CPUs to execute sync_core().

>From my reading of the Intel spec and the thread which laid down the
INT3 approach: https//lore.kernel.org/lkml/4b4d02b8.5020...@zytor.com,
skipping the sync_core() would mean that remote pipelines -- if they
have relevant uops cached would not see the updated instruction and
would continue to execute stale uops.

This is safe because the primary eventually gets back to the patching
loop in text_poke_site() and resumes the state-machine, re-writing
some of the insn sequences just written in the BP handling and forcing
the secondary CPUs to execute sync_core().

The handling on the secondary, is to call text_poke_sync_site() just as
in thread-context, so it contains acking the patch states such that the
primary can continue making forward progress. This can be called in a
re-entrant fashion.

Note that this does mean that we cannot handle any patches in
text_poke_sync_site() itself since that would end up being called
recursively in the BP handler.

Control flow diagram with the BP handler:

 CPU0-BP CPUx-BP
 --- ---

 poke_int3_native()  poke_int3_native()
   __text_do_poke()text_poke_sync_site()
   sync_one()   /* for state in:
 *  [PATCH_SYNC_y.._SYNC_DONE) */
 sync_one()
 ack()


 CPU0CPUx
 

 patch_worker()  patch_worker()

   /* Traversal, insn-gen */   text_poke_sync_finish()
   tps.patch_worker()/* wait until:
 /* = paravirt_worker() */*  tps->state == PATCH_DONE
  */
 /* for each patch-site */
 generate_paravirt()
   runtime_patch()
 text_poke_site()text_poke_sync_site()
poke_sync()   /* for state in:
  __text_do_poke() *  [PATCH_SYNC_0..PATCH_SYNC_y]
  sync_one()   */
  ack()sync_one()
  wait_for_acks()  ack()

   ... ...

   smp_store_release(>state, PATCH_DONE)

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/alternative.c | 145 --
 1 file changed, 137 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7fdaae9edbf0..c68d940356a2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1055,6 +1055,8 @@ static int notrace patch_cmp(const void *key, const void 
*elt)
 }
 NOKPROBE_SYMBOL(patch_cmp);
 
+static void poke_int3_native(struct pt_regs *regs,
+struct text_poke_loc *tp);
 int notrace poke_int3_handler(struct pt_regs *regs)
 {
struct bp_patching_desc *desc;
@@ -1099,8 +1101,11 @@ int notrace poke_int3_handler(struct pt_regs *regs)
goto out_put;
}
 
-   if (desc->native)
-   BUG();
+   if (desc->native) {
+   poke_int3_native(regs, tp);
+   ret = 1; /* handled */
+   goto out_put;
+   }
 
len = text_opcode_size(tp->emulated.opcode);
ip += len;
@@ -1469,8 +1474,15 @@ static void wait_for_acks(struct text_poke_state *tps)
 static void poke_sync(struct text_poke_state *tps, int state, int offset,
  const char *insns, int len)
 {
-   if (len)
+   if (len) {
+   /*
+* Note that we could hit a BP right after patching memory
+* below. This could happen before the state change further
+* down. The primary BP handler allows us to make
+* forward-progress in that case.
+*/
__text_do_poke(offset, insns, len);
+   }
/*
 * Stores to tps.sync_ack_map are ordered with
 * smp_load_acquire(tps->state) in text_poke_sync_site()
@@ -1504,11 +1516,22 @@ static void __maybe_unused text_poke_site(struct 
text_poke_state *tps,
temp_mm_state_t prev_mm;
pte_t *ptep;
int offset;
+   struct bp_patching_desc desc = {
+   .vec = tp,
+   .nr_entries = 1,
+   .native = true,
+   .refs = 

[RFC PATCH 16/26] x86/alternatives: Add paravirt patching at runtime

2020-04-07 Thread Ankur Arora
Add paravirt_patch_runtime() which uses text_poke_late() to patch
paravirt sites.

Also add paravirt_worker() which does the actual insn generation
generate_paravirt() (which uses runtime_patch() to generate the
appropriate native or paravirt insn sequences) and then calls
text_poke_site() to do the actual poking.

 CPU0CPUx
 

 patch_worker()  patch_worker()

   /* Traversal, insn-gen */   text_poke_sync_finish()
   tps.patch_worker()
 /* = paravirt_worker() */ /*
* wait until:
 /* for each patch-site */  *  tps->state == PATCH_DONE
 generate_paravirt()*/
   runtime_patch()
 text_poke_site()
   poke_sync()

   ... ...

   smp_store_release(>state, PATCH_DONE)

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/text-patching.h |  2 +
 arch/x86/kernel/alternative.c| 98 +++-
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index c4b2814f2f9d..e86709a8287e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -21,6 +21,8 @@ static inline void apply_paravirt(struct paravirt_patch_site 
*start,
 #ifndef CONFIG_PARAVIRT_RUNTIME
 #define __parainstructions_runtime NULL
 #define __parainstructions_runtime_end NULL
+#else
+int paravirt_runtime_patch(void);
 #endif
 
 /*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 452d4081eded..1c5acdc4f349 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1463,7 +1463,9 @@ static void poke_sync(struct text_poke_state *tps, int 
state, int offset,
 /**
  * text_poke_site() - called on the primary to patch a single call site.
  *
- * Returns after switching tps->state to PATCH_SYNC_DONE.
+ * Called in thread context with tps->state == PATCH_SYNC_DONE where it
+ * takes tps->state through different PATCH_SYNC_* states, returning
+ * after having switched the tps->state back to PATCH_SYNC_DONE.
  */
 static void __maybe_unused text_poke_site(struct text_poke_state *tps,
  struct text_poke_loc *tp)
@@ -1598,6 +1600,16 @@ static int __maybe_unused text_poke_late(patch_worker_t 
worker, void *stage)
return ret;
 }
 
+/*
+ * Check if this address is still in scope of this module's .text section.
+ */
+static bool __maybe_unused stale_address(struct alt_module *am, u8 *p)
+{
+   if (p < am->text || p >= am->text_end)
+   return true;
+   return false;
+}
+
 #ifdef CONFIG_PARAVIRT_RUNTIME
 struct paravirt_stage_entry {
void *dest; /* pv_op destination */
@@ -1654,4 +1666,88 @@ void text_poke_pv_stage_zero(void)
lockdep_assert_held(_mutex);
pv_stage.count = 0;
 }
+
+/**
+ * generate_paravirt - fill up the insn sequence for a pv-op.
+ *
+ * @tp - address of struct text_poke_loc
+ * @op - the pv-op entry for this location
+ * @site - patch site (kernel or module text)
+ */
+static void generate_paravirt(struct text_poke_loc *tp,
+ struct paravirt_stage_entry *op,
+ struct paravirt_patch_site *site)
+{
+   unsigned int used;
+
+   BUG_ON(site->len > POKE_MAX_OPCODE_SIZE);
+
+   text_poke_loc_init(tp, site->instr, site->instr, site->len, NULL, true);
+
+   /*
+* Paravirt patches can patch calls (ex. mmu.tlb_flush),
+* callee_saves(ex. queued_spin_unlock).
+*
+* runtime_patch() calls native_patch(), or paravirt_patch()
+* based on the destination.
+*/
+   used = runtime_patch(site->type, (void *)tp->text, op->dest,
+(unsigned long)site->instr, site->len);
+
+   /* No good way to recover. */
+   BUG_ON(used < 0);
+
+   /* Pad the rest with nops */
+   add_nops((void *)tp->text + used, site->len - used);
+}
+
+/**
+ * paravirt_worker - generate the paravirt patching
+ * insns and calls text_poke_site() to do the actual patching.
+ */
+static void paravirt_worker(struct text_poke_state *tps)
+{
+   struct paravirt_patch_site *site;
+   struct paravirt_stage *stage = tps->stage;
+   struct paravirt_stage_entry *op = >ops[0];
+   struct alt_module *am;
+   struct text_poke_loc tp;
+   int i;
+
+   list_for_each_entry(am, tps->head, next) {
+   for (site = am->para; site < am->para_end; site++) {
+   if (stale_address(am, site->instr))
+   continue;
+
+   for (i = 0;  i < stage->count; i++) {
+   if (op[i].type != site->type)
+   continue;
+
+   generate_paravirt(, [i], site);

[RFC PATCH 23/26] x86/kvm: Add worker to trigger runtime patching

2020-04-07 Thread Ankur Arora
Make __pv_init_lock_hash() conditional on either paravirt spinlocks
being enabled (via kvm_pv_spinlock()) or if paravirt spinlocks
might get enabled (runtime patching via CONFIG_PARAVIRT_RUNTIME.)

Also add a handler for CPUID reprobe which can trigger this patching.

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/kvm.c | 34 +-
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 31f5ecfd3907..1cb7eab805a6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int kvmapf = 1;
 
@@ -909,12 +910,15 @@ void __init kvm_spinlock_init(void)
if (num_possible_cpus() == 1)
return;
 
-   if (!kvm_pv_spinlock())
-   return;
-
-   __pv_init_lock_hash();
+   /*
+* Allocate if pv_spinlocks are enabled or if we might
+* end up patching them in later.
+*/
+   if (kvm_pv_spinlock() || IS_ENABLED(CONFIG_PARAVIRT_RUNTIME))
+   __pv_init_lock_hash();
 }
-
+#else  /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline bool kvm_pv_spinlock(void) { return false; }
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
@@ -952,3 +956,23 @@ void arch_haltpoll_disable(unsigned int cpu)
 }
 EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
 #endif
+
+#ifdef CONFIG_PARAVIRT_RUNTIME
+void kvm_trigger_reprobe_cpuid(struct work_struct *work)
+{
+   mutex_lock(_mutex);
+
+   paravirt_stage_zero();
+
+   kvm_pv_steal_clock();
+   kvm_pv_tlb();
+   paravirt_runtime_patch(false);
+
+   paravirt_stage_zero();
+
+   kvm_pv_spinlock();
+   paravirt_runtime_patch(true);
+
+   mutex_unlock(_mutex);
+}
+#endif /* CONFIG_PARAVIRT_RUNTIME */
-- 
2.20.1




[RFC PATCH 12/26] x86/alternatives: Use __get_unlocked_pte() in text_poke()

2020-04-07 Thread Ankur Arora
text_poke() uses get_locked_pte() to map poking_addr. However, this
introduces a dependency on locking code which precludes using
text_poke() to modify qspinlock primitives.

Accesses to this pte (and poking_addr) are protected by text_mutex
so we can safely switch to __get_unlocked_pte() here. Note that
we do need to be careful that we do not try to modify the poking_addr
from multiple contexts simultaneously (ex. INT3 or NMI context.)

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/alternative.c |  9 -
 include/linux/mm.h| 16 ++--
 mm/memory.c   |  9 ++---
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8c79a3dc5e72..0344e49a4ade 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -812,7 +812,6 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
temp_mm_state_t prev;
unsigned long flags;
pte_t pte, *ptep;
-   spinlock_t *ptl;
pgprot_t pgprot;
 
/*
@@ -846,10 +845,11 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
 
/*
-* The lock is not really needed, but this allows to avoid open-coding.
+* text_poke() might be used to poke spinlock primitives so do this
+* unlocked. This does mean that we need to be careful that no other
+* context (ex. INT3 handler) is simultaneously writing to this pte.
 */
-   ptep = get_locked_pte(poking_mm, poking_addr, );
-
+   ptep = __get_unlocked_pte(poking_mm, poking_addr);
/*
 * This must not fail; preallocated in poking_init().
 */
@@ -904,7 +904,6 @@ static void __text_poke(void *addr, const void *opcode, 
size_t len)
 */
BUG_ON(memcmp(addr, opcode, len));
 
-   pte_unmap_unlock(ptep, ptl);
local_irq_restore(flags);
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7dd5c4ccbf85..d4a652c2e269 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1895,8 +1895,20 @@ static inline int pte_devmap(pte_t pte)
 
 int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
 
-extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
-  spinlock_t **ptl);
+pte_t *__get_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl);
+
+static inline pte_t *__get_unlocked_pte(struct mm_struct *mm,
+   unsigned long addr)
+{
+   return __get_pte(mm, addr, NULL);
+}
+
+static inline pte_t *__get_locked_pte(struct mm_struct *mm,
+ unsigned long addr, spinlock_t **ptl)
+{
+   return __get_pte(mm, addr, ptl);
+}
+
 static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 586271f3efc6..7acfe9512084 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1407,8 +1407,8 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned 
long address,
 }
 EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
-pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
-   spinlock_t **ptl)
+pte_t *__get_pte(struct mm_struct *mm, unsigned long addr,
+spinlock_t **ptl)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -1427,7 +1427,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned 
long addr,
return NULL;
 
VM_BUG_ON(pmd_trans_huge(*pmd));
-   return pte_alloc_map_lock(mm, pmd, addr, ptl);
+   if (likely(ptl))
+   return pte_alloc_map_lock(mm, pmd, addr, ptl);
+   else
+   return pte_alloc_map(mm, pmd, addr);
 }
 
 /*
-- 
2.20.1




[RFC PATCH 14/26] x86/alternatives: Handle native insns in text_poke_loc*()

2020-04-07 Thread Ankur Arora
Intended to handle scenarios where we might want to patch arbitrary
instructions (ex. inlined opcodes in pv_lock_ops.)

Users for native mode (as opposed to emulated) are introduced in
later patches.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/text-patching.h |  4 +-
 arch/x86/kernel/alternative.c| 61 
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index 04778c2bc34e..c4b2814f2f9d 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -25,10 +25,10 @@ static inline void apply_paravirt(struct 
paravirt_patch_site *start,
 
 /*
  * Currently, the max observed size in the kernel code is
- * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
+ * NOP7 for indirect call, which is 7.
  * Raise it if needed.
  */
-#define POKE_MAX_OPCODE_SIZE   5
+#define POKE_MAX_OPCODE_SIZE   7
 
 extern void text_poke_early(void *addr, const void *opcode, size_t len);
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 337aad8c2521..004fe86f463f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -981,8 +981,15 @@ void text_poke_sync(void)
 
 struct text_poke_loc {
s32 rel_addr; /* addr := _stext + rel_addr */
-   s32 rel32;
-   u8 opcode;
+   union {
+   struct {
+   s32 rel32;
+   u8 opcode;
+   } emulated;
+   struct {
+   u8 len;
+   } native;
+   };
const u8 text[POKE_MAX_OPCODE_SIZE];
 };
 
@@ -990,6 +997,7 @@ struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
atomic_t refs;
+   bool native;
 };
 
 static struct bp_patching_desc *bp_desc;
@@ -1071,10 +1079,13 @@ int notrace poke_int3_handler(struct pt_regs *regs)
goto out_put;
}
 
-   len = text_opcode_size(tp->opcode);
+   if (desc->native)
+   BUG();
+
+   len = text_opcode_size(tp->emulated.opcode);
ip += len;
 
-   switch (tp->opcode) {
+   switch (tp->emulated.opcode) {
case INT3_INSN_OPCODE:
/*
 * Someone poked an explicit INT3, they'll want to handle it,
@@ -1083,12 +1094,12 @@ int notrace poke_int3_handler(struct pt_regs *regs)
goto out_put;
 
case CALL_INSN_OPCODE:
-   int3_emulate_call(regs, (long)ip + tp->rel32);
+   int3_emulate_call(regs, (long)ip + tp->emulated.rel32);
break;
 
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
-   int3_emulate_jmp(regs, (long)ip + tp->rel32);
+   int3_emulate_jmp(regs, (long)ip + tp->emulated.rel32);
break;
 
default:
@@ -1134,6 +1145,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, 
unsigned int nr_entries
.vec = tp,
.nr_entries = nr_entries,
.refs = ATOMIC_INIT(1),
+   .native = false,
};
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
@@ -1161,7 +1173,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, 
unsigned int nr_entries
 * Second step: update all but the first byte of the patched range.
 */
for (do_sync = 0, i = 0; i < nr_entries; i++) {
-   int len = text_opcode_size(tp[i].opcode);
+   int len = text_opcode_size(tp[i].emulated.opcode);
 
if (len - INT3_INSN_SIZE > 0) {
text_poke(text_poke_addr([i]) + INT3_INSN_SIZE,
@@ -1205,11 +1217,25 @@ static void text_poke_bp_batch(struct text_poke_loc 
*tp, unsigned int nr_entries
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
-  const void *opcode, size_t len, const void 
*emulate)
+  const void *opcode, size_t len,
+  const void *emulate, bool native)
 {
struct insn insn;
 
+   memset((void *)tp, 0, sizeof(*tp));
memcpy((void *)tp->text, opcode, len);
+
+   tp->rel_addr = addr - (void *)_stext;
+
+   /*
+* Native mode: when we might be poking
+* arbitrary (perhaps) multiple instructions.
+*/
+   if (native) {
+   tp->native.len = (u8)len;
+   return;
+   }
+
if (!emulate)
emulate = opcode;
 
@@ -1219,31 +1245,30 @@ static void text_poke_loc_init(struct text_poke_loc 
*tp, void *addr,
BUG_ON(!insn_complete());
BUG_ON(len != insn.length);
 
-   tp->rel_addr = addr - (void *)_stext;
-   tp->opcode = insn.opcode.bytes[0];
+   tp->emulated.opcode = insn.opcode.bytes[0];
 
-   switch (tp->opcode) {
+   switch (tp->emulated.opcode) {
case INT3_INSN_OPCODE:
  

[RFC PATCH 21/26] x86/alternatives: Paravirt runtime selftest

2020-04-07 Thread Ankur Arora
Add a selftest that triggers paravirt_runtime_patch() which
toggles between the paravirt and native pv_lock_ops.

The selftest also register an NMI handler, which exercises the
patched pv-ops by spin-lock operations. These are triggered via
artificially sent NMIs.

And last, introduce patch sites in the primary and secondary
patching code which are hit while during the patching process.

Signed-off-by: Ankur Arora 
---
 arch/x86/Kconfig.debug|  13 ++
 arch/x86/kernel/Makefile  |   1 +
 arch/x86/kernel/alternative.c |  20 +++
 arch/x86/kernel/kvm.c |   4 +-
 arch/x86/kernel/pv_selftest.c | 264 ++
 arch/x86/kernel/pv_selftest.h |  15 ++
 6 files changed, 315 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/kernel/pv_selftest.c
 create mode 100644 arch/x86/kernel/pv_selftest.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 2e74690b028a..82a8e3fa68c7 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -252,6 +252,19 @@ config X86_DEBUG_FPU
 
  If unsure, say N.
 
+config DEBUG_PARAVIRT_SELFTEST
+   bool "Enable paravirt runtime selftest"
+   depends on PARAVIRT
+   depends on PARAVIRT_RUNTIME
+   depends on PARAVIRT_SPINLOCKS
+   depends on KVM_GUEST
+   help
+ This option enables sanity testing of the runtime paravirtualized
+ patching code. Triggered via debugfs.
+
+ Might help diagnose patching problems in different
+ configurations and loads.
+
 config PUNIT_ATOM_DEBUG
tristate "ATOM Punit debug driver"
depends on PCI
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index ba89cabe5fcf..ed3c93681f12 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
 
 obj-$(CONFIG_AMD_NB)   += amd_nb.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
+obj-$(CONFIG_DEBUG_PARAVIRT_SELFTEST) += pv_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 385c3e6ea925..26407d7a54db 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include "pv_selftest.h"
 
 int __read_mostly alternatives_patched;
 
@@ -1549,6 +1550,12 @@ static void __maybe_unused text_poke_site(struct 
text_poke_state *tps,
 */
poke_sync(tps, PATCH_SYNC_0, offset, , INT3_INSN_SIZE);
 
+   /*
+* We have an INT3 in place; execute a contrived selftest that
+* has an insn sequence that is under patching.
+*/
+   pv_selftest_primary();
+
/* Poke remaining */
poke_sync(tps, PATCH_SYNC_1, offset + INT3_INSN_SIZE,
  tp->text + INT3_INSN_SIZE, tp->native.len - INT3_INSN_SIZE);
@@ -1634,6 +1641,19 @@ static void text_poke_sync_site(struct text_poke_state 
*tps)
smp_cond_load_acquire(>state,
  prevstate != VAL);
 
+   /*
+* Send an NMI to one of the other CPUs.
+*/
+   pv_selftest_send_nmi();
+
+   /*
+* We have an INT3 in place; execute a contrived selftest that
+* has an insn sequence that is under patching.
+*
+* Note that this function is also called from BP fixup but
+* is just an NOP when called from there.
+*/
+   pv_selftest_secondary();
prevstate = READ_ONCE(tps->state);
 
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6efe0410fb72..e56d263159d7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -779,7 +779,7 @@ arch_initcall(kvm_alloc_cpumask);
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
-static void kvm_kick_cpu(int cpu)
+void kvm_kick_cpu(int cpu)
 {
int apicid;
unsigned long flags = 0;
@@ -790,7 +790,7 @@ static void kvm_kick_cpu(int cpu)
 
 #include 
 
-static void kvm_wait(u8 *ptr, u8 val)
+void kvm_wait(u8 *ptr, u8 val)
 {
unsigned long flags;
 
diff --git a/arch/x86/kernel/pv_selftest.c b/arch/x86/kernel/pv_selftest.c
new file mode 100644
index ..e522f444bd6e
--- /dev/null
+++ b/arch/x86/kernel/pv_selftest.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "pv_selftest.h"
+
+static int nmi_selftest;
+static bool cond_state;
+
+#define SELFTEST_PARAVIRT  1
+static int test_mode;
+
+/*
+ * Mark this and the following functions __always_inline to ensure
+ * we generate multiple patch sites that can be hit independently
+ * in 

[RFC PATCH 15/26] x86/alternatives: Non-emulated text poking

2020-04-07 Thread Ankur Arora
Patching at runtime needs to handle interdependent pv-ops: as an example,
lock.queued_lock_slowpath(), lock.queued_lock_unlock() and the other
pv_lock_ops are paired and so need to be updated atomically. This is
difficult with emulation because non-patching CPUs could be executing in
critical sections.
(We could apply INT3 everywhere first and then use RCU to force a
barrier but given that spinlocks are everywhere, it still might mean a
lot of time in emulation.)

Second, locking operations can be called from interrupt handlers which
means we cannot trivially use IPIs to introduce a pipeline sync step on
non-patching CPUs.

Third, some pv-ops can be inlined and so we would need to emulate a
broader set of operations than CALL, JMP, NOP*.

Introduce the core state-machine with the actual poking and pipeline
sync stubbed out. This executes via stop_machine() with the primary CPU
carrying out a text_poke_bp() style three-staged algorithm.

The control flow diagram below shows CPU0 as the primary which does the
patching, while the rest of the CPUs (CPUx) execute the sync loop in
text_poke_sync_finish().

 CPU0   CPUx
    

 patch_worker() patch_worker()

   /* Traversal, insn-gen */  text_poke_sync_finish()
   tps.patch_worker() /*
   * wait until:
 /* for each patch-site */ *  tps->state == PATCH_DONE
 text_poke_site()  */
   poke_sync()

   ... ...

   smp_store_release(>state, PATCH_DONE)

Commits further on flesh out the rest of the code.

Signed-off-by: Ankur Arora 
---
sync_one() uses the following for pipeline synchronization:

+   if (in_nmi())
+   cpuid_eax(1);
+   else
+   sync_core();

The if (in_nmi()) clause is meant to be executed from NMI contexts.
Reading through past LKML discussions cpuid_eax() is probably a
bad choice -- at least in so far as Xen PV is concerned. What
would be a good primitive to use insead?

Also, given that we do handle the nested NMI case, does it make sense
to just use native_iret() (via sync_core()) in NMI contexts well?

---
 arch/x86/kernel/alternative.c | 247 ++
 1 file changed, 247 insertions(+)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 004fe86f463f..452d4081eded 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -979,6 +979,26 @@ void text_poke_sync(void)
on_each_cpu(do_sync_core, NULL, 1);
 }
 
+static void __maybe_unused sync_one(void)
+{
+   /*
+* We might be executing in NMI context, and so cannot use
+* IRET as a synchronizing instruction.
+*
+* We could use native_write_cr2() but that is not guaranteed
+* to work on Xen-PV -- it is emulated by Xen and might not
+* execute an iret (or similar synchronizing instruction)
+* internally.
+*
+* cpuid() would trap as well. Unclear if that's a solution
+* either.
+*/
+   if (in_nmi())
+   cpuid_eax(1);
+   else
+   sync_core();
+}
+
 struct text_poke_loc {
s32 rel_addr; /* addr := _stext + rel_addr */
union {
@@ -1351,6 +1371,233 @@ void __ref text_poke_bp(void *addr, const void *opcode, 
size_t len, const void *
text_poke_bp_batch(, 1);
 }
 
+struct text_poke_state;
+typedef void (*patch_worker_t)(struct text_poke_state *tps);
+
+/*
+ *+---possible-BP--+
+ *||
+ * +--write-INT3--+   +--suffix--+   +-insn-prefix-+
+ */   | _/   |__/  |
+ *   /v' v v
+ * PATCH_SYNC_0PATCH_SYNC_1PATCH_SYNC_2   *PATCH_SYNC_DONE*
+ *   \|`> 
PATCH_DONE
+ *`--<-<-<-<--+
+ *
+ * We start in state PATCH_SYNC_DONE and loop through PATCH_SYNC_* states
+ * to end at PATCH_DONE. The primary drives these in text_poke_site()
+ * with patch_worker() making the final transition to PATCH_DONE.
+ * All transitions but the last iteration need to be globally observed.
+ *
+ * On secondary CPUs, text_poke_sync_finish() waits in a cpu_relax()
+ * loop waiting for a transition to PATCH_SYNC_0 at which point it would
+ * start observing transitions until PATCH_SYNC_DONE.
+ * Eventually the master moves to PATCH_DONE and secondary CPUs finish.
+ */
+enum patch_state {
+   /*
+* Add an artificial state that we can do a bitwise operation
+* over all the PATCH_SYNC_* states.
+*/
+   PATCH_SYNC_x = 4,
+   PATCH_SYNC_0 = PATCH_SYNC_x | 0,/* Serialize INT3 */
+   PATCH_SYNC_1 = PATCH_SYNC_x | 1,/* 

[RFC PATCH 20/26] x86/paravirt: Enable pv-spinlocks in runtime_patch()

2020-04-07 Thread Ankur Arora
Enable runtime patching of paravirt spinlocks. These can be trivially
enabled because pv_lock_ops are never preemptible -- preemption is
disabled at entry to spin_lock*().

Note that a particular CPU instance might get preempted in the host but
because runtime_patching() is called via stop_machine(), the migration
thread would flush out any kernel threads preempted in the host.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/paravirt.h  | 10 +-
 arch/x86/kernel/paravirt_patch.c | 12 
 kernel/locking/lock_events.c |  2 +-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 694d8daf4983..cb3d0a91c060 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -642,27 +642,27 @@ static inline void __set_fixmap(unsigned /* enum 
fixed_addresses */ idx,
 static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock 
*lock,
u32 val)
 {
-   PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
+   PVRTOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
 }
 
 static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
 {
-   PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
+   PVRTOP_VCALLEE1(lock.queued_spin_unlock, lock);
 }
 
 static __always_inline void pv_wait(u8 *ptr, u8 val)
 {
-   PVOP_VCALL2(lock.wait, ptr, val);
+   PVRTOP_VCALL2(lock.wait, ptr, val);
 }
 
 static __always_inline void pv_kick(int cpu)
 {
-   PVOP_VCALL1(lock.kick, cpu);
+   PVRTOP_VCALL1(lock.kick, cpu);
 }
 
 static __always_inline bool pv_vcpu_is_preempted(long cpu)
 {
-   return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
+   return PVRTOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
 }
 
 void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index 3eb8c0e720b4..3f8606f2811c 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -152,6 +152,18 @@ int runtime_patch(u8 type, void *insn_buff, void *op,
 
/* Nothing whitelisted for now. */
switch (type) {
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+   /*
+* Preemption is always disabled in the lifetime of a spinlock
+* (whether held or while waiting to acquire.)
+*/
+   case PARAVIRT_PATCH(lock.queued_spin_lock_slowpath):
+   case PARAVIRT_PATCH(lock.queued_spin_unlock):
+   case PARAVIRT_PATCH(lock.wait):
+   case PARAVIRT_PATCH(lock.kick):
+   case PARAVIRT_PATCH(lock.vcpu_is_preempted):
+   break;
+#endif
default:
pr_warn("type=%d unsuitable for runtime-patching\n", type);
return -EINVAL;
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..c3057e82e6f9 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -115,7 +115,7 @@ static const struct file_operations fops_lockevent = {
.llseek = default_llseek,
 };
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && !defined(CONFIG_PARAVIRT_RUNTIME)
 #include 
 
 static bool __init skip_lockevent(const char *name)
-- 
2.20.1




[RFC PATCH 02/26] x86/paravirt: Allow paravirt patching post-init

2020-04-07 Thread Ankur Arora
Paravirt-ops are patched at init to convert indirect calls into
direct calls and in some cases, to inline the target at the call-site.
This is done by way of PVOP* macros which save the call-site
information via compile time annotations.

Pull this state out in .parainstructions.runtime for some pv-ops such
that they can be used for runtime patching.

Signed-off-by: Ankur Arora 
---
 arch/x86/Kconfig  | 12 
 arch/x86/include/asm/paravirt_types.h |  5 +
 arch/x86/include/asm/text-patching.h  |  5 +
 arch/x86/kernel/alternative.c |  2 ++
 arch/x86/kernel/module.c  | 10 +-
 arch/x86/kernel/vmlinux.lds.S | 16 
 include/asm-generic/vmlinux.lds.h |  8 
 7 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1edf788d301c..605619938f08 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -764,6 +764,18 @@ config PARAVIRT
  over full virtualization.  However, when run without a hypervisor
  the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_RUNTIME
+   bool "Enable paravirtualized ops to be patched at runtime"
+   depends on PARAVIRT
+   help
+ Enable the paravirtualized guest kernel to switch pv-ops based on
+ changed host conditions, potentially improving performance
+ significantly.
+
+ This would increase the memory footprint of the running kernel
+ slightly (depending mostly on whether lock and unlock are inlined
+ or not.)
+
 config PARAVIRT_XXL
bool
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 37e8f27a3b9d..00e4a062ca10 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -723,6 +723,11 @@ struct paravirt_patch_site {
 extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
 
+#ifdef CONFIG_PARAVIRT_RUNTIME
+extern struct paravirt_patch_site __parainstructions_runtime[],
+   __parainstructions_runtime_end[];
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index 67315fa3956a..e2ef241c261e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -18,6 +18,11 @@ static inline void apply_paravirt(struct paravirt_patch_site 
*start,
 #define __parainstructions_end NULL
 #endif
 
+#ifndef CONFIG_PARAVIRT_RUNTIME
+#define __parainstructions_runtime NULL
+#define __parainstructions_runtime_end NULL
+#endif
+
 /*
  * Currently, the max observed size in the kernel code is
  * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7867dfb3963e..fdfda1375f82 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -740,6 +740,8 @@ void __init alternative_instructions(void)
 #endif
 
apply_paravirt(__parainstructions, __parainstructions_end);
+   apply_paravirt(__parainstructions_runtime,
+  __parainstructions_runtime_end);
 
restart_nmi();
alternatives_patched = 1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d5c72cb877b3..658ea60ce324 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -222,7 +222,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
 {
const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
-   *para = NULL, *orc = NULL, *orc_ip = NULL;
+   *para = NULL, *para_run = NULL, *orc = NULL, *orc_ip = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -234,6 +234,9 @@ int module_finalize(const Elf_Ehdr *hdr,
locks = s;
if (!strcmp(".parainstructions", secstrings + s->sh_name))
para = s;
+   if (!strcmp(".parainstructions.runtime",
+   secstrings + s->sh_name))
+   para_run = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -257,6 +260,11 @@ int module_finalize(const Elf_Ehdr *hdr,
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
+   if (para_run) {
+   void *pseg = (void *)para_run->sh_addr;
+
+   apply_paravirt(pseg, pseg + para_run->sh_size);
+   }
 
/* make jump label nops */
jump_label_apply_nops(me);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1bf7e312361f..7f5b8f6ab96e 100644
--- 

[RFC PATCH 10/26] x86/paravirt: Add primitives to stage pv-ops

2020-04-07 Thread Ankur Arora
Add paravirt_stage_alt() which conditionally selects between a paravirt
or native pv-op and then stages it for later patching.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/paravirt_types.h |  6 +++
 arch/x86/include/asm/text-patching.h  |  3 ++
 arch/x86/kernel/alternative.c | 58 +++
 3 files changed, 67 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 3b9f6c105397..0c4ca7ad719c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -350,6 +350,12 @@ extern struct paravirt_patch_template native_pv_ops;
 #define PARAVIRT_PATCH(x)  \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
 
+#define paravirt_stage_alt(do_stage, op, opfn) \
+   (text_poke_pv_stage(PARAVIRT_PATCH(op), \
+   (do_stage) ? (opfn) : (native_pv_ops.op)))
+
+#define paravirt_stage_zero() text_poke_pv_stage_zero()
+
 /*
  * Neat trick to map patch type back to the call within the
  * corresponding structure.
diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index e2ef241c261e..706e61e6967d 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -55,6 +55,9 @@ extern void text_poke_bp(void *addr, const void *opcode, 
size_t len, const void
 extern void text_poke_queue(void *addr, const void *opcode, size_t len, const 
void *emulate);
 extern void text_poke_finish(void);
 
+bool text_poke_pv_stage(u8 type, void *opfn);
+void text_poke_pv_stage_zero(void);
+
 #define INT3_INSN_SIZE 1
 #define INT3_INSN_OPCODE   0xCC
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8189ac21624c..0c335af9ee28 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1307,3 +1307,61 @@ void __ref text_poke_bp(void *addr, const void *opcode, 
size_t len, const void *
text_poke_loc_init(, addr, opcode, len, emulate);
text_poke_bp_batch(, 1);
 }
+
+#ifdef CONFIG_PARAVIRT_RUNTIME
+struct paravirt_stage_entry {
+   void *dest; /* pv_op destination */
+   u8 type;/* pv_op type */
+};
+
+/*
+ * We don't anticipate many pv-ops being written at runtime.
+ */
+#define PARAVIRT_STAGE_MAX 8
+struct paravirt_stage {
+   struct paravirt_stage_entry ops[PARAVIRT_STAGE_MAX];
+   u32 count;
+};
+
+/* Protected by text_mutex */
+static struct paravirt_stage pv_stage;
+
+/**
+ * text_poke_pv_stage - Stage paravirt-op for poking.
+ * @addr: address in struct paravirt_patch_template
+ * @type: pv-op type
+ * @opfn: destination of the pv-op
+ *
+ * Return: staging status.
+ */
+bool text_poke_pv_stage(u8 type, void *opfn)
+{
+   if (system_state == SYSTEM_BOOTING) { /* Passthrough */
+   PARAVIRT_PATCH_OP(pv_ops, type) = (long)opfn;
+   goto out;
+   }
+
+   lockdep_assert_held(_mutex);
+
+   if (PARAVIRT_PATCH_OP(pv_ops, type) == (long)opfn)
+   goto out;
+
+   if (pv_stage.count >= PARAVIRT_STAGE_MAX)
+   goto out;
+
+   pv_stage.ops[pv_stage.count].type = type;
+   pv_stage.ops[pv_stage.count].dest = opfn;
+
+   pv_stage.count++;
+
+   return true;
+out:
+   return false;
+}
+
+void text_poke_pv_stage_zero(void)
+{
+   lockdep_assert_held(_mutex);
+   pv_stage.count = 0;
+}
+#endif /* CONFIG_PARAVIRT_RUNTIME */
-- 
2.20.1




[RFC PATCH 06/26] x86/alternatives: Remove stale symbols

2020-04-07 Thread Ankur Arora
__start_parainstructions and __stop_parainstructions aren't
defined, remove them.

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/alternative.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4157f848b537..09e4ee0e09a2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -623,8 +623,6 @@ void __init_or_module apply_paravirt(struct 
paravirt_patch_site *start,
text_poke_early(p->instr, insn_buff, p->len);
}
 }
-extern struct paravirt_patch_site __start_parainstructions[],
-   __stop_parainstructions[];
 #endif /* CONFIG_PARAVIRT */
 
 /*
-- 
2.20.1




[RFC PATCH 05/26] x86/alternatives: Rename alternatives_smp*, smp_alt_module

2020-04-07 Thread Ankur Arora
Rename alternatives_smp_module_*(), smp_alt_module to reflect
their new purpose.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/alternative.h | 10 +++---
 arch/x86/kernel/alternative.c  | 54 +++---
 arch/x86/kernel/module.c   |  8 ++---
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index 8235bbb746d9..db91a7731d87 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -75,11 +75,11 @@ extern void apply_alternatives(struct alt_instr *start, 
struct alt_instr *end);
 
 struct module;
 
-extern void alternatives_smp_module_add(struct module *mod, char *name,
-   void *locks, void *locks_end,
-   void *text, void *text_end);
-extern void alternatives_smp_module_del(struct module *mod);
-extern int alternatives_text_reserved(void *start, void *end);
+void alternatives_module_add(struct module *mod, char *name,
+void *locks, void *locks_end,
+void *text, void *text_end);
+void alternatives_module_del(struct module *mod);
+int alternatives_text_reserved(void *start, void *end);
 #ifdef CONFIG_SMP
 extern void alternatives_enable_smp(void);
 #else
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 32aa1ddf441d..4157f848b537 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -477,7 +477,7 @@ static inline void alternatives_smp_unlock(const s32 
*start, const s32 *end,
   u8 *text, u8 *text_end) { }
 #endif /* CONFIG_SMP */
 
-struct smp_alt_module {
+struct alt_module {
/* what is this ??? */
struct module   *mod;
char*name;
@@ -492,14 +492,14 @@ struct smp_alt_module {
 
struct list_head next;
 };
-static LIST_HEAD(smp_alt_modules);
 
-void __init_or_module alternatives_smp_module_add(struct module *mod,
- char *name,
- void *locks, void *locks_end,
- void *text,  void *text_end)
+static LIST_HEAD(alt_modules);
+
+void __init_or_module alternatives_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text,  void *text_end)
 {
-   struct smp_alt_module *smp;
+   struct alt_module *alt;
 
 #ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
@@ -511,36 +511,36 @@ void __init_or_module alternatives_smp_module_add(struct 
module *mod,
 
mutex_lock(_mutex);
 
-   smp = kzalloc(sizeof(*smp), GFP_KERNEL | __GFP_NOFAIL);
+   alt = kzalloc(sizeof(*alt), GFP_KERNEL | __GFP_NOFAIL);
 
-   smp->mod= mod;
-   smp->name   = name;
+   alt->mod= mod;
+   alt->name   = name;
 
if (num_possible_cpus() != 1 || uniproc_patched) {
/* Remember only if we'll need to undo it. */
-   smp->locks  = locks;
-   smp->locks_end  = locks_end;
+   alt->locks  = locks;
+   alt->locks_end  = locks_end;
}
 
-   smp->text   = text;
-   smp->text_end   = text_end;
+   alt->text   = text;
+   alt->text_end   = text_end;
DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
-   smp->locks, smp->locks_end,
-   smp->text, smp->text_end, smp->name);
+   alt->locks, alt->locks_end,
+   alt->text, alt->text_end, alt->name);
 
-   list_add_tail(>next, _alt_modules);
+   list_add_tail(>next, _modules);
 
if (uniproc_patched)
alternatives_smp_unlock(locks, locks_end, text, text_end);
mutex_unlock(_mutex);
 }
 
-void __init_or_module alternatives_smp_module_del(struct module *mod)
+void __init_or_module alternatives_module_del(struct module *mod)
 {
-   struct smp_alt_module *item;
+   struct alt_module *item;
 
mutex_lock(_mutex);
-   list_for_each_entry(item, _alt_modules, next) {
+   list_for_each_entry(item, _modules, next) {
if (mod != item->mod)
continue;
list_del(>next);
@@ -553,7 +553,7 @@ void __init_or_module alternatives_smp_module_del(struct 
module *mod)
 #ifdef CONFIG_SMP
 void alternatives_enable_smp(void)
 {
-   struct smp_alt_module *mod;
+   struct alt_module *mod;
 
/* Why bother if there are no other CPUs? */
BUG_ON(num_possible_cpus() == 1);
@@ -565,7 +565,7 @@ void alternatives_enable_smp(void)
BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(_data(0), X86_FEATURE_UP);
-   

[RFC PATCH 09/26] x86/paravirt: Add runtime_patch()

2020-04-07 Thread Ankur Arora
runtime_patch() generates insn sequences for patching supported pv_ops.
It does this by calling paravirt_patch_default() or native_patch()
dpending on if the target is a paravirt or native pv-op.

In addition, runtime_patch() also whitelists pv-ops that are safe to
patch at runtime.

The static conditions that need to be satisfied to patch safely:
 - Insn sequences under replacement need to execute without preemption.
   This is meant to avoid scenarios where a call-site (ex.
   lock.vcpu_is_preempted) switches between the following sequences:

  lock.vcpu_is_preempted = __raw_callee_save___kvm_vcpu_is_preempted
0: e8 31 e6 ff ff   callq  0xe636
4: 66 90xchg   %ax,%ax  # NOP2

  lock.vcpu_is_preempted = __raw_callee_save___native_vcpu_is_preempted
0: 31 c0xor   %rax, %rax
2: 0f 1f 44 00 00   nopl   0x0(%rax)# NOP5

   If kvm_vcpu_is_preempted() were preemptible, then, post patching
   we would return to address 4 above, which is in the middle of an
   instruction for native_vcpu_is_preempted().

   Even if this were to be made safe (ex. by changing the NOP2 to be a
   prefix instead of a suffix), it would still not be enough -- since
   we do not want any code from the switched out pv-op to be executing
   after the pv-op has been switched out.

 - Entered only at the beginning: this allows us to use text_poke()
   which uses INT3 as a barrier.

We don't store the address inside any call-sites so the second can be
assumed.

Guaranteeing the first condition boils down to stating that any pv-op
being patched cannot be present/referenced from any call-stack in the
system. pv-ops that are not obviously non-preemptible need to be
enclosed in preempt_disable_runtime_patch()/preempt_enable_runtime_patch().

This should be sufficient because runtime_patch() itself is called from
a stop_machine() context which would would be enough to flush out any
non-preemptible sequences.

Note that preemption in the host is okay: stop_machine() would unwind
any pv-ops sleeping in the host.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/paravirt_types.h |  8 +
 arch/x86/kernel/paravirt.c|  6 +---
 arch/x86/kernel/paravirt_patch.c  | 49 +++
 include/linux/preempt.h   | 17 ++
 4 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index bc935eec7ec6..3b9f6c105397 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -350,6 +350,12 @@ extern struct paravirt_patch_template native_pv_ops;
 #define PARAVIRT_PATCH(x)  \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
 
+/*
+ * Neat trick to map patch type back to the call within the
+ * corresponding structure.
+ */
+#define PARAVIRT_PATCH_OP(ops, type) (*(long *)(&((long **)&(ops))[type]))
+
 #define paravirt_type(op)  \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)),\
[paravirt_opptr] "i" (&(pv_ops.op))
@@ -383,6 +389,8 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, 
unsigned long addr, un
 unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char 
*start, const char *end);
 
 unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned 
len);
+int runtime_patch(u8 type, void *insn_buff, void *op, unsigned long addr,
+ unsigned int len);
 
 int paravirt_disable_iospace(void);
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8c511cc4d4f4..c4128436b05a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -117,11 +117,7 @@ void __init native_pv_lock_init(void)
 unsigned paravirt_patch_default(u8 type, void *insn_buff,
unsigned long addr, unsigned len)
 {
-   /*
-* Neat trick to map patch type back to the call within the
-* corresponding structure.
-*/
-   void *opfunc = *((void **)_ops + type);
+   void *opfunc = (void *)PARAVIRT_PATCH_OP(pv_ops, type);
unsigned ret;
 
if (opfunc == NULL)
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index 3eff63c090d2..3eb8c0e720b4 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
+#include 
 
 #include 
 #include 
@@ -124,3 +125,51 @@ unsigned int native_patch(u8 type, void *insn_buff, 
unsigned long addr,
 
return paravirt_patch_default(type, insn_buff, addr, len);
 }
+
+#ifdef CONFIG_PARAVIRT_RUNTIME
+/**
+ * runtime_patch - Generate patching code for a native/paravirt op
+ * @type: op type to generate code for
+ * @insn_buff: destination buffer
+ * @op: op target
+ * @addr: call site address
+ * @len: length of insn_buff
+ *

[RFC PATCH 01/26] x86/paravirt: Specify subsection in PVOP macros

2020-04-07 Thread Ankur Arora
Allow PVOP macros to specify a subsection such that _paravirt_alt() can
optionally put sites in .parainstructions.*.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/paravirt_types.h | 158 +-
 1 file changed, 102 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 732f62e04ddb..37e8f27a3b9d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -337,6 +337,9 @@ struct paravirt_patch_template {
 extern struct pv_info pv_info;
 extern struct paravirt_patch_template pv_ops;
 
+/* Sub-section for .parainstructions */
+#define PV_SUFFIX ""
+
 #define PARAVIRT_PATCH(x)  \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
 
@@ -350,9 +353,9 @@ extern struct paravirt_patch_template pv_ops;
  * Generate some code, and mark it as patchable by the
  * apply_paravirt() alternate instruction patcher.
  */
-#define _paravirt_alt(insn_string, type, clobber)  \
+#define _paravirt_alt(sec, insn_string, type, clobber) \
"771:\n\t" insn_string "\n" "772:\n"\
-   ".pushsection .parainstructions,\"a\"\n"\
+   ".pushsection .parainstructions" sec ",\"a\"\n" \
_ASM_ALIGN "\n" \
_ASM_PTR " 771b\n"  \
"  .byte " type "\n"\
@@ -361,8 +364,9 @@ extern struct paravirt_patch_template pv_ops;
".popsection\n"
 
 /* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string)  \
-   _paravirt_alt(insn_string, "%c[paravirt_typenum]", 
"%c[paravirt_clobber]")
+#define paravirt_alt(sec, insn_string) \
+   _paravirt_alt(sec, insn_string, "%c[paravirt_typenum]", \
+ "%c[paravirt_clobber]")
 
 /* Simple instruction patching code. */
 #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
@@ -414,7 +418,7 @@ int paravirt_disable_iospace(void);
  * unfortunately, are quite a bit (r8 - r11)
  *
  * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
+ * and size into the .parainstructions* sections, so that
  * apply_paravirt() in arch/i386/kernel/alternative.c can do the
  * appropriate patching under the control of the backend pv_init_ops
  * implementation.
@@ -512,7 +516,7 @@ int paravirt_disable_iospace(void);
})
 
 
-#define PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr,
\
+#define PVOP_CALL(sec, rettype, op, clbr, call_clbr, extra_clbr,   \
  pre, post, ...)   \
({  \
rettype __ret;  \
@@ -522,7 +526,7 @@ int paravirt_disable_iospace(void);
/* since this condition will never hold */  \
if (sizeof(rettype) > sizeof(unsigned long)) {  \
asm volatile(pre\
-paravirt_alt(PARAVIRT_CALL)\
+paravirt_alt(sec, PARAVIRT_CALL)   \
 post   \
 : call_clbr, ASM_CALL_CONSTRAINT   \
 : paravirt_type(op),   \
@@ -532,7 +536,7 @@ int paravirt_disable_iospace(void);
__ret = (rettype)u64)__edx) << 32) | __eax); \
} else {\
asm volatile(pre\
-paravirt_alt(PARAVIRT_CALL)\
+paravirt_alt(sec, PARAVIRT_CALL)   \
 post   \
 : call_clbr, ASM_CALL_CONSTRAINT   \
 : paravirt_type(op),   \
@@ -544,22 +548,22 @@ int paravirt_disable_iospace(void);
__ret;  \
})
 
-#define __PVOP_CALL(rettype, op, pre, post, ...)   \
-   PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS,\
+#define __PVOP_CALL(sec, rettype, op, pre, post, ...)  \
+   PVOP_CALL(sec, rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS,   \
  EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
 
-#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
-   PVOP_CALL(rettype, op.func, CLBR_RET_REG,   \
+#define __PVOP_CALLEESAVE(sec, 

[RFC PATCH 00/26] Runtime paravirt patching

2020-04-07 Thread Ankur Arora
A KVM host (or another hypervisor) might advertise paravirtualized
features and optimization hints (ex KVM_HINTS_REALTIME) which might
become stale over the lifetime of the guest. For instance, the
host might go from being undersubscribed to being oversubscribed
(or the other way round) and it would make sense for the guest
switch pv-ops based on that.

This lockorture splat that I saw on the guest while testing this is
indicative of the problem:

  [ 1136.461522] watchdog: BUG: soft lockup - CPU#8 stuck for 22s! 
[lock_torture_wr:12865]
  [ 1136.461542] CPU: 8 PID: 12865 Comm: lock_torture_wr Tainted: G W L 
5.4.0-rc7+ #77
  [ 1136.461546] RIP: 0010:native_queued_spin_lock_slowpath+0x15/0x220

(Caused by an oversubscribed host but using mismatched native pv_lock_ops
on the gues.)

This series addresses the problem by doing paravirt switching at runtime.

We keep an interesting subset of pv-ops (pv_lock_ops only for now,
but PV-TLB ops are also good candidates) in .parainstructions.runtime,
while discarding the .parainstructions as usual at init. This is then
used for switching back and forth between native and paravirt mode.
([1] lists some representative numbers of the increased memory
footprint.)

Mechanism: the patching itself is done using stop_machine(). That is
not ideal -- text_poke_stop_machine() was replaced with INT3+emulation
via text_poke_bp(), but I'm using this to address two issues:
 1) emulation in text_poke() can only easily handle a small set
 of instructions and this is problematic for inlined pv-ops (and see
 a possible alternatives use-case below.)
 2) paravirt patching might have inter-dependendent ops (ex.
 lock.queued_lock_slowpath, lock.queued_lock_unlock are paired and
 need to be updated atomically.)

The alternative use-case is a runtime version of apply_alternatives()
(not posted with this patch-set) that can be used for some safe subset
of X86_FEATUREs. This could be useful in conjunction with the ongoing
late microcode loading work that Mihai Carabas and others have been
working on.

Also, there are points of similarity with the ongoing static_call work
which does rewriting of indirect calls. The difference here is that
we need to switch a group of calls atomically and given that
some of them can be inlined, need to handle a wider variety of opcodes.

To patch safely we need to satisfy these constraints:

 - No references to insn sequences under replacement on any kernel stack
   once replacement is in progress. Without this constraint we might end
   up returning to an address that is in the middle of an instruction.

 - handle inter-dependent ops: as above, lock.queued_lock_unlock(),
   lock.queued_lock_slowpath() and the rest of the pv_lock_ops are
   a good example.

 - handle a broader set of insns than CALL and JMP: some pv-ops end up
   getting inlined. Alternatives can contain arbitrary instructions.

 - locking operations can be called from interrupt handlers which means
   we cannot trivially use IPIs for flushing.

Handling these, necessitates that target pv-ops not be preemptible.
Once that is a given (for safety these need to be explicitly whitelisted
in runtime_patch()), use a state-machine with the primary CPU doing the
patching and secondary CPUs in a sync_core() loop. 

In case we hit an INT3/BP (in NMI or thread-context) we makes forward
progress by continuing the patching instead of emulating.

One remaining issue is inter-dependent pv-ops which are also executed in
the NMI handler -- patching can potentially deadlock in case of multiple
NMIs. Handle these by pushing some of this work in the NMI handler where
we know it will be uninterrupted.

There are four main sets of patches in this series:

 1. PV-ops management (patches 1-10, 20): mostly infrastructure and
 refactoring pieces to make paravirt patching usable at runtime. For the
 most part scoped under CONFIG_PARAVIRT_RUNTIME.

 Patches 1-7, to persist part of parainstructions in memory:
  "x86/paravirt: Specify subsection in PVOP macros"
  "x86/paravirt: Allow paravirt patching post-init"
  "x86/paravirt: PVRTOP macros for PARAVIRT_RUNTIME"
  "x86/alternatives: Refactor alternatives_smp_module*
  "x86/alternatives: Rename alternatives_smp*, smp_alt_module
  "x86/alternatives: Remove stale symbols
  "x86/paravirt: Persist .parainstructions.runtime"

 Patches 8-10, develop the inerfaces to safely switch pv-ops:
  "x86/paravirt: Stash native pv-ops"
  "x86/paravirt: Add runtime_patch()"
  "x86/paravirt: Add primitives to stage pv-ops"

 Patch 20 enables switching of pv_lock_ops:
  "x86/paravirt: Enable pv-spinlocks in runtime_patch()"

 2. Non-emulated text poking (patches 11-19)

 Patches 11-13 are mostly refactoring to split __text_poke() into map,
 unmap and poke/memcpy phases with the poke portion being re-entrant
  "x86/alternatives: Remove return value of text_poke*()"
  "x86/alternatives: Use __get_unlocked_pte() in text_poke()"
  "x86/alternatives: Split __text_poke()"

 Patches 15, 17 add the 

[RFC PATCH 07/26] x86/paravirt: Persist .parainstructions.runtime

2020-04-07 Thread Ankur Arora
Persist .parainstructions.runtime in memory. We will use it to
patch paravirt-ops at runtime.

The extra memory footprint depends on chosen config options but the
inlined queued_spin_unlock() presents an edge case:

 $ objdump -h vmlinux|grep .parainstructions
 Idx Name   Size  VMA
LMAFile-off  Algn
  27 .parainstructions  0001013c  82895000
02895000   01c95000  2**3
  28 .parainstructions.runtime  cd2c  828a5140
028a5140   01ca5140  2**3

(The added footprint is the size of the .parainstructions.runtime
section.)

  $ size vmlinux
  text   data   bssdec  hex   filename
  13726196   12302814   14094336   40123346 2643bd2   vmlinux

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/alternative.h |  1 +
 arch/x86/kernel/alternative.c  | 16 +++-
 arch/x86/kernel/module.c   | 28 +++-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h 
b/arch/x86/include/asm/alternative.h
index db91a7731d87..d19546c14ff6 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -76,6 +76,7 @@ extern void apply_alternatives(struct alt_instr *start, 
struct alt_instr *end);
 struct module;
 
 void alternatives_module_add(struct module *mod, char *name,
+void *para, void *para_end,
 void *locks, void *locks_end,
 void *text, void *text_end);
 void alternatives_module_del(struct module *mod);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 09e4ee0e09a2..8189ac21624c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -482,6 +482,12 @@ struct alt_module {
struct module   *mod;
char*name;
 
+#ifdef CONFIG_PARAVIRT_RUNTIME
+   /* ptrs to paravirt sites */
+   struct paravirt_patch_site *para;
+   struct paravirt_patch_site *para_end;
+#endif
+
/* ptrs to lock prefixes */
const s32   *locks;
const s32   *locks_end;
@@ -496,6 +502,7 @@ struct alt_module {
 static LIST_HEAD(alt_modules);
 
 void __init_or_module alternatives_module_add(struct module *mod, char *name,
+ void *para, void *para_end,
  void *locks, void *locks_end,
  void *text,  void *text_end)
 {
@@ -506,7 +513,7 @@ void __init_or_module alternatives_module_add(struct module 
*mod, char *name,
if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1))
uniproc_patched = true;
 #endif
-   if (!uniproc_patched)
+   if (!IS_ENABLED(CONFIG_PARAVIRT_RUNTIME) && !uniproc_patched)
return;
 
mutex_lock(_mutex);
@@ -516,6 +523,11 @@ void __init_or_module alternatives_module_add(struct 
module *mod, char *name,
alt->mod= mod;
alt->name   = name;
 
+#ifdef CONFIG_PARAVIRT_RUNTIME
+   alt->para   = para;
+   alt->para_end   = para_end;
+#endif
+
if (num_possible_cpus() != 1 || uniproc_patched) {
/* Remember only if we'll need to undo it. */
alt->locks  = locks;
@@ -733,6 +745,8 @@ void __init alternative_instructions(void)
apply_alternatives(__alt_instructions, __alt_instructions_end);
 
alternatives_module_add(NULL, "core kernel",
+   __parainstructions_runtime,
+   __parainstructions_runtime_end,
__smp_locks, __smp_locks_end,
_text, _etext);
 
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index fc3d35198b09..7b2632184c11 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -248,12 +248,30 @@ int module_finalize(const Elf_Ehdr *hdr,
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
-   if (locks && text) {
-   void *lseg = (void *)locks->sh_addr;
-   void *tseg = (void *)text->sh_addr;
+   if (para_run || (locks && text)) {
+   void *pseg, *pseg_end;
+   void *lseg, *lseg_end;
+   void *tseg, *tseg_end;
+
+   pseg = pseg_end = NULL;
+   lseg = lseg_end = NULL;
+   tseg = tseg_end = NULL;
+   if (para_run) {
+   pseg = (void *)para_run->sh_addr;
+   pseg_end = pseg + para_run->sh_size;
+   }
+
+   if (locks && text) {
+   tseg = (void *)text->sh_addr;
+   tseg_end = tseg + text->sh_size;
+
+   lseg = (void *)locks->sh_addr;
+   

[RFC PATCH 08/26] x86/paravirt: Stash native pv-ops

2020-04-07 Thread Ankur Arora
Introduce native_pv_ops where we stash the pv_ops array before
hypervisor specific hooks have modified it.

native_pv_ops get used when switching between paravirt and native
pv-ops at runtime.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/paravirt_types.h |  4 
 arch/x86/kernel/paravirt.c| 10 ++
 arch/x86/kernel/setup.c   |  2 ++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index f1153f53c529..bc935eec7ec6 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -339,6 +339,7 @@ extern struct paravirt_patch_template pv_ops;
 
 #ifdef CONFIG_PARAVIRT_RUNTIME
 #define PVRT_SUFFIX ".runtime"
+extern struct paravirt_patch_template native_pv_ops;
 #else
 #define PVRT_SUFFIX ""
 #endif
@@ -775,6 +776,9 @@ extern struct paravirt_patch_site __parainstructions[],
 #ifdef CONFIG_PARAVIRT_RUNTIME
 extern struct paravirt_patch_site __parainstructions_runtime[],
__parainstructions_runtime_end[];
+void paravirt_ops_init(void);
+#else
+static inline void paravirt_ops_init(void) { }
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c131ba4e70ef..8c511cc4d4f4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -458,5 +458,15 @@ NOKPROBE_SYMBOL(native_set_debugreg);
 NOKPROBE_SYMBOL(native_load_idt);
 #endif
 
+#ifdef CONFIG_PARAVIRT_RUNTIME
+__ro_after_init struct paravirt_patch_template native_pv_ops;
+
+void __init paravirt_ops_init(void)
+{
+   native_pv_ops = pv_ops;
+}
+EXPORT_SYMBOL(native_pv_ops);
+#endif
+
 EXPORT_SYMBOL(pv_ops);
 EXPORT_SYMBOL_GPL(pv_info);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e6b545047f38..2746a6a78fe7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
@@ -831,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
 #endif
 
+   paravirt_ops_init();
/*
 * If we have OLPC OFW, we might end up relocating the fixmap due to
 * reserve_top(), so do this before touching the ioremap area.
-- 
2.20.1




[RFC PATCH 11/26] x86/alternatives: Remove return value of text_poke*()

2020-04-07 Thread Ankur Arora
Various text_poke() variants don't return a useful value. Remove it.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/text-patching.h |  4 ++--
 arch/x86/kernel/alternative.c| 11 +--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index 706e61e6967d..04778c2bc34e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -46,9 +46,9 @@ extern void text_poke_early(void *addr, const void *opcode, 
size_t len);
  * On the local CPU you need to be protected against NMI or MCE handlers seeing
  * an inconsistent instruction while you patch.
  */
-extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void text_poke(void *addr, const void *opcode, size_t len);
 extern void text_poke_sync(void);
-extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void text_poke_kgdb(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void text_poke_bp(void *addr, const void *opcode, size_t len, const 
void *emulate);
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 0c335af9ee28..8c79a3dc5e72 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -805,7 +805,7 @@ void __init_or_module text_poke_early(void *addr, const 
void *opcode,
 __ro_after_init struct mm_struct *poking_mm;
 __ro_after_init unsigned long poking_addr;
 
-static void *__text_poke(void *addr, const void *opcode, size_t len)
+static void __text_poke(void *addr, const void *opcode, size_t len)
 {
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
struct page *pages[2] = {NULL};
@@ -906,7 +906,6 @@ static void *__text_poke(void *addr, const void *opcode, 
size_t len)
 
pte_unmap_unlock(ptep, ptl);
local_irq_restore(flags);
-   return addr;
 }
 
 /**
@@ -925,11 +924,11 @@ static void *__text_poke(void *addr, const void *opcode, 
size_t len)
  * by registering a module notifier, and ordering module removal and patching
  * trough a mutex.
  */
-void *text_poke(void *addr, const void *opcode, size_t len)
+void text_poke(void *addr, const void *opcode, size_t len)
 {
lockdep_assert_held(_mutex);
 
-   return __text_poke(addr, opcode, len);
+   __text_poke(addr, opcode, len);
 }
 
 /**
@@ -946,9 +945,9 @@ void *text_poke(void *addr, const void *opcode, size_t len)
  * Context: should only be used by kgdb, which ensures no other core is 
running,
  * despite the fact it does not hold the text_mutex.
  */
-void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+void text_poke_kgdb(void *addr, const void *opcode, size_t len)
 {
-   return __text_poke(addr, opcode, len);
+   __text_poke(addr, opcode, len);
 }
 
 static void do_sync_core(void *info)
-- 
2.20.1




[qemu-mainline test] 149487: regressions - FAIL

2020-04-07 Thread osstest service owner
flight 149487 qemu-mainline real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149487/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-xl-qemuu-debianhvm-amd64 10 debian-hvm-install fail REGR. vs. 
144861
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-win7-amd64 10 windows-install  fail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-debianhvm-i386-xsm 10 debian-hvm-install fail REGR. 
vs. 144861
 test-amd64-i386-qemuu-rhel6hvm-intel 10 redhat-install   fail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-debianhvm-amd64 10 debian-hvm-install fail REGR. vs. 
144861
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-i386-xl-qemuu-ws16-amd64 10 windows-install   fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-debianhvm-i386-xsm 10 debian-hvm-install fail REGR. 
vs. 144861
 test-amd64-amd64-xl-qemuu-ws16-amd64 10 windows-install  fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-ovmf-amd64 10 debian-hvm-install fail REGR. vs. 
144861
 test-amd64-amd64-qemuu-nested-amd 10 debian-hvm-install  fail REGR. vs. 144861
 test-amd64-i386-qemuu-rhel6hvm-amd 10 redhat-install fail REGR. vs. 144861
 test-amd64-amd64-libvirt 12 guest-start  fail REGR. vs. 144861
 test-amd64-i386-libvirt-pair 21 guest-start/debian   fail REGR. vs. 144861
 test-amd64-amd64-libvirt-xsm 12 guest-start  fail REGR. vs. 144861
 test-amd64-i386-libvirt  12 guest-start  fail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-debianhvm-amd64-shadow 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-amd64-libvirt-pair 21 guest-start/debian  fail REGR. vs. 144861
 test-amd64-amd64-qemuu-nested-intel 10 debian-hvm-install fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-debianhvm-amd64-shadow 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-i386-xl-qemuu-ovmf-amd64 10 debian-hvm-install fail REGR. vs. 144861
 test-amd64-i386-libvirt-xsm  12 guest-start  fail REGR. vs. 144861
 test-armhf-armhf-libvirt 12 guest-start  fail REGR. vs. 144861
 test-arm64-arm64-libvirt-xsm 12 guest-start  fail REGR. vs. 144861
 test-amd64-amd64-libvirt-vhd 10 debian-di-installfail REGR. vs. 144861
 test-amd64-amd64-xl-qcow210 debian-di-installfail REGR. vs. 144861
 test-armhf-armhf-xl-vhd  10 debian-di-installfail REGR. vs. 144861
 test-armhf-armhf-libvirt-raw 10 debian-di-installfail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-win7-amd64 10 windows-install   fail REGR. vs. 144861

Tests which are failing intermittently (not blocking):
 test-amd64-amd64-xl-rtds   16 guest-localmigrate fail in 149475 pass in 149487
 test-armhf-armhf-xl-rtds 16 guest-start/debian.repeat  fail pass in 149475

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-dom0pvh-xl-intel 18 guest-localmigrate/x10 fail in 149475 
baseline untested
 test-amd64-amd64-xl-rtds 18 guest-localmigrate/x10   fail  like 144861
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 

[linux-linus test] 149480: regressions - FAIL

2020-04-07 Thread osstest service owner
flight 149480 linux-linus real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149480/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-i386-xl-pvshim 7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-ws16-amd64  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-examine   8 reboot   fail REGR. vs. 149238
 test-amd64-i386-freebsd10-amd64  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-xl-qemut-debianhvm-amd64  7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-ovmf-amd64  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-xl-shadow 7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-debianhvm-i386-xsm  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-libvirt-pair 10 xen-boot/src_hostfail REGR. vs. 149238
 test-amd64-i386-libvirt-pair 11 xen-boot/dst_hostfail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-debianhvm-amd64  7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-qemut-rhel6hvm-intel  7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-xl-qemut-ws16-amd64  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-debianhvm-amd64-shadow 7 xen-boot fail REGR. vs. 
149238
 test-amd64-i386-libvirt-xsm   7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-libvirt   7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-qemuu-rhel6hvm-amd  7 xen-boot   fail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-dmrestrict-amd64-dmrestrict 7 xen-boot fail REGR. vs. 
149238
 test-amd64-i386-xl-qemut-debianhvm-i386-xsm  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-xl-qemut-stubdom-debianhvm-amd64-xsm 7 xen-boot fail REGR. vs. 
149238
 test-amd64-i386-xl7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-xl-raw7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-qemut-rhel6hvm-amd  7 xen-boot   fail REGR. vs. 149238
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 7 xen-boot fail REGR. vs. 
149238
 test-amd64-i386-xl-xsm7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-qemuu-rhel6hvm-intel  7 xen-boot fail REGR. vs. 149238
 test-amd64-i386-freebsd10-i386  7 xen-boot   fail REGR. vs. 149238
 test-amd64-i386-xl-qemuu-win7-amd64  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-xl-qemut-win7-amd64  7 xen-boot  fail REGR. vs. 149238
 test-amd64-i386-pair 10 xen-boot/src_hostfail REGR. vs. 149238
 test-amd64-i386-pair 11 xen-boot/dst_hostfail REGR. vs. 149238

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-rtds 16 guest-localmigrate   fail REGR. vs. 149238
 test-armhf-armhf-xl-rtds 12 guest-start  fail REGR. vs. 149238

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-dom0pvh-xl-intel 15 guest-saverestorefail like 149238
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stopfail like 149238
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stopfail like 149238
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stopfail like 149238
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail  like 149238
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail  like 149238
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stopfail like 149238
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 

[ovmf test] 149497: all pass - PUSHED

2020-04-07 Thread osstest service owner
flight 149497 ovmf real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149497/

Perfect :-)
All tests in this flight passed as required
version targeted for testing:
 ovmf 9bb1f080c45f7253f9270662d55865a8718cebc8
baseline version:
 ovmf aab6a9c9aebb1f6614e72e98bdf6b5af93a43527

Last test of basis   149485  2020-04-07 08:40:39 Z0 days
Testing same since   149497  2020-04-07 19:10:25 Z0 days1 attempts


People who touched revisions under test:
  Michael Kubacki 
  Sean Brogan 
  Shenglei Zhang 

jobs:
 build-amd64-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/osstest/ovmf.git
   aab6a9c9ae..9bb1f080c4  9bb1f080c45f7253f9270662d55865a8718cebc8 -> 
xen-tested-master



[xen-unstable-smoke test] 149499: tolerable all pass - PUSHED

2020-04-07 Thread osstest service owner
flight 149499 xen-unstable-smoke real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149499/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  e013e8514389b739153016349e49f5a78e34ddf0
baseline version:
 xen  990b6e38d93c6e60f9d81e8b71ddfd209fca00bd

Last test of basis   149401  2020-04-03 20:00:45 Z4 days
Testing same since   149499  2020-04-07 21:00:41 Z0 days1 attempts


People who touched revisions under test:
  Juergen Gross 
  Wei Liu 

jobs:
 build-arm64-xsm  pass
 build-amd64  pass
 build-armhf  pass
 build-amd64-libvirt  pass
 test-armhf-armhf-xl  pass
 test-arm64-arm64-xl-xsm  pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-amd64-libvirt pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/xen.git
   990b6e38d9..e013e85143  e013e8514389b739153016349e49f5a78e34ddf0 -> smoke



Re: [XEN PATCH] libxc/migration: Abort migration on precopy policy request

2020-04-07 Thread Andrew Cooper
On 07/04/2020 23:06, Panyakin, Andrew wrote:
> On 4/7/20 10:22 PM, Wei Liu wrote:
>>> +PERROR("Abort precopy loop");
>>> +rc = -1;
>>> +goto out;
>> There is no need to have "goto out" here.
> I was considering two more examples of "goto out" in a branch right before 
> the label:
> - send_domain_memory_nonlive,
> - send_domain_memory_live.
>
> Isn't it done this way to simplify the function extension: you won't need to 
> add "goto out" to previous branch when adding new code?

I'd recommend leaving the goto out like this.  Less effort for the next
person editing this code to think about.

>> These can be fixed easily while committing, so no need to resend yet. I
>> will give other people a chance to comment.

None of the copy policy was done well.  If Amazon have a usecase then
lets put it in.  (Talking of - I wonder why XenServer's usecase hasn't
tripped over this...  This was put into to help negotiate two live
streams at once, but this is an error case which surely ought to trigger.)

~Andrew



Re: [PULL 0/3] xen queue for 5.0

2020-04-07 Thread Peter Maydell
On Tue, 7 Apr 2020 at 16:22, Anthony PERARD  wrote:
>
> The following changes since commit 8f0d25c464a1989d606f7b988d07b1147dfcde33:
>
>   Merge remote-tracking branch 
> 'remotes/philmd-gitlab/tags/acceptance-fixes-20200407' into staging 
> (2020-04-07 15:10:11 +0100)
>
> are available in the Git repository at:
>
>   https://xenbits.xen.org/git-http/people/aperard/qemu-dm.git 
> tags/pull-xen-20200407
>
> for you to fetch changes up to 758af9cfabfb000eb00e42b9738e655b18fdd812:
>
>   MAINTAINERS: Add xen-usb.c to Xen section (2020-04-07 16:13:26 +0100)
>
> 
> Xen queue for QEMU 5.0
>
> - Fix for xen-block.
> - A fix for a Coverity false positive in xen-usb.
> - Update MAINTAINERS to add xen-usb.c to Xen section.
>

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.0
for any user-visible changes.

-- PMM



Re: [XEN PATCH] libxc/migration: Abort migration on precopy policy request

2020-04-07 Thread Wei Liu
On Tue, Apr 07, 2020 at 02:52:22PM +, Andrew Panyakin wrote:
> libxc defines XGS_POLICY_ABORT for precopy policy to signal that migration
> should be aborted (eg. if the estimated pause time is too huge for the
> instance). Default simple precopy policy never returns that, but it could be
> overriden with a custom one.
> 

Right. I think this is a real problem.

> Signed-off-by: Andrew Panyakin 
> ---
>  tools/libxc/xc_sr_save.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c
> index fa736a311f..507274ce22 100644
> --- a/tools/libxc/xc_sr_save.c
> +++ b/tools/libxc/xc_sr_save.c
> @@ -560,6 +560,12 @@ static int send_memory_live(struct xc_sr_context *ctx)
>  
>  }
>  
> +if ( policy_decision == XGS_POLICY_ABORT ) {

The { should be on a new line.

> +PERROR("Abort precopy loop");
> +rc = -1;
> +goto out;

There is no need to have "goto out" here.

These can be fixed easily while committing, so no need to resend yet. I
will give other people a chance to comment.

Wei.

> +}
> +
>   out:
>  xc_set_progress_prefix(xch, NULL);
>  free(progress_str);
> -- 
> 2.16.6
> 



Re: [PATCH] config: use mini-os master for unstable

2020-04-07 Thread Wei Liu
On Tue, Apr 07, 2020 at 03:48:31PM +0200, Juergen Gross wrote:
> We haven't used mini-os master for about 2 years now due to a stubdom
> test failing [1]. Booting a guest with mini-os master used for building
> stubdom didn't reveal any problem, so use master for unstable in order
> to let OSStest find any problems not showing up in the local test.
> 
> [1]: https://lists.xen.org/archives/html/minios-devel/2018-04/msg00015.html
> 
> Signed-off-by: Juergen Gross 

Acked-by: Wei Liu 

And applied.



Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread Julien Grall

Hi George,

On 07/04/2020 19:16, George Dunlap wrote:




On Apr 7, 2020, at 5:50 PM, Julien Grall  wrote:



On 07/04/2020 17:16, George Dunlap wrote:

On Apr 6, 2020, at 7:47 PM, Julien Grall  wrote:

On 06/04/2020 18:58, George Dunlap wrote:

On Apr 3, 2020, at 9:27 PM, Julien Grall  wrote:

On Fri, 3 Apr 2020 at 20:41, Stefano Stabellini  wrote:


On Thu, 2 Apr 2020, Julien Grall wrote:

As we discussed on Tuesday, the cost for other vCPUs is only going to be a
trap to the hypervisor and then back again. The cost is likely smaller than
receiving and forwarding an interrupt.

You actually agreed on this analysis. So can you enlighten me as to why
receiving an interrupt is a not problem for latency but this is?


My answer was that the difference is that an operating system can
disable interrupts, but it cannot disable receiving this special IPI.


An OS can *only* disable its own interrupts, yet interrupts will still
be received by Xen even if the interrupts are masked at the processor
(e.g local_irq_disable()).

You would need to disable interrupts one by one as the GIC level (use
ICENABLER) in order to not receive any interrupts. Yet, Xen may still
receive interrupts for operational purposes (e.g serial, console,
maintainance IRQ...). So trap will happen.

I think Stefano’s assertion is that the users he has in mind will be 
configuring the system such that RT workloads get a minimum number of 
hypervisor-related interrupts possible.  On a 4-core system, you could  have 
non-RT workloads running on cores 0-1, and RT workloads running with the NULL 
scheduler on cores 2-3.  In such a system, you’d obviously arrange that serial 
and maintenance IRQs are delivered to cores 0-1.

Well maintenance IRQs are per pCPU so you can't route to another one...

But, I think you missed my point that local_irq_disable() from the guest will 
not prevent the hypervisor to receive interrupts *even* the one routed to the 
vCPU itself. They will just not be delivered to the guest context until 
local_irq_enable() is called.

My understanding, from Stefano was that what his customers are concerned about 
is the time between the time a physical IRQ is delivered to the guest and the 
time the guest OS can respond appropriately.  The key thing here isn’t 
necessarily speed, but predictability — system designers need to know that, 
with a high probability, their interrupt routines will complete within X amount 
of cycles.
Further interrupts delivered to a guest are not a problem in this scenario, if 
the guest can disable them until the critical IRQ has been handled.


You keep saying a guest can disable interrupts, but it can't do it via 
local_irq_disable(). So what method are you thinking? Disabling at the GIC 
level? That is involving traps and most likely not going to help with 
predictability...


So you’ll have to forgive me for making educated guesses here, as I’m trying to 
collect all the information.  On x86, if you use device pass-through on a 
system with a virtualized APIC and posted interrupts, then when when the device 
generates interrupts, those are delivered directly to the guest without 
involvement of Xen; and when the guest disables interrupts in the vAPIC, those 
interrupts will be disabled, and be delivered when the guest re-enables 
interrupts.  Given what Stefano said about disabling interrupts, I assumed that 
ARM had the same sort of functionality.  Is that not the case?


Posted interrupts are only present in GICv4 and onwards. GICv4 only 
supports direct injections for LPIs (e.g MSIs) and GICv4.1 added support 
for direct injection of SGIs (aka IPIs).


Xen on Arm does not support any posted interrupts at all and, I don't 
believe Stefano has HW (at least in production) using GICv4.





Xen-related IPIs, however, could potentially cause a problem if not mitigated. 
Consider a guest where vcpu 1 loops over the register, while vcpu 2 is handling 
a latency-critical IRQ.  A naive implementation might send an IPI every time 
vcpu 1 does a read, spamming vcpu 2 with dozens of IPIs.  Then an IRQ routine 
which reliably finishes well within the required time normally suddenly 
overruns and causes an issue.


I never suggested the naive implementation would be perfect. That's why I said 
it can be optimized...


It didn’t seem to me that you understood what Stefano’s concerns were; so I was 
trying to explain the situation he is trying to avoid (as well as making sure 
that I had a clear understanding myself).  The reason I said “a naive 
implementation” was to make clear that I knew that’s not what you were 
suggesting. :-)


I know Stefano's concerns, sorry it was not clear enough :).




I don’t know what maintenance IRQs are, but if they only happen intermittently, 
it’s possible that you’d never get more than a single one in a latency-critical 
IRQ routine; and as such, the variatibility in execution time (jitter) wouldn’t 
be an issue in practice.  But every time you add a new unblockable IPI, 

[ovmf test] 149485: all pass - PUSHED

2020-04-07 Thread osstest service owner
flight 149485 ovmf real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149485/

Perfect :-)
All tests in this flight passed as required
version targeted for testing:
 ovmf aab6a9c9aebb1f6614e72e98bdf6b5af93a43527
baseline version:
 ovmf 48f0e94921d83b8204f1025412e071b000394f04

Last test of basis   149477  2020-04-07 01:39:26 Z0 days
Testing same since   149485  2020-04-07 08:40:39 Z0 days1 attempts


People who touched revisions under test:
  Ard Biesheuvel 
  Jiewen Yao 
  Liming Gao 

jobs:
 build-amd64-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/osstest/ovmf.git
   48f0e94921..aab6a9c9ae  aab6a9c9aebb1f6614e72e98bdf6b5af93a43527 -> 
xen-tested-master



Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread George Dunlap


> On Apr 7, 2020, at 5:50 PM, Julien Grall  wrote:
> 
> 
> 
> On 07/04/2020 17:16, George Dunlap wrote:
>>> On Apr 6, 2020, at 7:47 PM, Julien Grall  wrote:
>>> 
>>> On 06/04/2020 18:58, George Dunlap wrote:
> On Apr 3, 2020, at 9:27 PM, Julien Grall  
> wrote:
> 
> On Fri, 3 Apr 2020 at 20:41, Stefano Stabellini  
> wrote:
>> 
>> On Thu, 2 Apr 2020, Julien Grall wrote:
>>> As we discussed on Tuesday, the cost for other vCPUs is only going to 
>>> be a
>>> trap to the hypervisor and then back again. The cost is likely smaller 
>>> than
>>> receiving and forwarding an interrupt.
>>> 
>>> You actually agreed on this analysis. So can you enlighten me as to why
>>> receiving an interrupt is a not problem for latency but this is?
>> 
>> My answer was that the difference is that an operating system can
>> disable interrupts, but it cannot disable receiving this special IPI.
> 
> An OS can *only* disable its own interrupts, yet interrupts will still
> be received by Xen even if the interrupts are masked at the processor
> (e.g local_irq_disable()).
> 
> You would need to disable interrupts one by one as the GIC level (use
> ICENABLER) in order to not receive any interrupts. Yet, Xen may still
> receive interrupts for operational purposes (e.g serial, console,
> maintainance IRQ...). So trap will happen.
 I think Stefano’s assertion is that the users he has in mind will be 
 configuring the system such that RT workloads get a minimum number of 
 hypervisor-related interrupts possible.  On a 4-core system, you could  
 have non-RT workloads running on cores 0-1, and RT workloads running with 
 the NULL scheduler on cores 2-3.  In such a system, you’d obviously 
 arrange that serial and maintenance IRQs are delivered to cores 0-1.
>>> Well maintenance IRQs are per pCPU so you can't route to another one...
>>> 
>>> But, I think you missed my point that local_irq_disable() from the guest 
>>> will not prevent the hypervisor to receive interrupts *even* the one routed 
>>> to the vCPU itself. They will just not be delivered to the guest context 
>>> until local_irq_enable() is called.
>> My understanding, from Stefano was that what his customers are concerned 
>> about is the time between the time a physical IRQ is delivered to the guest 
>> and the time the guest OS can respond appropriately.  The key thing here 
>> isn’t necessarily speed, but predictability — system designers need to know 
>> that, with a high probability, their interrupt routines will complete within 
>> X amount of cycles.
>> Further interrupts delivered to a guest are not a problem in this scenario, 
>> if the guest can disable them until the critical IRQ has been handled.
> 
> You keep saying a guest can disable interrupts, but it can't do it via 
> local_irq_disable(). So what method are you thinking? Disabling at the GIC 
> level? That is involving traps and most likely not going to help with 
> predictability...

So you’ll have to forgive me for making educated guesses here, as I’m trying to 
collect all the information.  On x86, if you use device pass-through on a 
system with a virtualized APIC and posted interrupts, then when when the device 
generates interrupts, those are delivered directly to the guest without 
involvement of Xen; and when the guest disables interrupts in the vAPIC, those 
interrupts will be disabled, and be delivered when the guest re-enables 
interrupts.  Given what Stefano said about disabling interrupts, I assumed that 
ARM had the same sort of functionality.  Is that not the case?

>> Xen-related IPIs, however, could potentially cause a problem if not 
>> mitigated. Consider a guest where vcpu 1 loops over the register, while vcpu 
>> 2 is handling a latency-critical IRQ.  A naive implementation might send an 
>> IPI every time vcpu 1 does a read, spamming vcpu 2 with dozens of IPIs.  
>> Then an IRQ routine which reliably finishes well within the required time 
>> normally suddenly overruns and causes an issue.
> 
> I never suggested the naive implementation would be perfect. That's why I 
> said it can be optimized...

It didn’t seem to me that you understood what Stefano’s concerns were; so I was 
trying to explain the situation he is trying to avoid (as well as making sure 
that I had a clear understanding myself).  The reason I said “a naive 
implementation” was to make clear that I knew that’s not what you were 
suggesting. :-)

>> I don’t know what maintenance IRQs are, but if they only happen 
>> intermittently, it’s possible that you’d never get more than a single one in 
>> a latency-critical IRQ routine; and as such, the variatibility in execution 
>> time (jitter) wouldn’t be an issue in practice.  But every time you add a 
>> new unblockable IPI, you increase this jitter; particularly if this 
>> unblockable IPI might be repeated an arbitrary number of times.
>> 

[PATCH v2 5/5] tools/libxc: make use of domain context SHARED_INFO record...

2020-04-07 Thread Paul Durrant
... in the save/restore code.

This patch replaces direct mapping of the shared_info_frame (retrieved
using XEN_DOMCTL_getdomaininfo) with save/load of the domain context
SHARED_INFO record.

No modifications are made to the definition of the migration stream at
this point. Subsequent patches will define a record in the libxc domain
image format for passing domain context and convert the save/restore code
to use that.

Signed-off-by: Paul Durrant 
---
Cc: Ian Jackson 
Cc: Wei Liu 

v2:
 - Re-based (now making use of DOMAIN_SAVE_FLAG_IGNORE)
---
 tools/libxc/xc_sr_common.h |  7 +++-
 tools/libxc/xc_sr_common_x86.c | 59 ++
 tools/libxc/xc_sr_common_x86.h |  4 ++
 tools/libxc/xc_sr_common_x86_pv.c  | 53 +++
 tools/libxc/xc_sr_common_x86_pv.h  |  3 ++
 tools/libxc/xc_sr_restore_x86_pv.c | 40 
 tools/libxc/xc_sr_save_x86_pv.c| 26 ++---
 tools/libxc/xg_save_restore.h  |  1 +
 8 files changed, 144 insertions(+), 49 deletions(-)

diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index 5dd51ccb15..db6519cdcc 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -287,6 +287,11 @@ struct xc_sr_context
 {
 struct /* x86 */
 {
+struct {
+void *buffer;
+unsigned int len;
+} domain_context;
+
 struct /* x86 PV guest. */
 {
 /* 4 or 8; 32 or 64 bit domain */
@@ -314,7 +319,7 @@ struct xc_sr_context
 /* The guest pfns containing the p2m leaves */
 xen_pfn_t *p2m_pfns;
 
-/* Read-only mapping of guests shared info page */
+/* Pointer to shared_info (located in context buffer) */
 shared_info_any_t *shinfo;
 
 /* p2m generation count for verifying validity of local p2m. */
diff --git a/tools/libxc/xc_sr_common_x86.c b/tools/libxc/xc_sr_common_x86.c
index 011684df97..e87dc0f0f3 100644
--- a/tools/libxc/xc_sr_common_x86.c
+++ b/tools/libxc/xc_sr_common_x86.c
@@ -42,6 +42,65 @@ int handle_x86_tsc_info(struct xc_sr_context *ctx, struct 
xc_sr_record *rec)
 return 0;
 }
 
+int x86_get_context(struct xc_sr_context *ctx)
+{
+xc_interface *xch = ctx->xch;
+size_t len = 0;
+int rc;
+
+if ( ctx->x86.domain_context.buffer )
+{
+ERROR("Domain context already present");
+return -1;
+}
+
+rc = xc_domain_getcontext(xch, ctx->domid, NULL, );
+if ( rc < 0 )
+{
+PERROR("Unable to get size of domain context");
+return -1;
+}
+
+ctx->x86.domain_context.buffer = malloc(len);
+if ( ctx->x86.domain_context.buffer == NULL )
+{
+PERROR("Unable to allocate memory for domain context");
+return -1;
+}
+
+rc = xc_domain_getcontext(xch, ctx->domid,
+  ctx->x86.domain_context.buffer, );
+if ( rc < 0 )
+{
+PERROR("Unable to get domain context");
+return -1;
+}
+
+ctx->x86.domain_context.len = len;
+
+return 0;
+}
+
+int x86_set_context(struct xc_sr_context *ctx)
+{
+xc_interface *xch = ctx->xch;
+
+if ( !ctx->x86.domain_context.buffer )
+{
+ERROR("Domain context not present");
+return -1;
+}
+
+return xc_domain_setcontext(xch, ctx->domid,
+ctx->x86.domain_context.buffer,
+ctx->x86.domain_context.len);
+}
+
+void x86_cleanup(struct xc_sr_context *ctx)
+{
+free(ctx->x86.domain_context.buffer);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xc_sr_common_x86.h b/tools/libxc/xc_sr_common_x86.h
index ebc4355bd1..501c9e52ba 100644
--- a/tools/libxc/xc_sr_common_x86.h
+++ b/tools/libxc/xc_sr_common_x86.h
@@ -14,6 +14,10 @@ int write_x86_tsc_info(struct xc_sr_context *ctx);
  */
 int handle_x86_tsc_info(struct xc_sr_context *ctx, struct xc_sr_record *rec);
 
+int x86_get_context(struct xc_sr_context *ctx);
+int x86_set_context(struct xc_sr_context *ctx);
+void x86_cleanup(struct xc_sr_context *ctx);
+
 #endif
 /*
  * Local variables:
diff --git a/tools/libxc/xc_sr_common_x86_pv.c 
b/tools/libxc/xc_sr_common_x86_pv.c
index d3d425cb82..7354fd6052 100644
--- a/tools/libxc/xc_sr_common_x86_pv.c
+++ b/tools/libxc/xc_sr_common_x86_pv.c
@@ -182,6 +182,59 @@ int x86_pv_map_m2p(struct xc_sr_context *ctx)
 return rc;
 }
 
+int x86_pv_get_shinfo(struct xc_sr_context *ctx)
+{
+unsigned int off = 0;
+struct domain_save_descriptor *desc;
+int rc;
+
+rc = x86_get_context(ctx);
+if ( rc )
+return rc;
+
+do {
+if ( ctx->x86.domain_context.len - off < sizeof(*desc) )
+return -1;
+
+desc = ctx->x86.domain_context.buffer + off;
+off += sizeof(*desc);
+
+switch (desc->typecode)
+{
+case DOMAIN_SAVE_CODE(SHARED_INFO):
+{
+

[PATCH v2 2/5] xen/common/domctl: introduce XEN_DOMCTL_get/setdomaincontext

2020-04-07 Thread Paul Durrant
These domctls provide a mechanism to get and set domain context from
the toolstack.

Signed-off-by: Paul Durrant 
---
Cc: Daniel De Graaf 
Cc: Ian Jackson 
Cc: Wei Liu 
Cc: Andrew Cooper 
Cc: George Dunlap 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: Stefano Stabellini 

v2:
 - drop mask parameter
 - const-ify some more buffers
---
 tools/flask/policy/modules/xen.if   |   4 +-
 tools/libxc/include/xenctrl.h   |   5 ++
 tools/libxc/xc_domain.c |  54 +
 xen/common/domctl.c | 117 
 xen/include/public/domctl.h |  44 ++-
 xen/xsm/flask/hooks.c   |   6 ++
 xen/xsm/flask/policy/access_vectors |   4 +
 7 files changed, 231 insertions(+), 3 deletions(-)

diff --git a/tools/flask/policy/modules/xen.if 
b/tools/flask/policy/modules/xen.if
index 8eb2293a52..2bc9db4f64 100644
--- a/tools/flask/policy/modules/xen.if
+++ b/tools/flask/policy/modules/xen.if
@@ -53,7 +53,7 @@ define(`create_domain_common', `
allow $1 $2:domain2 { set_cpu_policy settsc setscheduler setclaim
set_vnumainfo get_vnumainfo cacheflush
psr_cmt_op psr_alloc soft_reset
-   resource_map get_cpu_policy };
+   resource_map get_cpu_policy setcontext };
allow $1 $2:security check_context;
allow $1 $2:shadow enable;
allow $1 $2:mmu { map_read map_write adjust memorymap physmap pinpage 
mmuext_op updatemp };
@@ -97,7 +97,7 @@ define(`migrate_domain_out', `
allow $1 $2:hvm { gethvmc getparam };
allow $1 $2:mmu { stat pageinfo map_read };
allow $1 $2:domain { getaddrsize getvcpucontext pause destroy };
-   allow $1 $2:domain2 gettsc;
+   allow $1 $2:domain2 { gettsc getcontext };
allow $1 $2:shadow { enable disable logdirty };
 ')
 
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 58fa931de1..06ca8e9a74 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -867,6 +867,11 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
  uint8_t *hvm_ctxt,
  uint32_t size);
 
+int xc_domain_getcontext(xc_interface *xch, uint32_t domid,
+ void *ctxt_buf, size_t *size);
+int xc_domain_setcontext(xc_interface *xch, uint32_t domid,
+ const void *ctxt_buf, size_t size);
+
 /**
  * This function will return guest IO ABI protocol
  *
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 71829c2bce..212d1489dd 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -537,6 +537,60 @@ int xc_domain_hvm_setcontext(xc_interface *xch,
 return ret;
 }
 
+int xc_domain_getcontext(xc_interface *xch, uint32_t domid,
+ void *ctxt_buf, size_t *size)
+{
+int ret;
+DECLARE_DOMCTL;
+DECLARE_HYPERCALL_BOUNCE(ctxt_buf, *size, XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+
+if ( xc_hypercall_bounce_pre(xch, ctxt_buf) )
+return -1;
+
+domctl.cmd = XEN_DOMCTL_getdomaincontext;
+domctl.domain = domid;
+domctl.u.getdomaincontext.size = *size;
+set_xen_guest_handle(domctl.u.setdomaincontext.buffer, ctxt_buf);
+
+ret = do_domctl(xch, );
+
+xc_hypercall_bounce_post(xch, ctxt_buf);
+
+if ( ret )
+return ret;
+
+*size = domctl.u.getdomaincontext.size;
+if ( *size != domctl.u.getdomaincontext.size )
+{
+errno = EOVERFLOW;
+return -1;
+}
+
+return 0;
+}
+
+int xc_domain_setcontext(xc_interface *xch, uint32_t domid,
+ const void *ctxt_buf, size_t size)
+{
+int ret;
+DECLARE_DOMCTL;
+DECLARE_HYPERCALL_BOUNCE_IN(ctxt_buf, size);
+
+if ( xc_hypercall_bounce_pre(xch, ctxt_buf) )
+return -1;
+
+domctl.cmd = XEN_DOMCTL_setdomaincontext;
+domctl.domain = domid;
+domctl.u.setdomaincontext.size = size;
+set_xen_guest_handle(domctl.u.setdomaincontext.buffer, ctxt_buf);
+
+ret = do_domctl(xch, );
+
+xc_hypercall_bounce_post(xch, ctxt_buf);
+
+return ret;
+}
+
 int xc_vcpu_getcontext(xc_interface *xch,
uint32_t domid,
uint32_t vcpu,
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index a69b3b59a8..2e5c6a46d9 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -358,6 +359,113 @@ static struct vnuma_info *vnuma_init(const struct 
xen_domctl_vnuma *uinfo,
 return ERR_PTR(ret);
 }
 
+struct domctl_context
+{
+void *buffer;
+size_t len;
+size_t cur;
+};
+
+static int accumulate_size(void *priv, const void *data, size_t len)
+{
+struct domctl_context *c = priv;
+
+if ( c->len + len < c->len )
+return -EOVERFLOW;
+
+c->len += len;
+
+return 0;
+}
+
+static int save_data(void *priv, const void *data, size_t 

[PATCH v2 4/5] common/domain: add a domain context record for shared_info...

2020-04-07 Thread Paul Durrant
... and update xen-domctx to dump some information describing the record.

Signed-off-by: Paul Durrant 
---
Cc: Ian Jackson 
Cc: Wei Liu 
Cc: Andrew Cooper 
Cc: George Dunlap 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: Stefano Stabellini 

v2:
 - Drop the header change to define a 'Xen' page size and instead use a
   variable length struct now that the framework makes this is feasible
 - Guard use of 'has_32bit_shinfo' in common code with CONFIG_COMPAT
---
 tools/misc/xen-domctx.c   | 11 ++
 xen/common/domain.c   | 81 +++
 xen/include/public/save.h | 10 -
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/tools/misc/xen-domctx.c b/tools/misc/xen-domctx.c
index d663522a8b..a8d3922321 100644
--- a/tools/misc/xen-domctx.c
+++ b/tools/misc/xen-domctx.c
@@ -59,6 +59,16 @@ static void dump_header(struct domain_save_descriptor *desc)
 off += desc->length;
 }
 
+static void dump_shared_info(struct domain_save_descriptor *desc)
+{
+DOMAIN_SAVE_TYPE(SHARED_INFO) s;
+READ(s);
+printf("SHARED_INFO: field_width %u buffer size: %lu\n",
+   s.field_width, desc->length - sizeof(s));
+
+off += desc->length;
+}
+
 static void dump_end(struct domain_save_descriptor *desc)
 {
 DOMAIN_SAVE_TYPE(END) e;
@@ -125,6 +135,7 @@ int main(int argc, char **argv)
 switch (desc.typecode)
 {
 case DOMAIN_SAVE_CODE(HEADER): dump_header(); break;
+case DOMAIN_SAVE_CODE(SHARED_INFO): dump_shared_info(); break;
 case DOMAIN_SAVE_CODE(END): dump_end(); return 0;
 default:
 printf("Unknown type %u: skipping\n", desc.typecode);
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 3dcd73f67c..8b72462e07 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1646,6 +1647,86 @@ int continue_hypercall_on_cpu(
 return 0;
 }
 
+static int save_shared_info(const struct vcpu *v, struct domain_context *c,
+bool dry_run)
+{
+struct domain *d = v->domain;
+struct domain_shared_info_context ctxt = {};
+size_t hdr_size = offsetof(typeof(ctxt), buffer);
+size_t size = hdr_size + PAGE_SIZE;
+int rc;
+
+rc = DOMAIN_SAVE_BEGIN(SHARED_INFO, c, v, size);
+if ( rc )
+return rc;
+
+if ( !dry_run )
+ctxt.field_width =
+#ifdef CONFIG_COMPAT
+has_32bit_shinfo(d) ? 4 :
+#endif
+8;
+
+rc = domain_save_data(c, , hdr_size);
+if ( rc )
+return rc;
+
+rc = domain_save_data(c, d->shared_info, PAGE_SIZE);
+if ( rc )
+return rc;
+
+return domain_save_end(c);
+}
+
+static int load_shared_info(struct vcpu *v, struct domain_context *c)
+{
+struct domain *d = v->domain;
+struct domain_shared_info_context ctxt = {};
+size_t hdr_size = offsetof(typeof(ctxt), buffer);
+size_t size = hdr_size + PAGE_SIZE;
+unsigned int i;
+int rc;
+
+rc = DOMAIN_LOAD_BEGIN(SHARED_INFO, c, v, size, true);
+if ( rc )
+return rc;
+
+rc = domain_load_data(c, , hdr_size);
+if ( rc )
+return rc;
+
+for ( i = 0; i < ARRAY_SIZE(ctxt.pad); i++ )
+if ( ctxt.pad[i] )
+return -EINVAL;
+
+switch ( ctxt.field_width )
+{
+#ifdef CONFIG_COMPAT
+case 4:
+d->arch.has_32bit_shinfo = 1;
+break;
+#endif
+case 8:
+#ifdef CONFIG_COMPAT
+d->arch.has_32bit_shinfo = 0;
+#endif
+break;
+
+default:
+rc = -EINVAL;
+break;
+}
+
+rc = domain_load_data(c, d->shared_info, PAGE_SIZE);
+if ( rc )
+return rc;
+
+return domain_load_end(c);
+}
+
+DOMAIN_REGISTER_SAVE_RESTORE(SHARED_INFO, false, save_shared_info,
+ load_shared_info);
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/public/save.h b/xen/include/public/save.h
index 7e5f8752bd..ed994a8765 100644
--- a/xen/include/public/save.h
+++ b/xen/include/public/save.h
@@ -79,6 +79,14 @@ struct domain_save_header {
 };
 DECLARE_DOMAIN_SAVE_TYPE(HEADER, 1, struct domain_save_header);
 
-#define DOMAIN_SAVE_CODE_MAX 1
+struct domain_shared_info_context {
+uint8_t field_width;
+uint8_t pad[7];
+uint8_t buffer[]; /* Implementation specific size */
+};
+
+DECLARE_DOMAIN_SAVE_TYPE(SHARED_INFO, 2, struct domain_shared_info_context);
+
+#define DOMAIN_SAVE_CODE_MAX 2
 
 #endif /* __XEN_PUBLIC_SAVE_H__ */
-- 
2.20.1




[PATCH v2 1/5] xen/common: introduce a new framework for save/restore of 'domain' context

2020-04-07 Thread Paul Durrant
To allow enlightened HVM guests (i.e. those that have PV drivers) to be
migrated without their co-operation it will be necessary to transfer 'PV'
state such as event channel state, grant entry state, etc.

Currently there is a framework (entered via the hvm_save/load() functions)
that allows a domain's 'HVM' (architectural) state to be transferred but
'PV' state is also common with pure PV guests and so this framework is not
really suitable.

This patch adds the new public header and low level implementation of a new
common framework, entered via the domain_save/load() functions. Subsequent
patches will introduce other parts of the framework, and code that will
make use of it within the current version of the libxc migration stream.

This patch also marks the HVM-only framework as deprecated in favour of the
new framework.

Signed-off-by: Paul Durrant 
---
Cc: Andrew Cooper 
Cc: George Dunlap 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: Stefano Stabellini 
Cc: Wei Liu 
Cc: Volodymyr Babchuk 
Cc: "Roger Pau Monné" 

v2:
 - Allow multi-stage save/load to avoid the need to double-buffer
 - Get rid of the masks and add an 'ignore' flag instead
 - Create copy function union to preserve const save buffer
 - Deprecate HVM-only framework
---
 xen/common/Makefile|   1 +
 xen/common/save.c  | 329 +
 xen/include/public/arch-arm/hvm/save.h |   5 +
 xen/include/public/arch-x86/hvm/save.h |   5 +
 xen/include/public/save.h  |  84 +++
 xen/include/xen/save.h | 152 
 6 files changed, 576 insertions(+)
 create mode 100644 xen/common/save.c
 create mode 100644 xen/include/public/save.h
 create mode 100644 xen/include/xen/save.h

diff --git a/xen/common/Makefile b/xen/common/Makefile
index e8cde65370..90553ba5d7 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -37,6 +37,7 @@ obj-y += radix-tree.o
 obj-y += rbtree.o
 obj-y += rcupdate.o
 obj-y += rwlock.o
+obj-y += save.o
 obj-y += shutdown.o
 obj-y += softirq.o
 obj-y += sort.o
diff --git a/xen/common/save.c b/xen/common/save.c
new file mode 100644
index 00..6cdac3785b
--- /dev/null
+++ b/xen/common/save.c
@@ -0,0 +1,329 @@
+/*
+ * save.c: Save and restore PV guest state common to all domain types.
+ *
+ * Copyright Amazon.com Inc. or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see .
+ */
+
+#include 
+
+union domain_copy_entry {
+domain_write_entry write;
+domain_read_entry read;
+};
+
+struct domain_context {
+bool log;
+struct domain_save_descriptor desc;
+size_t data_len;
+union domain_copy_entry copy;
+void *priv;
+};
+
+static struct {
+const char *name;
+bool per_vcpu;
+domain_save_handler save;
+domain_load_handler load;
+} handlers[DOMAIN_SAVE_CODE_MAX + 1];
+
+void __init domain_register_save_type(unsigned int tc, const char *name,
+  bool per_vcpu,
+  domain_save_handler save,
+  domain_load_handler load)
+{
+BUG_ON(tc > ARRAY_SIZE(handlers));
+
+ASSERT(!handlers[tc].save);
+ASSERT(!handlers[tc].load);
+
+handlers[tc].name = name;
+handlers[tc].per_vcpu = per_vcpu;
+handlers[tc].save = save;
+handlers[tc].load = load;
+}
+
+int domain_save_begin(struct domain_context *c, unsigned int tc,
+  const char *name, const struct vcpu *v, size_t len)
+{
+int rc;
+
+if ( c->log )
+gdprintk(XENLOG_INFO, "%pv save: %s (%lu)\n", v, name,
+ (unsigned long)len);
+
+BUG_ON(tc != c->desc.typecode);
+BUG_ON(v->vcpu_id != c->desc.vcpu_id);
+
+ASSERT(!c->data_len);
+c->data_len = c->desc.length = len;
+
+rc = c->copy.write(c->priv, >desc, sizeof(c->desc));
+if ( rc )
+return rc;
+
+c->desc.length = 0;
+
+return 0;
+}
+
+int domain_save_data(struct domain_context *c, const void *src, size_t len)
+{
+if ( c->desc.length + len > c->data_len )
+return -ENOSPC;
+
+c->desc.length += len;
+
+return c->copy.write(c->priv, src, len);
+}
+
+int domain_save_end(struct domain_context *c)
+{
+/*
+ * If desc.length does not match the length specified in
+ * domain_save_begin(), there should have been more data.
+ */
+if ( c->desc.length != c->data_len )
+return -EIO;
+
+

[PATCH v2 3/5] tools/misc: add xen-domctx to present domain context

2020-04-07 Thread Paul Durrant
This tool is analogous to 'xen-hvmctx' which presents HVM context.
Subsequent patches will add 'dump' functions when new records are
introduced.

Signed-off-by: Paul Durrant 
---
Cc: Ian Jackson 
Cc: Wei Liu 

v2:
 - Change name from 'xen-ctx' to 'xen-domctx'
---
 .gitignore  |   1 +
 tools/misc/Makefile |   4 ++
 tools/misc/xen-domctx.c | 145 
 3 files changed, 150 insertions(+)
 create mode 100644 tools/misc/xen-domctx.c

diff --git a/.gitignore b/.gitignore
index 4ca679ddbc..744c0529bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,6 +208,7 @@ tools/misc/xen_cpuperf
 tools/misc/xen-cpuid
 tools/misc/xen-detect
 tools/misc/xen-diag
+tools/misc/xen-domctx
 tools/misc/xen-tmem-list-parse
 tools/misc/xen-livepatch
 tools/misc/xenperf
diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 63947bfadc..ef25524354 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -30,6 +30,7 @@ INSTALL_SBIN   += xenpm
 INSTALL_SBIN   += xenwatchdogd
 INSTALL_SBIN   += xen-livepatch
 INSTALL_SBIN   += xen-diag
+INSTALL_SBIN   += xen-domctx
 INSTALL_SBIN += $(INSTALL_SBIN-y)
 
 # Everything to be installed in a private bin/
@@ -108,6 +109,9 @@ xen-livepatch: xen-livepatch.o
 xen-diag: xen-diag.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+xen-domctx: xen-domctx.o
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
+
 xen-lowmemd: xen-lowmemd.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenevtchn) $(LDLIBS_libxenctrl) 
$(LDLIBS_libxenstore) $(APPEND_LDFLAGS)
 
diff --git a/tools/misc/xen-domctx.c b/tools/misc/xen-domctx.c
new file mode 100644
index 00..d663522a8b
--- /dev/null
+++ b/tools/misc/xen-domctx.c
@@ -0,0 +1,145 @@
+/*
+ * xen-domctx.c
+ *
+ * Print out domain save records in a human-readable way.
+ *
+ * Copyright Amazon.com Inc. or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+static void *buf = NULL;
+static size_t len, off;
+
+#define READ(_x) do {   \
+if ( len - off < sizeof (_x) )  \
+{   \
+fprintf(stderr, \
+"Error: need another %lu bytes, only %lu available\n",  \
+sizeof(_x), len - off); \
+exit(1);\
+}   \
+memcpy(&(_x), buf + off, sizeof (_x));  \
+} while (0)
+
+static void dump_header(struct domain_save_descriptor *desc)
+{
+DOMAIN_SAVE_TYPE(HEADER) h;
+READ(h);
+printf("HEADER: magic %#x, version %u\n",
+   h.magic, h.version);
+
+off += desc->length;
+}
+
+static void dump_end(struct domain_save_descriptor *desc)
+{
+DOMAIN_SAVE_TYPE(END) e;
+READ(e);
+printf("END\n");
+}
+
+int main(int argc, char **argv)
+{
+uint32_t domid;
+unsigned int entry;
+xc_interface *xch;
+int rc;
+
+if ( argc != 2 || !argv[1] || (rc = atoi(argv[1])) < 0 )
+{
+fprintf(stderr, "usage: %s \n", argv[0]);
+exit(1);
+}
+domid = rc;
+
+xch = xc_interface_open(0,0,0);
+if ( !xch )
+{
+fprintf(stderr, "Error: can't open libxc handle\n");
+exit(1);
+}
+
+rc = xc_domain_getcontext(xch, domid, NULL, );
+if ( rc < 0 )
+{
+fprintf(stderr, "Error: can't get record length for dom %u: %s\n",
+domid, strerror(errno));
+exit(1);
+}
+
+buf = malloc(len);
+if ( !buf )
+   

Re: [PATCH] MAINTAINERS: Remove all S: entries

2020-04-07 Thread Julien Grall

Hi,

On 07/04/2020 17:15, Ian Jackson wrote:

Feature support status is tracked in SUPPORT.md nowadays.


I don't think we track everything the same way. At least...



Signed-off-by: Ian Jackson 
---
  MAINTAINERS | 60 
  1 file changed, 60 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8a4c869704..e784169d1b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -69,16 +69,6 @@ Descriptions of section entries:
L: Mailing list that is relevant to this area
W: Web-page with status/info
T: SCM tree type and location.  Type is one of: git, hg, quilt, stgit.
-   S: Status, one of the following:
-  Supported:   Someone is actually paid to look after this.
-  Maintained:  Someone actually looks after it.


... we don't have a way to track code that are maintained but no-one is 
paid.


IHMO, this is an important to track as a contributor should not expect 
the same level of support between someone that is paid for it and 
someone that is not.


Cheers,

--
Julien Grall



[PATCH v2 0/5] domain context infrastructure

2020-04-07 Thread Paul Durrant
From: Paul Durrant 

Paul Durrant (5):
  xen/common: introduce a new framework for save/restore of 'domain'
context
  xen/common/domctl: introduce XEN_DOMCTL_get/setdomaincontext
  tools/misc: add xen-domctx to present domain context
  common/domain: add a domain context record for shared_info...
  tools/libxc: make use of domain context SHARED_INFO record...

 .gitignore |   1 +
 tools/flask/policy/modules/xen.if  |   4 +-
 tools/libxc/include/xenctrl.h  |   5 +
 tools/libxc/xc_domain.c|  54 
 tools/libxc/xc_sr_common.h |   7 +-
 tools/libxc/xc_sr_common_x86.c |  59 +
 tools/libxc/xc_sr_common_x86.h |   4 +
 tools/libxc/xc_sr_common_x86_pv.c  |  53 
 tools/libxc/xc_sr_common_x86_pv.h  |   3 +
 tools/libxc/xc_sr_restore_x86_pv.c |  40 ++-
 tools/libxc/xc_sr_save_x86_pv.c|  26 +-
 tools/libxc/xg_save_restore.h  |   1 +
 tools/misc/Makefile|   4 +
 tools/misc/xen-domctx.c| 156 
 xen/common/Makefile|   1 +
 xen/common/domain.c|  81 ++
 xen/common/domctl.c| 117 +
 xen/common/save.c  | 329 +
 xen/include/public/arch-arm/hvm/save.h |   5 +
 xen/include/public/arch-x86/hvm/save.h |   5 +
 xen/include/public/domctl.h|  44 +++-
 xen/include/public/save.h  |  92 +++
 xen/include/xen/save.h | 152 
 xen/xsm/flask/hooks.c  |   6 +
 xen/xsm/flask/policy/access_vectors|   4 +
 25 files changed, 1201 insertions(+), 52 deletions(-)
 create mode 100644 tools/misc/xen-domctx.c
 create mode 100644 xen/common/save.c
 create mode 100644 xen/include/public/save.h
 create mode 100644 xen/include/xen/save.h
---
Cc: Andrew Cooper 
Cc: Daniel De Graaf 
Cc: George Dunlap 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: "Roger Pau Monné" 
Cc: Stefano Stabellini 
Cc: Volodymyr Babchuk 
Cc: Wei Liu 
-- 
2.20.1




[xen-unstable test] 149478: tolerable FAIL

2020-04-07 Thread osstest service owner
flight 149478 xen-unstable real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149478/

Failures :-/ but no regressions.

Tests which are failing intermittently (not blocking):
 test-amd64-amd64-dom0pvh-xl-intel 17 guest-saverestore.2   fail pass in 149451
 test-amd64-amd64-examine  4 memdisk-try-append fail pass in 149451
 test-amd64-amd64-dom0pvh-xl-amd 20 guest-start/debian.repeat fail pass in 
149451

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-rtds 16 guest-localmigrate   fail  like 149451
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail  like 149451
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stopfail like 149451
 test-amd64-i386-xl-qemut-win7-amd64 17 guest-stop fail like 149451
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stopfail like 149451
 test-amd64-i386-xl-qemuu-win7-amd64 17 guest-stop fail like 149451
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail  like 149451
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stopfail like 149451
 test-amd64-i386-xl-qemuu-ws16-amd64 17 guest-stop fail like 149451
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stopfail like 149451
 test-amd64-i386-libvirt-xsm  13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 13 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 14 saverestore-support-checkfail never pass
 test-armhf-armhf-libvirt 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-qemut-ws16-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-xl-vhd  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  13 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 12 migrate-support-checkfail   never pass

version targeted for testing:
 xen  990b6e38d93c6e60f9d81e8b71ddfd209fca00bd
baseline version:
 xen  990b6e38d93c6e60f9d81e8b71ddfd209fca00bd

Last test of basis   149478  2020-04-07 01:51:22 Z0 days
Testing same since  (not found) 0 attempts

jobs:
 

Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread Julien Grall




On 07/04/2020 17:50, Julien Grall wrote:



On 07/04/2020 17:16, George Dunlap wrote:




On Apr 6, 2020, at 7:47 PM, Julien Grall  wrote:

On 06/04/2020 18:58, George Dunlap wrote:
On Apr 3, 2020, at 9:27 PM, Julien Grall 
 wrote:


On Fri, 3 Apr 2020 at 20:41, Stefano Stabellini 
 wrote:


On Thu, 2 Apr 2020, Julien Grall wrote:
As we discussed on Tuesday, the cost for other vCPUs is only 
going to be a
trap to the hypervisor and then back again. The cost is likely 
smaller than

receiving and forwarding an interrupt.

You actually agreed on this analysis. So can you enlighten me as 
to why

receiving an interrupt is a not problem for latency but this is?


My answer was that the difference is that an operating system can
disable interrupts, but it cannot disable receiving this special IPI.


An OS can *only* disable its own interrupts, yet interrupts will still
be received by Xen even if the interrupts are masked at the processor
(e.g local_irq_disable()).

You would need to disable interrupts one by one as the GIC level (use
ICENABLER) in order to not receive any interrupts. Yet, Xen may still
receive interrupts for operational purposes (e.g serial, console,
maintainance IRQ...). So trap will happen.
I think Stefano’s assertion is that the users he has in mind will be 
configuring the system such that RT workloads get a minimum number 
of hypervisor-related interrupts possible.  On a 4-core system, you 
could  have non-RT workloads running on cores 0-1, and RT workloads 
running with the NULL scheduler on cores 2-3.  In such a system, 
you’d obviously arrange that serial and maintenance IRQs are 
delivered to cores 0-1.

Well maintenance IRQs are per pCPU so you can't route to another one...

But, I think you missed my point that local_irq_disable() from the 
guest will not prevent the hypervisor to receive interrupts *even* 
the one routed to the vCPU itself. They will just not be delivered to 
the guest context until local_irq_enable() is called.


My understanding, from Stefano was that what his customers are 
concerned about is the time between the time a physical IRQ is 
delivered to the guest and the time the guest OS can respond 
appropriately.  The key thing here isn’t necessarily speed, but 
predictability — system designers need to know that, with a high 
probability, their interrupt routines will complete within X amount of 
cycles.


Further interrupts delivered to a guest are not a problem in this 
scenario, if the guest can disable them until the critical IRQ has 
been handled.


You keep saying a guest can disable interrupts, but it can't do it via 
local_irq_disable(). So what method are you thinking? Disabling at the 
GIC level? That is involving traps and most likely not going to help 
with predictability...


Just to clear I meant interrupts to be received by Xen including the one 
routed to that vCPU.


Cheers,

--
Julien Grall



Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread Julien Grall




On 07/04/2020 17:25, Bertrand Marquis wrote:
This is possible to do on a per interrupt basis or when all interrupts 
in LR registers have all been handled (maintenance interrupt when there 
is nothing left to handle in LR registers, used to fire other interrupts 
if we have more pending interrupts then number of LR registers).


Maybe a solution making sure we get a maintenance interrupts once all 
interrupts in LR registers have been handled could be a good mitigation ?


The chance of having multiple interrupts inflight is fairly limited. So 
effectively, you will trap for every interrupts.


Cheers,

--
Julien Grall



Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread Julien Grall




On 07/04/2020 17:16, George Dunlap wrote:




On Apr 6, 2020, at 7:47 PM, Julien Grall  wrote:

On 06/04/2020 18:58, George Dunlap wrote:

On Apr 3, 2020, at 9:27 PM, Julien Grall  wrote:

On Fri, 3 Apr 2020 at 20:41, Stefano Stabellini  wrote:


On Thu, 2 Apr 2020, Julien Grall wrote:

As we discussed on Tuesday, the cost for other vCPUs is only going to be a
trap to the hypervisor and then back again. The cost is likely smaller than
receiving and forwarding an interrupt.

You actually agreed on this analysis. So can you enlighten me as to why
receiving an interrupt is a not problem for latency but this is?


My answer was that the difference is that an operating system can
disable interrupts, but it cannot disable receiving this special IPI.


An OS can *only* disable its own interrupts, yet interrupts will still
be received by Xen even if the interrupts are masked at the processor
(e.g local_irq_disable()).

You would need to disable interrupts one by one as the GIC level (use
ICENABLER) in order to not receive any interrupts. Yet, Xen may still
receive interrupts for operational purposes (e.g serial, console,
maintainance IRQ...). So trap will happen.

I think Stefano’s assertion is that the users he has in mind will be 
configuring the system such that RT workloads get a minimum number of 
hypervisor-related interrupts possible.  On a 4-core system, you could  have 
non-RT workloads running on cores 0-1, and RT workloads running with the NULL 
scheduler on cores 2-3.  In such a system, you’d obviously arrange that serial 
and maintenance IRQs are delivered to cores 0-1.

Well maintenance IRQs are per pCPU so you can't route to another one...

But, I think you missed my point that local_irq_disable() from the guest will 
not prevent the hypervisor to receive interrupts *even* the one routed to the 
vCPU itself. They will just not be delivered to the guest context until 
local_irq_enable() is called.


My understanding, from Stefano was that what his customers are concerned about 
is the time between the time a physical IRQ is delivered to the guest and the 
time the guest OS can respond appropriately.  The key thing here isn’t 
necessarily speed, but predictability — system designers need to know that, 
with a high probability, their interrupt routines will complete within X amount 
of cycles.

Further interrupts delivered to a guest are not a problem in this scenario, if 
the guest can disable them until the critical IRQ has been handled.


You keep saying a guest can disable interrupts, but it can't do it via 
local_irq_disable(). So what method are you thinking? Disabling at the 
GIC level? That is involving traps and most likely not going to help 
with predictability...




Xen-related IPIs, however, could potentially cause a problem if not mitigated.  
Consider a guest where vcpu 1 loops over the register, while vcpu 2 is handling 
a latency-critical IRQ.  A naive implementation might send an IPI every time 
vcpu 1 does a read, spamming vcpu 2 with dozens of IPIs.  Then an IRQ routine 
which reliably finishes well within the required time normally suddenly 
overruns and causes an issue.


I never suggested the naive implementation would be perfect. That's why 
I said it can be optimized...




I don’t know what maintenance IRQs are, but if they only happen intermittently, 
it’s possible that you’d never get more than a single one in a latency-critical 
IRQ routine; and as such, the variatibility in execution time (jitter) wouldn’t 
be an issue in practice.  But every time you add a new unblockable IPI, you 
increase this jitter; particularly if this unblockable IPI might be repeated an 
arbitrary number of times.

(Stefano, let me know if I’ve misunderstood something.)

So stepping back a moment, here’s all the possible ideas that I think have been 
discussed (or are there implicitly) so far.

1. [Default] Do nothing; guests using this register continue crashing

2. Make the I?ACTIVER registers RZWI.

3. Make I?ACTIVER return the most recent known value; i.e. KVM’s current 
behavior (as far as we understand it)

4. Use a simple IPI with do_noop to update I?ACTIVER

4a.  Use an IPI, but come up with clever tricks to avoid interrupting guests 
handling IRQs.

5. Trap to Xen on guest EOI, so that we know when the

6. Some clever paravirtualized option


7. Use an IPI if we are confident the interrupts may be active.



Obviously nobody wants #1, and #3 is clearly not really an option now either.

#2 is not great, but it’s simple and quick to implement for now.  Julien, I’m 
not sure your position on this one: You rejected the idea back in v1 of this 
patch series, but seemed to refer to it again earlier in this thread.

#4 is relatively quick to implement a “dumb” version, but such a “dumb” version 
has a high risk of causing unacceptable jitter (or at least, so Stefano 
believes).

#4a or #6 are further potential lines to explore, but would require a lot of 
additional design to get working 

Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread Bertrand Marquis


On 7 Apr 2020, at 17:16, George Dunlap 
mailto:george.dun...@citrix.com>> wrote:



On Apr 6, 2020, at 7:47 PM, Julien Grall 
mailto:jul...@xen.org>> wrote:

On 06/04/2020 18:58, George Dunlap wrote:
On Apr 3, 2020, at 9:27 PM, Julien Grall 
mailto:julien.grall@gmail.com>> wrote:

On Fri, 3 Apr 2020 at 20:41, Stefano Stabellini 
mailto:sstabell...@kernel.org>> wrote:

On Thu, 2 Apr 2020, Julien Grall wrote:
As we discussed on Tuesday, the cost for other vCPUs is only going to be a
trap to the hypervisor and then back again. The cost is likely smaller than
receiving and forwarding an interrupt.

You actually agreed on this analysis. So can you enlighten me as to why
receiving an interrupt is a not problem for latency but this is?

My answer was that the difference is that an operating system can
disable interrupts, but it cannot disable receiving this special IPI.

An OS can *only* disable its own interrupts, yet interrupts will still
be received by Xen even if the interrupts are masked at the processor
(e.g local_irq_disable()).

You would need to disable interrupts one by one as the GIC level (use
ICENABLER) in order to not receive any interrupts. Yet, Xen may still
receive interrupts for operational purposes (e.g serial, console,
maintainance IRQ...). So trap will happen.
I think Stefano’s assertion is that the users he has in mind will be 
configuring the system such that RT workloads get a minimum number of 
hypervisor-related interrupts possible.  On a 4-core system, you could  have 
non-RT workloads running on cores 0-1, and RT workloads running with the NULL 
scheduler on cores 2-3.  In such a system, you’d obviously arrange that serial 
and maintenance IRQs are delivered to cores 0-1.
Well maintenance IRQs are per pCPU so you can't route to another one...

But, I think you missed my point that local_irq_disable() from the guest will 
not prevent the hypervisor to receive interrupts *even* the one routed to the 
vCPU itself. They will just not be delivered to the guest context until 
local_irq_enable() is called.

My understanding, from Stefano was that what his customers are concerned about 
is the time between the time a physical IRQ is delivered to the guest and the 
time the guest OS can respond appropriately.  The key thing here isn’t 
necessarily speed, but predictability — system designers need to know that, 
with a high probability, their interrupt routines will complete within X amount 
of cycles.

Further interrupts delivered to a guest are not a problem in this scenario, if 
the guest can disable them until the critical IRQ has been handled.

Xen-related IPIs, however, could potentially cause a problem if not mitigated.  
Consider a guest where vcpu 1 loops over the register, while vcpu 2 is handling 
a latency-critical IRQ.  A naive implementation might send an IPI every time 
vcpu 1 does a read, spamming vcpu 2 with dozens of IPIs.  Then an IRQ routine 
which reliably finishes well within the required time normally suddenly 
overruns and causes an issue.

I don’t know what maintenance IRQs are, but if they only happen intermittently, 
it’s possible that you’d never get more than a single one in a latency-critical 
IRQ routine; and as such, the variatibility in execution time (jitter) wouldn’t 
be an issue in practice.  But every time you add a new unblockable IPI, you 
increase this jitter; particularly if this unblockable IPI might be repeated an 
arbitrary number of times.

(Stefano, let me know if I’ve misunderstood something.)

So stepping back a moment, here’s all the possible ideas that I think have been 
discussed (or are there implicitly) so far.

1. [Default] Do nothing; guests using this register continue crashing

2. Make the I?ACTIVER registers RZWI.

3. Make I?ACTIVER return the most recent known value; i.e. KVM’s current 
behavior (as far as we understand it)

4. Use a simple IPI with do_noop to update I?ACTIVER

4a.  Use an IPI, but come up with clever tricks to avoid interrupting guests 
handling IRQs.

5. Trap to Xen on guest EOI, so that we know when the

This is possible to do on a per interrupt basis or when all interrupts in LR 
registers have all been handled (maintenance interrupt when there is nothing 
left to handle in LR registers, used to fire other interrupts if we have more 
pending interrupts then number of LR registers).

Maybe a solution making sure we get a maintenance interrupts once all 
interrupts in LR registers have been handled could be a good mitigation ?

This could allow to not sync other cores but would make sure the time until we 
will cleanup active interrupts is limited so the poller could be sure to have 
at some acceptable point in time the information.


6. Some clever paravirtualized option

Obviously nobody wants #1, and #3 is clearly not really an option now either.

#2 is not great, but it’s simple and quick to implement for now.  Julien, I’m 
not sure your position on this one: You rejected the idea back in v1 of this 
patch 

Re: [PATCH 5/5] x86_64/mm: map and unmap page tables in destroy_m2p_mapping

2020-04-07 Thread Hongyan Xia
On Wed, 2020-04-01 at 14:40 +0200, Jan Beulich wrote:
> On 23.03.2020 10:41, Hongyan Xia wrote:
> > @@ -297,26 +298,33 @@ static void destroy_m2p_mapping(struct
> > mem_hotadd_info *info)
> >  continue;
> >  }
> >  
> > -l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
> > +l2_ro_mpt =
> > map_l2t_from_l3e(l3_ro_mpt[l3_table_offset(va)]);
> >  if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) &
> > _PAGE_PRESENT))
> >  {
> >  i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
> >  (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
> > +UNMAP_DOMAIN_PAGE(l2_ro_mpt);
> >  continue;
> >  }
> >  
> >  pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
> >  if ( hotadd_mem_valid(pt_pfn, info) )
> >  {
> > +l2_pgentry_t *l2t;
> > +
> >  destroy_xen_mappings(rwva, rwva + (1UL <<
> > L2_PAGETABLE_SHIFT));
> >  
> > -l2_ro_mpt =
> > l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
> > -l2e_write(_ro_mpt[l2_table_offset(va)],
> > l2e_empty());
> > +l2t =
> > map_l2t_from_l3e(l3_ro_mpt[l3_table_offset(va)]);
> 
> Why a 2nd mapping of the same L3 entry that you've already mapped
> into l2_ro_mpt?
> > +l2e_write([l2_table_offset(va)], l2e_empty());
> > +UNMAP_DOMAIN_PAGE(l2t);
> 
> If this then weren't to go away, it should again be the lowercase
> variant imo, as the variable's scope ends here.

Hmm, I don't see a reason why l2_ro_mpt needs to be mapped again either
(and don't see why it was re-derived in the original code), so yes, I
think the map and unmap can just be dropped. Will revise.

Hongyan




Re: [PATCH v2] xen/arm: implement GICD_I[S/C]ACTIVER reads

2020-04-07 Thread George Dunlap


> On Apr 6, 2020, at 7:47 PM, Julien Grall  wrote:
> 
> On 06/04/2020 18:58, George Dunlap wrote:
>>> On Apr 3, 2020, at 9:27 PM, Julien Grall  wrote:
>>> 
>>> On Fri, 3 Apr 2020 at 20:41, Stefano Stabellini  
>>> wrote:
 
 On Thu, 2 Apr 2020, Julien Grall wrote:
> As we discussed on Tuesday, the cost for other vCPUs is only going to be a
> trap to the hypervisor and then back again. The cost is likely smaller 
> than
> receiving and forwarding an interrupt.
> 
> You actually agreed on this analysis. So can you enlighten me as to why
> receiving an interrupt is a not problem for latency but this is?
 
 My answer was that the difference is that an operating system can
 disable interrupts, but it cannot disable receiving this special IPI.
>>> 
>>> An OS can *only* disable its own interrupts, yet interrupts will still
>>> be received by Xen even if the interrupts are masked at the processor
>>> (e.g local_irq_disable()).
>>> 
>>> You would need to disable interrupts one by one as the GIC level (use
>>> ICENABLER) in order to not receive any interrupts. Yet, Xen may still
>>> receive interrupts for operational purposes (e.g serial, console,
>>> maintainance IRQ...). So trap will happen.
>> I think Stefano’s assertion is that the users he has in mind will be 
>> configuring the system such that RT workloads get a minimum number of 
>> hypervisor-related interrupts possible.  On a 4-core system, you could  have 
>> non-RT workloads running on cores 0-1, and RT workloads running with the 
>> NULL scheduler on cores 2-3.  In such a system, you’d obviously arrange that 
>> serial and maintenance IRQs are delivered to cores 0-1.
> Well maintenance IRQs are per pCPU so you can't route to another one...
> 
> But, I think you missed my point that local_irq_disable() from the guest will 
> not prevent the hypervisor to receive interrupts *even* the one routed to the 
> vCPU itself. They will just not be delivered to the guest context until 
> local_irq_enable() is called.

My understanding, from Stefano was that what his customers are concerned about 
is the time between the time a physical IRQ is delivered to the guest and the 
time the guest OS can respond appropriately.  The key thing here isn’t 
necessarily speed, but predictability — system designers need to know that, 
with a high probability, their interrupt routines will complete within X amount 
of cycles.

Further interrupts delivered to a guest are not a problem in this scenario, if 
the guest can disable them until the critical IRQ has been handled.

Xen-related IPIs, however, could potentially cause a problem if not mitigated.  
Consider a guest where vcpu 1 loops over the register, while vcpu 2 is handling 
a latency-critical IRQ.  A naive implementation might send an IPI every time 
vcpu 1 does a read, spamming vcpu 2 with dozens of IPIs.  Then an IRQ routine 
which reliably finishes well within the required time normally suddenly 
overruns and causes an issue.

I don’t know what maintenance IRQs are, but if they only happen intermittently, 
it’s possible that you’d never get more than a single one in a latency-critical 
IRQ routine; and as such, the variatibility in execution time (jitter) wouldn’t 
be an issue in practice.  But every time you add a new unblockable IPI, you 
increase this jitter; particularly if this unblockable IPI might be repeated an 
arbitrary number of times.

(Stefano, let me know if I’ve misunderstood something.)

So stepping back a moment, here’s all the possible ideas that I think have been 
discussed (or are there implicitly) so far.

1. [Default] Do nothing; guests using this register continue crashing

2. Make the I?ACTIVER registers RZWI.

3. Make I?ACTIVER return the most recent known value; i.e. KVM’s current 
behavior (as far as we understand it)

4. Use a simple IPI with do_noop to update I?ACTIVER

4a.  Use an IPI, but come up with clever tricks to avoid interrupting guests 
handling IRQs.

5. Trap to Xen on guest EOI, so that we know when the 

6. Some clever paravirtualized option

Obviously nobody wants #1, and #3 is clearly not really an option now either.

#2 is not great, but it’s simple and quick to implement for now.  Julien, I’m 
not sure your position on this one: You rejected the idea back in v1 of this 
patch series, but seemed to refer to it again earlier in this thread.

#4 is relatively quick to implement a “dumb” version, but such a “dumb” version 
has a high risk of causing unacceptable jitter (or at least, so Stefano 
believes).

#4a or #6 are further potential lines to explore, but would require a lot of 
additional design to get working right.

I think if I understand Stefano’s PoV, then #5 would actually be acceptable — 
the overall amount of time spent in the hypervisor would probably be greater, 
but it would be bounded and predictable: once someone got their IRQ handler 
working reliably, it would likely continue to work.

It sounds like #5 might 

[PATCH] MAINTAINERS: Remove all S: entries

2020-04-07 Thread Ian Jackson
Feature support status is tracked in SUPPORT.md nowadays.

Signed-off-by: Ian Jackson 
---
 MAINTAINERS | 60 
 1 file changed, 60 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8a4c869704..e784169d1b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -69,16 +69,6 @@ Descriptions of section entries:
L: Mailing list that is relevant to this area
W: Web-page with status/info
T: SCM tree type and location.  Type is one of: git, hg, quilt, stgit.
-   S: Status, one of the following:
-  Supported:   Someone is actually paid to look after this.
-  Maintained:  Someone actually looks after it.
-  Odd Fixes:   It has a maintainer but they don't have time to do
-   much other than throw the odd patch in. See below..
-  Orphan:  No current maintainer [but maybe you could take the
-   role as you write your new code].
-  Obsolete:Old code. Something tagged obsolete generally means
-   it has been replaced by a better system and you
-   should be using that.
F: Files and directories with wildcard patterns.
   A trailing slash includes all files and subdirectory files.
   F:   drivers/net/all files in and below drivers/net
@@ -194,7 +184,6 @@ Maintainers List (try to look for most precise areas first)
 
 ACPI
 M: Jan Beulich 
-S: Supported
 F: xen/arch/x86/acpi/
 F: xen/drivers/acpi/
 F: xen/include/acpi/
@@ -203,19 +192,16 @@ F:tools/libacpi/
 AMD IOMMU
 M: Jan Beulich 
 M: Andrew Cooper 
-S: Maintained
 F: xen/drivers/passthrough/amd/
 
 AMD SVM
 M: Jan Beulich 
 M: Andrew Cooper 
-S: Supported
 F: xen/arch/x86/hvm/svm/
 F: xen/arch/x86/cpu/vpmu_amd.c
 
 ARGO
 M: Christopher Clark 
-S: Maintained
 F: xen/include/public/argo.h
 F: xen/include/xen/argo.h
 F: xen/common/argo.c
@@ -223,7 +209,6 @@ F:  xen/common/argo.c
 ARINC653 SCHEDULER
 M: Josh Whitehead 
 M: Stewart Hildebrand 
-S: Supported
 L: xen-de...@dornerworks.com
 F: xen/common/sched/arinc653.c
 F: tools/libxc/xc_arinc653.c
@@ -232,7 +217,6 @@ ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE
 M: Stefano Stabellini 
 M: Julien Grall 
 R: Volodymyr Babchuk 
-S: Supported
 L: xen-devel@lists.xenproject.org
 F: docs/misc/arm/
 F: xen/arch/arm/
@@ -252,14 +236,12 @@ F:xen/include/public/arch-arm.h
 Change Log
 M: Paul Durrant 
 R: Community Manager 
-S: Maintained
 F: CHANGELOG.md
 
 Continuous Integration (CI)
 M: Doug Goldstein 
 W: https://gitlab.com/xen-project/xen
 W: https://travis-ci.org/xen-project/xen
-S: Supported
 F: .gitlab-ci.yml
 F: .travis.yml
 F: automation/
@@ -268,13 +250,11 @@ F:scripts/travis-build
 CPU POOLS
 M: Juergen Gross 
 M: Dario Faggioli 
-S: Supported
 F: xen/common/sched/cpupool.c
 
 DEVICE TREE
 M: Stefano Stabellini 
 M: Julien Grall 
-S: Supported
 F: xen/common/libfdt/
 F: xen/common/device_tree.c
 F: xen/include/xen/libfdt/
@@ -283,7 +263,6 @@ F:  xen/drivers/passthrough/device_tree.c
 
 EFI
 M: Jan Beulich 
-S: Supported
 F: xen/arch/x86/efi/
 F: xen/common/efi/
 F: xen/include/efi/
@@ -292,30 +271,25 @@ F:xen/include/asm-x86/x86_*/efi*.h
 
 GDBSX DEBUGGER
 M: Elena Ufimtseva 
-S: Supported
 F: xen/arch/x86/debug.c
 F: tools/debugger/gdbsx/
 
 GOLANG BINDINGS
 M: George Dunlap 
-S: Maintained
 F: tools/golang
 
 INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
 R: Lukasz Hawrylko 
-S: Odd Fixes
 F: xen/arch/x86/tboot.c
 F: xen/include/asm-x86/tboot.h
 
 INTEL(R) VT FOR DIRECTED I/O (VT-D)
 M: Kevin Tian 
-S: Supported
 F: xen/drivers/passthrough/vtd/
 
 INTEL(R) VT FOR X86 (VT-X)
 M: Jun Nakajima 
 M: Kevin Tian 
-S: Supported
 F: xen/arch/x86/hvm/vmx/
 F: xen/arch/x86/mm/p2m-ept.c
 F: xen/include/asm-x86/hvm/vmx/
@@ -324,7 +298,6 @@ F:  xen/arch/x86/cpu/vpmu_intel.c
 IOMMU VENDOR INDEPENDENT CODE
 M: Jan Beulich 
 M: Paul Durrant 
-S: Supported
 F: xen/drivers/passthrough/
 X: xen/drivers/passthrough/amd/
 X: xen/drivers/passthrough/arm/
@@ -334,18 +307,15 @@ F:xen/include/xen/iommu.h
 
 KCONFIG
 M: Doug Goldstein 
-S: Supported
 F: docs/misc/kconfig{,-language}.txt
 F: xen/tools/kconfig/
 
 KDD DEBUGGER
 M: Tim Deegan 
-S: Odd Fixes
 F: tools/debugger/kdd/
 
 KEXEC
 M: Andrew Cooper 
-S: Supported
 F: xen/common/{kexec,kimage}.c
 F: xen/include/{kexec,kimage}.h
 F: xen/arch/x86/machine_kexec.c
@@ -355,14 +325,12 @@ LIBXENLIGHT
 M: Ian Jackson 
 M: Wei Liu 
 M: Anthony PERARD 
-S: Supported
 F: tools/libxl/
 F: tools/xl/
 
 LIVEPATCH
 M: 

Re: [PATCH] tools/xl: Remove the filelock when building VM if autoballooning is off

2020-04-07 Thread Ian Jackson
Dmitry Isaykin writes ("[PATCH] tools/xl: Remove the filelock when building VM 
if autoballooning is off"):
> The presence of this filelock does not allow building several VMs at the same
> time. This filelock was added to prevent other xl instances from using memory
> freeed for the currently building VM in autoballoon mode.
> 
> Signed-off-by: Dmitry Isaykin 

Reviewed-by: Ian Jackson 

This was deferred due to the Xen 4.13 freeze.  I found it in a todo
list of mine.  I think it should be committed and I will do so soon
unless someone objects.

Sorry for the delay, Dmitry!

Regards,
Ian.

>  tools/xl/xl_vmcontrol.c | 14 +-
>  1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/xl/xl_vmcontrol.c b/tools/xl/xl_vmcontrol.c
> index a1d633795c..2b42bb487d 100644
> --- a/tools/xl/xl_vmcontrol.c
> +++ b/tools/xl/xl_vmcontrol.c
> @@ -873,9 +873,11 @@ int create_domain(struct domain_create *dom_info)
>  start:
>  assert(domid == INVALID_DOMID);
>  
> -rc = acquire_lock();
> -if (rc < 0)
> -goto error_out;
> +if (autoballoon) {
> +rc = acquire_lock();
> +if (rc < 0)
> +goto error_out;
> +}
>  
>  if (domid_soft_reset == INVALID_DOMID) {
>  if (!freemem(domid, _config.b_info)) {
> @@ -935,7 +937,8 @@ start:
>  if ( ret )
>  goto error_out;
>  
> -release_lock();
> +if (autoballoon)
> +release_lock();
>  
>  if (restore_fd_to_close >= 0) {
>  if (close(restore_fd_to_close))
> @@ -1109,7 +1112,8 @@ start:
>  }
>  
>  error_out:
> -release_lock();
> +if (autoballoon)
> +release_lock();
>  if (libxl_domid_valid_guest(domid)) {
>  libxl_domain_destroy(ctx, domid, 0);
>  domid = INVALID_DOMID;
> -- 
> 2.17.1
> 



[PULL 3/3] MAINTAINERS: Add xen-usb.c to Xen section

2020-04-07 Thread Anthony PERARD
Signed-off-by: Anthony PERARD 
Acked-by: Paul Durrant 
Message-Id: <20200406165043.1447837-1-anthony.per...@citrix.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d156d73b31e..839959f7e4ac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -440,6 +440,7 @@ F: hw/9pfs/xen-9p*
 F: hw/char/xen_console.c
 F: hw/display/xenfb.c
 F: hw/net/xen_nic.c
+F: hw/usb/xen-usb.c
 F: hw/block/xen*
 F: hw/block/dataplane/xen*
 F: hw/xen/
-- 
Anthony PERARD




[PULL 2/3] xen-block: Fix uninitialized variable

2020-04-07 Thread Anthony PERARD
Since 7f5d9b206d1e ("object-add: don't create return value if
failed"), qmp_object_add() don't write any value in 'ret_data', thus
has random data. Then qobject_unref() fails and abort().

Fix by initialising 'ret_data' properly.

Fixes: 5f07c4d60d09 ("qapi: Flatten object-add")
Signed-off-by: Anthony PERARD 
Reviewed-by: Markus Armbruster 
Message-Id: <20200406164207.1446817-1-anthony.per...@citrix.com>
---
 hw/block/xen-block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index 07bb32e22b51..99cb4c67cb09 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -860,7 +860,7 @@ static XenBlockIOThread *xen_block_iothread_create(const 
char *id,
 XenBlockIOThread *iothread = g_new(XenBlockIOThread, 1);
 Error *local_err = NULL;
 QDict *opts;
-QObject *ret_data;
+QObject *ret_data = NULL;
 
 iothread->id = g_strdup(id);
 
-- 
Anthony PERARD




[PULL 1/3] hw/usb/xen-usb.c: Pass struct usbback_req* to usbback_packet_complete()

2020-04-07 Thread Anthony PERARD
From: Peter Maydell 

The function usbback_packet_complete() currently takes a USBPacket*,
which must be a pointer to the packet field within a struct
usbback_req; the function uses container_of() to get the struct
usbback_req* given the USBPacket*.

This is unnecessarily confusing (and in particular it confuses the
Coverity Scan analysis, resulting in the false positive CID 1421919
where it thinks that we write off the end of the structure). Since
both callsites already have the pointer to the struct usbback_req,
just pass that in directly.

Signed-off-by: Peter Maydell 
Acked-by: Gerd Hoffmann 
Reviewed-by: Anthony PERARD 
Message-Id: <20200323164318.26567-1-peter.mayd...@linaro.org>
Signed-off-by: Anthony PERARD 
---
 hw/usb/xen-usb.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
index 1fc2f32ce93d..961190d0f78c 100644
--- a/hw/usb/xen-usb.c
+++ b/hw/usb/xen-usb.c
@@ -347,13 +347,11 @@ static int32_t usbback_xlat_status(int status)
 return -ESHUTDOWN;
 }
 
-static void usbback_packet_complete(USBPacket *packet)
+static void usbback_packet_complete(struct usbback_req *usbback_req)
 {
-struct usbback_req *usbback_req;
+USBPacket *packet = _req->packet;
 int32_t status;
 
-usbback_req = container_of(packet, struct usbback_req, packet);
-
 QTAILQ_REMOVE(_req->stub->submit_q, usbback_req, q);
 
 status = usbback_xlat_status(packet->status);
@@ -566,7 +564,7 @@ static void usbback_dispatch(struct usbback_req 
*usbback_req)
 
 usb_handle_packet(usbback_req->stub->dev, _req->packet);
 if (usbback_req->packet.status != USB_RET_ASYNC) {
-usbback_packet_complete(_req->packet);
+usbback_packet_complete(usbback_req);
 }
 return;
 
@@ -993,7 +991,7 @@ static void xen_bus_complete(USBPort *port, USBPacket 
*packet)
 
 usbif = usbback_req->usbif;
 TR_REQ(>xendev, "\n");
-usbback_packet_complete(packet);
+usbback_packet_complete(usbback_req);
 }
 
 static USBPortOps xen_usb_port_ops = {
-- 
Anthony PERARD




[PULL 0/3] xen queue for 5.0

2020-04-07 Thread Anthony PERARD
The following changes since commit 8f0d25c464a1989d606f7b988d07b1147dfcde33:

  Merge remote-tracking branch 
'remotes/philmd-gitlab/tags/acceptance-fixes-20200407' into staging (2020-04-07 
15:10:11 +0100)

are available in the Git repository at:

  https://xenbits.xen.org/git-http/people/aperard/qemu-dm.git 
tags/pull-xen-20200407

for you to fetch changes up to 758af9cfabfb000eb00e42b9738e655b18fdd812:

  MAINTAINERS: Add xen-usb.c to Xen section (2020-04-07 16:13:26 +0100)


Xen queue for QEMU 5.0

- Fix for xen-block.
- A fix for a Coverity false positive in xen-usb.
- Update MAINTAINERS to add xen-usb.c to Xen section.


Anthony PERARD (2):
  xen-block: Fix uninitialized variable
  MAINTAINERS: Add xen-usb.c to Xen section

Peter Maydell (1):
  hw/usb/xen-usb.c: Pass struct usbback_req* to usbback_packet_complete()

 MAINTAINERS  |  1 +
 hw/block/xen-block.c |  2 +-
 hw/usb/xen-usb.c | 10 --
 3 files changed, 6 insertions(+), 7 deletions(-)



Re: [PATCH 3/5] x86_64/mm: map and unmap page tables in share_hotadd_m2p_table

2020-04-07 Thread Jan Beulich
On 07.04.2020 17:11, Hongyan Xia wrote:
> On Wed, 2020-04-01 at 14:29 +0200, Jan Beulich wrote:
>> On 23.03.2020 10:41, Hongyan Xia wrote:
>>> --- a/xen/include/asm-x86/page.h
>>> +++ b/xen/include/asm-x86/page.h
>>> @@ -196,6 +196,24 @@ static inline l4_pgentry_t
>>> l4e_from_paddr(paddr_t pa, unsigned int flags)
>>>  #define map_l2t_from_l3e(x)(l2_pgentry_t
>>> *)map_domain_page(l3e_get_mfn(x))
>>>  #define map_l3t_from_l4e(x)(l3_pgentry_t
>>> *)map_domain_page(l4e_get_mfn(x))
>>>  
>>> +#define l1e_from_l2e(l2e, off) ({   \
>>> +l1_pgentry_t *l1t = map_l1t_from_l2e(l2e);  \
>>> +l1_pgentry_t l1e = l1t[off];\
>>> +UNMAP_DOMAIN_PAGE(l1t); \
>>> +l1e; })
>>> +
>>> +#define l2e_from_l3e(l3e, off) ({   \
>>> +l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);  \
>>> +l2_pgentry_t l2e = l2t[off];\
>>> +UNMAP_DOMAIN_PAGE(l2t); \
>>> +l2e; })
>>> +
>>> +#define l3e_from_l4e(l4e, off) ({   \
>>> +l3_pgentry_t *l3t = map_l3t_from_l4e(l4e);  \
>>> +l3_pgentry_t l3e = l3t[off];\
>>> +UNMAP_DOMAIN_PAGE(l3t); \
>>> +l3e; })
>>
>> There's a reason these are macros rather than inline functions,
>> I assume? (This reason would be nice to be stated in the
>> description.)
> 
> Actually no specific reasons, just to keep them as macros to be
> consistent with other helpers above. Fairly trivial to convert them
> into inline functions if this is desired.

Where possible this would be the preferred route for new helpers.

>> I don't see why you use UNMAP_DOMAIN_PAGE() rather than
>> unmap_domain_page() here - the variables in question go out of
>> scope right afterwards, so the storing of NULL into them is
>> pointless (and very likely to be eliminated by the compiler
>> anyway).
> 
> My intention is to just avoid using the lower case one altogether, so
> that one day when we need to expand any function or macros we do not
> need to look back at the code and worry about whether any lower case
> ones need to be converted to upper case (sometimes for large functions
> this may not be trivial), and the compiler can deal with the extra
> nullification anyway.

I don't agree with this goal; perhaps others on Cc have an opinion?

Jan



Re: [PATCH 3/5] x86_64/mm: map and unmap page tables in share_hotadd_m2p_table

2020-04-07 Thread Hongyan Xia
On Wed, 2020-04-01 at 14:29 +0200, Jan Beulich wrote:
> On 23.03.2020 10:41, Hongyan Xia wrote:
> > --- a/xen/include/asm-x86/page.h
> > +++ b/xen/include/asm-x86/page.h
> > @@ -196,6 +196,24 @@ static inline l4_pgentry_t
> > l4e_from_paddr(paddr_t pa, unsigned int flags)
> >  #define map_l2t_from_l3e(x)(l2_pgentry_t
> > *)map_domain_page(l3e_get_mfn(x))
> >  #define map_l3t_from_l4e(x)(l3_pgentry_t
> > *)map_domain_page(l4e_get_mfn(x))
> >  
> > +#define l1e_from_l2e(l2e, off) ({   \
> > +l1_pgentry_t *l1t = map_l1t_from_l2e(l2e);  \
> > +l1_pgentry_t l1e = l1t[off];\
> > +UNMAP_DOMAIN_PAGE(l1t); \
> > +l1e; })
> > +
> > +#define l2e_from_l3e(l3e, off) ({   \
> > +l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);  \
> > +l2_pgentry_t l2e = l2t[off];\
> > +UNMAP_DOMAIN_PAGE(l2t); \
> > +l2e; })
> > +
> > +#define l3e_from_l4e(l4e, off) ({   \
> > +l3_pgentry_t *l3t = map_l3t_from_l4e(l4e);  \
> > +l3_pgentry_t l3e = l3t[off];\
> > +UNMAP_DOMAIN_PAGE(l3t); \
> > +l3e; })
> 
> There's a reason these are macros rather than inline functions,
> I assume? (This reason would be nice to be stated in the
> description.)

Actually no specific reasons, just to keep them as macros to be
consistent with other helpers above. Fairly trivial to convert them
into inline functions if this is desired.

> I don't see why you use UNMAP_DOMAIN_PAGE() rather than
> unmap_domain_page() here - the variables in question go out of
> scope right afterwards, so the storing of NULL into them is
> pointless (and very likely to be eliminated by the compiler
> anyway).

My intention is to just avoid using the lower case one altogether, so
that one day when we need to expand any function or macros we do not
need to look back at the code and worry about whether any lower case
ones need to be converted to upper case (sometimes for large functions
this may not be trivial), and the compiler can deal with the extra
nullification anyway.

> Finally the pointers should each be "to const", as you're
> only reading through them.

Good point. Will const qualify in next revision.

Thanks,
Hongyan




Re: Live migration and PV device handling

2020-04-07 Thread Tamas K Lengyel
On Tue, Apr 7, 2020 at 1:57 AM Paul Durrant  wrote:
>
> > -Original Message-
> > From: Xen-devel  On Behalf Of Tamas 
> > K Lengyel
> > Sent: 06 April 2020 18:31
> > To: Andrew Cooper 
> > Cc: Xen-devel ; Anastassios Nanos 
> > 
> > Subject: Re: Live migration and PV device handling
> >
> > On Mon, Apr 6, 2020 at 11:24 AM Andrew Cooper  
> > wrote:
> > >
> > > On 06/04/2020 18:16, Tamas K Lengyel wrote:
> > > > On Fri, Apr 3, 2020 at 6:44 AM Andrew Cooper 
> > > >  wrote:
> > > >> On 03/04/2020 13:32, Anastassios Nanos wrote:
> > > >>> Hi all,
> > > >>>
> > > >>> I am trying to understand how live-migration happens in xen. I am
> > > >>> looking in the HVM guest case and I have dug into the relevant parts
> > > >>> of the toolstack and the hypervisor regarding memory, vCPU context
> > > >>> etc.
> > > >>>
> > > >>> In particular, I am interested in how PV device migration happens. I
> > > >>> assume that the guest is not aware of any suspend/resume operations
> > > >>> being done
> > > >> Sadly, this assumption is not correct.  HVM guests with PV drivers
> > > >> currently have to be aware in exactly the same way as PV guests.
> > > >>
> > > >> Work is in progress to try and address this.  See
> > > >> https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=775a02452ddf3a6889690de90b1a94eb29c3c732
> > > >> (sorry - for some reason that doc isn't being rendered properly in
> > > >> https://xenbits.xen.org/docs/ )
> > > > That proposal is very interesting - first time it came across my radar
> > > > - but I dislike the idea that domain IDs need to be preserved for
> > > > uncooperative migration to work.
> > >
> > > The above restriction is necessary to work with existing guests, which
> > > is an implementation requirement of the folks driving the work.
> > >
> > > > Ideally I would be able to take
> > > > advantage of the same plumbing to perform forking of VMs with PV
> > > > drivers where preserving the domain id is impossible since its still
> > > > in use.
> > >
> > > We would of course like to make changes to remove the above restriction
> > > in the longterm.  The problem is that it is not a trivial thing to fix.
> > > Various things were discussed in Chicago, but I don't recall if any of
> > > the plans made their way onto xen-devel.
> >
> > Yea I imagine trying to get this to work with existing PV drivers is
> > not possible in any other way.
>
> No, as the doc says, the domid forms part of the protocol, hence being 
> visible to the guest, and the guest may sample and use the value when making 
> certain hypercalls (only some enforce use of DOMID_SELF). Thus faking it 
> without risking a guest crash is going to be difficult.
>
> > But if we can update the PV driver code
> > such that in the longterm it can work without preserving the domain
> > ID, that would be worthwhile.
> >
>
> I think that ship has sailed. It would probably be simpler and cheaper to 
> just get virtio working with Xen.

That would certainly make sense to me. That would reduce the
maintenance overhead considerably if we all converged on a single
standard.

Tamas



Re: [PATCH 0/5] use new API for Xen page tables

2020-04-07 Thread Hongyan Xia
On Mon, 2020-04-06 at 13:03 +0200, Jan Beulich wrote:
> On 06.04.2020 10:27, Hongyan Xia wrote:
> > Ping.
> 
> Does this somehow imply you didn't get my replies sent on the 1st?
> 
> Jan

Apologies. Somehow your replies ended up in a separate thread. There is
a problem with my email client.

Hongyan




[PATCH] config: use mini-os master for unstable

2020-04-07 Thread Juergen Gross
We haven't used mini-os master for about 2 years now due to a stubdom
test failing [1]. Booting a guest with mini-os master used for building
stubdom didn't reveal any problem, so use master for unstable in order
to let OSStest find any problems not showing up in the local test.

[1]: https://lists.xen.org/archives/html/minios-devel/2018-04/msg00015.html

Signed-off-by: Juergen Gross 
---
 Config.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Config.mk b/Config.mk
index dc6e7d03df..0f303c79b2 100644
--- a/Config.mk
+++ b/Config.mk
@@ -245,7 +245,7 @@ MINIOS_UPSTREAM_URL ?= git://xenbits.xen.org/mini-os.git
 endif
 OVMF_UPSTREAM_REVISION ?= 20d2e5a125e34fc8501026613a71549b2a1a3e54
 QEMU_UPSTREAM_REVISION ?= master
-MINIOS_UPSTREAM_REVISION ?= 0b4b7897e08b967a09bed2028a79fabff82342dd
+MINIOS_UPSTREAM_REVISION ?= master
 
 SEABIOS_UPSTREAM_REVISION ?= rel-1.13.0
 
-- 
2.16.4




Re: [PATCH] x86/PoD: correct ordering of checks in p2m_pod_zero_check()

2020-04-07 Thread Andrew Cooper
On 07/04/2020 12:07, Jan Beulich wrote:
> Commit 0537d246f8db ("mm: add 'is_special_page' inline function...")
> moved the is_special_page() checks first in its respective changes to
> PoD code. While this is fine for p2m_pod_zero_check_superpage(), the
> validity of the MFN is inferred in both cases from the p2m_is_ram()
> check, which therefore also needs to come first in this 2nd instance.
>
> Take the opportunity and address latent UB here as well - transform
> the MFN into struct page_info * only after having established that
> this is a valid page.
>
> Signed-off-by: Jan Beulich 

Acked-by: Andrew Cooper 



Re: [PATCH] x86/PoD: correct ordering of checks in p2m_pod_zero_check()

2020-04-07 Thread Jan Beulich
On 07.04.2020 14:39, Paul Durrant wrote:
>> -Original Message-
>> From: Jan Beulich 
>> Sent: 07 April 2020 12:08
>> To: xen-devel@lists.xenproject.org
>> Cc: Andrew Cooper ; Roger Pau Monné 
>> ; Wei Liu
>> ; Paul Durrant 
>> Subject: [PATCH] x86/PoD: correct ordering of checks in p2m_pod_zero_check()
>>
>> Commit 0537d246f8db ("mm: add 'is_special_page' inline function...")
>> moved the is_special_page() checks first in its respective changes to
>> PoD code. While this is fine for p2m_pod_zero_check_superpage(), the
>> validity of the MFN is inferred in both cases from the p2m_is_ram()
>> check, which therefore also needs to come first in this 2nd instance.
>>
>> Take the opportunity and address latent UB here as well - transform
>> the MFN into struct page_info * only after having established that
>> this is a valid page.
>>
>> Signed-off-by: Jan Beulich 
> 
> Reviewed-by: Paul Durrant 

Thanks.

> ...with a suggestion below
> 
>> ---
>> I will admit that this was build tested only. I did observe the crash
>> late yesterday while in the office, but got around to analyzing it only
>> today, where I'm again restricted in what I can reasonably test.
>>
>> --- a/xen/arch/x86/mm/p2m-pod.c
>> +++ b/xen/arch/x86/mm/p2m-pod.c
>> @@ -877,23 +877,25 @@ p2m_pod_zero_check(struct p2m_domain *p2
>>  for ( i = 0; i < count; i++ )
>>  {
>>  p2m_access_t a;
>> -struct page_info *pg;
>>
>>  mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, ,
>>   0, NULL, NULL);
>> -pg = mfn_to_page(mfns[i]);
>>
>>  /*
>>   * If this is ram, and not a pagetable or a special page, and
>>   * probably not mapped elsewhere, map it; otherwise, skip.
>>   */
>> -if ( !is_special_page(pg) && p2m_is_ram(types[i]) &&
>> - (pg->count_info & PGC_allocated) &&
>> - !(pg->count_info & PGC_page_table) &&
>> - ((pg->count_info & PGC_count_mask) <= max_ref) )
>> -map[i] = map_domain_page(mfns[i]);
>> -else
>> -map[i] = NULL;
>> +map[i] = NULL;
>> +if ( p2m_is_ram(types[i]) )
>> +{
>> +const struct page_info *pg = mfn_to_page(mfns[i]);
> 
> Perhaps have local scope stack variable for count_info too?

I'd view this as useful only if ...

>> +
>> +if ( !is_special_page(pg) &&

... this could then also be made make use of it.

Jan



RE: [PATCH] x86/PoD: correct ordering of checks in p2m_pod_zero_check()

2020-04-07 Thread Paul Durrant
> -Original Message-
> From: Jan Beulich 
> Sent: 07 April 2020 12:08
> To: xen-devel@lists.xenproject.org
> Cc: Andrew Cooper ; Roger Pau Monné 
> ; Wei Liu
> ; Paul Durrant 
> Subject: [PATCH] x86/PoD: correct ordering of checks in p2m_pod_zero_check()
> 
> Commit 0537d246f8db ("mm: add 'is_special_page' inline function...")
> moved the is_special_page() checks first in its respective changes to
> PoD code. While this is fine for p2m_pod_zero_check_superpage(), the
> validity of the MFN is inferred in both cases from the p2m_is_ram()
> check, which therefore also needs to come first in this 2nd instance.
> 
> Take the opportunity and address latent UB here as well - transform
> the MFN into struct page_info * only after having established that
> this is a valid page.
> 
> Signed-off-by: Jan Beulich 

Reviewed-by: Paul Durrant 

...with a suggestion below

> ---
> I will admit that this was build tested only. I did observe the crash
> late yesterday while in the office, but got around to analyzing it only
> today, where I'm again restricted in what I can reasonably test.
> 
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -877,23 +877,25 @@ p2m_pod_zero_check(struct p2m_domain *p2
>  for ( i = 0; i < count; i++ )
>  {
>  p2m_access_t a;
> -struct page_info *pg;
> 
>  mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, ,
>   0, NULL, NULL);
> -pg = mfn_to_page(mfns[i]);
> 
>  /*
>   * If this is ram, and not a pagetable or a special page, and
>   * probably not mapped elsewhere, map it; otherwise, skip.
>   */
> -if ( !is_special_page(pg) && p2m_is_ram(types[i]) &&
> - (pg->count_info & PGC_allocated) &&
> - !(pg->count_info & PGC_page_table) &&
> - ((pg->count_info & PGC_count_mask) <= max_ref) )
> -map[i] = map_domain_page(mfns[i]);
> -else
> -map[i] = NULL;
> +map[i] = NULL;
> +if ( p2m_is_ram(types[i]) )
> +{
> +const struct page_info *pg = mfn_to_page(mfns[i]);

Perhaps have local scope stack variable for count_info too?

> +
> +if ( !is_special_page(pg) &&
> + (pg->count_info & PGC_allocated) &&
> + !(pg->count_info & PGC_page_table) &&
> + ((pg->count_info & PGC_count_mask) <= max_ref) )
> +map[i] = map_domain_page(mfns[i]);
> +}
>  }
> 
>  /*




Re: [PATCH v2 for-5.0] xen-block: Fix double qlist remove and request leak

2020-04-07 Thread Max Reitz
On 06.04.20 16:02, Anthony PERARD wrote:
> Commit a31ca6801c02 ("qemu/queue.h: clear linked list pointers on
> remove") revealed that a request was removed twice from a list, once
> in xen_block_finish_request() and a second time in
> xen_block_release_request() when both function are called from
> xen_block_complete_aio(). But also, the `requests_inflight' counter is
> decreased twice, and thus became negative.
> 
> This is a bug that was introduced in bfd0d6366043, where a `finished'
> list was removed.
> 
> That commit also introduced a leak of request in xen_block_do_aio().
> That function calls xen_block_finish_request() but the request is
> never released after that.
> 
> To fix both issue, we do two changes:
> - we squash finish_request() and release_request() together as we want
>   to remove a request from 'inflight' list to add it to 'freelist'.
> - before releasing a request, we need to let now the result to the
>   other end, thus we should call xen_block_send_response() before
>   releasing a request.
> 
> The first change fix the double QLIST_REMOVE() as we remove the extra
> call. The second change makes the leak go away because if we want to
> call finish_request(), we need to call a function that do all of
> finish, send response, and release.
> 
> Fixes: bfd0d6366043 ("xen-block: improve response latency")
> Signed-off-by: Anthony PERARD 
> ---
>  hw/block/dataplane/xen-block.c | 48 --
>  1 file changed, 16 insertions(+), 32 deletions(-)

I’m going to send a pull request today anyway, so I hope you won’t mind
and let me take this patch to my branch (with Paul’s suggestions
incorporated):

https://git.xanclic.moe/XanClic/qemu/commits/branch/block

Max



signature.asc
Description: OpenPGP digital signature


[PATCH] x86/PoD: correct ordering of checks in p2m_pod_zero_check()

2020-04-07 Thread Jan Beulich
Commit 0537d246f8db ("mm: add 'is_special_page' inline function...")
moved the is_special_page() checks first in its respective changes to
PoD code. While this is fine for p2m_pod_zero_check_superpage(), the
validity of the MFN is inferred in both cases from the p2m_is_ram()
check, which therefore also needs to come first in this 2nd instance.

Take the opportunity and address latent UB here as well - transform
the MFN into struct page_info * only after having established that
this is a valid page.

Signed-off-by: Jan Beulich 
---
I will admit that this was build tested only. I did observe the crash
late yesterday while in the office, but got around to analyzing it only
today, where I'm again restricted in what I can reasonably test.

--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -877,23 +877,25 @@ p2m_pod_zero_check(struct p2m_domain *p2
 for ( i = 0; i < count; i++ )
 {
 p2m_access_t a;
-struct page_info *pg;
 
 mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, ,
  0, NULL, NULL);
-pg = mfn_to_page(mfns[i]);
 
 /*
  * If this is ram, and not a pagetable or a special page, and
  * probably not mapped elsewhere, map it; otherwise, skip.
  */
-if ( !is_special_page(pg) && p2m_is_ram(types[i]) &&
- (pg->count_info & PGC_allocated) &&
- !(pg->count_info & PGC_page_table) &&
- ((pg->count_info & PGC_count_mask) <= max_ref) )
-map[i] = map_domain_page(mfns[i]);
-else
-map[i] = NULL;
+map[i] = NULL;
+if ( p2m_is_ram(types[i]) )
+{
+const struct page_info *pg = mfn_to_page(mfns[i]);
+
+if ( !is_special_page(pg) &&
+ (pg->count_info & PGC_allocated) &&
+ !(pg->count_info & PGC_page_table) &&
+ ((pg->count_info & PGC_count_mask) <= max_ref) )
+map[i] = map_domain_page(mfns[i]);
+}
 }
 
 /*



Re: [PATCH 0/3] Cleanup IOREQ server on exit

2020-04-07 Thread Maximilian Heyne

Could someone please have a look at this patch? It solves an actual issue:
Try soft-reset with qemu-xen-traditional and it will fail.

On 3/13/20 1:33 PM, Maximilian Heyne wrote:

Following up on commit 9c0eed61 ("qemu-trad: stop using the default IOREQ
server"), clean up the IOREQ server on exit. This fixes a bug with soft-reset
that shows up as "bind interdomain ioctl error 22" because the event channels
were not closed at the soft-reset and can't be bound again.

For this I used the exit notifiers from QEMU that I backported together with the
required generic notifier lists.

Anthony Liguori (1):
   Add support for generic notifier lists

Gerd Hoffmann (1):
   Add exit notifiers.

Maximilian Heyne (1):
   xen: cleanup IOREQ server on exit

  Makefile|  1 +
  hw/xen_machine_fv.c | 11 +++
  notify.c| 39 +++
  notify.h| 43 +++
  sys-queue.h |  5 +
  sysemu.h|  5 +
  vl.c| 20 
  7 files changed, 124 insertions(+)
  create mode 100644 notify.c
  create mode 100644 notify.h





Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




Re: [PATCH 7/7] xen/guest_access: Fix coding style in xen/guest_access.h

2020-04-07 Thread Julien Grall




On 07/04/2020 09:17, Jan Beulich wrote:

On 04.04.2020 15:10, Julien Grall wrote:

From: Julien Grall 

 * Add space before and after operator
 * Align \
 * Format comments


To be honest, despite the reason you give for the split in patch 6,
I'd rather see code movement like this to do formatting adjustments
right away.


Thank you for thinking I can move code and also modify coding style at 
the same time :).


However I know mistakes can be easily made and may not be caught during 
review (possibly leading to an XSA). So I tend to adhere to the KISS 
principle.



But if you're convinced the split is better, I'm not
meaning to insist.


I will keep the code as-is unless someone else think it is bad idea.

Cheers,

--
Julien Grall



Re: [PATCH v2] xen/guest_access: Harden *copy_to_guest_offset() to prevent const dest operand

2020-04-07 Thread Jan Beulich
On 07.04.2020 11:01, Julien Grall wrote:
> Hi Jan,
> 
> On 06/04/2020 12:01, Jan Beulich wrote:
>> On 04.04.2020 15:06, Julien Grall wrote:
>>> From: Julien Grall 
>>>
>>> At the moment, *copy_to_guest_offset() will allow the hypervisor to copy
>>> data to guest handle marked const.
>>>
>>> Thankfully, no users of the helper will do that. Rather than hoping this
>>> can be caught during review, harden copy_to_guest_offset() so the build
>>> will fail if such users are introduced.
>>>
>>> There is no easy way to check whether a const is NULL in C99. The
>>> approach used is to introduce an unused variable that is non-const and
>>> assign the handle. If the handle were const, this would fail at build
>>> because without an explicit cast, it is not possible to assign a const
>>> variable to a non-const variable.
>>>
>>> Suggested-by: Jan Beulich 
>>> Signed-off-by: Julien Grall 
>>
>> I'm not convinced it is a good idea to add (recurring) comments
>> like you do - there are more aspects of these macros that would
>> be worth commenting on, and commenting on some but not all may
>> give the wrong impression of all subtleties being covered (also
>> for others).
> 
> I thought you would say that, but I don't think I am the best
> person to describe all the other subtetly of the code. Yet I
> didn't want to not comment the oddity of using a maybe_unused
> variable.

Well, to me the "__maybe_unused" is telling enough.

>> In any event I'd like to ask that each header gain such a
>> comment only once, with the other being a tiny reference to the
>> one "complete" instance.
> 
> I am not entirely sure how this would look like. We would need
> to rely on _t having the same meaning across all the headers.
> This is quite easy to miss during review, so my preference
> still sticks to multiple comments.

Well, I did say "each header" exactly because of this.

> Although I can reduce the size of the comment to one on top
> of the definition of _t. Something like: "Check if the handler
> is not const".

Ah yes, that would seem better (with s/handler/handle/ of course).

Jan



Re: [PATCH v2] xen/guest_access: Harden *copy_to_guest_offset() to prevent const dest operand

2020-04-07 Thread Julien Grall

Hi Jan,

On 06/04/2020 12:01, Jan Beulich wrote:

On 04.04.2020 15:06, Julien Grall wrote:

From: Julien Grall 

At the moment, *copy_to_guest_offset() will allow the hypervisor to copy
data to guest handle marked const.

Thankfully, no users of the helper will do that. Rather than hoping this
can be caught during review, harden copy_to_guest_offset() so the build
will fail if such users are introduced.

There is no easy way to check whether a const is NULL in C99. The
approach used is to introduce an unused variable that is non-const and
assign the handle. If the handle were const, this would fail at build
because without an explicit cast, it is not possible to assign a const
variable to a non-const variable.

Suggested-by: Jan Beulich 
Signed-off-by: Julien Grall 


I'm not convinced it is a good idea to add (recurring) comments
like you do - there are more aspects of these macros that would
be worth commenting on, and commenting on some but not all may
give the wrong impression of all subtleties being covered (also
for others).


I thought you would say that, but I don't think I am the best person to 
describe all the other subtetly of the code. Yet I didn't want to not 
comment the oddity of using a maybe_unused variable.




In any event I'd like to ask that each header gain such a
comment only once, with the other being a tiny reference to the
one "complete" instance.


I am not entirely sure how this would look like. We would need to rely 
on _t having the same meaning across all the headers. This is quite easy 
to miss during review, so my preference still sticks to multiple comments.


Although I can reduce the size of the comment to one on top of the 
definition of _t. Something like: "Check if the handler is not const".


Cheers,

--
Julien Grall



[qemu-mainline test] 149475: regressions - FAIL

2020-04-07 Thread osstest service owner
flight 149475 qemu-mainline real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149475/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-xl-qemuu-debianhvm-amd64 10 debian-hvm-install fail REGR. vs. 
144861
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-win7-amd64 10 windows-install  fail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-debianhvm-i386-xsm 10 debian-hvm-install fail REGR. 
vs. 144861
 test-amd64-i386-qemuu-rhel6hvm-intel 10 redhat-install   fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-debianhvm-i386-xsm 10 debian-hvm-install fail REGR. 
vs. 144861
 test-amd64-i386-xl-qemuu-debianhvm-amd64 10 debian-hvm-install fail REGR. vs. 
144861
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-i386-xl-qemuu-ws16-amd64 10 windows-install   fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-ws16-amd64 10 windows-install  fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-ovmf-amd64 10 debian-hvm-install fail REGR. vs. 
144861
 test-amd64-amd64-qemuu-nested-amd 10 debian-hvm-install  fail REGR. vs. 144861
 test-amd64-i386-qemuu-rhel6hvm-amd 10 redhat-install fail REGR. vs. 144861
 test-amd64-amd64-libvirt 12 guest-start  fail REGR. vs. 144861
 test-amd64-i386-libvirt-pair 21 guest-start/debian   fail REGR. vs. 144861
 test-amd64-amd64-libvirt-xsm 12 guest-start  fail REGR. vs. 144861
 test-amd64-i386-libvirt  12 guest-start  fail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-debianhvm-amd64-shadow 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-amd64-libvirt-pair 21 guest-start/debian  fail REGR. vs. 144861
 test-amd64-amd64-qemuu-nested-intel 10 debian-hvm-install fail REGR. vs. 144861
 test-amd64-amd64-xl-qemuu-debianhvm-amd64-shadow 10 debian-hvm-install fail 
REGR. vs. 144861
 test-amd64-i386-xl-qemuu-ovmf-amd64 10 debian-hvm-install fail REGR. vs. 144861
 test-amd64-i386-libvirt-xsm  12 guest-start  fail REGR. vs. 144861
 test-arm64-arm64-libvirt-xsm 12 guest-start  fail REGR. vs. 144861
 test-armhf-armhf-libvirt 12 guest-start  fail REGR. vs. 144861
 test-amd64-amd64-libvirt-vhd 10 debian-di-installfail REGR. vs. 144861
 test-amd64-amd64-xl-qcow210 debian-di-installfail REGR. vs. 144861
 test-armhf-armhf-xl-vhd  10 debian-di-installfail REGR. vs. 144861
 test-armhf-armhf-libvirt-raw 10 debian-di-installfail REGR. vs. 144861
 test-amd64-i386-xl-qemuu-win7-amd64 10 windows-install   fail REGR. vs. 144861

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-rtds 16 guest-localmigrate   fail REGR. vs. 144861

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-dom0pvh-xl-intel 18 guest-localmigrate/x10 fail baseline 
untested
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 

Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Jan Beulich
On 07.04.2020 10:25, Roger Pau Monné wrote:
> IMO forcefully disabling decodings (and enabling them afterwards if
> already enabled) and leaving bus mastering as-is would be better.

Limiting this to the "already enabled" case may not be enough:
Devices needed for guest booting may need them to be enabled
even if prior to BAR assignment they were turned off. That's
what a BIOS would typically do, too. I've seen quite a few
BIOS setups actually where one could chose whether the BIOS
would do so just for what it recognized as boot devices, or
for all of them.

Jan



Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Jan Beulich
On 07.04.2020 10:14, Paul Durrant wrote:
>> -Original Message-
>> From: Roger Pau Monné 
>> Sent: 07 April 2020 09:07
>> To: p...@xen.org
>> Cc: 'Harsha Shamsundara Havanur' ; 
>> xen-devel@lists.xenproject.org; 'Wei Liu'
>> ; 'Andrew Cooper' ; 'Ian Jackson' 
>> ;
>> 'Jan Beulich' 
>> Subject: Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
>> resource allocation
>>
>> On Tue, Apr 07, 2020 at 08:48:42AM +0100, Paul Durrant wrote:
 -Original Message-
 From: Xen-devel  On Behalf Of 
 Harsha Shamsundara Havanur
 Sent: 06 April 2020 18:47
 To: xen-devel@lists.xenproject.org
 Cc: Wei Liu ; Andrew Cooper ; Ian 
 Jackson
 ; Jan Beulich ; Harsha 
 Shamsundara Havanur
 ; Roger Pau Monné 
 Subject: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
 resource allocation

 It was observed that PCI MMIO and/or IO BARs were programmed with
 BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
 register) enabled, during PCI setup phase. This resulted in
 spurious and premature bus transactions as soon as the lower bar of
 the 64 bit bar is programmed. It is highly recommended that BARs be
 programmed whilst decode bits are cleared to avoid spurious bus
 transactions.

>>>
>>> It's not so much spurious transactions that are the issue. I think 
>>> "spurious and premature bus
>> transactions" should be replaced with "incorrect mappings being created".
>>>
>>> I believe the PCI spec says all three bits should be clear after reset 
>>> anyway, and BAR programming
>> whilst decodes are enabled causes problems for emulators such as QEMU which 
>> need to create and destroy
>> mappings between the gaddr being programming into the virtual BAR and the 
>> maddr programmed into the
>> physical BAR.
>>> Specifically the case we see is that a 64-bit memory BAR is programmed by 
>>> writing the lower half and
>> then the upper half. After the first write the BAR is mapped to an address 
>> under 4G that happens to
>> contain RAM, which is displaced by the mapping. After the second write the 
>> BAR is re-mapped to the
>> intended location but the RAM displaced by the other mapping is not 
>> restored. The OS then continues to
>> boot and function until at some point it happens to try to use that RAM at 
>> which point it suffers a
>> page fault and crashes. It was only by noticing that the faulting address 
>> lay within the transient BAR
>> mapping that we figured out what was happening.
>>
>> In order to fix this isn't it enough to just disable memory and IO
>> decodes, leaving bus mastering as it is?
>>
>> I assume there is (or was) some reason why bus master is enabled by
>> hvmloader in the first place?
>>
> 
> I admit it is a while since I went mining for the reason BME
> was being set but IIRC the commit that added the original code
> did not state why it was done.

I can second this observation, but this is not a reason to drop
the enabling again without suitable justification. If there are
babbling devices, perhaps they should be quirked rather than,
as you did suggest in reply to Andrew, ones which require it
enabled? (If such a requirement indeed exists, I assume they
would get handed to hvmloader with the bit already set, in
which case a middle ground may be to simply leave the bit
alone rather than force-enabling or force-disabling it. But
again the commit adding the enabling would stand against this,
because it likely was done for a reason even if that reason is
not stated in the commit.)

Jan



Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Roger Pau Monné
On Tue, Apr 07, 2020 at 09:14:53AM +0100, Paul Durrant wrote:
> > -Original Message-
> > From: Roger Pau Monné 
> > Sent: 07 April 2020 09:07
> > To: p...@xen.org
> > Cc: 'Harsha Shamsundara Havanur' ; 
> > xen-devel@lists.xenproject.org; 'Wei Liu'
> > ; 'Andrew Cooper' ; 'Ian Jackson' 
> > ;
> > 'Jan Beulich' 
> > Subject: Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> > resource allocation
> > 
> > On Tue, Apr 07, 2020 at 08:48:42AM +0100, Paul Durrant wrote:
> > > > -Original Message-
> > > > From: Xen-devel  On Behalf Of 
> > > > Harsha Shamsundara Havanur
> > > > Sent: 06 April 2020 18:47
> > > > To: xen-devel@lists.xenproject.org
> > > > Cc: Wei Liu ; Andrew Cooper ; 
> > > > Ian Jackson
> > > > ; Jan Beulich ; Harsha 
> > > > Shamsundara Havanur
> > > > ; Roger Pau Monné 
> > > > Subject: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> > > > resource allocation
> > > >
> > > > It was observed that PCI MMIO and/or IO BARs were programmed with
> > > > BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
> > > > register) enabled, during PCI setup phase. This resulted in
> > > > spurious and premature bus transactions as soon as the lower bar of
> > > > the 64 bit bar is programmed. It is highly recommended that BARs be
> > > > programmed whilst decode bits are cleared to avoid spurious bus
> > > > transactions.
> > > >
> > >
> > > It's not so much spurious transactions that are the issue. I think 
> > > "spurious and premature bus
> > transactions" should be replaced with "incorrect mappings being created".
> > >
> > > I believe the PCI spec says all three bits should be clear after reset 
> > > anyway, and BAR programming
> > whilst decodes are enabled causes problems for emulators such as QEMU which 
> > need to create and destroy
> > mappings between the gaddr being programming into the virtual BAR and the 
> > maddr programmed into the
> > physical BAR.
> > > Specifically the case we see is that a 64-bit memory BAR is programmed by 
> > > writing the lower half and
> > then the upper half. After the first write the BAR is mapped to an address 
> > under 4G that happens to
> > contain RAM, which is displaced by the mapping. After the second write the 
> > BAR is re-mapped to the
> > intended location but the RAM displaced by the other mapping is not 
> > restored. The OS then continues to
> > boot and function until at some point it happens to try to use that RAM at 
> > which point it suffers a
> > page fault and crashes. It was only by noticing that the faulting address 
> > lay within the transient BAR
> > mapping that we figured out what was happening.
> > 
> > In order to fix this isn't it enough to just disable memory and IO
> > decodes, leaving bus mastering as it is?
> > 
> > I assume there is (or was) some reason why bus master is enabled by
> > hvmloader in the first place?
> > 
> 
> I admit it is a while since I went mining for the reason BME was being set 
> but IIRC the commit that added the original code did not state why it was 
> done.
> 
> In the past I have run with local hacks disabling it whilst playing with GPU 
> pass-through and not noticed it causing any problems. There is an argument to 
> say that hvmloader should perhaps leave it alone but there is no good reason 
> I can think of why it ought to be enabling it.

IMO forcefully disabling decodings (and enabling them afterwards if
already enabled) and leaving bus mastering as-is would be better.

Thanks, Roger.



Re: [PATCH 7/7] xen/guest_access: Fix coding style in xen/guest_access.h

2020-04-07 Thread Jan Beulich
On 04.04.2020 15:10, Julien Grall wrote:
> From: Julien Grall 
> 
> * Add space before and after operator
> * Align \
> * Format comments

To be honest, despite the reason you give for the split in patch 6,
I'd rather see code movement like this to do formatting adjustments
right away. But if you're convinced the split is better, I'm not
meaning to insist.

Jan



Re: [PATCH 6/7] xen/guest_access: Consolidate guest access helpers in xen/guest_access.h

2020-04-07 Thread Jan Beulich
On 04.04.2020 15:10, Julien Grall wrote:
> From: Julien Grall 
> 
> Most of the helpers to access guest memory are implemented the same way
> on Arm and x86. The only differences are:
> - guest_handle_{from, to}_param(): while on x86 XEN_GUEST_HANDLE()
>   and XEN_GUEST_HANDLE_PARAM() are the same, they are not on Arm. It
>   is still fine to use the Arm implementation on x86.
> - __clear_guest_offset(): Interestingly the prototype does not match
>   between the x86 and Arm. However, the Arm one is bogus. So the x86
>   implementation can be used.
> - guest_handle{,_subrange}_okay(): They are validly differing
>   because Arm is only supporting auto-translated guest and therefore
>   handles are always valid.

While I'm fine in principle with such consolidation, I'm afraid I
really need to ask for some historical background to be added
here. It may very well be that there's a reason for the separation
(likely to be found in the removed ia64 or ppc ports), which may
then provide a hint at why future ports may want to have these
separated. If such reasons exist, I'd prefer to avoid the back and
forth between headers. What we could do in such a case is borrow
Linux'es asm-generic/ concept, and move the "typical"
implementation there. (And of course if there were no noticable
reasons for the split, the change as it is would be fine in
general; saying so without having looked at the details of it,
yet).

Jan



RE: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Paul Durrant
> -Original Message-
> From: Roger Pau Monné 
> Sent: 07 April 2020 09:07
> To: p...@xen.org
> Cc: 'Harsha Shamsundara Havanur' ; 
> xen-devel@lists.xenproject.org; 'Wei Liu'
> ; 'Andrew Cooper' ; 'Ian Jackson' 
> ;
> 'Jan Beulich' 
> Subject: Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> resource allocation
> 
> On Tue, Apr 07, 2020 at 08:48:42AM +0100, Paul Durrant wrote:
> > > -Original Message-
> > > From: Xen-devel  On Behalf Of 
> > > Harsha Shamsundara Havanur
> > > Sent: 06 April 2020 18:47
> > > To: xen-devel@lists.xenproject.org
> > > Cc: Wei Liu ; Andrew Cooper ; 
> > > Ian Jackson
> > > ; Jan Beulich ; Harsha 
> > > Shamsundara Havanur
> > > ; Roger Pau Monné 
> > > Subject: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> > > resource allocation
> > >
> > > It was observed that PCI MMIO and/or IO BARs were programmed with
> > > BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
> > > register) enabled, during PCI setup phase. This resulted in
> > > spurious and premature bus transactions as soon as the lower bar of
> > > the 64 bit bar is programmed. It is highly recommended that BARs be
> > > programmed whilst decode bits are cleared to avoid spurious bus
> > > transactions.
> > >
> >
> > It's not so much spurious transactions that are the issue. I think 
> > "spurious and premature bus
> transactions" should be replaced with "incorrect mappings being created".
> >
> > I believe the PCI spec says all three bits should be clear after reset 
> > anyway, and BAR programming
> whilst decodes are enabled causes problems for emulators such as QEMU which 
> need to create and destroy
> mappings between the gaddr being programming into the virtual BAR and the 
> maddr programmed into the
> physical BAR.
> > Specifically the case we see is that a 64-bit memory BAR is programmed by 
> > writing the lower half and
> then the upper half. After the first write the BAR is mapped to an address 
> under 4G that happens to
> contain RAM, which is displaced by the mapping. After the second write the 
> BAR is re-mapped to the
> intended location but the RAM displaced by the other mapping is not restored. 
> The OS then continues to
> boot and function until at some point it happens to try to use that RAM at 
> which point it suffers a
> page fault and crashes. It was only by noticing that the faulting address lay 
> within the transient BAR
> mapping that we figured out what was happening.
> 
> In order to fix this isn't it enough to just disable memory and IO
> decodes, leaving bus mastering as it is?
> 
> I assume there is (or was) some reason why bus master is enabled by
> hvmloader in the first place?
> 

I admit it is a while since I went mining for the reason BME was being set but 
IIRC the commit that added the original code did not state why it was done.

In the past I have run with local hacks disabling it whilst playing with GPU 
pass-through and not noticed it causing any problems. There is an argument to 
say that hvmloader should perhaps leave it alone but there is no good reason I 
can think of why it ought to be enabling it.

  Paul

> Thanks, Roger.




Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Roger Pau Monné
On Tue, Apr 07, 2020 at 08:48:42AM +0100, Paul Durrant wrote:
> > -Original Message-
> > From: Xen-devel  On Behalf Of 
> > Harsha Shamsundara Havanur
> > Sent: 06 April 2020 18:47
> > To: xen-devel@lists.xenproject.org
> > Cc: Wei Liu ; Andrew Cooper ; Ian 
> > Jackson
> > ; Jan Beulich ; Harsha 
> > Shamsundara Havanur
> > ; Roger Pau Monné 
> > Subject: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> > resource allocation
> > 
> > It was observed that PCI MMIO and/or IO BARs were programmed with
> > BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
> > register) enabled, during PCI setup phase. This resulted in
> > spurious and premature bus transactions as soon as the lower bar of
> > the 64 bit bar is programmed. It is highly recommended that BARs be
> > programmed whilst decode bits are cleared to avoid spurious bus
> > transactions.
> > 
> 
> It's not so much spurious transactions that are the issue. I think "spurious 
> and premature bus transactions" should be replaced with "incorrect mappings 
> being created".
> 
> I believe the PCI spec says all three bits should be clear after reset 
> anyway, and BAR programming whilst decodes are enabled causes problems for 
> emulators such as QEMU which need to create and destroy mappings between the 
> gaddr being programming into the virtual BAR and the maddr programmed into 
> the physical BAR.
> Specifically the case we see is that a 64-bit memory BAR is programmed by 
> writing the lower half and then the upper half. After the first write the BAR 
> is mapped to an address under 4G that happens to contain RAM, which is 
> displaced by the mapping. After the second write the BAR is re-mapped to the 
> intended location but the RAM displaced by the other mapping is not restored. 
> The OS then continues to boot and function until at some point it happens to 
> try to use that RAM at which point it suffers a page fault and crashes. It 
> was only by noticing that the faulting address lay within the transient BAR 
> mapping that we figured out what was happening.

In order to fix this isn't it enough to just disable memory and IO
decodes, leaving bus mastering as it is?

I assume there is (or was) some reason why bus master is enabled by
hvmloader in the first place?

Thanks, Roger.



Re: [PATCH 1/7] xen/guest_access: Add missing emacs magics

2020-04-07 Thread Jan Beulich
On 04.04.2020 15:10, Julien Grall wrote:
> From: Julien Grall 
> 
> Add missing emacs magics for xen/guest_access.h and
> asm-x86/guest_access.h.
> 
> Signed-off-by: Julien Grall 

I don't think these are "missing"; as per ./CODING_STYLE they're
permitted, but not required (and I continue to question why one
form of such a comment should be preferred over possible other
forms other editors may support). Nevertheless, as this is in
line with what we have elsewhere:

Acked-by: Jan Beulich 

Jan



RE: Live migration and PV device handling

2020-04-07 Thread Paul Durrant
> -Original Message-
> From: Xen-devel  On Behalf Of Tamas K 
> Lengyel
> Sent: 06 April 2020 18:31
> To: Andrew Cooper 
> Cc: Xen-devel ; Anastassios Nanos 
> 
> Subject: Re: Live migration and PV device handling
> 
> On Mon, Apr 6, 2020 at 11:24 AM Andrew Cooper  
> wrote:
> >
> > On 06/04/2020 18:16, Tamas K Lengyel wrote:
> > > On Fri, Apr 3, 2020 at 6:44 AM Andrew Cooper  
> > > wrote:
> > >> On 03/04/2020 13:32, Anastassios Nanos wrote:
> > >>> Hi all,
> > >>>
> > >>> I am trying to understand how live-migration happens in xen. I am
> > >>> looking in the HVM guest case and I have dug into the relevant parts
> > >>> of the toolstack and the hypervisor regarding memory, vCPU context
> > >>> etc.
> > >>>
> > >>> In particular, I am interested in how PV device migration happens. I
> > >>> assume that the guest is not aware of any suspend/resume operations
> > >>> being done
> > >> Sadly, this assumption is not correct.  HVM guests with PV drivers
> > >> currently have to be aware in exactly the same way as PV guests.
> > >>
> > >> Work is in progress to try and address this.  See
> > >> https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=775a02452ddf3a6889690de90b1a94eb29c3c732
> > >> (sorry - for some reason that doc isn't being rendered properly in
> > >> https://xenbits.xen.org/docs/ )
> > > That proposal is very interesting - first time it came across my radar
> > > - but I dislike the idea that domain IDs need to be preserved for
> > > uncooperative migration to work.
> >
> > The above restriction is necessary to work with existing guests, which
> > is an implementation requirement of the folks driving the work.
> >
> > > Ideally I would be able to take
> > > advantage of the same plumbing to perform forking of VMs with PV
> > > drivers where preserving the domain id is impossible since its still
> > > in use.
> >
> > We would of course like to make changes to remove the above restriction
> > in the longterm.  The problem is that it is not a trivial thing to fix.
> > Various things were discussed in Chicago, but I don't recall if any of
> > the plans made their way onto xen-devel.
> 
> Yea I imagine trying to get this to work with existing PV drivers is
> not possible in any other way.

No, as the doc says, the domid forms part of the protocol, hence being visible 
to the guest, and the guest may sample and use the value when making certain 
hypercalls (only some enforce use of DOMID_SELF). Thus faking it without 
risking a guest crash is going to be difficult.

> But if we can update the PV driver code
> such that in the longterm it can work without preserving the domain
> ID, that would be worthwhile.
> 

I think that ship has sailed. It would probably be simpler and cheaper to just 
get virtio working with Xen.

  Paul





Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Jan Beulich
On 06.04.2020 19:46, Harsha Shamsundara Havanur wrote:
> --- a/tools/firmware/hvmloader/pci.c
> +++ b/tools/firmware/hvmloader/pci.c
> @@ -84,6 +84,7 @@ void pci_setup(void)
>  uint32_t vga_devfn = 256;
>  uint16_t class, vendor_id, device_id;
>  unsigned int bar, pin, link, isa_irq;
> +uint8_t pci_devfn_decode_type[256] = {};

256 bytes of new stack space consumption looks quite a lot to me.
Did you consider using two 256-bit bitmaps instead, totaling to
an extra 64 bytes of stack space needed?

> It was observed that PCI MMIO and/or IO BARs were programmed with
> BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
> register) enabled, during PCI setup phase. This resulted in
> spurious and premature bus transactions as soon as the lower bar of
> the 64 bit bar is programmed. It is highly recommended that BARs be
> programmed whilst decode bits are cleared to avoid spurious bus
> transactions.
> 
> This patch address the issue by deferring enablement of memory and
> I/O decode in command register until all the resources, like interrupts
> I/O and/or MMIO BARs for all the PCI device functions are programmed.
> PCI bus memory and I/O space is enabled in command register after
> all the resources like interrupts, I/O and/or MMIO BARs are
> programmed for all valid device functions. PCI BUS MASTER is kept
> disabled in the bootloader as this needs to be enabled by the guest
> OS driver once it initializes and takes control of the device.

Identifying the commit that introduced the issue, even if very old,
would be nice (bbfa22f8c9ca). From looking at current code I first
got the impression that it might have been a result of splitting a
loop, as the main issue is that the bits get set after a first
BAR got programmed, without considering that there might be more.
However, the original commit looks to have assumed that there's
at most one BAR or each type per device (which may have been true
back then for just the few emulated devices there were).

> @@ -289,9 +290,14 @@ void pci_setup(void)
> devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
>  }
>  
> -/* Enable bus mastering. */
> +/*
> + * Disable bus mastering, memory and I/O space, which is typical 
> device
> + * reset state. It is recommended that BAR programming be done whilst
> + * decode bits are cleared to avoid spurious DMAs and bus 
> transactions.
> + * Bus master should be enabled by guest driver when it deems fit.
> + */
>  cmd = pci_readw(devfn, PCI_COMMAND);
> -cmd |= PCI_COMMAND_MASTER;
> +cmd &= ~(PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
>  pci_writew(devfn, PCI_COMMAND, cmd);
>  }

I agree with Andrew that there doesn't look to be a reason to
fiddle with bus mastering here. This is the more that you don't
re-enable it later.

I'm also curious whether there are actually devices getting
handed to the domain (and hence hvmloader) with the memory
and/or I/O decode bits set. This would look to be wrong to me,
and would perhaps want fixing wherever they get set prematurely.
(This isn't to say that, to be on the safe side, we couldn't
also re-clear them here. Maybe there should be a warning issued
if either bit is set?)

Jan



RE: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Paul Durrant
> -Original Message-
> From: Xen-devel  On Behalf Of Harsha 
> Shamsundara Havanur
> Sent: 06 April 2020 18:47
> To: xen-devel@lists.xenproject.org
> Cc: Wei Liu ; Andrew Cooper ; Ian 
> Jackson
> ; Jan Beulich ; Harsha 
> Shamsundara Havanur
> ; Roger Pau Monné 
> Subject: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> resource allocation
> 
> It was observed that PCI MMIO and/or IO BARs were programmed with
> BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
> register) enabled, during PCI setup phase. This resulted in
> spurious and premature bus transactions as soon as the lower bar of
> the 64 bit bar is programmed. It is highly recommended that BARs be
> programmed whilst decode bits are cleared to avoid spurious bus
> transactions.
> 

It's not so much spurious transactions that are the issue. I think "spurious 
and premature bus transactions" should be replaced with "incorrect mappings 
being created".

I believe the PCI spec says all three bits should be clear after reset anyway, 
and BAR programming whilst decodes are enabled causes problems for emulators 
such as QEMU which need to create and destroy mappings between the gaddr being 
programming into the virtual BAR and the maddr programmed into the physical BAR.
Specifically the case we see is that a 64-bit memory BAR is programmed by 
writing the lower half and then the upper half. After the first write the BAR 
is mapped to an address under 4G that happens to contain RAM, which is 
displaced by the mapping. After the second write the BAR is re-mapped to the 
intended location but the RAM displaced by the other mapping is not restored. 
The OS then continues to boot and function until at some point it happens to 
try to use that RAM at which point it suffers a page fault and crashes. It was 
only by noticing that the faulting address lay within the transient BAR mapping 
that we figured out what was happening.

> This patch address the issue by deferring enablement of memory and
> I/O decode in command register until all the resources, like interrupts
> I/O and/or MMIO BARs for all the PCI device functions are programmed.
> PCI bus memory and I/O space is enabled in command register after
> all the resources like interrupts, I/O and/or MMIO BARs are
> programmed for all valid device functions. PCI BUS MASTER is kept
> disabled in the bootloader as this needs to be enabled by the guest
> OS driver once it initializes and takes control of the device.
> 
> Signed-off-by: Harsha Shamsundara Havanur 
> Ack-by: Paul Durrant 

With the comment fixed as I suggest, you can replace this with:

Reviewed-by: Paul Durrant 

> ---
>  tools/firmware/hvmloader/pci.c | 24 +++-
>  1 file changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/firmware/hvmloader/pci.c b/tools/firmware/hvmloader/pci.c
> index 0b708bf578..0f31866453 100644
> --- a/tools/firmware/hvmloader/pci.c
> +++ b/tools/firmware/hvmloader/pci.c
> @@ -84,6 +84,7 @@ void pci_setup(void)
>  uint32_t vga_devfn = 256;
>  uint16_t class, vendor_id, device_id;
>  unsigned int bar, pin, link, isa_irq;
> +uint8_t pci_devfn_decode_type[256] = {};
> 
>  /* Resources assignable to PCI devices via BARs. */
>  struct resource {
> @@ -289,9 +290,14 @@ void pci_setup(void)
> devfn>>3, devfn&7, 'A'+pin-1, isa_irq);
>  }
> 
> -/* Enable bus mastering. */
> +/*
> + * Disable bus mastering, memory and I/O space, which is typical 
> device
> + * reset state. It is recommended that BAR programming be done whilst
> + * decode bits are cleared to avoid spurious DMAs and bus 
> transactions.
> + * Bus master should be enabled by guest driver when it deems fit.
> + */
>  cmd = pci_readw(devfn, PCI_COMMAND);
> -cmd |= PCI_COMMAND_MASTER;
> +cmd &= ~(PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY | PCI_COMMAND_IO);
>  pci_writew(devfn, PCI_COMMAND, cmd);
>  }
> 
> @@ -503,10 +509,9 @@ void pci_setup(void)
>  if ( (bar_reg == PCI_ROM_ADDRESS) ||
>   ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
>PCI_BASE_ADDRESS_SPACE_MEMORY) )
> -cmd |= PCI_COMMAND_MEMORY;
> +pci_devfn_decode_type[devfn] |= PCI_COMMAND_MEMORY;
>  else
> -cmd |= PCI_COMMAND_IO;
> -pci_writew(devfn, PCI_COMMAND, cmd);
> +pci_devfn_decode_type[devfn] |= PCI_COMMAND_IO;
>  }
> 
>  if ( pci_hi_mem_start )
> @@ -530,6 +535,15 @@ void pci_setup(void)
>  cmd |= PCI_COMMAND_IO;
>  pci_writew(vga_devfn, PCI_COMMAND, cmd);
>  }
> +
> +/* Enable memory and I/O space. */
> +for ( devfn = 0; devfn < 256; devfn++ )
> +if ( pci_devfn_decode_type[devfn] )
> +{
> +cmd = pci_readw(devfn, PCI_COMMAND);
> +cmd |= pci_devfn_decode_type[devfn];
> +pci_writew(devfn, PCI_COMMAND, cmd);
> +  

RE: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all resource allocation

2020-04-07 Thread Paul Durrant
> -Original Message-
> From: Xen-devel  On Behalf Of Andrew 
> Cooper
> Sent: 06 April 2020 19:07
> To: Harsha Shamsundara Havanur ; 
> xen-devel@lists.xenproject.org
> Cc: Ian Jackson ; Wei Liu ; Jan 
> Beulich ;
> Roger Pau Monné 
> Subject: Re: [XEN PATCH] hvmloader: Enable MMIO and I/O decode, after all 
> resource allocation
> 
> On 06/04/2020 18:46, Harsha Shamsundara Havanur wrote:
> > It was observed that PCI MMIO and/or IO BARs were programmed with
> > BUS master, memory and I/O decodes (bits 0,1 and 2 of PCI COMMAND
> > register) enabled, during PCI setup phase. This resulted in
> > spurious and premature bus transactions as soon as the lower bar of
> > the 64 bit bar is programmed. It is highly recommended that BARs be
> > programmed whilst decode bits are cleared to avoid spurious bus
> > transactions.
> 
> What kinds of spurious transactions?
> 
> Keeping memory and I/O decoding disabled until the BARs are set up is a
> no-brainer, but busmastering is a more complicated subject.  Therefore,
> it would be helpful to know exactly what you've seen in the way of
> spurious transactions.
> 

I think you know of some GPU h/w that doesn't necessarily stop DMAing after an 
FLR. There is no reason why hvmloader, or anything else until the function 
driver loads, needs BME to be on. As you say mem and io decodes are 
no-brainers, yet without this patch hvmloader enables them after the first BAR 
on the device is programmed, thus causing much fun for device models when the 
subsequent BARs are programmed.

> >
> > This patch address the issue by deferring enablement of memory and
> > I/O decode in command register until all the resources, like interrupts
> > I/O and/or MMIO BARs for all the PCI device functions are programmed.
> > PCI bus memory and I/O space is enabled in command register after
> > all the resources like interrupts, I/O and/or MMIO BARs are
> > programmed for all valid device functions. PCI BUS MASTER is kept
> > disabled in the bootloader as this needs to be enabled by the guest
> > OS driver once it initializes and takes control of the device.
> 
> Has this been tested with an Intel integrated graphics card?  These have
> a habit of hitting a platform reset line if busmaster is ever disabled.
> 

No, we don't have suitable h/w for that AFAIK. If that is the case then we 
ought to quirk it.

> A lot of this will depend on what Qemu does behind the scenes, and
> whether disabling busmastering gets reflected in the real device.
> 

When I last looked at upstream QEMU modifications to the BME bit were echoed 
through.

> >
> > Signed-off-by: Harsha Shamsundara Havanur 
> > Ack-by: Paul Durrant 
> 
> Acked-by

This was a little premature as I have not yet looked at the re-based code.

  Paul

> 
> ~Andrew





[ovmf test] 149477: all pass - PUSHED

2020-04-07 Thread osstest service owner
flight 149477 ovmf real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149477/

Perfect :-)
All tests in this flight passed as required
version targeted for testing:
 ovmf 48f0e94921d83b8204f1025412e071b000394f04
baseline version:
 ovmf ee026ea78b0e32a9ffbaf0040afe91de8ae2179c

Last test of basis   149462  2020-04-06 12:09:20 Z0 days
Testing same since   149477  2020-04-07 01:39:26 Z0 days1 attempts


People who touched revisions under test:
  Michael Kubacki 
  Sean Brogan 

jobs:
 build-amd64-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-i386-libvirt   pass
 build-amd64-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-xl-qemuu-ovmf-amd64 pass
 test-amd64-i386-xl-qemuu-ovmf-amd64  pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/osstest/ovmf.git
   ee026ea78b..48f0e94921  48f0e94921d83b8204f1025412e071b000394f04 -> 
xen-tested-master



[libvirt test] 149482: regressions - FAIL

2020-04-07 Thread osstest service owner
flight 149482 libvirt real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149482/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 build-amd64-libvirt   6 libvirt-buildfail REGR. vs. 146182
 build-i386-libvirt6 libvirt-buildfail REGR. vs. 146182
 build-arm64-libvirt   6 libvirt-buildfail REGR. vs. 146182
 build-armhf-libvirt   6 libvirt-buildfail REGR. vs. 146182

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 1 build-check(1) blocked n/a
 test-amd64-amd64-libvirt  1 build-check(1)   blocked  n/a
 test-amd64-amd64-libvirt-pair  1 build-check(1)   blocked  n/a
 test-armhf-armhf-libvirt-raw  1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt-pair  1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt-xsm   1 build-check(1)   blocked  n/a
 test-amd64-amd64-libvirt-xsm  1 build-check(1)   blocked  n/a
 test-armhf-armhf-libvirt  1 build-check(1)   blocked  n/a
 test-amd64-amd64-libvirt-vhd  1 build-check(1)   blocked  n/a
 test-arm64-arm64-libvirt  1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt   1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 1 build-check(1) blocked n/a
 test-arm64-arm64-libvirt-xsm  1 build-check(1)   blocked  n/a
 test-arm64-arm64-libvirt-qcow2  1 build-check(1)   blocked  n/a

version targeted for testing:
 libvirt  88011ed280c4f946a7b8e7ffcea2335eb075de60
baseline version:
 libvirt  a1cd25b919509be2645dbe6f952d5263e0d4e4e5

Last test of basis   146182  2020-01-17 06:00:23 Z   81 days
Failing since146211  2020-01-18 04:18:52 Z   80 days   77 attempts
Testing same since   149482  2020-04-07 04:18:51 Z0 days1 attempts


People who touched revisions under test:
  Andrea Bolognani 
  Arnaud Patard 
  Boris Fiuczynski 
  Christian Ehrhardt 
  Christian Schoenebeck 
  Collin Walling 
  Daniel Henrique Barboza 
  Daniel P. Berrangé 
  Daniel Veillard 
  Dario Faggioli 
  Erik Skultety 
  Gaurav Agrawal 
  Han Han 
  Jim Fehlig 
  Jiri Denemark 
  Jonathon Jongsma 
  Julio Faracco 
  Ján Tomko 
  Laine Stump 
  Lin Ma 
  Marc-André Lureau 
  Marek Marczykowski-Górecki 
  Mauro S. M. Rodrigues 
  Michal Privoznik 
  Nikolay Shirokovskiy 
  Pavel Hrdina 
  Pavel Mores 
  Peter Krempa 
  Pino Toscano 
  Prathamesh Chavan 
  Rafael Fonseca 
  Richard W.M. Jones 
  Rikard Falkeborn 
  Ryan Moeller 
  Sahid Orentino Ferdjaoui 
  Sebastian Mitterle 
  Seeteena Thoufeek 
  Stefan Berger 
  Stefan Berger 
  Stefan Hajnoczi 
  Thomas Huth 
  Wu Qingliang 
  Your Name 
  Zhang Bo 
  zhenwei pi 
  Zhimin Feng 

jobs:
 build-amd64-xsm  pass
 build-arm64-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-arm64  pass
 build-armhf  pass
 build-i386   pass
 build-amd64-libvirt  fail
 build-arm64-libvirt  fail
 build-armhf-libvirt  fail
 build-i386-libvirt   fail
 build-amd64-pvopspass
 build-arm64-pvopspass
 build-armhf-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm   blocked 
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsmblocked 
 test-amd64-amd64-libvirt-xsm blocked 
 test-arm64-arm64-libvirt-xsm blocked 
 test-amd64-i386-libvirt-xsm  blocked 
 test-amd64-amd64-libvirt blocked 
 test-arm64-arm64-libvirt blocked 
 test-armhf-armhf-libvirt blocked 
 test-amd64-i386-libvirt  blocked 
 test-amd64-amd64-libvirt-pairblocked 
 test-amd64-i386-libvirt-pair blocked 
 test-arm64-arm64-libvirt-qcow2   blocked 
 test-armhf-armhf-libvirt-raw blocked 
 test-amd64-amd64-libvirt-vhd