[PATCH v2] KVM: ppc64: Enable ring-based dirty memory tracking on ppc64: enable config options and implement relevant functions

2023-07-17 Thread Kautuk Consul
- Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
  ordered.
- Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
  kvmppc_xive_native_set_attr is called in the context of an ioctl
  syscall and will call kvmppc_xive_native_eq_sync for setting the
  KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
  when there isn't a running vcpu. Implemented the
  kvm_arch_allow_write_without_running_vcpu to always return true
  to allow mark_page_dirty_in_slot to mark the page dirty in the
  memslot->dirty_bitmap in this case.
- Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page
  offset.
- Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required
  for the generic KVM code to call.
- Add a check to kvmppc_vcpu_run_hv for checking whether the dirty
  ring is soft full.
- Implement the kvm_arch_flush_remote_tlbs_memslot function to support
  the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option.

Test Results

On testing with live migration it was found that there is around
150-180 ms improvment in overall migration time with this patch.

Bare Metal P9 testing with patch:

(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 20694 ms
downtime: 73 ms
setup: 23 ms
transferred ram: 2604370 kbytes
throughput: 1033.55 mbps
remaining ram: 0 kbytes
total ram: 16777216 kbytes
duplicate: 3555398 pages
skipped: 0 pages
normal: 642026 pages
normal bytes: 2568104 kbytes
dirty sync count: 3
page size: 4 kbytes
multifd bytes: 0 kbytes
pages-per-second: 32455
precopy ram: 2581549 kbytes
downtime ram: 22820 kbytes

Bare Metal P9 testing without patch:
---
(qemu) info migrate
globals:
store-global-state: on
only-migratable: off
send-configuration: on
send-section-footer: on
decompress-error-check: on
clear-bitmap-shift: 18
Migration status: completed
total time: 20873 ms
downtime: 62 ms
setup: 19 ms
transferred ram: 2612900 kbytes
throughput: 1027.83 mbps
remaining ram: 0 kbytes
total ram: 16777216 kbytes
duplicate: 3553329 pages
skipped: 0 pages
normal: 644159 pages
normal bytes: 2576636 kbytes
dirty sync count: 4
page size: 4 kbytes
multifd bytes: 0 kbytes
pages-per-second: 88297
precopy ram: 2603645 kbytes
downtime ram: 9254 kbytes

Signed-off-by: Kautuk Consul 
---
 Documentation/virt/kvm/api.rst  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  2 ++
 arch/powerpc/kvm/Kconfig|  2 ++
 arch/powerpc/kvm/book3s.c   | 46 +
 arch/powerpc/kvm/book3s_hv.c|  3 ++
 include/linux/kvm_dirty_ring.h  |  5 
 virt/kvm/dirty_ring.c   |  1 +
 7 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index c0ddd3035462..84c180ccd178 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through the 
CPUID leaf.
 8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
 --
 
-:Architectures: x86, arm64
+:Architectures: x86, arm64, ppc64
 :Parameters: args[0] - size of the dirty log ring
 
 KVM is capable of tracking dirty memory using ring buffers that are
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 9f18fa090f1f..f722309ed7fb 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -33,6 +33,8 @@
 /* Not always available, but if it is, this is the correct offset.  */
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
+
 struct kvm_regs {
__u64 pc;
__u64 cr;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 902611954200..c93354ec3bd5 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -26,6 +26,8 @@ config KVM
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select INTERVAL_TREE
+   select HAVE_KVM_DIRTY_RING_ACQ_REL
+   select NEED_KVM_DIRTY_RING_WITH_BITMAP
 
 config KVM_BOOK3S_HANDLER
bool
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 686d8d9eda3e..01aa4fe2c424 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 #include "trace.h"
@@ -1070,6 +1071,51 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 
 #endif /* CONFIG_KVM_XICS */
 
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It write protects selected pages to enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt

Re: [PATCH] KVM: ppc64: Enable ring-based dirty memory tracking

2023-07-16 Thread Kautuk Consul
Hi Jordan,

On 2023-07-06 14:15:13, Jordan Niethe wrote:
> 
> 
> On 8/6/23 10:34 pm, Kautuk Consul wrote:
> 
> Need at least a little context in the commit message itself:
> 
> "Enable ring-based dirty memory tracking on ppc64:"
Sure will take this in the v2 patch.
> 
> > - Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
> >ordered.
> > - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
> >kvmppc_xive_native_set_attr is called in the context of an ioctl
> >syscall and will call kvmppc_xive_native_eq_sync for setting the
> >KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
> >when there isn't a running vcpu. Implemented the
> >kvm_arch_allow_write_without_running_vcpu to always return true
> >to allow mark_page_dirty_in_slot to mark the page dirty in the
> >memslot->dirty_bitmap in this case.
> 
> Should kvm_arch_allow_write_without_running_vcpu() only return true in the
> context of kvmppc_xive_native_eq_sync()?
Not required. Reason is: kvm_arch_allow_write_without_running_vcpu() is
anyway used only for avoiding the WARN_ON_ONCE in
mark_page_dirty_in_slot(). The memslot->dirty_bitmap in 
mark_page_dirty_in_slot()
will be anyway used only when the KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP is
set and the vcpu returned by kvm_get_running_vcpu() is NULL which is
what happens only when kvmppc_xive_native_eq_sync is called via the
ioctl syscall I mentioned.

> > +   *ptep = __pte(pte_val(*ptep) & ~(_PAGE_WRITE));
> On rpt I think you'd need to use kvmppc_radix_update_pte()?
Sure. I'll add a check for radix_enabled() and call
kvmppc_radix_update_pte() or a similar function in the v2 patch for this.


Re: [PATCH] KVM: ppc64: Enable ring-based dirty memory tracking

2023-07-02 Thread Kautuk Consul
Hi Everyone,

On 2023-06-08 08:34:48, Kautuk Consul wrote:
> - Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
>   ordered.
> - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
>   kvmppc_xive_native_set_attr is called in the context of an ioctl
>   syscall and will call kvmppc_xive_native_eq_sync for setting the
>   KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
>   when there isn't a running vcpu. Implemented the
>   kvm_arch_allow_write_without_running_vcpu to always return true
>   to allow mark_page_dirty_in_slot to mark the page dirty in the
>   memslot->dirty_bitmap in this case.
> - Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page
>   offset.
> - Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required
>   for the generic KVM code to call.
> - Add a check to kvmppc_vcpu_run_hv for checking whether the dirty
>   ring is soft full.
> - Implement the kvm_arch_flush_remote_tlbs_memslot function to support
>   the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option.
> 
> On testing with live migration it was found that there is around
> 150-180 ms improvment in overall migration time with this patch.
> 
> Signed-off-by: Kautuk Consul 
Can someone review this ?


Re: [PATCH] KVM: ppc64: Enable ring-based dirty memory tracking

2023-06-18 Thread Kautuk Consul
Hi Nick/Gavin/Everyone,

On 2023-06-08 08:34:48, Kautuk Consul wrote:
> - Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
>   ordered.
> - Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
>   kvmppc_xive_native_set_attr is called in the context of an ioctl
>   syscall and will call kvmppc_xive_native_eq_sync for setting the
>   KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
>   when there isn't a running vcpu. Implemented the
>   kvm_arch_allow_write_without_running_vcpu to always return true
>   to allow mark_page_dirty_in_slot to mark the page dirty in the
>   memslot->dirty_bitmap in this case.
> - Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page
>   offset.
> - Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required
>   for the generic KVM code to call.
> - Add a check to kvmppc_vcpu_run_hv for checking whether the dirty
>   ring is soft full.
> - Implement the kvm_arch_flush_remote_tlbs_memslot function to support
>   the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option.
> 
> On testing with live migration it was found that there is around
> 150-180 ms improvment in overall migration time with this patch.
> 
> Signed-off-by: Kautuk Consul 
> ---
>  Documentation/virt/kvm/api.rst  |  2 +-
>  arch/powerpc/include/uapi/asm/kvm.h |  2 ++
>  arch/powerpc/kvm/Kconfig|  2 ++
>  arch/powerpc/kvm/book3s_64_mmu_hv.c | 42 +
>  arch/powerpc/kvm/book3s_hv.c|  3 +++
>  include/linux/kvm_dirty_ring.h  |  5 
>  6 files changed, 55 insertions(+), 1 deletion(-)
> 
Any review comments on this ?
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index add067793b90..ce1ebc513bae 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through 
> the CPUID leaf.
>  8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
>  --
> 
> -:Architectures: x86, arm64
> +:Architectures: x86, arm64, ppc64
>  :Parameters: args[0] - size of the dirty log ring
> 
>  KVM is capable of tracking dirty memory using ring buffers that are
> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
> b/arch/powerpc/include/uapi/asm/kvm.h
> index 9f18fa090f1f..f722309ed7fb 100644
> --- a/arch/powerpc/include/uapi/asm/kvm.h
> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> @@ -33,6 +33,8 @@
>  /* Not always available, but if it is, this is the correct offset.  */
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
> 
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 64
> +
>  struct kvm_regs {
>   __u64 pc;
>   __u64 cr;
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 902611954200..c93354ec3bd5 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -26,6 +26,8 @@ config KVM
>   select IRQ_BYPASS_MANAGER
>   select HAVE_KVM_IRQ_BYPASS
>   select INTERVAL_TREE
> + select HAVE_KVM_DIRTY_RING_ACQ_REL
> + select NEED_KVM_DIRTY_RING_WITH_BITMAP
> 
>  config KVM_BOOK3S_HANDLER
>   bool
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
> b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 7f765d5ad436..c92e8022e017 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -2147,3 +2147,45 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
> 
>   vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
>  }
> +
> +/*
> + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for 
> selected
> + * dirty pages.
> + *
> + * It write protects selected pages to enable dirty logging for them.
> + */
> +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
> +  struct kvm_memory_slot *slot,
> +  gfn_t gfn_offset,
> +  unsigned long mask)
> +{
> + phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
> + phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
> + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
> +
> + while (start < end) {
> + pte_t *ptep;
> + unsigned int shift;
> +
> + ptep = find_kvm_secondary_pte(kvm, start, );
> +
> + *ptep = __pte(pte_val(*ptep) & ~(_PAGE_WRITE));
> +
> + start += PAGE_SIZE;
> + }
> +}
> +
> +#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
> +bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
> +{
> + 

[PATCH] KVM: ppc64: Enable ring-based dirty memory tracking

2023-06-08 Thread Kautuk Consul
- Enable CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL as ppc64 is weakly
  ordered.
- Enable CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP because the
  kvmppc_xive_native_set_attr is called in the context of an ioctl
  syscall and will call kvmppc_xive_native_eq_sync for setting the
  KVM_DEV_XIVE_EQ_SYNC attribute which will call mark_dirty_page()
  when there isn't a running vcpu. Implemented the
  kvm_arch_allow_write_without_running_vcpu to always return true
  to allow mark_page_dirty_in_slot to mark the page dirty in the
  memslot->dirty_bitmap in this case.
- Set KVM_DIRTY_LOG_PAGE_OFFSET for the ring buffer's physical page
  offset.
- Implement the kvm_arch_mmu_enable_log_dirty_pt_masked function required
  for the generic KVM code to call.
- Add a check to kvmppc_vcpu_run_hv for checking whether the dirty
  ring is soft full.
- Implement the kvm_arch_flush_remote_tlbs_memslot function to support
  the CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT config option.

On testing with live migration it was found that there is around
150-180 ms improvment in overall migration time with this patch.

Signed-off-by: Kautuk Consul 
---
 Documentation/virt/kvm/api.rst  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  2 ++
 arch/powerpc/kvm/Kconfig|  2 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 42 +
 arch/powerpc/kvm/book3s_hv.c|  3 +++
 include/linux/kvm_dirty_ring.h  |  5 
 6 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index add067793b90..ce1ebc513bae 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8114,7 +8114,7 @@ regardless of what has actually been exposed through the 
CPUID leaf.
 8.29 KVM_CAP_DIRTY_LOG_RING/KVM_CAP_DIRTY_LOG_RING_ACQ_REL
 --
 
-:Architectures: x86, arm64
+:Architectures: x86, arm64, ppc64
 :Parameters: args[0] - size of the dirty log ring
 
 KVM is capable of tracking dirty memory using ring buffers that are
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 9f18fa090f1f..f722309ed7fb 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -33,6 +33,8 @@
 /* Not always available, but if it is, this is the correct offset.  */
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
+
 struct kvm_regs {
__u64 pc;
__u64 cr;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 902611954200..c93354ec3bd5 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -26,6 +26,8 @@ config KVM
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select INTERVAL_TREE
+   select HAVE_KVM_DIRTY_RING_ACQ_REL
+   select NEED_KVM_DIRTY_RING_WITH_BITMAP
 
 config KVM_BOOK3S_HANDLER
bool
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 7f765d5ad436..c92e8022e017 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -2147,3 +2147,45 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
 }
+
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It write protects selected pages to enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+struct kvm_memory_slot *slot,
+gfn_t gfn_offset,
+unsigned long mask)
+{
+   phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+   phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
+   phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+   while (start < end) {
+   pte_t *ptep;
+   unsigned int shift;
+
+   ptep = find_kvm_secondary_pte(kvm, start, );
+
+   *ptep = __pte(pte_val(*ptep) & ~(_PAGE_WRITE));
+
+   start += PAGE_SIZE;
+   }
+}
+
+#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
+bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+   return true;
+}
+#endif
+
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+   const struct kvm_memory_slot *memslot)
+{
+   kvm_flush_remote_tlbs(kvm);
+}
+#endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 130bafdb1430..1d1264ea72c4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4804,6 +4804,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
return -EINTR;
}
 
+   if (kvm_dirty_ring_check_request(vcpu))
+  

Re: [PATCH] KVM: PPC: BOOK3S: book3s_hv_nested.c: improve branch prediction for k.alloc

2023-04-19 Thread Kautuk Consul
On 2023-04-12 12:34:13, Kautuk Consul wrote:
> Hi,
> 
> On 2023-04-11 16:35:10, Michael Ellerman wrote:
> > Kautuk Consul  writes:
> > > On 2023-04-07 09:01:29, Sean Christopherson wrote:
> > >> On Fri, Apr 07, 2023, Bagas Sanjaya wrote:
> > >> > On Fri, Apr 07, 2023 at 05:31:47AM -0400, Kautuk Consul wrote:
> > >> > > I used the unlikely() macro on the return values of the k.alloc
> > >> > > calls and found that it changes the code generation a bit.
> > >> > > Optimize all return paths of k.alloc calls by improving
> > >> > > branch prediction on return value of k.alloc.
> > >> 
> > >> Nit, this is improving code generation, not branch prediction.
> > > Sorry my mistake.
> > >> 
> > >> > What about below?
> > >> > 
> > >> > "Improve branch prediction on kmalloc() and kzalloc() call by using
> > >> > unlikely() macro to optimize their return paths."
> > >> 
> > >> Another nit, using unlikely() doesn't necessarily provide a measurable 
> > >> optimization.
> > >> As above, it does often improve code generation for the happy path, but 
> > >> that doesn't
> > >> always equate to improved performance, e.g. if the CPU can easily 
> > >> predict the branch
> > >> and/or there is no impact on the cache footprint.
> > 
> > > I see. I will submit a v2 of the patch with a better and more accurate
> > > description. Does anyone else have any comments before I do so ?
> >  
> > In general I think unlikely should be saved for cases where either the
> > compiler is generating terrible code, or the likelyness of the condition
> > might be surprising to a human reader.
> > 
> > eg. if you had some code that does a NULL check and it's *expected* that
> > the value is NULL, then wrapping that check in likely() actually adds
> > information for a human reader.
> > 
> > Also please don't use unlikely in init paths or other cold paths, it
> > clutters the code (only slightly but a little) and that's not worth the
> > possible tiny benefit for code that only runs once or infrequently.
> > 
> > I would expect the compilers to do the right thing in all
> > these cases without the unlikely. But if you can demonstrate that they
> > meaningfully improve the code generation with a before/after
> > dissassembly then I'd be interested.
> Just FYI, the last email by kautuk.consul...@gmail.com was by me.
> That last email contains a diff file attachment which compares 2 files:
> before my changes and after my changes.
> This diff file shows a lot of changes in code generation. Im assuming
> all those changes are made by the compiler towards optimizing all return
> paths to k.alloc calls.
> Kindly review and comment.
Any comments on the numerous code generation changes as shown by the
files I attached to this mail chain ? Sorry I don't have concrete
figures of any type to prove that this leads to any measurable performance
improvements. I am just assuming that the compiler's modified code
generation (due to the use of the unlikely macro) would be optimal.

Thanks.
> > cheers


Re: [PATCH] KVM: PPC: BOOK3S: book3s_hv_nested.c: improve branch prediction for k.alloc

2023-04-12 Thread Kautuk Consul
Hi,

On 2023-04-11 16:35:10, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > On 2023-04-07 09:01:29, Sean Christopherson wrote:
> >> On Fri, Apr 07, 2023, Bagas Sanjaya wrote:
> >> > On Fri, Apr 07, 2023 at 05:31:47AM -0400, Kautuk Consul wrote:
> >> > > I used the unlikely() macro on the return values of the k.alloc
> >> > > calls and found that it changes the code generation a bit.
> >> > > Optimize all return paths of k.alloc calls by improving
> >> > > branch prediction on return value of k.alloc.
> >> 
> >> Nit, this is improving code generation, not branch prediction.
> > Sorry my mistake.
> >> 
> >> > What about below?
> >> > 
> >> > "Improve branch prediction on kmalloc() and kzalloc() call by using
> >> > unlikely() macro to optimize their return paths."
> >> 
> >> Another nit, using unlikely() doesn't necessarily provide a measurable 
> >> optimization.
> >> As above, it does often improve code generation for the happy path, but 
> >> that doesn't
> >> always equate to improved performance, e.g. if the CPU can easily predict 
> >> the branch
> >> and/or there is no impact on the cache footprint.
> 
> > I see. I will submit a v2 of the patch with a better and more accurate
> > description. Does anyone else have any comments before I do so ?
>  
> In general I think unlikely should be saved for cases where either the
> compiler is generating terrible code, or the likelyness of the condition
> might be surprising to a human reader.
> 
> eg. if you had some code that does a NULL check and it's *expected* that
> the value is NULL, then wrapping that check in likely() actually adds
> information for a human reader.
> 
> Also please don't use unlikely in init paths or other cold paths, it
> clutters the code (only slightly but a little) and that's not worth the
> possible tiny benefit for code that only runs once or infrequently.
> 
> I would expect the compilers to do the right thing in all
> these cases without the unlikely. But if you can demonstrate that they
> meaningfully improve the code generation with a before/after
> dissassembly then I'd be interested.
Just FYI, the last email by kautuk.consul...@gmail.com was by me.
That last email contains a diff file attachment which compares 2 files:
before my changes and after my changes.
This diff file shows a lot of changes in code generation. Im assuming
all those changes are made by the compiler towards optimizing all return
paths to k.alloc calls.
Kindly review and comment.
> cheers


Re: [PATCH] KVM: PPC: BOOK3S: book3s_hv_nested.c: improve branch prediction for k.alloc

2023-04-10 Thread Kautuk Consul
On 2023-04-07 09:01:29, Sean Christopherson wrote:
> On Fri, Apr 07, 2023, Bagas Sanjaya wrote:
> > On Fri, Apr 07, 2023 at 05:31:47AM -0400, Kautuk Consul wrote:
> > > I used the unlikely() macro on the return values of the k.alloc
> > > calls and found that it changes the code generation a bit.
> > > Optimize all return paths of k.alloc calls by improving
> > > branch prediction on return value of k.alloc.
> 
> Nit, this is improving code generation, not branch prediction.
Sorry my mistake.
> 
> > What about below?
> > 
> > "Improve branch prediction on kmalloc() and kzalloc() call by using
> > unlikely() macro to optimize their return paths."
> 
> Another nit, using unlikely() doesn't necessarily provide a measurable 
> optimization.
> As above, it does often improve code generation for the happy path, but that 
> doesn't
> always equate to improved performance, e.g. if the CPU can easily predict the 
> branch
> and/or there is no impact on the cache footprint.
I see. I will submit a v2 of the patch with a better and more accurate
description. Does anyone else have any comments before I do so ?


[PATCH] KVM: PPC: BOOK3S: book3s_hv_nested.c: improve branch prediction for k.alloc

2023-04-07 Thread Kautuk Consul
I used the unlikely() macro on the return values of the k.alloc
calls and found that it changes the code generation a bit.
Optimize all return paths of k.alloc calls by improving
branch prediction on return value of k.alloc.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 5a64a1341e6f..dbf2dd073e1f 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -446,7 +446,7 @@ long kvmhv_nested_init(void)
ptb_order = 12;
pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
   GFP_KERNEL);
-   if (!pseries_partition_tb) {
+   if (unlikely(!pseries_partition_tb)) {
pr_err("kvm-hv: failed to allocated nested partition table\n");
return -ENOMEM;
}
@@ -575,7 +575,7 @@ long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
return H_PARAMETER;
 
buf = kzalloc(n, GFP_KERNEL | __GFP_NOWARN);
-   if (!buf)
+   if (unlikely(!buf))
return H_NO_MEM;
 
gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false);
@@ -689,7 +689,7 @@ static struct kvm_nested_guest *kvmhv_alloc_nested(struct 
kvm *kvm, unsigned int
long shadow_lpid;
 
gp = kzalloc(sizeof(*gp), GFP_KERNEL);
-   if (!gp)
+   if (unlikely(!gp))
return NULL;
gp->l1_host = kvm;
gp->l1_lpid = lpid;
@@ -1633,7 +1633,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu 
*vcpu,
/* 4. Insert the pte into our shadow_pgtable */
 
n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
-   if (!n_rmap)
+   if (unlikely(!n_rmap))
return RESUME_GUEST; /* Let the guest try again */
n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
-- 
2.39.2



Re: [PATCH] arch/powerpc/kvm: kvmppc_core_vcpu_create_hv: check for kzalloc failure

2023-03-29 Thread Kautuk Consul
On 2023-03-30 10:59:19, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > On 2023-03-28 23:02:09, Michael Ellerman wrote:
> >> Kautuk Consul  writes:
> >> > On 2023-03-28 15:44:02, Kautuk Consul wrote:
> >> >> On 2023-03-28 20:44:48, Michael Ellerman wrote:
> >> >> > Kautuk Consul  writes:
> >> >> > > kvmppc_vcore_create() might not be able to allocate memory through
> >> >> > > kzalloc. In that case the kvm->arch.online_vcores shouldn't be
> >> >> > > incremented.
> >> >> > 
> >> >> > I agree that looks wrong.
> >> >> > 
> >> >> > Have you tried to test what goes wrong if it fails? It looks like it
> >> >> > will break the LPCR update, which likely will cause the guest to crash
> >> >> > horribly.
> >> > Also, are you referring to the code in kvmppc_update_lpcr()?
> >> > That code will not crash as it checks for the vc before trying to
> >> > dereference it.
> >> 
> >> Yeah that's what I was looking at. I didn't mean it would crash, but
> >> that it would bail out early when it sees a NULL vcore, leaving other
> >> vcores with the wrong LPCR value.
> >> 
> >> But as you say it doesn't happen because qemu quits on the first ENOMEM.
> >> 
> >> And regardless if qemu does something that means the guest is broken
> >> that's just a qemu bug, no big deal as far as the kernel is concerned.
> 
> > But there could be another user-mode application other than qemu that
> > actually tries to create a vcpu after it gets a -ENOMEM for another
> > vcpu. Shouldn't the kernel be independent of qemu?
> 
> Yes, the kernel is independent of qemu.
> 
> On P8 we had kvmtool, and there's several other VMMs these days, though
> most don't support Power.
> 
> I didn't mean qemu specifically above. If any VMM continues blindly
> after getting ENOMEM back from the KVM API then that's a bug in that
> VMM.
> 
> >> > But the following 2 places that utilize the arch.online_vcores will have
> >> > problems in logic if the usermode test-case doesn't pull down the
> >> > kvm context after the -ENOMEM vcpu allocation failure:
> >> > book3s_hv.c:3030:   if (!kvm->arch.online_vcores) {
> >> > book3s_hv_rm_mmu.c:44:  if (kvm->arch.online_vcores == 1 && 
> >> > local_paca->kvm_hstate.kvm_vcpu)
> >> 
> >> OK. Both of those look harmless to the host.
> 
> > Harmless to the host in terms of a crash, not in terms of behavior.
> > For example in the case of kvmhv_set_smt_mode:
> > If we got a kzalloc failure once (and online_vcores was wrongly 
> > incremented), 
> > then if kvmhv_set_smt_mode() is called after that then it would be
> > not be setting the arch.smt_mode and arch.emul_smt_mode correctly and it
> > would be wrongly returning with -EBUSY instead of 0.
> 
> But again that bug only affects that VM, which the VMM should have
> terminated when it got the ENOMEM back.
> 
> It's definitely a bug that we increment online_vcores incorrectly, but
> it only affects that VM, and a correctly operating VMM will terminate
> the VM anyway because of the ENOMEM.
Okay, I understand. I used to earlier try to contribute to other
mailing lists and they were very particular about correcting code
that was doing something wrong (just by code review) irrespective of whether
it would actually result in a bug/crash or misbehaviour. I guess maintainers
look at the generic part of the kernel in a different way than arch or
device specific kernel/driver code.
> 
> cheers


Re: [PATCH] arch/powerpc/kvm: kvmppc_core_vcpu_create_hv: check for kzalloc failure

2023-03-28 Thread Kautuk Consul
On 2023-03-28 23:02:09, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > On 2023-03-28 15:44:02, Kautuk Consul wrote:
> >> On 2023-03-28 20:44:48, Michael Ellerman wrote:
> >> > Kautuk Consul  writes:
> >> > > kvmppc_vcore_create() might not be able to allocate memory through
> >> > > kzalloc. In that case the kvm->arch.online_vcores shouldn't be
> >> > > incremented.
> >> > 
> >> > I agree that looks wrong.
> >> > 
> >> > Have you tried to test what goes wrong if it fails? It looks like it
> >> > will break the LPCR update, which likely will cause the guest to crash
> >> > horribly.
> > Also, are you referring to the code in kvmppc_update_lpcr()?
> > That code will not crash as it checks for the vc before trying to
> > dereference it.
> 
> Yeah that's what I was looking at. I didn't mean it would crash, but
> that it would bail out early when it sees a NULL vcore, leaving other
> vcores with the wrong LPCR value.
> 
> But as you say it doesn't happen because qemu quits on the first ENOMEM.
> 
> And regardless if qemu does something that means the guest is broken
> that's just a qemu bug, no big deal as far as the kernel is concerned.
But there could be another user-mode application other than qemu that
actually tries to create a vcpu after it gets a -ENOMEM for another
vcpu. Shouldn't the kernel be independent of qemu?
> 
> > But the following 2 places that utilize the arch.online_vcores will have
> > problems in logic if the usermode test-case doesn't pull down the
> > kvm context after the -ENOMEM vcpu allocation failure:
> > book3s_hv.c:3030:   if (!kvm->arch.online_vcores) {
> > book3s_hv_rm_mmu.c:44:  if (kvm->arch.online_vcores == 1 && 
> > local_paca->kvm_hstate.kvm_vcpu)
> 
> OK. Both of those look harmless to the host.
Harmless to the host in terms of a crash, not in terms of behavior.
For example in the case of kvmhv_set_smt_mode:
If we got a kzalloc failure once (and online_vcores was wrongly incremented), 
then if kvmhv_set_smt_mode() is called after that then it would be
not be setting the arch.smt_mode and arch.emul_smt_mode correctly and it
would be wrongly returning with -EBUSY instead of 0.
Isn't that incorrect with respect to the intent of the code ?
I agree that applications like qemu might not do that but don't we need
to have some integrity with respect to the intent and value of variable
use ? What about good code and logic quality ?
> 
> If we find a case where a misbehaving qemu can crash the host then we
> need to be a bit more careful and treat it at least as a
> denial-of-service bug. But looks like this is not one of those.
> 
> cheers
beers


Re: [PATCH] arch/powerpc/kvm: kvmppc_core_vcpu_create_hv: check for kzalloc failure

2023-03-28 Thread Kautuk Consul
On 2023-03-28 15:44:02, Kautuk Consul wrote:
> On 2023-03-28 20:44:48, Michael Ellerman wrote:
> > Kautuk Consul  writes:
> > > kvmppc_vcore_create() might not be able to allocate memory through
> > > kzalloc. In that case the kvm->arch.online_vcores shouldn't be
> > > incremented.
> > 
> > I agree that looks wrong.
> > 
> > Have you tried to test what goes wrong if it fails? It looks like it
> > will break the LPCR update, which likely will cause the guest to crash
> > horribly.
Also, are you referring to the code in kvmppc_update_lpcr()?
That code will not crash as it checks for the vc before trying to
dereference it.
But the following 2 places that utilize the arch.online_vcores will have
problems in logic if the usermode test-case doesn't pull down the
kvm context after the -ENOMEM vcpu allocation failure:
book3s_hv.c:3030:   if (!kvm->arch.online_vcores) {
book3s_hv_rm_mmu.c:44:  if (kvm->arch.online_vcores == 1 && 
local_paca->kvm_hstate.kvm_vcpu)

> Not sure about LPCR update, but with and without the patch qemu exits
> and so the kvm context is pulled down fine.
> > 
> > You could use CONFIG_FAIL_SLAB and fail-nth etc. to fail just one
> > allocation for a guest. Or probably easier to just hack the code to fail
> > the 4th time it's called using a static counter.
> I am using live debug and I set the r3 return value to 0x0 after the
> call to kzalloc.
> > 
> > Doesn't really matter but could be interesting.
> With and without this patch qemu quits with:
> qemu-system-ppc64: kvm_init_vcpu: kvm_get_vcpu failed (0): Cannot allocate 
> memory
> 
> That's because qemu will shut down when any vcpu is not able
> to be allocated.
> > 
> > > Add a check for kzalloc failure and return with -ENOMEM from
> > > kvmppc_core_vcpu_create_hv().
> > >
> > > Signed-off-by: Kautuk Consul 
> > > ---
> > >  arch/powerpc/kvm/book3s_hv.c | 10 +++---
> > >  1 file changed, 7 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > > index 6ba68dd6190b..e29ee755c920 100644
> > > --- a/arch/powerpc/kvm/book3s_hv.c
> > > +++ b/arch/powerpc/kvm/book3s_hv.c
> > > @@ -2968,13 +2968,17 @@ static int kvmppc_core_vcpu_create_hv(struct 
> > > kvm_vcpu *vcpu)
> > >   pr_devel("KVM: collision on id %u", id);
> > >   vcore = NULL;
> > >   } else if (!vcore) {
> > > + vcore = kvmppc_vcore_create(kvm,
> > > + id & ~(kvm->arch.smt_mode - 1));
> > 
> > That line doesn't need to be wrapped, we allow 90 columns.
> > 
> > > + if (unlikely(!vcore)) {
> > > + mutex_unlock(>lock);
> > > + return -ENOMEM;
> > > + }
> > 
> > Rather than introducing a new return point here, I think it would be
> > preferable to use the existing !vcore case below.
> > 
> > >   /*
> > >* Take mmu_setup_lock for mutual exclusion
> > >* with kvmppc_update_lpcr().
> > >*/
> > > - err = -ENOMEM;
> > > - vcore = kvmppc_vcore_create(kvm,
> > > - id & ~(kvm->arch.smt_mode - 1));
> > 
> > So leave that as is (maybe move the comment down).
> > 
> > And wrap the below in:
> > 
> >  +  if (vcore) {
> > 
> > >   mutex_lock(>arch.mmu_setup_lock);
> > >   kvm->arch.vcores[core] = vcore;
> > >   kvm->arch.online_vcores++;
> > 
> > mutex_unlock(>arch.mmu_setup_lock);
> >  +  }
> > }
> > }
> > 
> > Meaning the vcore == NULL case will fall through to here and return via
> > this existing path:
> > 
> > mutex_unlock(>lock);
> > 
> > if (!vcore)
> > return err;
> > 
> > 
> > cheers


Re: [PATCH] arch/powerpc/kvm: kvmppc_core_vcpu_create_hv: check for kzalloc failure

2023-03-28 Thread Kautuk Consul
On 2023-03-28 20:44:48, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > kvmppc_vcore_create() might not be able to allocate memory through
> > kzalloc. In that case the kvm->arch.online_vcores shouldn't be
> > incremented.
> 
> I agree that looks wrong.
> 
> Have you tried to test what goes wrong if it fails? It looks like it
> will break the LPCR update, which likely will cause the guest to crash
> horribly.
Not sure about LPCR update, but with and without the patch qemu exits
and so the kvm context is pulled down fine.
> 
> You could use CONFIG_FAIL_SLAB and fail-nth etc. to fail just one
> allocation for a guest. Or probably easier to just hack the code to fail
> the 4th time it's called using a static counter.
I am using live debug and I set the r3 return value to 0x0 after the
call to kzalloc.
> 
> Doesn't really matter but could be interesting.
With and without this patch qemu quits with:
qemu-system-ppc64: kvm_init_vcpu: kvm_get_vcpu failed (0): Cannot allocate 
memory

That's because qemu will shut down when any vcpu is not able
to be allocated.
> 
> > Add a check for kzalloc failure and return with -ENOMEM from
> > kvmppc_core_vcpu_create_hv().
> >
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/kvm/book3s_hv.c | 10 +++---
> >  1 file changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index 6ba68dd6190b..e29ee755c920 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -2968,13 +2968,17 @@ static int kvmppc_core_vcpu_create_hv(struct 
> > kvm_vcpu *vcpu)
> > pr_devel("KVM: collision on id %u", id);
> > vcore = NULL;
> > } else if (!vcore) {
> > +   vcore = kvmppc_vcore_create(kvm,
> > +   id & ~(kvm->arch.smt_mode - 1));
> 
> That line doesn't need to be wrapped, we allow 90 columns.
> 
> > +   if (unlikely(!vcore)) {
> > +   mutex_unlock(>lock);
> > +   return -ENOMEM;
> > +   }
> 
> Rather than introducing a new return point here, I think it would be
> preferable to use the existing !vcore case below.
> 
> > /*
> >  * Take mmu_setup_lock for mutual exclusion
> >  * with kvmppc_update_lpcr().
> >  */
> > -   err = -ENOMEM;
> > -   vcore = kvmppc_vcore_create(kvm,
> > -   id & ~(kvm->arch.smt_mode - 1));
> 
> So leave that as is (maybe move the comment down).
> 
> And wrap the below in:
> 
>  +  if (vcore) {
> 
> > mutex_lock(>arch.mmu_setup_lock);
> > kvm->arch.vcores[core] = vcore;
> > kvm->arch.online_vcores++;
>   
>   mutex_unlock(>arch.mmu_setup_lock);
>  +  }
>   }
>   }
> 
> Meaning the vcore == NULL case will fall through to here and return via
> this existing path:
> 
>   mutex_unlock(>lock);
> 
>   if (!vcore)
>   return err;
> 
> 
> cheers


Re: [PATCH] arch/powerpc/kvm: kvmppc_core_vcpu_create_hv: check for kzalloc failure

2023-03-27 Thread Kautuk Consul
Hi,
On 2023-03-23 03:47:18, Kautuk Consul wrote:
> kvmppc_vcore_create() might not be able to allocate memory through
> kzalloc. In that case the kvm->arch.online_vcores shouldn't be
> incremented.
> Add a check for kzalloc failure and return with -ENOMEM from
> kvmppc_core_vcpu_create_hv().
Anyone wants to review this ?
Its been a few days of silence.
By the way, I anyway have to post a v2 as I now know that
the commit description needs to be different. But I'll wait
for review comments on the code before I do so.
> 
> Signed-off-by: Kautuk Consul 
> ---
>  arch/powerpc/kvm/book3s_hv.c | 10 +++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 6ba68dd6190b..e29ee755c920 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -2968,13 +2968,17 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu 
> *vcpu)
>   pr_devel("KVM: collision on id %u", id);
>   vcore = NULL;
>   } else if (!vcore) {
> + vcore = kvmppc_vcore_create(kvm,
> + id & ~(kvm->arch.smt_mode - 1));
> + if (unlikely(!vcore)) {
> + mutex_unlock(>lock);
> + return -ENOMEM;
> + }
> +
>   /*
>* Take mmu_setup_lock for mutual exclusion
>* with kvmppc_update_lpcr().
>*/
> - err = -ENOMEM;
> - vcore = kvmppc_vcore_create(kvm,
> - id & ~(kvm->arch.smt_mode - 1));
>   mutex_lock(>arch.mmu_setup_lock);
>   kvm->arch.vcores[core] = vcore;
>   kvm->arch.online_vcores++;
> -- 
> 2.39.2
> 


[PATCH v5] KVM: PPC: Book3S HV: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
kvmppc_hv_entry isn't called from anywhere other than
book3s_hv_rmhandlers.S itself. Removing .global scope for
this function and annotating it with SYM_CODE_START_LOCAL
and SYM_CODE_END.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index acf80915f406..0a9781192b86 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  **
  */
 
-.global kvmppc_hv_entry
-kvmppc_hv_entry:
+SYM_CODE_START_LOCAL(kvmppc_hv_entry)
 
/* Required state:
 *
@@ -940,6 +939,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld  r4, VCPU_GPR(R4)(r4)
HRFI_TO_GUEST
b   .
+SYM_CODE_END(kvmppc_hv_entry)
 
 secondary_too_late:
li  r12, 0
-- 
2.39.2



Re: [PATCH v4] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
On 2023-03-27 20:30:02, Nicholas Piggin wrote:
> On Mon Mar 27, 2023 at 8:04 PM AEST, Kautuk Consul wrote:
> > kvmppc_hv_entry isn't called from anywhere other than
> > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > this function and annotating it with SYM_CODE_START_LOCAL
> > and SYM_CODE_END.
> 
> Does removing .global introduce the objtool warning, or was it already
> present? Just trying to understand if this is two changes or one (not
> that it really matters, maybe just for the purpose of the changelog).
Yes. Just removing .global introduces the objtool warning.
> 
> Since the patch only touches KVM, subject should follow arch/powerpc/kvm
> convention, which is not the same as the rest of arch/powerpc. KVM: PPC:
> Book3S HV:
Okay. Do you need me to send another patch in "KVM: PPC: Book3S HV:"
format ?
> 
> Otherwise seems okay
> 
> Reviewed-by: Nicholas Piggin 
Thank you!
> 
> Thanks,
> Nick
> 
> >
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index acf80915f406..0a9781192b86 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >   * 
> >*
> >   
> > */
> >  
> > -.global kvmppc_hv_entry
> > -kvmppc_hv_entry:
> > +SYM_CODE_START_LOCAL(kvmppc_hv_entry)
> >  
> > /* Required state:
> >  *
> > @@ -940,6 +939,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> > ld  r4, VCPU_GPR(R4)(r4)
> > HRFI_TO_GUEST
> > b   .
> > +SYM_CODE_END(kvmppc_hv_entry)
> >  
> >  secondary_too_late:
> > li  r12, 0
> > -- 
> > 2.39.2
> 


Re: [PATCH v3 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
On 2023-03-27 15:25:24, Kautuk Consul wrote:
> On 2023-03-27 19:51:34, Nicholas Piggin wrote:
> > On Mon Mar 27, 2023 at 7:34 PM AEST, Kautuk Consul wrote:
> > > On 2023-03-27 14:58:03, Kautuk Consul wrote:
> > > > On 2023-03-27 19:19:37, Nicholas Piggin wrote:
> > > > > On Thu Mar 16, 2023 at 3:10 PM AEST, Kautuk Consul wrote:
> > > > > > kvmppc_hv_entry isn't called from anywhere other than
> > > > > > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > > > > > this function and annotating it with SYM_INNER_LABEL.
> > > > > >
> > > > > > Signed-off-by: Kautuk Consul 
> > > > > > ---
> > > > > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
> > > > > >  1 file changed, 1 insertion(+), 2 deletions(-)
> > > > > >
> > > > > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > > > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > > > index acf80915f406..b81ba4ee0521 100644
> > > > > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > > > @@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> > > > > >   * 
> > > > > >*
> > > > > >   
> > > > > > */
> > > > > >  
> > > > > > -.global kvmppc_hv_entry
> > > > > 
> > > > > I think this is okay.
> > > > > 
> > > > > > -kvmppc_hv_entry:
> > > > > > +SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > > > > 
> > > > > The documentation for SYM_INNER_LABEL says it for labels inside a SYM
> > > > > function block, is that a problem? This is a function but doesn't have
> > > > > C calling convention, so asm annotation docs say that it should use
> > > > > SYM_CODE_START_LOCAL?
> > > > That is correct. Will create a v4 patch for this and send it.
> > > But using SYM_CODE_START_LOCAL again causes a warning in the build
> > > (which we were trying to avoid):
> > > arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x48: 
> > > unannotated intra-function call
> > 
> > Are you using SYM_FUNC_END as well? Looks like you need that to
> > annotate the type properly. It should be the same as SYM_INNER_LABEL
> > in the end AFAIKS.
> 
> What about SYM_CODE_START_LOCAL and SYM_CODE_END ?
> This seems to work fine for me without any build warnings from objtool.
Sent a v4 with this fix. Thanks.
> > 
> > > > > 
> > > > > BTW. why don't our _GLOBAL() macros use these SYM annotations? I 
> > > > > haven't
> > > > > really looked into them.
> > > > Not sure. Was mostly just concentrating on the kvmppc_hv_entry code.
> > 
> > Looks like it's because we have a .type @function annotation in those
> > already. Not sure if we should end up converting all that over to use
> > the SYM annotations or if it's okay to leave it as is.
> > 
> > Thanks,
> > Nick


[PATCH v4] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
kvmppc_hv_entry isn't called from anywhere other than
book3s_hv_rmhandlers.S itself. Removing .global scope for
this function and annotating it with SYM_CODE_START_LOCAL
and SYM_CODE_END.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index acf80915f406..0a9781192b86 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  **
  */
 
-.global kvmppc_hv_entry
-kvmppc_hv_entry:
+SYM_CODE_START_LOCAL(kvmppc_hv_entry)
 
/* Required state:
 *
@@ -940,6 +939,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld  r4, VCPU_GPR(R4)(r4)
HRFI_TO_GUEST
b   .
+SYM_CODE_END(kvmppc_hv_entry)
 
 secondary_too_late:
li  r12, 0
-- 
2.39.2



Re: [PATCH v3 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
On 2023-03-27 19:51:34, Nicholas Piggin wrote:
> On Mon Mar 27, 2023 at 7:34 PM AEST, Kautuk Consul wrote:
> > On 2023-03-27 14:58:03, Kautuk Consul wrote:
> > > On 2023-03-27 19:19:37, Nicholas Piggin wrote:
> > > > On Thu Mar 16, 2023 at 3:10 PM AEST, Kautuk Consul wrote:
> > > > > kvmppc_hv_entry isn't called from anywhere other than
> > > > > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > > > > this function and annotating it with SYM_INNER_LABEL.
> > > > >
> > > > > Signed-off-by: Kautuk Consul 
> > > > > ---
> > > > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
> > > > >  1 file changed, 1 insertion(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > > index acf80915f406..b81ba4ee0521 100644
> > > > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > > @@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> > > > >   *   
> > > > >  *
> > > > >   
> > > > > */
> > > > >  
> > > > > -.global kvmppc_hv_entry
> > > > 
> > > > I think this is okay.
> > > > 
> > > > > -kvmppc_hv_entry:
> > > > > +SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > > > 
> > > > The documentation for SYM_INNER_LABEL says it for labels inside a SYM
> > > > function block, is that a problem? This is a function but doesn't have
> > > > C calling convention, so asm annotation docs say that it should use
> > > > SYM_CODE_START_LOCAL?
> > > That is correct. Will create a v4 patch for this and send it.
> > But using SYM_CODE_START_LOCAL again causes a warning in the build
> > (which we were trying to avoid):
> > arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x48: 
> > unannotated intra-function call
> 
> Are you using SYM_FUNC_END as well? Looks like you need that to
> annotate the type properly. It should be the same as SYM_INNER_LABEL
> in the end AFAIKS.

What about SYM_CODE_START_LOCAL and SYM_CODE_END ?
This seems to work fine for me without any build warnings from objtool.
> 
> > > > 
> > > > BTW. why don't our _GLOBAL() macros use these SYM annotations? I haven't
> > > > really looked into them.
> > > Not sure. Was mostly just concentrating on the kvmppc_hv_entry code.
> 
> Looks like it's because we have a .type @function annotation in those
> already. Not sure if we should end up converting all that over to use
> the SYM annotations or if it's okay to leave it as is.
> 
> Thanks,
> Nick


Re: [PATCH v3 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
On 2023-03-27 15:04:38, Kautuk Consul wrote:
> On 2023-03-27 14:58:03, Kautuk Consul wrote:
> > On 2023-03-27 19:19:37, Nicholas Piggin wrote:
> > > On Thu Mar 16, 2023 at 3:10 PM AEST, Kautuk Consul wrote:
> > > > kvmppc_hv_entry isn't called from anywhere other than
> > > > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > > > this function and annotating it with SYM_INNER_LABEL.
> > > >
> > > > Signed-off-by: Kautuk Consul 
> > > > ---
> > > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
> > > >  1 file changed, 1 insertion(+), 2 deletions(-)
> > > >
> > > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > index acf80915f406..b81ba4ee0521 100644
> > > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > > @@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> > > >   * 
> > > >*
> > > >   
> > > > */
> > > >  
> > > > -.global kvmppc_hv_entry
> > > 
> > > I think this is okay.
> > > 
> > > > -kvmppc_hv_entry:
> > > > +SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > > 
> > > The documentation for SYM_INNER_LABEL says it for labels inside a SYM
> > > function block, is that a problem? This is a function but doesn't have
> > > C calling convention, so asm annotation docs say that it should use
> > > SYM_CODE_START_LOCAL?
> > That is correct. Will create a v4 patch for this and send it.
> But using SYM_CODE_START_LOCAL again causes a warning in the build
> (which we were trying to avoid):
> arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x48: 
> unannotated intra-function call
For that matter, even SYM_INNER_LABEL shows that warning. When I tested
it on my setup it seemed to work fine. I'll investigate and try to come
up with a solution.
> > > 
> > > BTW. why don't our _GLOBAL() macros use these SYM annotations? I haven't
> > > really looked into them.
> > Not sure. Was mostly just concentrating on the kvmppc_hv_entry code.
> > > 
> > > Thanks,
> > > Nick


Re: [PATCH v3 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
On 2023-03-27 14:58:03, Kautuk Consul wrote:
> On 2023-03-27 19:19:37, Nicholas Piggin wrote:
> > On Thu Mar 16, 2023 at 3:10 PM AEST, Kautuk Consul wrote:
> > > kvmppc_hv_entry isn't called from anywhere other than
> > > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > > this function and annotating it with SYM_INNER_LABEL.
> > >
> > > Signed-off-by: Kautuk Consul 
> > > ---
> > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
> > >  1 file changed, 1 insertion(+), 2 deletions(-)
> > >
> > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > index acf80915f406..b81ba4ee0521 100644
> > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > @@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> > >   *   
> > >  *
> > >   
> > > */
> > >  
> > > -.global kvmppc_hv_entry
> > 
> > I think this is okay.
> > 
> > > -kvmppc_hv_entry:
> > > +SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > 
> > The documentation for SYM_INNER_LABEL says it for labels inside a SYM
> > function block, is that a problem? This is a function but doesn't have
> > C calling convention, so asm annotation docs say that it should use
> > SYM_CODE_START_LOCAL?
> That is correct. Will create a v4 patch for this and send it.
But using SYM_CODE_START_LOCAL again causes a warning in the build
(which we were trying to avoid):
arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x48: 
unannotated intra-function call
> > 
> > BTW. why don't our _GLOBAL() macros use these SYM annotations? I haven't
> > really looked into them.
> Not sure. Was mostly just concentrating on the kvmppc_hv_entry code.
> > 
> > Thanks,
> > Nick


Re: [PATCH v3 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-27 Thread Kautuk Consul
On 2023-03-27 19:19:37, Nicholas Piggin wrote:
> On Thu Mar 16, 2023 at 3:10 PM AEST, Kautuk Consul wrote:
> > kvmppc_hv_entry isn't called from anywhere other than
> > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > this function and annotating it with SYM_INNER_LABEL.
> >
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
> >  1 file changed, 1 insertion(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index acf80915f406..b81ba4ee0521 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >   * 
> >*
> >   
> > */
> >  
> > -.global kvmppc_hv_entry
> 
> I think this is okay.
> 
> > -kvmppc_hv_entry:
> > +SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> 
> The documentation for SYM_INNER_LABEL says it for labels inside a SYM
> function block, is that a problem? This is a function but doesn't have
> C calling convention, so asm annotation docs say that it should use
> SYM_CODE_START_LOCAL?
That is correct. Will create a v4 patch for this and send it.
> 
> BTW. why don't our _GLOBAL() macros use these SYM annotations? I haven't
> really looked into them.
Not sure. Was mostly just concentrating on the kvmppc_hv_entry code.
> 
> Thanks,
> Nick


[PATCH] arch/powerpc/kvm: kvmppc_core_vcpu_create_hv: check for kzalloc failure

2023-03-23 Thread Kautuk Consul
kvmppc_vcore_create() might not be able to allocate memory through
kzalloc. In that case the kvm->arch.online_vcores shouldn't be
incremented.
Add a check for kzalloc failure and return with -ENOMEM from
kvmppc_core_vcpu_create_hv().

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6ba68dd6190b..e29ee755c920 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2968,13 +2968,17 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu 
*vcpu)
pr_devel("KVM: collision on id %u", id);
vcore = NULL;
} else if (!vcore) {
+   vcore = kvmppc_vcore_create(kvm,
+   id & ~(kvm->arch.smt_mode - 1));
+   if (unlikely(!vcore)) {
+   mutex_unlock(>lock);
+   return -ENOMEM;
+   }
+
/*
 * Take mmu_setup_lock for mutual exclusion
 * with kvmppc_update_lpcr().
 */
-   err = -ENOMEM;
-   vcore = kvmppc_vcore_create(kvm,
-   id & ~(kvm->arch.smt_mode - 1));
mutex_lock(>arch.mmu_setup_lock);
kvm->arch.vcores[core] = vcore;
kvm->arch.online_vcores++;
-- 
2.39.2



Re: [PATCH v3 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-22 Thread Kautuk Consul
Hi,

On 2023-03-22 23:17:35, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > kvmppc_hv_entry is called from only 2 locations within
> > book3s_hv_rmhandlers.S. Both of those locations set r4
> > as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
> > So, shift the r4 load instruction to kvmppc_hv_entry and
> > thus modify the calling convention of this function.
> >
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++-
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b81ba4ee0521..b61f0b2c677b 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
> > RFI_TO_KERNEL
> >  
> >  kvmppc_call_hv_entry:
> > -   ld  r4, HSTATE_KVM_VCPU(r13)
> > +   /* Enter guest. */
> > bl  kvmppc_hv_entry
> >  
> > /* Back from guest - restore host state and return to caller */
> > @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
> > mtspr   SPRN_LDBAR, r0
> > isync
> >  63:
> > -   /* Order load of vcpu after load of vcore */
> > -   lwsync
> > -   ld  r4, HSTATE_KVM_VCPU(r13)
> > +   /* Enter guest. */
> > bl  kvmppc_hv_entry
> >  
> > /* Back from the guest, go back to nap */
> > @@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> >  
> > /* Required state:
> >  *
> > -* R4 = vcpu pointer (or NULL)
> >  * MSR = ~IR|DR
> >  * R13 = PACA
> >  * R1 = host R1
> > @@ -524,6 +521,10 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > li  r6, KVM_GUEST_MODE_HOST_HV
> > stb r6, HSTATE_IN_GUEST(r13)
> >  
> > +   /* Order load of vcpu after load of vcore */
> 
> Just copying the comment here doesn't work. It doesn't make sense on its
> own here, because the VCORE is loaded (again) a few lines below (536).
> So as written this comment seems backward vs the code.
> 
> The comment would need to expand to explain that the barrier is for the
> case where we came from kvm_secondary_got_guest.
> 
Agreed.
> > +   lwsync
> > +   ld  r4, HSTATE_KVM_VCPU(r13)
> > +
> >  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
> > /* Store initial timestamp */
> > cmpdi   r4, 0
> 
> 
> But as Nick says I don't think it's worth investing effort in small
> tweaks to this code. The risk of introducing bugs is too high for such a
> small improvement to the code.
> 
> Thanks for trying, but I think this asm code is best left more or less
> alone unless we find actual bugs in it - or unless we can make
> substantial improvements to it, which would be rewriting in C, or at
> least converting to a fully call/return style rather than the current
> forest of labels.
> 
> I will take patch 1 though, as that's an obvious cleanup and poses no
> risk (famous last words :D).
Thanks for taking patch 1. I completely agree that it would not be wise
to introduce even small alterations to stable legacy code. But sending
patches like this and conversing with this mailing list's reviewers
is giving me a good picture of what you are open to accepting at this
stage.

Thanks again!
> 
> cheers


Re: [PATCH v3 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-21 Thread Kautuk Consul
On 2023-03-21 10:24:36, Kautuk Consul wrote:
> > Is r4 there only used for CONFIG_KVM_BOOK3S_HV_P8_TIMING? Could put it
> > under there. Although you then lose the barrier if it's disabled, that
> > is okay if you're sure that's the only memory operation being ordered.
> r4 is also used by the secondary_too_late label. So I decided against
> moving it inside the CONFIG_KVM_BOOK3S_HV_P8_TIMING #ifdef as then I
> would need to again load from HSTATE_KVM_VCPU(r13) in secondary_too_late.
> > 
Sorry, forgot to mention, r4 is also being used in the 10f label.
Thats one more reason to not shift the r4 load into the #ifdef.
> > I'm not sure how much new work we want to put into changing this asm
> > code, since it's POWER7/8 only. I would love to move this (and the
> > other) KVM implementations to C like we did with P9. It's a pretty big
> > job though.
> Yeah. I was just reviewing this code and decided to send this small
> improvement to the mailing list. I will check with my team and ask
> them if we could move this implementation to C.
> > 
> > Thanks,
> > Nick
> > 
> > >
> > > Signed-off-by: Kautuk Consul 
> > > ---
> > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++-
> > >  1 file changed, 6 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > index b81ba4ee0521..b61f0b2c677b 100644
> > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
> > >   RFI_TO_KERNEL
> > >  
> > >  kvmppc_call_hv_entry:
> > > - ld  r4, HSTATE_KVM_VCPU(r13)
> > > + /* Enter guest. */
> > >   bl  kvmppc_hv_entry
> > >  
> > >   /* Back from guest - restore host state and return to caller */
> > > @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
> > >   mtspr   SPRN_LDBAR, r0
> > >   isync
> > >  63:
> > > - /* Order load of vcpu after load of vcore */
> > > - lwsync
> > > - ld  r4, HSTATE_KVM_VCPU(r13)
> > > + /* Enter guest. */
> > >   bl  kvmppc_hv_entry
> > >  
> > >   /* Back from the guest, go back to nap */
> > > @@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > >  
> > >   /* Required state:
> > >*
> > > -  * R4 = vcpu pointer (or NULL)
> > >* MSR = ~IR|DR
> > >* R13 = PACA
> > >* R1 = host R1
> > > @@ -524,6 +521,10 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > >   li  r6, KVM_GUEST_MODE_HOST_HV
> > >   stb r6, HSTATE_IN_GUEST(r13)
> > >  
> > > + /* Order load of vcpu after load of vcore */
> > > + lwsync
> > > + ld  r4, HSTATE_KVM_VCPU(r13)
> > > +
> > >  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
> > >   /* Store initial timestamp */
> > >   cmpdi   r4, 0
> > > -- 
> > > 2.39.2
> > 


Re: [PATCH v3 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-20 Thread Kautuk Consul
Hi,

On 2023-03-21 14:15:14, Nicholas Piggin wrote:
> On Thu Mar 16, 2023 at 3:10 PM AEST, Kautuk Consul wrote:
> > kvmppc_hv_entry is called from only 2 locations within
> > book3s_hv_rmhandlers.S. Both of those locations set r4
> > as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
> > So, shift the r4 load instruction to kvmppc_hv_entry and
> > thus modify the calling convention of this function.
> 
> Is r4 there only used for CONFIG_KVM_BOOK3S_HV_P8_TIMING? Could put it
> under there. Although you then lose the barrier if it's disabled, that
> is okay if you're sure that's the only memory operation being ordered.
r4 is also used by the secondary_too_late label. So I decided against
moving it inside the CONFIG_KVM_BOOK3S_HV_P8_TIMING #ifdef as then I
would need to again load from HSTATE_KVM_VCPU(r13) in secondary_too_late.
> 
> I'm not sure how much new work we want to put into changing this asm
> code, since it's POWER7/8 only. I would love to move this (and the
> other) KVM implementations to C like we did with P9. It's a pretty big
> job though.
Yeah. I was just reviewing this code and decided to send this small
improvement to the mailing list. I will check with my team and ask
them if we could move this implementation to C.
> 
> Thanks,
> Nick
> 
> >
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++-
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b81ba4ee0521..b61f0b2c677b 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
> > RFI_TO_KERNEL
> >  
> >  kvmppc_call_hv_entry:
> > -   ld  r4, HSTATE_KVM_VCPU(r13)
> > +   /* Enter guest. */
> > bl  kvmppc_hv_entry
> >  
> > /* Back from guest - restore host state and return to caller */
> > @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
> > mtspr   SPRN_LDBAR, r0
> > isync
> >  63:
> > -   /* Order load of vcpu after load of vcore */
> > -   lwsync
> > -   ld  r4, HSTATE_KVM_VCPU(r13)
> > +   /* Enter guest. */
> > bl  kvmppc_hv_entry
> >  
> > /* Back from the guest, go back to nap */
> > @@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> >  
> > /* Required state:
> >  *
> > -* R4 = vcpu pointer (or NULL)
> >  * MSR = ~IR|DR
> >  * R13 = PACA
> >  * R1 = host R1
> > @@ -524,6 +521,10 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > li  r6, KVM_GUEST_MODE_HOST_HV
> > stb r6, HSTATE_IN_GUEST(r13)
> >  
> > +   /* Order load of vcpu after load of vcore */
> > +   lwsync
> > +   ld  r4, HSTATE_KVM_VCPU(r13)
> > +
> >  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
> > /* Store initial timestamp */
> > cmpdi   r4, 0
> > -- 
> > 2.39.2
> 


[PATCH v3 0/2] Improving calls to kvmppc_hv_entry

2023-03-15 Thread Kautuk Consul
- remove .global scope of kvmppc_hv_entry
- remove r4 argument to kvmppc_hv_entry as it is not required

Changes since v2:
- Add the lwsync instruction before the load to r4 to order
  load of vcore before load of vcpu

Kautuk Consul (2):
  arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope
  arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

-- 
2.39.2



[PATCH v3 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-15 Thread Kautuk Consul
kvmppc_hv_entry is called from only 2 locations within
book3s_hv_rmhandlers.S. Both of those locations set r4
as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
So, shift the r4 load instruction to kvmppc_hv_entry and
thus modify the calling convention of this function.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b81ba4ee0521..b61f0b2c677b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
-   ld  r4, HSTATE_KVM_VCPU(r13)
+   /* Enter guest. */
bl  kvmppc_hv_entry
 
/* Back from guest - restore host state and return to caller */
@@ -352,9 +352,7 @@ kvm_secondary_got_guest:
mtspr   SPRN_LDBAR, r0
isync
 63:
-   /* Order load of vcpu after load of vcore */
-   lwsync
-   ld  r4, HSTATE_KVM_VCPU(r13)
+   /* Enter guest. */
bl  kvmppc_hv_entry
 
/* Back from the guest, go back to nap */
@@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
 
/* Required state:
 *
-* R4 = vcpu pointer (or NULL)
 * MSR = ~IR|DR
 * R13 = PACA
 * R1 = host R1
@@ -524,6 +521,10 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
li  r6, KVM_GUEST_MODE_HOST_HV
stb r6, HSTATE_IN_GUEST(r13)
 
+   /* Order load of vcpu after load of vcore */
+   lwsync
+   ld  r4, HSTATE_KVM_VCPU(r13)
+
 #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
/* Store initial timestamp */
cmpdi   r4, 0
-- 
2.39.2



[PATCH v3 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-15 Thread Kautuk Consul
kvmppc_hv_entry isn't called from anywhere other than
book3s_hv_rmhandlers.S itself. Removing .global scope for
this function and annotating it with SYM_INNER_LABEL.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index acf80915f406..b81ba4ee0521 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  **
  */
 
-.global kvmppc_hv_entry
-kvmppc_hv_entry:
+SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
 
/* Required state:
 *
-- 
2.39.2



Re: [PATCH v2 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-15 Thread Kautuk Consul
Hi,

On 2023-03-16 14:39:08, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > On 2023-03-15 15:48:53, Michael Ellerman wrote:
> >> Kautuk Consul  writes:
> >> > kvmppc_hv_entry is called from only 2 locations within
> >> > book3s_hv_rmhandlers.S. Both of those locations set r4
> >> > as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
> >> > So, shift the r4 load instruction to kvmppc_hv_entry and
> >> > thus modify the calling convention of this function.
> >> >
> >> > Signed-off-by: Kautuk Consul 
> >> > ---
> >> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 -
> >> >  1 file changed, 4 insertions(+), 5 deletions(-)
> >> >
> >> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> >> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> >> > index b81ba4ee0521..da9a15db12fe 100644
> >> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> >> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> >> > @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
> >> >  RFI_TO_KERNEL
> >> >  
> >> >  kvmppc_call_hv_entry:
> >> > -ld  r4, HSTATE_KVM_VCPU(r13)
> >> > +/* Enter guest. */
> >> >  bl  kvmppc_hv_entry
> >> >  
> >> >  /* Back from guest - restore host state and return to caller */
> >> > @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
> >> >  mtspr   SPRN_LDBAR, r0
> >> >  isync
> >> >  63:
> >> > -/* Order load of vcpu after load of vcore */
> >> > -lwsync
> >> 
> >> Where did this barrier go?
> >> 
> >> I don't see that it's covered by any existing barriers in
> >> kvmppc_hv_entry, and you don't add it back anywhere. 
> >
> > My concept about this is that since now the call to kvmppc_hv_entry
> > is first taken before the load to r4 shouldn't the pending load in the
> > pipeline of the HSTATE_KVM_VCORE as per the earlier comment be ordered 
> > anyway
> > before-hand ?
> 
> No.
> 
> > Or do you mean to say that pending loads may not be
> > cleared/flushed across the "bl " boundary ?
> 
> Right.
> 
> The "bl" imposes no ordering on loads before or after it.
> 
> In general nothing orders two independant loads, other than a barrier.
> 
> cheers

Okay, I will post a patch v3 with lwsync before the load to r4 in
kvmppc_hv_entry.

Thanks.


Re: [PATCH v2 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-14 Thread Kautuk Consul
On 2023-03-15 10:48:01, Kautuk Consul wrote:
> On 2023-03-15 15:48:53, Michael Ellerman wrote:
> > Kautuk Consul  writes:
> > > kvmppc_hv_entry is called from only 2 locations within
> > > book3s_hv_rmhandlers.S. Both of those locations set r4
> > > as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
> > > So, shift the r4 load instruction to kvmppc_hv_entry and
> > > thus modify the calling convention of this function.
> > >
> > > Signed-off-by: Kautuk Consul 
> > > ---
> > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 -
> > >  1 file changed, 4 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > index b81ba4ee0521..da9a15db12fe 100644
> > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
> > >   RFI_TO_KERNEL
> > >  
> > >  kvmppc_call_hv_entry:
> > > - ld  r4, HSTATE_KVM_VCPU(r13)
> > > + /* Enter guest. */
> > >   bl  kvmppc_hv_entry
> > >  
> > >   /* Back from guest - restore host state and return to caller */
> > > @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
> > >   mtspr   SPRN_LDBAR, r0
> > >   isync
> > >  63:
> > > - /* Order load of vcpu after load of vcore */
> > > - lwsync
> > 
> > Where did this barrier go?
> > 
> > I don't see that it's covered by any existing barriers in
> > kvmppc_hv_entry, and you don't add it back anywhere. 
> 
> My concept about this is that since now the call to kvmppc_hv_entry
> is first taken before the load to r4 shouldn't the pending load in the
> pipeline of the HSTATE_KVM_VCORE as per the earlier comment be ordered anyway
> before-hand ? Or do you mesn to say that pending loads may not be
> cleared/flushed across the "bl " boundary ?
Sorry, I mean: " shouldn't the pending load in the pipeline (of the
HSTATE_KVM_VCORE) as per the earlier comment be ordered anyway
before-hand?"

Forgot the paranthesis.
> > 
> > > - ld  r4, HSTATE_KVM_VCPU(r13)
> > > + /* Enter guest. */
> > >   bl  kvmppc_hv_entry
> > >  
> > >   /* Back from the guest, go back to nap */
> > > @@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > >  
> > >   /* Required state:
> > >*
> > > -  * R4 = vcpu pointer (or NULL)
> > >* MSR = ~IR|DR
> > >* R13 = PACA
> > >* R1 = host R1
> > > @@ -524,6 +521,8 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > >   li  r6, KVM_GUEST_MODE_HOST_HV
> > >   stb r6, HSTATE_IN_GUEST(r13)
> > >  
> > > + ld  r4, HSTATE_KVM_VCPU(r13)
> > > +
> > >  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
> > >   /* Store initial timestamp */
> > >   cmpdi   r4, 0
> > 
> > cheers


Re: [PATCH v2 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-14 Thread Kautuk Consul
On 2023-03-15 15:48:53, Michael Ellerman wrote:
> Kautuk Consul  writes:
> > kvmppc_hv_entry is called from only 2 locations within
> > book3s_hv_rmhandlers.S. Both of those locations set r4
> > as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
> > So, shift the r4 load instruction to kvmppc_hv_entry and
> > thus modify the calling convention of this function.
> >
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 -
> >  1 file changed, 4 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b81ba4ee0521..da9a15db12fe 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
> > RFI_TO_KERNEL
> >  
> >  kvmppc_call_hv_entry:
> > -   ld  r4, HSTATE_KVM_VCPU(r13)
> > +   /* Enter guest. */
> > bl  kvmppc_hv_entry
> >  
> > /* Back from guest - restore host state and return to caller */
> > @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
> > mtspr   SPRN_LDBAR, r0
> > isync
> >  63:
> > -   /* Order load of vcpu after load of vcore */
> > -   lwsync
> 
> Where did this barrier go?
> 
> I don't see that it's covered by any existing barriers in
> kvmppc_hv_entry, and you don't add it back anywhere. 

My concept about this is that since now the call to kvmppc_hv_entry
is first taken before the load to r4 shouldn't the pending load in the
pipeline of the HSTATE_KVM_VCORE as per the earlier comment be ordered anyway
before-hand ? Or do you mesn to say that pending loads may not be
cleared/flushed across the "bl " boundary ?
> 
> > -   ld  r4, HSTATE_KVM_VCPU(r13)
> > +   /* Enter guest. */
> > bl  kvmppc_hv_entry
> >  
> > /* Back from the guest, go back to nap */
> > @@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> >  
> > /* Required state:
> >  *
> > -* R4 = vcpu pointer (or NULL)
> >  * MSR = ~IR|DR
> >  * R13 = PACA
> >  * R1 = host R1
> > @@ -524,6 +521,8 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
> > li  r6, KVM_GUEST_MODE_HOST_HV
> > stb r6, HSTATE_IN_GUEST(r13)
> >  
> > +   ld  r4, HSTATE_KVM_VCPU(r13)
> > +
> >  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
> > /* Store initial timestamp */
> > cmpdi   r4, 0
> 
> cheers


Re: [PATCH v2 0/2] Improving calls to kvmppc_hv_entry

2023-03-12 Thread Kautuk Consul
Hi everyone,

Anyone interested in reviewing this small patch-set ?
I tested it on P8 and it works fine.

Thanks.

On 2023-03-06 07:37:38, Kautuk Consul wrote:
> - remove .global scope of kvmppc_hv_entry
> - remove r4 argument to kvmppc_hv_entry as it is not required
> 
> Changes since v1:
> - replaced .global by SYM_INNER_LABEL for kvmpcc_hv_entry
> 
> Kautuk Consul (2):
>   arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope
>   arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument
> 
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +---
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
> -- 
> 2.36.1
> 


[PATCH v2 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-06 Thread Kautuk Consul
kvmppc_hv_entry is called from only 2 locations within
book3s_hv_rmhandlers.S. Both of those locations set r4
as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
So, shift the r4 load instruction to kvmppc_hv_entry and
thus modify the calling convention of this function.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b81ba4ee0521..da9a15db12fe 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
-   ld  r4, HSTATE_KVM_VCPU(r13)
+   /* Enter guest. */
bl  kvmppc_hv_entry
 
/* Back from guest - restore host state and return to caller */
@@ -352,9 +352,7 @@ kvm_secondary_got_guest:
mtspr   SPRN_LDBAR, r0
isync
 63:
-   /* Order load of vcpu after load of vcore */
-   lwsync
-   ld  r4, HSTATE_KVM_VCPU(r13)
+   /* Enter guest. */
bl  kvmppc_hv_entry
 
/* Back from the guest, go back to nap */
@@ -506,7 +504,6 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
 
/* Required state:
 *
-* R4 = vcpu pointer (or NULL)
 * MSR = ~IR|DR
 * R13 = PACA
 * R1 = host R1
@@ -524,6 +521,8 @@ SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
li  r6, KVM_GUEST_MODE_HOST_HV
stb r6, HSTATE_IN_GUEST(r13)
 
+   ld  r4, HSTATE_KVM_VCPU(r13)
+
 #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
/* Store initial timestamp */
cmpdi   r4, 0
-- 
2.36.1



[PATCH v2 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-06 Thread Kautuk Consul
kvmppc_hv_entry isn't called from anywhere other than
book3s_hv_rmhandlers.S itself. Removing .global scope for
this function and annotating it with SYM_INNER_LABEL.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index acf80915f406..b81ba4ee0521 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -502,8 +502,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  **
  */
 
-.global kvmppc_hv_entry
-kvmppc_hv_entry:
+SYM_INNER_LABEL(kvmppc_hv_entry, SYM_L_LOCAL)
 
/* Required state:
 *
-- 
2.36.1



[PATCH v2 0/2] Improving calls to kvmppc_hv_entry

2023-03-06 Thread Kautuk Consul
- remove .global scope of kvmppc_hv_entry
- remove r4 argument to kvmppc_hv_entry as it is not required

Changes since v1:
- replaced .global by SYM_INNER_LABEL for kvmpcc_hv_entry

Kautuk Consul (2):
  arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope
  arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

-- 
2.36.1



Re: [PATCH 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-03-06 Thread Kautuk Consul
Hi,

On 2023-02-20 10:53:55, Kautuk Consul wrote:
> kvmppc_hv_entry is called from only 2 locations within
> book3s_hv_rmhandlers.S. Both of those locations set r4
> as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
> So, shift the r4 load instruction to kvmppc_hv_entry and
> thus modify the calling convention of this function.
> 

I am posting v2 of this patch-set now.
I have tested this on POWER8 and it works fine.
Can anyone review the v2 for this patch ? 
I didn't receive any review comments for this patch.

> Signed-off-by: Kautuk Consul 
> ---
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 -
>  1 file changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 7e063fde7adc..922667b09168 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
>   RFI_TO_KERNEL
> 
>  kvmppc_call_hv_entry:
> - ld  r4, HSTATE_KVM_VCPU(r13)
> + /* Enter guest. */
>   bl  kvmppc_hv_entry
> 
>   /* Back from guest - restore host state and return to caller */
> @@ -352,9 +352,7 @@ kvm_secondary_got_guest:
>   mtspr   SPRN_LDBAR, r0
>   isync
>  63:
> - /* Order load of vcpu after load of vcore */
> - lwsync
> - ld  r4, HSTATE_KVM_VCPU(r13)
> + /* Enter guest. */
>   bl  kvmppc_hv_entry
> 
>   /* Back from the guest, go back to nap */
> @@ -506,7 +504,6 @@ kvmppc_hv_entry:
> 
>   /* Required state:
>*
> -  * R4 = vcpu pointer (or NULL)
>* MSR = ~IR|DR
>* R13 = PACA
>* R1 = host R1
> @@ -524,6 +521,8 @@ kvmppc_hv_entry:
>   li  r6, KVM_GUEST_MODE_HOST_HV
>   stb r6, HSTATE_IN_GUEST(r13)
> 
> + ld  r4, HSTATE_KVM_VCPU(r13)
> +
>  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
>   /* Store initial timestamp */
>   cmpdi   r4, 0
> -- 
> 2.31.1
> 


Re: [PATCH 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-03-06 Thread Kautuk Consul
On 2023-02-24 16:45:45, Sathvika Vasireddy wrote:
> On 23/02/23 10:39, Kautuk Consul wrote:
> 
> > Hi Sathvika,
> > > Just one question though. Went through the code again and I think
> > > that this place shouldn't be proper to insert a SYM_FUNC_END
> > > because we haven't entered the guest at this point and the name
> > > of the function is kvmppc_hv_entry which  I think implies that
> > > this SYM_FUNC_END should be at some place after the HRFI_TO_GUEST.
> > > 
> > > What do you think ?
> > Any updates on this ? Is there any other way to avoid this warning ?
> Hmm, to mark the end of the kvmppc_hv_entry function, I think
> SYM_FUNC_END(kvmppc_hv_entry) should be placed before the next symbol, which
> is kvmppc_got_guest() in this case.
> 
> However, if you think it needs to be put at a different place, then it does
> not make sense to have any other symbols before that. You may want to
> consider checking if other macros like SYM_INNER_LABEL() can be used.

SYM_INNER_LABEL works fine for me. I will post a v2 with this change.

Thanks! :-)
> 
> - Sathvika
> 


Re: [PATCH 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-02-22 Thread Kautuk Consul
Hi Sathvika,
> 
> Just one question though. Went through the code again and I think
> that this place shouldn't be proper to insert a SYM_FUNC_END
> because we haven't entered the guest at this point and the name
> of the function is kvmppc_hv_entry which  I think implies that
> this SYM_FUNC_END should be at some place after the HRFI_TO_GUEST.
> 
> What do you think ?
Any updates on this ? Is there any other way to avoid this warning ?


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
 
> You are correct, the patch is wrong because it fails to account for IO
> accesses.

Okay, I looked at the PowerPC ISA and found:
"The memory barrier provides an ordering function for the storage accesses
caused by Load, Store,and dcbz instructions that are executed by the processor
executing the sync instruction and for which the specified storage location
is in storage that is Memory Coherence Required and is neitherWrite Through
Required nor Caching Inhibited.The applicable pairs are all pairs ai ,bj of
such accesses except those in which ai is an accesscaused by a Store or dcbz
instruction and bj is anaccess caused by a Load instruction."

Thanks for your time, Michael. Sorry for the noise.
> 
> Kautuk, I'm not sure what motivated you to look at these barriers, was
> it just the documentation you linked to?
I read the basic documentation. Now that I have access to the PowerISA
document I guess I'll go through it more thoroughly.
> 


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
On 2023-02-22 20:16:10, Paul E. McKenney wrote:
> On Thu, Feb 23, 2023 at 09:31:48AM +0530, Kautuk Consul wrote:
> > On 2023-02-22 09:47:19, Paul E. McKenney wrote:
> > > On Wed, Feb 22, 2023 at 02:33:44PM +0530, Kautuk Consul wrote:
> > > > A link from ibm.com states:
> > > > "Ensures that all instructions preceding the call to __lwsync
> > > >  complete before any subsequent store instructions can be executed
> > > >  on the processor that executed the function. Also, it ensures that
> > > >  all load instructions preceding the call to __lwsync complete before
> > > >  any subsequent load instructions can be executed on the processor
> > > >  that executed the function. This allows you to synchronize between
> > > >  multiple processors with minimal performance impact, as __lwsync
> > > >  does not wait for confirmation from each processor."
> > > > 
> > > > Thats why smp_rmb() and smp_wmb() are defined to lwsync.
> > > > But this same understanding applies to parallel pipeline
> > > > execution on each PowerPC processor.
> > > > So, use the lwsync instruction for rmb() and wmb() on the PPC
> > > > architectures that support it.
> > > > 
> > > > Signed-off-by: Kautuk Consul 
> > > > ---
> > > >  arch/powerpc/include/asm/barrier.h | 7 +++
> > > >  1 file changed, 7 insertions(+)
> > > > 
> > > > diff --git a/arch/powerpc/include/asm/barrier.h 
> > > > b/arch/powerpc/include/asm/barrier.h
> > > > index b95b666f0374..e088dacc0ee8 100644
> > > > --- a/arch/powerpc/include/asm/barrier.h
> > > > +++ b/arch/powerpc/include/asm/barrier.h
> > > > @@ -36,8 +36,15 @@
> > > >   * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
> > > >   */
> > > >  #define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
> > > > +
> > > > +/* The sub-arch has lwsync. */
> > > > +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> > > > +#define __rmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> > > > +#define __wmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> > > 
> > > Hmmm...
> > > 
> > > Does the lwsync instruction now order both cached and uncached accesses?
> > > Or have there been changes so that smp_rmb() and smp_wmb() get this
> > > definition, while rmb() and wmb() still get the sync instruction?
> > > (Not seeing this, but I could easily be missing something.)
> 
> > Upfront I don't see any documentation that states that lwsync
> > distinguishes between cached and uncached accesses.
> > That's why I requested the mailing list for test results with
> > kernel load testing.
> 
> I suggest giving the reference manual a very careful read.  I wish I
> could be more helpful, but I found that a very long time ago, and no
> longer recall exactly where it was stated.
Will do that as soon as I get an opprotunity.
> 
> But maybe Michael Ellerman has a pointer?
Sure. Maybe the cached and uncached accesses in these instructions should be
spelt out more clearly for newer people like me. :-)
Thanks for your time, Paul.
> 
>   Thanx, Paul
> 
> > > > +#else
> > > >  #define __rmb()  __asm__ __volatile__ ("sync" : : : "memory")
> > > >  #define __wmb()  __asm__ __volatile__ ("sync" : : : "memory")
> > > > +#endif
> > > >  
> > > >  /* The sub-arch has lwsync */
> > > >  #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> > > > -- 
> > > > 2.31.1
> > > > 


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
On 2023-02-23 14:51:25, Michael Ellerman wrote:
> Hi Paul,
> 
> "Paul E. McKenney"  writes:
> > On Wed, Feb 22, 2023 at 02:33:44PM +0530, Kautuk Consul wrote:
> >> A link from ibm.com states:
> >> "Ensures that all instructions preceding the call to __lwsync
> >>  complete before any subsequent store instructions can be executed
> >>  on the processor that executed the function. Also, it ensures that
> >>  all load instructions preceding the call to __lwsync complete before
> >>  any subsequent load instructions can be executed on the processor
> >>  that executed the function. This allows you to synchronize between
> >>  multiple processors with minimal performance impact, as __lwsync
> >>  does not wait for confirmation from each processor."
> >> 
> >> Thats why smp_rmb() and smp_wmb() are defined to lwsync.
> >> But this same understanding applies to parallel pipeline
> >> execution on each PowerPC processor.
> >> So, use the lwsync instruction for rmb() and wmb() on the PPC
> >> architectures that support it.
> >> 
> >> Signed-off-by: Kautuk Consul 
> >> ---
> >>  arch/powerpc/include/asm/barrier.h | 7 +++
> >>  1 file changed, 7 insertions(+)
> >> 
> >> diff --git a/arch/powerpc/include/asm/barrier.h 
> >> b/arch/powerpc/include/asm/barrier.h
> >> index b95b666f0374..e088dacc0ee8 100644
> >> --- a/arch/powerpc/include/asm/barrier.h
> >> +++ b/arch/powerpc/include/asm/barrier.h
> >> @@ -36,8 +36,15 @@
> >>   * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
> >>   */
> >>  #define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
> >> +
> >> +/* The sub-arch has lwsync. */
> >> +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> >> +#define __rmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> >> +#define __wmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> >
> > Hmmm...
> >
> > Does the lwsync instruction now order both cached and uncached accesses?
> 
> No.
> 
> > Or have there been changes so that smp_rmb() and smp_wmb() get this
> > definition, while rmb() and wmb() still get the sync instruction?
> 
> No.
> 
> They come from:
> 
> include/asm-generic/barrier.h:#define rmb()do { kcsan_rmb(); __rmb(); 
> } while (0)
> include/asm-generic/barrier.h:#define wmb()do { kcsan_wmb(); __wmb(); 
> } while (0)
> 
> > (Not seeing this, but I could easily be missing something.)
> 
> You are correct, the patch is wrong because it fails to account for IO
> accesses.
> 
> Kautuk, I'm not sure what motivated you to look at these barriers, was
> it just the documentation you linked to?
> 
> Maybe we need some better documentation in the kernel explaining why we
> use the barriers we do?
> 
> Although there's already a pretty decent comment above the definitions,
> but maybe it needs further clarification:
> 
>   /*
>* Memory barrier.
>* The sync instruction guarantees that all memory accesses initiated
>* by this processor have been performed (with respect to all other
>* mechanisms that access memory).  The eieio instruction is a barrier
>* providing an ordering (separately) for (a) cacheable stores and (b)
>* loads and stores to non-cacheable memory (e.g. I/O devices).
>*
>* mb() prevents loads and stores being reordered across this point.
>* rmb() prevents loads being reordered across this point.
>* wmb() prevents stores being reordered across this point.
>*
>* *mb() variants without smp_ prefix must order all types of memory
>* operations with one another. sync is the only instruction sufficient
>* to do this.
>*
>* For the smp_ barriers, ordering is for cacheable memory operations
>* only. We have to use the sync instruction for smp_mb(), since lwsync
>* doesn't order loads with respect to previous stores.  Lwsync can be
>* used for smp_rmb() and smp_wmb().

Sorry I didn't change the comment.
My point is: Is the "sync is the only instruction sufficient to do this"
comment completely correct?
As I mentioned in my reply to Paul there I didn't find any
documentation that up-front states (in the differences between sync and
lwsync) that lwsync distinguishes between cached and unchached accesses.
> 
> 
> cheers


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
On 2023-02-22 09:47:19, Paul E. McKenney wrote:
> On Wed, Feb 22, 2023 at 02:33:44PM +0530, Kautuk Consul wrote:
> > A link from ibm.com states:
> > "Ensures that all instructions preceding the call to __lwsync
> >  complete before any subsequent store instructions can be executed
> >  on the processor that executed the function. Also, it ensures that
> >  all load instructions preceding the call to __lwsync complete before
> >  any subsequent load instructions can be executed on the processor
> >  that executed the function. This allows you to synchronize between
> >  multiple processors with minimal performance impact, as __lwsync
> >  does not wait for confirmation from each processor."
> > 
> > Thats why smp_rmb() and smp_wmb() are defined to lwsync.
> > But this same understanding applies to parallel pipeline
> > execution on each PowerPC processor.
> > So, use the lwsync instruction for rmb() and wmb() on the PPC
> > architectures that support it.
> > 
> > Signed-off-by: Kautuk Consul 
> > ---
> >  arch/powerpc/include/asm/barrier.h | 7 +++
> >  1 file changed, 7 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/barrier.h 
> > b/arch/powerpc/include/asm/barrier.h
> > index b95b666f0374..e088dacc0ee8 100644
> > --- a/arch/powerpc/include/asm/barrier.h
> > +++ b/arch/powerpc/include/asm/barrier.h
> > @@ -36,8 +36,15 @@
> >   * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
> >   */
> >  #define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
> > +
> > +/* The sub-arch has lwsync. */
> > +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> > +#define __rmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> > +#define __wmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> 
> Hmmm...
> 
> Does the lwsync instruction now order both cached and uncached accesses?
> Or have there been changes so that smp_rmb() and smp_wmb() get this
> definition, while rmb() and wmb() still get the sync instruction?
> (Not seeing this, but I could easily be missing something.)
> 
>   Thanx, Paul
Upfront I don't see any documentation that states that lwsync
distinguishes between cached and uncached accesses.
That's why I requested the mailing list for test results with
kernel load testing.
> 
> > +#else
> >  #define __rmb()  __asm__ __volatile__ ("sync" : : : "memory")
> >  #define __wmb()  __asm__ __volatile__ ("sync" : : : "memory")
> > +#endif
> >  
> >  /* The sub-arch has lwsync */
> >  #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> > -- 
> > 2.31.1
> > 


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
> >> I'd have preferred with 'asm volatile' though.
> > Sorry about that! That wasn't the intent of this patch.
> > Probably another patch series should change this manner of #defining
> > assembly.
> 
> Why adding new line wrong then have to have another patch to make them 
> right ?
> 
> When you build a new house in an old village, you first build your house 
> with old materials and then you replace everything with new material ?
Sorry Christophe. I will take care next time to adhere to new
conventions.


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
On Wed, Feb 22, 2023 at 09:44:54AM +, Christophe Leroy wrote:
> 
> 
> Le 22/02/2023 à 10:30, Kautuk Consul a écrit :
> > Again, could some IBM/non-IBM employees do basic sanity kernel load
> > testing on PPC64 UP and SMP systems for this patch?
> > would deeply appreciate it! :-)
> 
> And can 'non-IBM' 'non employees' do something ? :)
Anyone can help! :-)
> 
> > 
> > Thanks again!
> > 
> 
> Did you try on QEMU ?
I am a new IBM employee so I don't have any setup with cross-compiler
and rootfs, etc. with me at the moment.
Thats why I requested this from the mailing list.


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
> 
> Reviewed-by: Christophe Leroy 
Thanks!
> 
> > ---
> >   arch/powerpc/include/asm/barrier.h | 7 +++
> >   1 file changed, 7 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/barrier.h 
> > b/arch/powerpc/include/asm/barrier.h
> > index b95b666f0374..e088dacc0ee8 100644
> > --- a/arch/powerpc/include/asm/barrier.h
> > +++ b/arch/powerpc/include/asm/barrier.h
> > @@ -36,8 +36,15 @@
> >* heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
> >*/
> >   #define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
> > +
> > +/* The sub-arch has lwsync. */
> > +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> > +#define __rmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> > +#define __wmb() __asm__ __volatile__ ("lwsync" : : : "memory")
> 
> I'd have preferred with 'asm volatile' though.
Sorry about that! That wasn't the intent of this patch.
Probably another patch series should change this manner of #defining
assembly.


Re: [PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
Again, could some IBM/non-IBM employees do basic sanity kernel load
testing on PPC64 UP and SMP systems for this patch?
would deeply appreciate it! :-)

Thanks again!



Re: [PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
Sorry, sent the wrong patch!
Please ignore this one.
Sending the v2 in another email.

On Wed, Feb 22, 2023 at 02:31:12PM +0530, Kautuk Consul wrote:
> A link from ibm.com states:
> "Ensures that all instructions preceding the call to __lwsync
>  complete before any subsequent store instructions can be executed
>  on the processor that executed the function. Also, it ensures that
>  all load instructions preceding the call to __lwsync complete before
>  any subsequent load instructions can be executed on the processor
>  that executed the function. This allows you to synchronize between
>  multiple processors with minimal performance impact, as __lwsync
>  does not wait for confirmation from each processor."
> 
> Thats why smp_rmb() and smp_wmb() are defined to lwsync.
> But this same understanding applies to parallel pipeline
> execution on each PowerPC processor.
> So, use the lwsync instruction for rmb() and wmb() on the PPC
> architectures that support it.
> 
> Also removed some useless spaces.
> 
> Signed-off-by: Kautuk Consul 
> ---
>  arch/powerpc/include/asm/barrier.h | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/barrier.h 
> b/arch/powerpc/include/asm/barrier.h
> index e80b2c0e9315..553f5a5d20bd 100644
> --- a/arch/powerpc/include/asm/barrier.h
> +++ b/arch/powerpc/include/asm/barrier.h
> @@ -41,11 +41,17 @@
> 
>  /* The sub-arch has lwsync */
>  #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> -#define SMPWMB  LWSYNC
> +#undef rmb
> +#undef wmb
> +/* Redefine rmb() to lwsync. */
> +#define rmb()({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> +/* Redefine wmb() to lwsync. */
> +#define wmb()({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> +#define SMPWMB  LWSYNC
>  #elif defined(CONFIG_BOOKE)
> -#define SMPWMB  mbar
> +#define SMPWMB  mbar
>  #else
> -#define SMPWMB  eieio
> +#define SMPWMB  eieio
>  #endif
> 
>  /* clang defines this macro for a builtin, which will not work with runtime 
> patching */
> -- 
> 2.31.1
> 


[PATCH v2] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
A link from ibm.com states:
"Ensures that all instructions preceding the call to __lwsync
 complete before any subsequent store instructions can be executed
 on the processor that executed the function. Also, it ensures that
 all load instructions preceding the call to __lwsync complete before
 any subsequent load instructions can be executed on the processor
 that executed the function. This allows you to synchronize between
 multiple processors with minimal performance impact, as __lwsync
 does not wait for confirmation from each processor."

Thats why smp_rmb() and smp_wmb() are defined to lwsync.
But this same understanding applies to parallel pipeline
execution on each PowerPC processor.
So, use the lwsync instruction for rmb() and wmb() on the PPC
architectures that support it.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/include/asm/barrier.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index b95b666f0374..e088dacc0ee8 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -36,8 +36,15 @@
  * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
  */
 #define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
+
+/* The sub-arch has lwsync. */
+#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
+#define __rmb() __asm__ __volatile__ ("lwsync" : : : "memory")
+#define __wmb() __asm__ __volatile__ ("lwsync" : : : "memory")
+#else
 #define __rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define __wmb()  __asm__ __volatile__ ("sync" : : : "memory")
+#endif
 
 /* The sub-arch has lwsync */
 #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
-- 
2.31.1



[PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
A link from ibm.com states:
"Ensures that all instructions preceding the call to __lwsync
 complete before any subsequent store instructions can be executed
 on the processor that executed the function. Also, it ensures that
 all load instructions preceding the call to __lwsync complete before
 any subsequent load instructions can be executed on the processor
 that executed the function. This allows you to synchronize between
 multiple processors with minimal performance impact, as __lwsync
 does not wait for confirmation from each processor."

Thats why smp_rmb() and smp_wmb() are defined to lwsync.
But this same understanding applies to parallel pipeline
execution on each PowerPC processor.
So, use the lwsync instruction for rmb() and wmb() on the PPC
architectures that support it.

Also removed some useless spaces.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/include/asm/barrier.h | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index e80b2c0e9315..553f5a5d20bd 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -41,11 +41,17 @@
 
 /* The sub-arch has lwsync */
 #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
-#define SMPWMB  LWSYNC
+#undef rmb
+#undef wmb
+/* Redefine rmb() to lwsync. */
+#define rmb()  ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
+/* Redefine wmb() to lwsync. */
+#define wmb()  ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
+#define SMPWMB  LWSYNC
 #elif defined(CONFIG_BOOKE)
-#define SMPWMB  mbar
+#define SMPWMB  mbar
 #else
-#define SMPWMB  eieio
+#define SMPWMB  eieio
 #endif
 
 /* clang defines this macro for a builtin, which will not work with runtime 
patching */
-- 
2.31.1



Re: [PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
 
> No, I don't mean to use the existing #ifdef/elif/else.
> 
> Define an #ifdef /#else dedicated to xmb macros.
> 
> Something like that:
> 
> @@ -35,9 +35,15 @@
>* However, on CPUs that don't support lwsync, lwsync actually maps to a
>* heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
>*/
> +#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> +#define __mb()   asm volatile ("lwsync" : : : "memory")
> +#define __rmb()  asm volatile ("lwsync" : : : "memory")
> +#define __wmb()  asm volatile ("lwsync" : : : "memory")
> +#else
>   #define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
>   #define __rmb()  __asm__ __volatile__ ("sync" : : : "memory")
>   #define __wmb()  __asm__ __volatile__ ("sync" : : : "memory")
> +#endif
Ok. Got it. Will do.

> >> Shouldn't you also consider the same for mb() ?
> > My change wasn't meant to address newer usages of as volatile
> > #defines. I just wanted to redefine the rmb and wmb #defines
> > to lwsync.
> 
> That's my point, why not also redefine mb to lwsync ?
That would be incorrect. lwsync will only work for one: load or store.
mb() is meant for barriering both loads as well as stores so the sync
instruction is correct for that one.
> 
> >>
> >> Note that your series will conflict with b6e259297a6b ("powerpc/kcsan:
> >> Memory barriers semantics") in powerpc/next tree.
> > I thought of defining the __rmb and __wmb macros but I decided against
> > it because I didn't understand kcsan completely.
> > I used the standard Linus' tree, not powerpc/next.
> > Can you tell me which branch or git repo I should make this patch on ?
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
> 
> 'merge' branch is a merge of branches 'master', 'fixes' and 'next'.
> 
> That's the branch to use, allthough it is not always in sync with fixes 
> and next, in that case all you have to do is to locally merge 'next' and 
> 'fixes' branch until it's done remotely.
> 
> But just using 'next' branch also works most of the time.
> 
> Note that 'next' branch should already be part of linux-next so you may 
> also use linux-next if you prefer.
> 
Will send another patch on this.
Thanks. Will use linux-next branch on this git repo.


Re: [PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
On Wed, Feb 22, 2023 at 08:28:19AM +, Christophe Leroy wrote:
> 
> 
> Le 22/02/2023 à 09:21, Kautuk Consul a écrit :
> >> On Wed, Feb 22, 2023 at 07:02:34AM +, Christophe Leroy wrote:
> >>>> +/* Redefine rmb() to lwsync. */
> >>>
> >>> WHat's the added value of this comment ? Isn't it obvious in the line
> >>> below that rmb() is being defined to lwsync ? Please avoid useless 
> >>> comments.
> >> Sure.
> > Sorry, forgot to add that I wasn't adding this useless comment.
> > Its just that checkpatch.pl complains that the memory barrier #define
> > doesn't have a comment for it.
> >>>
> 
> See https://docs.kernel.org/dev-tools/checkpatch.html, it says:
> 
> Checkpatch is not always right. Your judgement takes precedence over 
> checkpatch messages. If your code looks better with the violations, then 
> its probably best left alone.
> 
> checkpatch wants a comment for uses of memory barriers. Here I think it 
> is a false positive.
Cool. I will make the changes you mentioned.
Can you tell me which branch or git repo I should re-make this patch on ?


Re: [PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
> On Wed, Feb 22, 2023 at 07:02:34AM +, Christophe Leroy wrote:
> > > +/* Redefine rmb() to lwsync. */
> > 
> > WHat's the added value of this comment ? Isn't it obvious in the line 
> > below that rmb() is being defined to lwsync ? Please avoid useless comments.
> Sure.
Sorry, forgot to add that I wasn't adding this useless comment.
Its just that checkpatch.pl complains that the memory barrier #define
doesn't have a comment for it.
> > 


Re: [PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-22 Thread Kautuk Consul
On Wed, Feb 22, 2023 at 07:02:34AM +, Christophe Leroy wrote:
> 
> 
> Le 22/02/2023 à 07:01, Kautuk Consul a écrit :
> > A link from ibm.com states:
> > "Ensures that all instructions preceding the call to __lwsync
> >   complete before any subsequent store instructions can be executed
> >   on the processor that executed the function. Also, it ensures that
> >   all load instructions preceding the call to __lwsync complete before
> >   any subsequent load instructions can be executed on the processor
> >   that executed the function. This allows you to synchronize between
> >   multiple processors with minimal performance impact, as __lwsync
> >   does not wait for confirmation from each processor."
> > 
> > Thats why smp_rmb() and smp_wmb() are defined to lwsync.
> > But this same understanding applies to parallel pipeline
> > execution on each PowerPC processor.
> > So, use the lwsync instruction for rmb() and wmb() on the PPC
> > architectures that support it.
> > 
> > Also removed some useless spaces.
> > 
> > Signed-off-by: Kautuk Consul 
> > ---
> >   arch/powerpc/include/asm/barrier.h | 12 +---
> >   1 file changed, 9 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/barrier.h 
> > b/arch/powerpc/include/asm/barrier.h
> > index e80b2c0e9315..553f5a5d20bd 100644
> > --- a/arch/powerpc/include/asm/barrier.h
> > +++ b/arch/powerpc/include/asm/barrier.h
> > @@ -41,11 +41,17 @@
> >   
> >   /* The sub-arch has lwsync */
> >   #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> > -#define SMPWMB  LWSYNC
> 
> This line shouldn't be changed by your patch
I mentioned it in the commit message.
But if you want I'll remove this. Did this because the rest
of the file doesn't have these spaces.
> 
> > +#undef rmb
> > +#undef wmb
> 
> I see no point with defining something and undefining them a few lines 
> later.
> 
> Instead, why not do:
> 
> #define mb()   __asm__ __volatile__ ("sync" : : : "memory")
> 
> #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> #define rmb() ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> #define wmb() ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> #else
> #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
> #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
> #endif
> 
I thought of doing that earlier, but there exists one more #elif
for CONFIG_BOOKE and then the #else.
That way we would have to put 3 different #defines for rmb and wmb
and I wanted to avoid that.
> By the way, why put it inside a ({ }) ?
checkpatch.pl asks for ({}).
> And I think nowdays we use "asm volatile" not "__asm__ __volatile__"
I was just following what was there in the file already.
Can change this if required.
> 
> Shouldn't you also consider the same for mb() ?
My change wasn't meant to address newer usages of as volatile
#defines. I just wanted to redefine the rmb and wmb #defines
to lwsync.
> 
> Note that your series will conflict with b6e259297a6b ("powerpc/kcsan: 
> Memory barriers semantics") in powerpc/next tree.
I thought of defining the __rmb and __wmb macros but I decided against
it because I didn't understand kcsan completely.
I used the standard Linus' tree, not powerpc/next.
Can you tell me which branch or git repo I should make this patch on ?
> 
> > +/* Redefine rmb() to lwsync. */
> 
> WHat's the added value of this comment ? Isn't it obvious in the line 
> below that rmb() is being defined to lwsync ? Please avoid useless comments.
Sure.
> 
> > +#define rmb()  ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> > +/* Redefine wmb() to lwsync. */
> > +#define wmb()  ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> > +#define SMPWMB  LWSYNC
> >   #elif defined(CONFIG_BOOKE)
> > -#define SMPWMB  mbar
> 
> This line shouldn't be changed by your patch
> 
> > +#define SMPWMB  mbar
> >   #else
> > -#define SMPWMB  eieio
Ok. Can change my patch.
> 
> This line shouldn't be changed by your patch
> 
> > +#define SMPWMB  eieio
> >   #endif
Sure. Will retain this too.
> >   
> >   /* clang defines this macro for a builtin, which will not work with 
> > runtime patching */


Re: [PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-21 Thread Kautuk Consul
Hi All,

On Wed, Feb 22, 2023 at 11:31:07AM +0530, Kautuk Consul wrote:
>  /* The sub-arch has lwsync */
>  #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
> -#define SMPWMB  LWSYNC
> +#undef rmb
> +#undef wmb
> +/* Redefine rmb() to lwsync. */
> +#define rmb()({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> +/* Redefine wmb() to lwsync. */
> +#define wmb()({__asm__ __volatile__ ("lwsync" : : : "memory"); })
> +#define SMPWMB  LWSYNC
>  #elif defined(CONFIG_BOOKE)
> -#define SMPWMB  mbar
> +#define SMPWMB  mbar
>  #else
> -#define SMPWMB  eieio
> +#define SMPWMB  eieio
>  #endif

I think I am conceptually right about this patch but I lack the
resources currently to tets this out on PowerPC 64 bit servers.

I request IBM/Non-IBM employees to test this patch out for:
a) functionality breaking. This patch is no good if this breaks current
   kernel functionality.
b) performance impact. If functionality doesn't break, can anyone do
   some reliable kernel load testing on ppc64 servers to see if there
   is any significant performance gain ?

Thanks a lot!


[PATCH] arch/powerpc/include/asm/barrier.h: redefine rmb and wmb to lwsync

2023-02-21 Thread Kautuk Consul
A link from ibm.com states:
"Ensures that all instructions preceding the call to __lwsync
 complete before any subsequent store instructions can be executed
 on the processor that executed the function. Also, it ensures that
 all load instructions preceding the call to __lwsync complete before
 any subsequent load instructions can be executed on the processor
 that executed the function. This allows you to synchronize between
 multiple processors with minimal performance impact, as __lwsync
 does not wait for confirmation from each processor."

Thats why smp_rmb() and smp_wmb() are defined to lwsync.
But this same understanding applies to parallel pipeline
execution on each PowerPC processor.
So, use the lwsync instruction for rmb() and wmb() on the PPC
architectures that support it.

Also removed some useless spaces.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/include/asm/barrier.h | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index e80b2c0e9315..553f5a5d20bd 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -41,11 +41,17 @@
 
 /* The sub-arch has lwsync */
 #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
-#define SMPWMB  LWSYNC
+#undef rmb
+#undef wmb
+/* Redefine rmb() to lwsync. */
+#define rmb()  ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
+/* Redefine wmb() to lwsync. */
+#define wmb()  ({__asm__ __volatile__ ("lwsync" : : : "memory"); })
+#define SMPWMB  LWSYNC
 #elif defined(CONFIG_BOOKE)
-#define SMPWMB  mbar
+#define SMPWMB  mbar
 #else
-#define SMPWMB  eieio
+#define SMPWMB  eieio
 #endif
 
 /* clang defines this macro for a builtin, which will not work with runtime 
patching */
-- 
2.31.1



Re: [PATCH 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-02-20 Thread Kautuk Consul
On Mon, Feb 20, 2023 at 01:41:38PM +0530, Kautuk Consul wrote:
> On Mon, Feb 20, 2023 at 01:31:40PM +0530, Sathvika Vasireddy wrote:
> > Placing SYM_FUNC_END(kvmppc_hv_entry) before kvmppc_got_guest() should do:
> > 
> > @@ -502,12 +500,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> > * *
> > */
> > 
> > -.global kvmppc_hv_entry
> > -kvmppc_hv_entry:
> > +SYM_FUNC_START_LOCAL(kvmppc_hv_entry)
> > 
> >     /* Required state:
> >  *
> > -    * R4 = vcpu pointer (or NULL)
> >  * MSR = ~IR|DR
> >  * R13 = PACA
> >  * R1 = host R1
> > @@ -525,6 +521,8 @@ kvmppc_hv_entry:
> >     li  r6, KVM_GUEST_MODE_HOST_HV
> >     stb r6, HSTATE_IN_GUEST(r13)
> > 
> > +   ld  r4, HSTATE_KVM_VCPU(r13)
> > +
> >  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
> >     /* Store initial timestamp */
> >     cmpdi   r4, 0
> > @@ -619,6 +617,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >     /* Do we have a guest vcpu to run? */
> >  10:    cmpdi   r4, 0
> >     beq kvmppc_primary_no_guest
> > +SYM_FUNC_END(kvmppc_hv_entry)

Just one question though. Went through the code again and I think
that this place shouldn't be proper to insert a SYM_FUNC_END
because we haven't entered the guest at this point and the name
of the function is kvmppc_hv_entry which  I think implies that
this SYM_FUNC_END should be at some place after the HRFI_TO_GUEST.

What do you think ?

> > +
> >  kvmppc_got_guest:
> >     /* Increment yield count if they have a VPA */
> >     ld  r3, VCPU_VPA(r4)
> > 
> 
> Thanks! Will send out a v2 after I get some response for
> PATCH 2/2 with comments.
> > 
> > Thanks,
> > Sathvika


Re: [PATCH 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-02-20 Thread Kautuk Consul
On Mon, Feb 20, 2023 at 01:31:40PM +0530, Sathvika Vasireddy wrote:
> Placing SYM_FUNC_END(kvmppc_hv_entry) before kvmppc_got_guest() should do:
> 
> @@ -502,12 +500,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> * *
> */
> 
> -.global kvmppc_hv_entry
> -kvmppc_hv_entry:
> +SYM_FUNC_START_LOCAL(kvmppc_hv_entry)
> 
>     /* Required state:
>  *
> -    * R4 = vcpu pointer (or NULL)
>  * MSR = ~IR|DR
>  * R13 = PACA
>  * R1 = host R1
> @@ -525,6 +521,8 @@ kvmppc_hv_entry:
>     li  r6, KVM_GUEST_MODE_HOST_HV
>     stb r6, HSTATE_IN_GUEST(r13)
> 
> +   ld  r4, HSTATE_KVM_VCPU(r13)
> +
>  #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
>     /* Store initial timestamp */
>     cmpdi   r4, 0
> @@ -619,6 +617,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>     /* Do we have a guest vcpu to run? */
>  10:    cmpdi   r4, 0
>     beq kvmppc_primary_no_guest
> +SYM_FUNC_END(kvmppc_hv_entry)
> +
>  kvmppc_got_guest:
>     /* Increment yield count if they have a VPA */
>     ld  r3, VCPU_VPA(r4)
> 

Thanks! Will send out a v2 after I get some response for
PATCH 2/2 with comments.
> 
> Thanks,
> Sathvika


Re: [PATCH 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-02-19 Thread Kautuk Consul
Hi Sathvika,

(Sorry didn't include list in earlier email.)

On Mon, Feb 20, 2023 at 12:35:09PM +0530, Sathvika Vasireddy wrote:
> Hi Kautuk,
> 
> On 20/02/23 10:53, Kautuk Consul wrote:
> > kvmppc_hv_entry isn't called from anywhere other than
> > book3s_hv_rmhandlers.S itself. Removing .global scope for
> > this function.
> > 
> > Signed-off-by: Kautuk Consul 
> > ---
> >   arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 -
> >   1 file changed, 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index acf80915f406..7e063fde7adc 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -502,7 +502,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >*
> > *
> >
> > */
> > -.global kvmppc_hv_entry
> >   kvmppc_hv_entry:
> > /* Required state:
> I see the following objtool warning with this patch applied.
> arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x48:
> unannotated intra-function call
> 
> Annotating kvmppc_hv_entry symbol with SYM_FUNC_START_LOCAL and SYM_FUNC_END
> macros should help fix this warning.

Not sure where to put the SYM_FUNC_END annotation.
Will the following do:

ld  r0, VCPU_GPR(R0)(r4)
ld  r2, VCPU_GPR(R2)(r4)
ld  r3, VCPU_GPR(R3)(r4)
ld  r4, VCPU_GPR(R4)(r4)
HRFI_TO_GUEST
b   .

SYM_FUNC_END(kvmppc_hv_entry)

secondary_too_late:
li  r12, 0

?

Thanks.

> 
> Thanks,
> Sathvika
> 
> 


[PATCH 2/2] arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

2023-02-19 Thread Kautuk Consul
kvmppc_hv_entry is called from only 2 locations within
book3s_hv_rmhandlers.S. Both of those locations set r4
as HSTATE_KVM_VCPU(r13) before calling kvmppc_hv_entry.
So, shift the r4 load instruction to kvmppc_hv_entry and
thus modify the calling convention of this function.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7e063fde7adc..922667b09168 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -85,7 +85,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
-   ld  r4, HSTATE_KVM_VCPU(r13)
+   /* Enter guest. */
bl  kvmppc_hv_entry
 
/* Back from guest - restore host state and return to caller */
@@ -352,9 +352,7 @@ kvm_secondary_got_guest:
mtspr   SPRN_LDBAR, r0
isync
 63:
-   /* Order load of vcpu after load of vcore */
-   lwsync
-   ld  r4, HSTATE_KVM_VCPU(r13)
+   /* Enter guest. */
bl  kvmppc_hv_entry
 
/* Back from the guest, go back to nap */
@@ -506,7 +504,6 @@ kvmppc_hv_entry:
 
/* Required state:
 *
-* R4 = vcpu pointer (or NULL)
 * MSR = ~IR|DR
 * R13 = PACA
 * R1 = host R1
@@ -524,6 +521,8 @@ kvmppc_hv_entry:
li  r6, KVM_GUEST_MODE_HOST_HV
stb r6, HSTATE_IN_GUEST(r13)
 
+   ld  r4, HSTATE_KVM_VCPU(r13)
+
 #ifdef CONFIG_KVM_BOOK3S_HV_P8_TIMING
/* Store initial timestamp */
cmpdi   r4, 0
-- 
2.31.1



[PATCH 1/2] arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope

2023-02-19 Thread Kautuk Consul
kvmppc_hv_entry isn't called from anywhere other than
book3s_hv_rmhandlers.S itself. Removing .global scope for
this function.

Signed-off-by: Kautuk Consul 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index acf80915f406..7e063fde7adc 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -502,7 +502,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  **
  */
 
-.global kvmppc_hv_entry
 kvmppc_hv_entry:
 
/* Required state:
-- 
2.31.1



[PATCH 0/2] Improving calls to kvmppc_hv_entry

2023-02-19 Thread Kautuk Consul
- remove .global scope of kvmppc_hv_entry
- remove r4 argument to kvmppc_hv_entry as it is not required

Kautuk Consul (2):
  arch/powerpc/kvm: kvmppc_hv_entry: remove .global scope
  arch/powerpc/kvm: kvmppc_hv_entry: remove r4 argument

 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

-- 
2.31.1