from:"Leonardo Bras"

Re: [PATCH V12 08/14] riscv: qspinlock: Force virt_spin_lock for KVM guests

2024-01-03 Thread Leonardo Bras

On Mon, Dec 25, 2023 at 07:58:41AM -0500, guo...@kernel.org wrote:
> From: Guo Ren 
> 
> Force to enable virt_spin_lock when KVM guest, because fair locks
> have horrible lock 'holder' preemption issues.
> 
> Suggested-by: Leonardo Bras 
> Link: https://lkml.kernel.org/kvm/zqk9-tn2mepxl...@redhat.com/
> Signed-off-by: Guo Ren 
> Signed-off-by: Guo Ren 
> ---
>  arch/riscv/include/asm/sbi.h | 8 
>  arch/riscv/kernel/sbi.c  | 2 +-
>  arch/riscv/kernel/setup.c| 6 +-
>  3 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
> index 0892f4421bc4..8f748d9e1b85 100644
> --- a/arch/riscv/include/asm/sbi.h
> +++ b/arch/riscv/include/asm/sbi.h
> @@ -51,6 +51,13 @@ enum sbi_ext_base_fid {
>   SBI_EXT_BASE_GET_MIMPID,
>  };
>  
> +enum sbi_ext_base_impl_id {
> + SBI_EXT_BASE_IMPL_ID_BBL = 0,
> + SBI_EXT_BASE_IMPL_ID_OPENSBI,
> + SBI_EXT_BASE_IMPL_ID_XVISOR,
> + SBI_EXT_BASE_IMPL_ID_KVM,
> +};
> +
>  enum sbi_ext_time_fid {
>   SBI_EXT_TIME_SET_TIMER = 0,
>  };
> @@ -276,6 +283,7 @@ int sbi_console_getchar(void);
>  long sbi_get_mvendorid(void);
>  long sbi_get_marchid(void);
>  long sbi_get_mimpid(void);
> +long sbi_get_firmware_id(void);
>  void sbi_set_timer(uint64_t stime_value);
>  void sbi_shutdown(void);
>  void sbi_send_ipi(unsigned int cpu);
> diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
> index 5a62ed1da453..4330aedf65fd 100644
> --- a/arch/riscv/kernel/sbi.c
> +++ b/arch/riscv/kernel/sbi.c
> @@ -543,7 +543,7 @@ static inline long sbi_get_spec_version(void)
>   return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION);
>  }
>  
> -static inline long sbi_get_firmware_id(void)
> +long sbi_get_firmware_id(void)
>  {
>   return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_ID);
>  }
> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
> index 0bafb9fd6ea3..e33430e9d97e 100644
> --- a/arch/riscv/kernel/setup.c
> +++ b/arch/riscv/kernel/setup.c
> @@ -281,6 +281,9 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
>  
>  static void __init virt_spin_lock_init(void)
>  {
> + if (sbi_get_firmware_id() != SBI_EXT_BASE_IMPL_ID_KVM)
> + no_virt_spin = true;
> +
>   if (no_virt_spin)
>   static_branch_disable(_spin_lock_key);
>   else
> @@ -290,7 +293,8 @@ static void __init virt_spin_lock_init(void)
>  
>  static void __init riscv_spinlock_init(void)
>  {
> - if (!enable_qspinlock) {
> + if ((!enable_qspinlock) &&
> + (sbi_get_firmware_id() != SBI_EXT_BASE_IMPL_ID_KVM)) {
>   static_branch_disable(_qspinlock_key);
>   pr_info("Ticket spinlock: enabled\n");
>   } else {
> -- 
> 2.40.1
> 

LGTM:
Reviewed-by: Leonardo Bras

Re: [PATCH V12 07/14] riscv: qspinlock: Add virt_spin_lock() support for VM guest

2024-01-03 Thread Leonardo Bras

On Mon, Dec 25, 2023 at 07:58:40AM -0500, guo...@kernel.org wrote:
> From: Guo Ren 
> 
> Add a static key controlling whether virt_spin_lock() should be
> called or not. When running on bare metal set the new key to
> false.
> 
> The VM guests should fall back to a Test-and-Set spinlock,
> because fair locks have horrible lock 'holder' preemption issues.
> The virt_spin_lock_key would shortcut for the queued_spin_lock_-
> slowpath() function that allow virt_spin_lock to hijack it.
> 
> Signed-off-by: Guo Ren 
> Signed-off-by: Guo Ren 
> ---
>  .../admin-guide/kernel-parameters.txt |  4 +++
>  arch/riscv/include/asm/spinlock.h | 22 
>  arch/riscv/kernel/setup.c | 26 +++
>  3 files changed, 52 insertions(+)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 2ac9f1511774..b7794c96d91e 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3997,6 +3997,10 @@
>   no_uaccess_flush
>   [PPC] Don't flush the L1-D cache after accessing user 
> data.
>  
> + no_virt_spin[RISC-V] Disable virt_spin_lock in VM guest to use
> + native_queued_spinlock when the nopvspin option is 
> enabled.
> + This would help vcpu=pcpu scenarios.
> +
>   novmcoredd  [KNL,KDUMP]
>   Disable device dump. Device dump allows drivers to
>   append dump data to vmcore so you can collect driver
> diff --git a/arch/riscv/include/asm/spinlock.h 
> b/arch/riscv/include/asm/spinlock.h
> index d07643c07aae..7bbcf3d9fff0 100644
> --- a/arch/riscv/include/asm/spinlock.h
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -4,6 +4,28 @@
>  #define __ASM_RISCV_SPINLOCK_H
>  
>  #ifdef CONFIG_QUEUED_SPINLOCKS
> +/*
> + * The KVM guests fall back to a Test-and-Set spinlock, because fair locks
> + * have horrible lock 'holder' preemption issues. The virt_spin_lock_key
> + * would shortcut for the queued_spin_lock_slowpath() function that allow
> + * virt_spin_lock to hijack it.
> + */
> +DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
> +
> +#define virt_spin_lock virt_spin_lock
> +static inline bool virt_spin_lock(struct qspinlock *lock)
> +{
> + if (!static_branch_likely(_spin_lock_key))
> + return false;
> +
> + do {
> + while (atomic_read(>val) != 0)
> + cpu_relax();
> + } while (atomic_cmpxchg(>val, 0, _Q_LOCKED_VAL) != 0);
> +
> + return true;
> +}
> +
>  #define _Q_PENDING_LOOPS (1 << 9)
>  #endif
>  
> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
> index d9072a59831c..0bafb9fd6ea3 100644
> --- a/arch/riscv/kernel/setup.c
> +++ b/arch/riscv/kernel/setup.c
> @@ -27,6 +27,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -266,6 +267,27 @@ early_param("qspinlock", queued_spinlock_setup);
>  DEFINE_STATIC_KEY_TRUE(combo_qspinlock_key);
>  EXPORT_SYMBOL(combo_qspinlock_key);
>  
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> +static bool no_virt_spin __ro_after_init;
> +static int __init no_virt_spin_setup(char *p)
> +{
> + no_virt_spin = true;
> +
> + return 0;
> +}
> +early_param("no_virt_spin", no_virt_spin_setup);
> +
> +DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
> +
> +static void __init virt_spin_lock_init(void)
> +{
> + if (no_virt_spin)
> + static_branch_disable(_spin_lock_key);
> + else
> + pr_info("Enable virt_spin_lock\n");
> +}
> +#endif
> +
>  static void __init riscv_spinlock_init(void)
>  {
>   if (!enable_qspinlock) {
> @@ -274,6 +296,10 @@ static void __init riscv_spinlock_init(void)
>   } else {
>   pr_info("Queued spinlock: enabled\n");
>   }
> +
> +#ifdef CONFIG_QUEUED_SPINLOCKS
> + virt_spin_lock_init();
> +#endif
>  }
>  #endif
>  
> -- 
> 2.40.1
> 

LGTM:
Reviewed-by: Leonardo Bras

Re: [PATCH V12 06/14] riscv: qspinlock: Introduce combo spinlock

2024-01-03 Thread Leonardo Bras

On Mon, Dec 25, 2023 at 07:58:39AM -0500, guo...@kernel.org wrote:
> From: Guo Ren 
> 
> Combo spinlock could support queued and ticket in one Linux Image and
> select them during boot time via command line. Here is the func
> size (Bytes) comparison table below:
> 
> TYPE  : COMBO | TICKET | QUEUED
> arch_spin_lock: 106   | 60 | 50
> arch_spin_unlock  : 54| 36 | 26
> arch_spin_trylock : 110   | 72 | 54
> arch_spin_is_locked   : 48| 34 | 20
> arch_spin_is_contended: 56| 40 | 24
> rch_spin_value_unlocked   : 48| 34 | 24
> 
> One example of disassemble combo arch_spin_unlock:
>   <+14>:nop# detour slot
>   <+18>:fence   rw,w   --+-> queued_spin_unlock
>   <+22>:sb  zero,0(a4) --+   (2 instructions)
>   <+26>:ld  s0,8(sp)
>   <+28>:addisp,sp,16
>   <+30>:ret
>   <+32>:lw  a5,0(a4)   --+-> ticket_spin_unlock
>   <+34>:sext.w  a5,a5|   (7 instructions)
>   <+36>:fence   rw,w |
>   <+40>:addiw   a5,a5,1  |
>   <+42>:sllia5,a5,0x30   |
>   <+44>:srlia5,a5,0x30   |
>   <+46>:sh  a5,0(a4)   --+
>   <+50>:ld  s0,8(sp)
>   <+52>:addisp,sp,16
>   <+54>:ret
> The qspinlock is smaller and faster than ticket-lock when all are in a
> fast path.
> 
> The combo spinlock could provide a compatible Linux Image for different
> micro-arch designs that have/haven't forward progress guarantee. Use
> command line options to select between qspinlock and ticket-lock, and
> the default is ticket-lock.
> 
> Signed-off-by: Guo Ren 
> Signed-off-by: Guo Ren 
> ---
>  .../admin-guide/kernel-parameters.txt |  2 +
>  arch/riscv/Kconfig|  9 +++-
>  arch/riscv/include/asm/spinlock.h | 48 +++
>  arch/riscv/kernel/setup.c | 34 +
>  include/asm-generic/qspinlock.h   |  2 +
>  include/asm-generic/ticket_spinlock.h |  2 +
>  6 files changed, 96 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 65731b060e3f..2ac9f1511774 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4753,6 +4753,8 @@
>   [KNL] Number of legacy pty's. Overwrites compiled-in
>   default number.
>  
> + qspinlock   [RISCV] Use native qspinlock.
> +
>   quiet   [KNL] Disable most log messages
>  
>   r128=   [HW,DRM]
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index f345df0763b2..b7673c5c0997 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -434,7 +434,7 @@ config NODES_SHIFT
>  
>  choice
>   prompt "RISC-V spinlock type"
> - default RISCV_TICKET_SPINLOCKS
> + default RISCV_COMBO_SPINLOCKS
>  
>  config RISCV_TICKET_SPINLOCKS
>   bool "Using ticket spinlock"
> @@ -446,6 +446,13 @@ config RISCV_QUEUED_SPINLOCKS
>   help
> Make sure your micro arch give cmpxchg/xchg forward progress
> guarantee. Otherwise, stay at ticket-lock.
> +
> +config RISCV_COMBO_SPINLOCKS
> + bool "Using combo spinlock"
> + depends on SMP && MMU
> + select ARCH_USE_QUEUED_SPINLOCKS
> + help
> +   Select queued spinlock or ticket-lock by cmdline.
>  endchoice
>  
>  config RISCV_ALTERNATIVE
> diff --git a/arch/riscv/include/asm/spinlock.h 
> b/arch/riscv/include/asm/spinlock.h
> index 98a3da4b1056..d07643c07aae 100644
> --- a/arch/riscv/include/asm/spinlock.h
> +++ b/arch/riscv/include/asm/spinlock.h
> @@ -7,12 +7,60 @@
>  #define _Q_PENDING_LOOPS (1 << 9)
>  #endif
>  
> +#ifdef CONFIG_RISCV_COMBO_SPINLOCKS
> +#define __no_arch_spinlock_redefine
> +#include 
> +#include 
> +#include 
> +
> +DECLARE_STATIC_KEY_TRUE(combo_qspinlock_key);
> +
> +#define COMBO_SPINLOCK_BASE_DECLARE(op)  
> \
> +static __always_inline void arch_spin_##op(arch_spinlock_t *lock)\
> +{\
> + if (static_branch_likely(_qspinlock_key)) \
> + queued_spin_##op(lock); \
> + else\
> + ticket_spin_##op(lock); \
> +}
> +COMBO_SPINLOCK_BASE_DECLARE(lock)
> +COMBO_SPINLOCK_BASE_DECLARE(unlock)
> +
> +#define COMBO_SPINLOCK_IS_DECLARE(op)
> \
> +static __always_inline int arch_spin_##op(arch_spinlock_t *lock) \
> +{\
> + if (static_branch_likely(_qspinlock_key)) \
> + return queued_spin_##op(lock);  \
> + else

Re: [PATCH V12 03/14] riscv: errata: Move errata vendor func-id into vendorid_list.h

2024-01-03 Thread Leonardo Bras

On Mon, Dec 25, 2023 at 07:58:36AM -0500, guo...@kernel.org wrote:
> From: Guo Ren 
> 
> Move errata vendor func-id definitions from errata_list into
> vendorid_list.h. Unifying these definitions is also for following
> rwonce errata implementation.
> 
> Suggested-by: Leonardo Bras 
> Link: https://lore.kernel.org/linux-riscv/zqlfj1cmq8pao...@redhat.com/
> Signed-off-by: Guo Ren 
> Signed-off-by: Guo Ren 
> ---
>  arch/riscv/include/asm/errata_list.h   | 18 --
>  arch/riscv/include/asm/vendorid_list.h | 18 ++
>  2 files changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/errata_list.h 
> b/arch/riscv/include/asm/errata_list.h
> index 83ed25e43553..31bbd9840e97 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -11,24 +11,6 @@
>  #include 
>  #include 
>  
> -#ifdef CONFIG_ERRATA_ANDES
> -#define ERRATA_ANDESTECH_NO_IOCP 0
> -#define ERRATA_ANDESTECH_NUMBER  1
> -#endif
> -
> -#ifdef CONFIG_ERRATA_SIFIVE
> -#define  ERRATA_SIFIVE_CIP_453 0
> -#define  ERRATA_SIFIVE_CIP_1200 1
> -#define  ERRATA_SIFIVE_NUMBER 2
> -#endif
> -
> -#ifdef CONFIG_ERRATA_THEAD
> -#define  ERRATA_THEAD_PBMT 0
> -#define  ERRATA_THEAD_CMO 1
> -#define  ERRATA_THEAD_PMU 2
> -#define  ERRATA_THEAD_NUMBER 3
> -#endif
> -
>  #ifdef __ASSEMBLY__
>  
>  #define ALT_INSN_FAULT(x)\
> diff --git a/arch/riscv/include/asm/vendorid_list.h 
> b/arch/riscv/include/asm/vendorid_list.h
> index e55407ace0c3..c503373193d2 100644
> --- a/arch/riscv/include/asm/vendorid_list.h
> +++ b/arch/riscv/include/asm/vendorid_list.h
> @@ -9,4 +9,22 @@
>  #define SIFIVE_VENDOR_ID 0x489
>  #define THEAD_VENDOR_ID  0x5b7
>  
> +#ifdef CONFIG_ERRATA_ANDES
> +#define ERRATA_ANDESTECH_NO_IOCP 0
> +#define ERRATA_ANDESTECH_NUMBER  1
> +#endif
> +
> +#ifdef CONFIG_ERRATA_SIFIVE
> +#define  ERRATA_SIFIVE_CIP_453 0
> +#define  ERRATA_SIFIVE_CIP_1200 1
> +#define  ERRATA_SIFIVE_NUMBER 2
> +#endif
> +
> +#ifdef CONFIG_ERRATA_THEAD
> +#define      ERRATA_THEAD_PBMT 0
> +#define  ERRATA_THEAD_CMO 1
> +#define  ERRATA_THEAD_PMU 2
> +#define  ERRATA_THEAD_NUMBER 3
> +#endif
> +
>  #endif
> -- 
> 2.40.1
> 

LGTM:
Reviewed-by: Leonardo Bras

Re: [PATCH 1/1] powerpc/pseries/iommu: Fix window size for direct mapping with pmem

2021-04-19 Thread Leonardo Bras

On Tue, 2021-04-20 at 15:18 +1000, Alexey Kardashevskiy wrote:
> 
> On 20/04/2021 14:54, Leonardo Bras wrote:
> > As of today, if the DDW is big enough to fit (1 << MAX_PHYSMEM_BITS) it's
> > possible to use direct DMA mapping even with pmem region.
> > 
> > But, if that happens, the window size (len) is set to
> > (MAX_PHYSMEM_BITS - page_shift) instead of MAX_PHYSMEM_BITS, causing a
> > pagesize times smaller DDW to be created, being insufficient for correct
> > usage.
> > 
> > Fix this so the correct window size is used in this case.
> 
> Good find indeed.
> 
> afaict this does not create a huge problem though as 
> query.largest_available_block is always smaller than (MAX_PHYSMEM_BITS - 
> page_shift) where it matters (phyp).
> 
> 
> Reviewed-by: Alexey Kardashevskiy 
> 

Thanks for reviewing!

Leonardo Bras

[PATCH 1/1] powerpc/pseries/iommu: Fix window size for direct mapping with pmem

2021-04-19 Thread Leonardo Bras

As of today, if the DDW is big enough to fit (1 << MAX_PHYSMEM_BITS) it's
possible to use direct DMA mapping even with pmem region.

But, if that happens, the window size (len) is set to
(MAX_PHYSMEM_BITS - page_shift) instead of MAX_PHYSMEM_BITS, causing a
pagesize times smaller DDW to be created, being insufficient for correct
usage.

Fix this so the correct window size is used in this case.

Fixes: bf6e2d562bbc4("powerpc/dma: Fallback to dma_ops when persistent memory 
present")
Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..836cbbe0ecc5 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1229,7 +1229,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (pmem_present) {
if (query.largest_available_block >=
(1ULL << (MAX_PHYSMEM_BITS - page_shift)))
-   len = MAX_PHYSMEM_BITS - page_shift;
+   len = MAX_PHYSMEM_BITS;
else
dev_info(>dev, "Skipping ibm,pmemory");
}
-- 
2.30.2

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-19 Thread Leonardo Bras

On Mon, 2021-04-19 at 20:39 -0500, Rob Herring wrote:
> On Mon, Apr 19, 2021 at 7:35 PM Leonardo Bras  wrote:
> > 
> > On Mon, 2021-04-19 at 10:44 -0500, Rob Herring wrote:
> > > On Fri, Apr 16, 2021 at 3:58 PM Leonardo Bras  wrote:
> > > > 
> > > > Hello Rob, thanks for this feedback!
> > > > 
> > > > On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> > > > > +PPC and PCI lists
> > > > > 
> > > > > On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  
> > > > > wrote:
> > > > > > 
> > > > > > Many other resource flag parsers already add this flag when the 
> > > > > > input
> > > > > > has bits 24 & 25 set, so update this one to do the same.
> > > > > 
> > > > > Many others? Looks like sparc and powerpc to me.
> > > > > 
> > > > 
> > > > s390 also does that, but it look like it comes from a device-tree.
> > > 
> > > I'm only looking at DT based platforms, and s390 doesn't use DT.
> > 
> > Correct.
> > Sorry, I somehow write above the opposite of what I was thinking.
> > 
> > > 
> > > > > Those would be the
> > > > > ones I worry about breaking. Sparc doesn't use of/address.c so it's
> > > > > fine. Powerpc version of the flags code was only fixed in 2019, so I
> > > > > don't think powerpc will care either.
> > > > 
> > > > In powerpc I reach this function with this stack, while configuring a
> > > > virtio-net device for a qemu/KVM pseries guest:
> > > > 
> > > > pci_process_bridge_OF_ranges+0xac/0x2d4
> > > > pSeries_discover_phbs+0xc4/0x158
> > > > discover_phbs+0x40/0x60
> > > > do_one_initcall+0x60/0x2d0
> > > > kernel_init_freeable+0x308/0x3a8
> > > > kernel_init+0x2c/0x168
> > > > ret_from_kernel_thread+0x5c/0x70
> > > > 
> > > > For this, both MMIO32 and MMIO64 resources will have flags 0x200.
> > > 
> > > Oh good, powerpc has 2 possible flags parsing functions. So in the
> > > above path, do we need to set PCI_BASE_ADDRESS_MEM_TYPE_64?
> > > 
> > > Does pci_parse_of_flags() get called in your case?
> > > 
> > 
> > It's called in some cases, but not for the device I am debugging
> > (virtio-net pci@8002000).
> > 
> > For the above device, here is an expanded stack trace:
> > 
> > of_bus_pci_get_flags() (from parser->bus->get_flags())
> > of_pci_range_parser_one() (from macro for_each_of_pci_range)
> > pci_process_bridge_OF_ranges+0xac/0x2d4
> > pSeries_discover_phbs+0xc4/0x158
> > discover_phbs+0x40/0x60
> > do_one_initcall+0x60/0x2d0
> > kernel_init_freeable+0x308/0x3a8
> > kernel_init+0x2c/0x168
> > ret_from_kernel_thread+0x5c/0x70
> > 
> > For other devices, I could also see the following stack trace:
> > ## device ethernet@8
> > 
> > pci_parse_of_flags()
> > of_create_pci_dev+0x7f0/0xa40
> > __of_scan_bus+0x248/0x320
> > pcibios_scan_phb+0x370/0x3b0
> > pcibios_init+0x8c/0x12c
> > do_one_initcall+0x60/0x2d0
> > kernel_init_freeable+0x308/0x3a8
> > kernel_init+0x2c/0x168
> > ret_from_kernel_thread+0x5c/0x70
> > 
> > Devices that get parsed with of_bus_pci_get_flags() appears first at
> > dmesg (around 0.015s in my test), while devices that get parsed by
> > pci_parse_of_flags() appears later (0.025s in my test).
> > 
> > I am not really used to this code, but having the term "discover phbs"
> > in the first trace and the term "scan phb" in the second, makes me
> > wonder if the first trace is seen on devices that are seen/described in
> > the device-tree and the second trace is seen in devices not present in
> > the device-tree and found scanning pci bus.
> 
> That was my guess as well. I think on pSeries that most PCI devices
> are in the DT whereas on Arm and other flattened DT (non OpenFirmware)
> platforms PCI devices are not in DT.
> 

It makes sense to me. 

>  Of course, for virtio devices,
> they would not be in DT in either case.

I don't get this part... in pseries it looks like virtio devices can be
in device-tree.

Oh, I think I get it... this pci@8002000 looks like a bus
(described in device-tree, so discovered), and then the devices are
inside it, getting scanned.

The virtio device gets the correct flags (from pci_parse_of_flags), but
the bus (pci@8002000) does not seem to get it correctly,
because

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-19 Thread Leonardo Bras

On Mon, 2021-04-19 at 10:44 -0500, Rob Herring wrote:
> On Fri, Apr 16, 2021 at 3:58 PM Leonardo Bras  wrote:
> > 
> > Hello Rob, thanks for this feedback!
> > 
> > On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> > > +PPC and PCI lists
> > > 
> > > On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
> > > > 
> > > > Many other resource flag parsers already add this flag when the input
> > > > has bits 24 & 25 set, so update this one to do the same.
> > > 
> > > Many others? Looks like sparc and powerpc to me.
> > > 
> > 
> > s390 also does that, but it look like it comes from a device-tree.
> 
> I'm only looking at DT based platforms, and s390 doesn't use DT.

Correct. 
Sorry, I somehow write above the opposite of what I was thinking.

> 
> > > Those would be the
> > > ones I worry about breaking. Sparc doesn't use of/address.c so it's
> > > fine. Powerpc version of the flags code was only fixed in 2019, so I
> > > don't think powerpc will care either.
> > 
> > In powerpc I reach this function with this stack, while configuring a
> > virtio-net device for a qemu/KVM pseries guest:
> > 
> > pci_process_bridge_OF_ranges+0xac/0x2d4
> > pSeries_discover_phbs+0xc4/0x158
> > discover_phbs+0x40/0x60
> > do_one_initcall+0x60/0x2d0
> > kernel_init_freeable+0x308/0x3a8
> > kernel_init+0x2c/0x168
> > ret_from_kernel_thread+0x5c/0x70
> > 
> > For this, both MMIO32 and MMIO64 resources will have flags 0x200.
> 
> Oh good, powerpc has 2 possible flags parsing functions. So in the
> above path, do we need to set PCI_BASE_ADDRESS_MEM_TYPE_64?
> 
> Does pci_parse_of_flags() get called in your case?
> 

It's called in some cases, but not for the device I am debugging
(virtio-net pci@8002000). 

For the above device, here is an expanded stack trace:

of_bus_pci_get_flags() (from parser->bus->get_flags()) 
of_pci_range_parser_one() (from macro for_each_of_pci_range)
pci_process_bridge_OF_ranges+0xac/0x2d4
pSeries_discover_phbs+0xc4/0x158
discover_phbs+0x40/0x60
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

For other devices, I could also see the following stack trace:
## device ethernet@8

pci_parse_of_flags()
of_create_pci_dev+0x7f0/0xa40
__of_scan_bus+0x248/0x320
pcibios_scan_phb+0x370/0x3b0
pcibios_init+0x8c/0x12c
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

Devices that get parsed with of_bus_pci_get_flags() appears first at
dmesg (around 0.015s in my test), while devices that get parsed by
pci_parse_of_flags() appears later (0.025s in my test).

I am not really used to this code, but having the term "discover phbs"
in the first trace and the term "scan phb" in the second, makes me
wonder if the first trace is seen on devices that are seen/described in
the device-tree and the second trace is seen in devices not present in
the device-tree and found scanning pci bus.

> > > I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> > > the flags. AFAICT, that's not set anywhere outside of arch code. So
> > > never for riscv, arm and arm64 at least. That leads me to
> > > pci_std_update_resource() which is where the PCI code sets BARs and
> > > just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> > > IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> > > neither is prefetch.
> > > 
> > 
> > I am not sure if you mean here:
> > a) it's ok to add IORESOURCE_MEM_64 here, because it does not affect
> > anything else, or
> > b) it should be using PCI_BASE_ADDRESS_MEM_TYPE_64
> > (or IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64) instead, since
> > it's how it's added in powerpc/sparc, and else there is no point.
> 
> I'm wondering if a) is incomplete and PCI_BASE_ADDRESS_MEM_TYPE_64
> also needs to be set. The question is ultimately are BARs getting set
> correctly for 64-bit? It looks to me like they aren't.

I am not used to these terms, does BAR means 'Base Address Register'?

If so, those are the addresses stored in pci->phb->mem_resources[i] and
pci->phb->mem_offset[i], printed from enable_ddw() (which takes place a
lot after discovering the device (0.17s in my run)).

resource #1 pci@8002000: start=0x20008000
end=0x2000 flags=0x200 desc=0x0 offset=0x2000
resource #2 pci@8002000: start=0x2100
end=0x21ff flags=0x200 desc=0x0 offset=0x0

The message above was printed without this patch.
With the patch, the flags for memory resource #2 gets ORed with 
0x0010.

Is it enough to know if BARs are correctly set for 64-bit?
If it's not, how can I check?

> 
> Rob

Thanks Rob!

Leonardo Brás

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-16 Thread Leonardo Bras

Hello Rob, thanks for this feedback!

On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> +PPC and PCI lists
> 
> On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
> > 
> > Many other resource flag parsers already add this flag when the input
> > has bits 24 & 25 set, so update this one to do the same.
> 
> Many others? Looks like sparc and powerpc to me. 
> 

s390 also does that, but it look like it comes from a device-tree.

> Those would be the
> ones I worry about breaking. Sparc doesn't use of/address.c so it's
> fine. Powerpc version of the flags code was only fixed in 2019, so I
> don't think powerpc will care either.

In powerpc I reach this function with this stack, while configuring a
virtio-net device for a qemu/KVM pseries guest:

pci_process_bridge_OF_ranges+0xac/0x2d4
pSeries_discover_phbs+0xc4/0x158
discover_phbs+0x40/0x60
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

For this, both MMIO32 and MMIO64 resources will have flags 0x200.

> 
> I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> the flags. AFAICT, that's not set anywhere outside of arch code. So
> never for riscv, arm and arm64 at least. That leads me to
> pci_std_update_resource() which is where the PCI code sets BARs and
> just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> neither is prefetch.
> 

I am not sure if you mean here:
a) it's ok to add IORESOURCE_MEM_64 here, because it does not affect
anything else, or
b) it should be using PCI_BASE_ADDRESS_MEM_TYPE_64 
(or IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64) instead, since
it's how it's added in powerpc/sparc, and else there is no point.

Again, thanks for helping!

Best regards,
Leonardo Bras

Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-16 Thread Leonardo Bras

Hello Rob, thanks for this feedback!

On Thu, 2021-04-15 at 13:59 -0500, Rob Herring wrote:
> +PPC and PCI lists
> 
> On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras  wrote:
> > 
> > Many other resource flag parsers already add this flag when the input
> > has bits 24 & 25 set, so update this one to do the same.
> 
> Many others? Looks like sparc and powerpc to me. 
> 

s390 also does that, but it look like it comes from a device-tree.

> Those would be the
> ones I worry about breaking. Sparc doesn't use of/address.c so it's
> fine. Powerpc version of the flags code was only fixed in 2019, so I
> don't think powerpc will care either.

In powerpc I reach this function with this stack, while configuring a
virtio-net device for a qemu/KVM pseries guest:

pci_process_bridge_OF_ranges+0xac/0x2d4
pSeries_discover_phbs+0xc4/0x158
discover_phbs+0x40/0x60
do_one_initcall+0x60/0x2d0
kernel_init_freeable+0x308/0x3a8
kernel_init+0x2c/0x168
ret_from_kernel_thread+0x5c/0x70

For this, both MMIO32 and MMIO64 resources will have flags 0x200.

> 
> I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> the flags. AFAICT, that's not set anywhere outside of arch code. So
> never for riscv, arm and arm64 at least. That leads me to
> pci_std_update_resource() which is where the PCI code sets BARs and
> just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> neither is prefetch.
> 

I am not sure if you mean here:
a) it's ok to add IORESOURCE_MEM_64 here, because it does not affect
anything else, or
b) it should be using PCI_BASE_ADDRESS_MEM_TYPE_64 
(or IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64) instead, since
it's how it's added in powerpc/sparc, and else there is no point.

Again, thanks for helping!

Best regards,
Leonardo Bras

[PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses

2021-04-15 Thread Leonardo Bras

Many other resource flag parsers already add this flag when the input
has bits 24 & 25 set, so update this one to do the same.

Some devices (like virtio-net) have more than one memory resource
(like MMIO32 and MMIO64) and without this flag it would be needed to
verify the address range to know which one is which.

Signed-off-by: Leonardo Bras 
---
 drivers/of/address.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/of/address.c b/drivers/of/address.c
index 73ddf2540f3f..dc7147843783 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -116,9 +116,12 @@ static unsigned int of_bus_pci_get_flags(const __be32 
*addr)
flags |= IORESOURCE_IO;
break;
case 0x02: /* 32 bits */
-   case 0x03: /* 64 bits */
flags |= IORESOURCE_MEM;
break;
+
+   case 0x03: /* 64 bits */
+   flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
+   break;
}
if (w & 0x4000)
flags |= IORESOURCE_PREFETCH;
-- 
2.30.2

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-13 Thread Leonardo Bras

On Mon, 2021-04-12 at 17:21 -0500, Segher Boessenkool wrote:
> On Fri, Apr 09, 2021 at 02:36:16PM +1000, Alexey Kardashevskiy wrote:
> > On 08/04/2021 19:04, Michael Ellerman wrote:
> > > > > > +#define QUERY_DDW_PGSIZE_4K0x01
> > > > > > +#define QUERY_DDW_PGSIZE_64K   0x02
> > > > > > +#define QUERY_DDW_PGSIZE_16M   0x04
> > > > > > +#define QUERY_DDW_PGSIZE_32M   0x08
> > > > > > +#define QUERY_DDW_PGSIZE_64M   0x10
> > > > > > +#define QUERY_DDW_PGSIZE_128M  0x20
> > > > > > +#define QUERY_DDW_PGSIZE_256M  0x40
> > > > > > +#define QUERY_DDW_PGSIZE_16G   0x80
> > > > > 
> > > > > I'm not sure the #defines really gain us much vs just putting the
> > > > > literal values in the array below?
> > > > 
> > > > Then someone says "u magic values" :) I do not mind either way. 
> > > > Thanks,
> > > 
> > > Yeah that's true. But #defining them doesn't make them less magic, if
> > > you only use them in one place :)
> > 
> > Defining them with "QUERY_DDW" in the names kinda tells where they are 
> > from. Can also grep QEMU using these to see how the other side handles 
> > it. Dunno.
> 
> And *not* defining anything reduces the mental load a lot.  You can add
> a comment at the single spot you use them, explaining what this is, in a
> much better way!
> 
> Comments are *good*.
> 
> 
> Segher

Thanks for the feedback Alexey, Michael and Segher!

I have sent a v3 for this patch. 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210408201915.174217-1-leobra...@gmail.com/

Please let me know of your feedback in it.

Best regards,
Leonardo Bras

Re: [PATCH 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-04-08 Thread Leonardo Bras

Hello David, thanks for commenting.

On Tue, 2021-03-23 at 10:45 +1100, David Gibson wrote:
> > @@ -805,6 +808,10 @@ static int resize_hpt_for_hotplug(unsigned long 
> > new_mem_size, bool shrinking)
> >     if (shrinking) {
> > 
> > +   /* When batch removing entries, only resizes HPT at the end. */
> > +   if (atomic_read_acquire(_resize_disable))
> > +   return 0;
> > +
> 
> I'm not quite convinced by this locking.  Couldn't hpt_resize_disable
> be set after this point, but while you're still inside
> resize_hpt_for_hotplug()?  Probably better to use an explicit mutex
> (and mutex_trylock()) to make the critical sections clearer.

Sure, I can do that for v2.

> Except... do we even need the fancy mechanics to suppress the resizes
> in one place to do them elswhere.  Couldn't we just replace the
> existing resize calls with the batched ones?

How do you think of having batched resizes-down in HPT? 
Other than the current approach, I could only think of a way that would
touch a lot of generic code, and/or duplicate some functions, as
dlpar_add_lmb() does a lot of other stuff.

> > +void hash_memory_batch_shrink_end(void)
> > +{
> > +   unsigned long newsize;
> > +
> > +   /* Re-enables HPT resize-down after hot-unplug */
> > +   atomic_set_release(_resize_disable, 0);
> > +
> > +   newsize = memblock_phys_mem_size();
> > +   /* Resize to smallest SHIFT possible */
> > +   while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
> > +   newsize *= 2;
> 
> As noted earlier, doing this without an explicit cap on the new hpt
> size (of the existing size) this makes me nervous. 
> 

I can add a stop in v2.

>  Less so, but doing
> the calculations on memory size, rather than explictly on HPT size /
> HPT order also seems kinda clunky.

Agree, but at this point, it would seem kind of a waste to find the
shift from newsize, then calculate (1 << shift) for each retry of
resize_hpt_for_hotplug() only to point that we are retrying the order
value.

But sure, if you think it looks better, I can change that. 

> > +void memory_batch_shrink_begin(void)
> > +{
> > +   if (!radix_enabled())
> > +   hash_memory_batch_shrink_begin();
> > +}
> > +
> > +void memory_batch_shrink_end(void)
> > +{
> > +   if (!radix_enabled())
> > +   hash_memory_batch_shrink_end();
> > +}
> 
> Again, these wrappers don't seem particularly useful to me.

Options would be add 'if (!radix_enabled())' to hotplug-memory.c
functions or to hash* functions, which look kind of wrong.

> > +   memory_batch_shrink_end();
> 
> remove_by_index only removes a single LMB, so there's no real point to
> batching here.

Sure, will be fixed for v2.

> > @@ -700,6 +712,7 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
> >     if (lmbs_added != lmbs_to_add) {
> >     pr_err("Memory hot-add failed, removing any added LMBs\n");
> > 
> > +   memory_batch_shrink_begin();
> 
> 
> The effect of these on the memory grow path is far from clear.
> 

On hotplug, HPT is resized-up before adding LMBs.
On hotunplug, HPT is resized-down after removing LMBs.
And each one has it's own mechanism to batch HPT resizes...

I can't understand exactly how using it on hotplug fail path can be any
different than using it on hotunplug.
> 

Can you please help me understanding this?

Best regards,
Leonardo Bras

Re: [PATCH 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-04-08 Thread Leonardo Bras

Hello David, thanks for the feedback!

On Mon, 2021-03-22 at 18:55 +1100, David Gibson wrote:
> > +void hash_memory_batch_expand_prepare(unsigned long newsize)
> > +{
> > +   /*
> > +* Resizing-up HPT should never fail, but there are some cases system 
> > starts with higher
> > +* SHIFT than required, and we go through the funny case of resizing 
> > HPT down while
> > +* adding memory
> > +*/
> > +
> > +   while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
> > +   newsize *= 2;
> > +   pr_warn("Hash collision while resizing HPT\n");
> 
> This unbounded increase in newsize makes me nervous - we should be
> bounded by the current size of the HPT at least.  In practice we
> should be fine, since the resize should always succeed by the time we
> reach our current HPT size, but that's far from obvious from this
> point in the code.

Sure, I will add bounds in v2.

> 
> And... you're doubling newsize which is a value which might not be a
> power of 2.  I'm wondering if there's an edge case where this could
> actually cause us to skip the current size and erroneously resize to
> one bigger than we have currently.

I also though that at the start, but it seems quite reliable.
Before using this value, htab_shift_for_mem_size() will always round it
to next power of 2. 
Ex.
Any value between 0b0101 and 0b1000 will be rounded to 0b1000 for shift
calculation. If we multiply it by 2 (same as << 1), we have that
anything between 0b01010 and 0b1 will be rounded to 0b1. 

This works just fine as long as we are multiplying. 
Division may have the behavior you expect, as 0b0101 >> 1 would become
0b010 and skip a shift.

> > +void memory_batch_expand_prepare(unsigned long newsize)
> 
> This wrapper doesn't seem useful.

Yeah, it does little, but I can't just jump into hash_* functions
directly from hotplug-memory.c, without even knowing if it's using hash
pagetables. (in case the suggestion would be test for disable_radix
inside hash_memory_batch*)

> 
> > +{
> > +   if (!radix_enabled())
> > +   hash_memory_batch_expand_prepare(newsize);
> > +}
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> >  
> > 
> > +   memory_batch_expand_prepare(memblock_phys_mem_size() +
> > +drmem_info->n_lmbs * drmem_lmb_size());
> 
> This doesn't look right.  memory_add_by_index() is adding a *single*
> LMB, I think using drmem_info->n_lmbs here means you're counting this
> as adding again as much memory as you already have hotplugged.

Yeah, my mistake. This makes sense.
I will change it to something like 
memblock_phys_mem_size() + drmem_lmb_size()

> > 
> > +   memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
> > drmem_lmb_size());
> > +
> >     for_each_drmem_lmb_in_range(lmb, start_lmb, end_lmb) {
> >     if (lmb->flags & DRCONF_MEM_ASSIGNED)
> >     continue;
> 
> I don't see memory_batch_expand_prepare() suppressing any existing HPT
> resizes.  Won't this just resize to the right size for the full add,
> then resize several times again as we perform the add?  Or.. I guess
> that will be suppressed by patch 1/3. 

Correct.

>  That's seems kinda fragile, though.

What do you mean by fragile here?
What would you suggest doing different?

Best regards,
Leonardo Bras

Re: [PATCH 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-04-08 Thread Leonardo Bras

Hello David, thanks for your feedback.

On Mon, 2021-03-22 at 17:49 +1100, David Gibson wrote:
> I don't love this approach.  Adding the extra flag at this level seems
> a bit inelegant, and it means we're passing up an easy opportunity to
> reduce our resource footprint on the host.

I understand, but trying to reduce resource footprint in host, and
mostly failing is what causes hot-add and hot-remove to take so long.

> But... maybe we'll have to do it.  I'd like to see if we can get
> things to work well enough with just the "batching" to avoid multiple
> resize attempts first.

This batching is something I had thought a lot about.
Problem is that there are a lot of generic interfaces between memory
hotplug and actually resizing HPT. I tried a simpler approach in
patches 2 & 3, so I don't touch much stuff there.

Best regards,
Leonardo Bras

[PATCH v3 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras 
---
Changes since v2:
 - Restore 'int array & shift' strategy
 - Remove defines for RTAS "IO Page Size" output of ibm,query-pe-dma-window
 - Added/Improved comments
Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210407195613.131140-1-leobra...@gmail.com/
Changes since v1:
- Remove page shift defines, replace by __builtin_ctzll(SZ_XXX)
- Add bit field defines for RTAS "IO Page Shift" output of 
ibm,query-pe-dma-window
- Use struct array instead of int array to be more explicit on pagesizes
Link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobra...@gmail.com/
 

 arch/powerpc/platforms/pseries/iommu.c | 37 +-
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..67c9953a6503 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1099,6 +1099,33 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+/* Return largest page shift based on "IO Page Sizes" output of 
ibm,query-pe-dma-window. */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+   /* Supported IO page-sizes according to LoPAR */
+   const int shift[] = {
+   __builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K), 
__builtin_ctzll(SZ_16M),
+   __builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M), 
__builtin_ctzll(SZ_128M),
+   __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G)
+   };
+
+   int i = ARRAY_SIZE(shift) - 1;
+
+   /*
+* On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a 
bit field:
+* - bit 31 means 4k pages are supported,
+* - bit 30 means 64k pages are supported, and so on.
+* Larger pagesizes map more memory with the same amount of TCEs, so 
start probing them.
+*/
+   for (; i >= 0 ; i--) {
+   if (query_page_size & (1 << i))
+   return shift[i];
+   }
+
+   /* No valid page size found. */
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1233,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
}
-   if (query.page_size & 4) {
-   page_shift = 24; /* 16MB */
-   } else if (query.page_size & 2) {
-   page_shift = 16; /* 64kB */
-   } else if (query.page_size & 1) {
-   page_shift = 12; /* 4kB */
-   } else {
+
+   page_shift = iommu_get_page_shift(query.page_size);
+   if (!page_shift) {
dev_dbg(>dev, "no supported direct page size in mask %x",
  query.page_size);
goto out_failed;
-- 
2.30.2

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

On Thu, 2021-04-08 at 03:20 -0300, Leonardo Bras wrote:
> > > +#define QUERY_DDW_PGSIZE_4K  0x01
> > > +#define QUERY_DDW_PGSIZE_64K 0x02
> > > +#define QUERY_DDW_PGSIZE_16M 0x04
> > > +#define QUERY_DDW_PGSIZE_32M 0x08
> > > +#define QUERY_DDW_PGSIZE_64M 0x10
> > > +#define QUERY_DDW_PGSIZE_128M0x20
> > > +#define QUERY_DDW_PGSIZE_256M0x40
> > > +#define QUERY_DDW_PGSIZE_16G 0x80
> > 
> > I'm not sure the #defines really gain us much vs just putting the
> > literal values in the array below?
> 
> My v1 did not use the define approach, what do you think of that?
> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobra...@gmail.com/
> 
> 
(of course, it would be that without the pageshift defines also, using
the __builtin_ctz() approach suggested by Alexey.)

Re: [PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-08 Thread Leonardo Bras

Hello Michael, thank you for this feedback!
Comments inline:

On Thu, 2021-04-08 at 15:37 +1000, Michael Ellerman wrote:
> Leonardo Bras  writes:
> > According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
> > will let the OS know all possible pagesizes that can be used for creating a
> > new DDW.
> > 
> > Currently Linux will only try using 3 of the 8 available options:
> > 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
> > 128M, 256M and 16G.
> 
> Do we know of any hardware & hypervisor combination that will actually
> give us bigger pages?
> 
> > Enabling bigger pages would be interesting for direct mapping systems
> > with a lot of RAM, while using less TCE entries.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 49 ++
> >  1 file changed, 42 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index 9fc5217f0c8e..6cda1c92597d 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -53,6 +53,20 @@ enum {
> >     DDW_EXT_QUERY_OUT_SIZE = 2
> >  };
> 
> A comment saying where the values come from would be good.

Sure, I will add the information about LoPAR.

> 
> > +#define QUERY_DDW_PGSIZE_4K0x01
> > +#define QUERY_DDW_PGSIZE_64K   0x02
> > +#define QUERY_DDW_PGSIZE_16M   0x04
> > +#define QUERY_DDW_PGSIZE_32M   0x08
> > +#define QUERY_DDW_PGSIZE_64M   0x10
> > +#define QUERY_DDW_PGSIZE_128M  0x20
> > +#define QUERY_DDW_PGSIZE_256M  0x40
> > +#define QUERY_DDW_PGSIZE_16G   0x80
> 
> I'm not sure the #defines really gain us much vs just putting the
> literal values in the array below?

My v1 did not use the define approach, what do you think of that?
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobra...@gmail.com/

> 
> > +struct iommu_ddw_pagesize {
> > +   u32 mask;
> > +   int shift;
> > +};
> > +
> >  static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> >  {
> >     struct iommu_table_group *table_group;
> > @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, 
> > struct device_node *par_dn)
> >  ret);
> >  }
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > +/* Returns page shift based on "IO Page Sizes" output at 
> > ibm,query-pe-dma-window. See LoPAR */
> > +static int iommu_get_page_shift(u32 query_page_size)
> > +{
> > +   const struct iommu_ddw_pagesize ddw_pagesize[] = {
> > +   { QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
> > +   { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
> > +   { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
> > +   { QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
> > +   { QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
> > +   { QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
> > +   { QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
> > +   { QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
> > +   };
> 
> 
> cheers

Best regards,
Leonardo Bras

Re: [PATCH 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-07 Thread Leonardo Bras

Hello Alexey,

On Tue, 2021-03-23 at 18:41 +1100, Alexey Kardashevskiy wrote:
[...]
> > +#define IOMMU_PAGE_SHIFT_16G   34
> > +#define IOMMU_PAGE_SHIFT_256M  28
> > +#define IOMMU_PAGE_SHIFT_128M  27
> > +#define IOMMU_PAGE_SHIFT_64M   26
> > +#define IOMMU_PAGE_SHIFT_32M   25
> > +#define IOMMU_PAGE_SHIFT_16M   24
> > +#define IOMMU_PAGE_SHIFT_64K   16
> 
> 
> These are not very descriptive, these are just normal shifts, could be 
> as simple as __builtin_ctz(SZ_4K) (gcc will optimize this) and so on.
> 
> OTOH the PAPR page sizes need macros as they are the ones which are 
> weird and screaming for macros.
> 
> I'd steal/rework spapr_page_mask_to_query_mask() from QEMU. Thanks,
> 

Thanks for this feedback!
I just sent a v2 applying your suggestions.

Best regards,
Leonardo Bras

[PATCH v2 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-04-07 Thread Leonardo Bras

According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 49 ++
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..6cda1c92597d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,20 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+#define QUERY_DDW_PGSIZE_4K0x01
+#define QUERY_DDW_PGSIZE_64K   0x02
+#define QUERY_DDW_PGSIZE_16M   0x04
+#define QUERY_DDW_PGSIZE_32M   0x08
+#define QUERY_DDW_PGSIZE_64M   0x10
+#define QUERY_DDW_PGSIZE_128M  0x20
+#define QUERY_DDW_PGSIZE_256M  0x40
+#define QUERY_DDW_PGSIZE_16G   0x80
+
+struct iommu_ddw_pagesize {
+   u32 mask;
+   int shift;
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+/* Returns page shift based on "IO Page Sizes" output at 
ibm,query-pe-dma-window. See LoPAR */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+   const struct iommu_ddw_pagesize ddw_pagesize[] = {
+   { QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
+   { QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
+   { QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
+   { QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
+   { QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
+   { QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
+   { QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
+   { QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
+   };
+
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(ddw_pagesize); i++) {
+   if (query_page_size & ddw_pagesize[i].mask)
+   return ddw_pagesize[i].shift;
+   }
+
+   /* No valid page size found. */
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1245,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
}
-   if (query.page_size & 4) {
-   page_shift = 24; /* 16MB */
-   } else if (query.page_size & 2) {
-   page_shift = 16; /* 64kB */
-   } else if (query.page_size & 1) {
-   page_shift = 12; /* 4kB */
-   } else {
+
+   page_shift = iommu_get_page_shift(query.page_size);
+   if (!page_shift) {
dev_dbg(>dev, "no supported direct page size in mask %x",
  query.page_size);
goto out_failed;
-- 
2.30.2

[PATCH 1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

2021-03-22 Thread Leonardo Bras

According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h   |  8 
 arch/powerpc/platforms/pseries/iommu.c | 28 +++---
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..c170048b7a1b 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -19,6 +19,14 @@
 #include 
 #include 
 
+#define IOMMU_PAGE_SHIFT_16G   34
+#define IOMMU_PAGE_SHIFT_256M  28
+#define IOMMU_PAGE_SHIFT_128M  27
+#define IOMMU_PAGE_SHIFT_64M   26
+#define IOMMU_PAGE_SHIFT_32M   25
+#define IOMMU_PAGE_SHIFT_16M   24
+#define IOMMU_PAGE_SHIFT_64K   16
+
 #define IOMMU_PAGE_SHIFT_4K  12
 #define IOMMU_PAGE_SIZE_4K   (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K)
 #define IOMMU_PAGE_MASK_4K   (~((1 << IOMMU_PAGE_SHIFT_4K) - 1))
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..02958e80aa91 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1099,6 +1099,24 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+/* Returns page shift based on "IO Page Sizes" output at 
ibm,query-pe-dma-window. SeeL LoPAR */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+   const int shift[] = {IOMMU_PAGE_SHIFT_4K,   IOMMU_PAGE_SHIFT_64K,  
IOMMU_PAGE_SHIFT_16M,
+IOMMU_PAGE_SHIFT_32M,  IOMMU_PAGE_SHIFT_64M,  
IOMMU_PAGE_SHIFT_128M,
+IOMMU_PAGE_SHIFT_256M, IOMMU_PAGE_SHIFT_16G};
+   int i = ARRAY_SIZE(shift) - 1;
+
+   /* Looks for the largest page size supported */
+   for (; i >= 0; i--) {
+   if (query_page_size & (1 << i))
+   return shift[i];
+   }
+
+   /* No valid page size found. */
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1224,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
}
-   if (query.page_size & 4) {
-   page_shift = 24; /* 16MB */
-   } else if (query.page_size & 2) {
-   page_shift = 16; /* 64kB */
-   } else if (query.page_size & 1) {
-   page_shift = 12; /* 4kB */
-   } else {
+
+   page_shift = iommu_get_page_shift(query.page_size);
+   if (!page_shift) {
dev_dbg(>dev, "no supported direct page size in mask %x",
  query.page_size);
goto out_failed;
-- 
2.29.2

[PATCH 1/1] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc

2021-03-18 Thread Leonardo Bras

As of today, doing iommu_range_alloc() only for !largealloc (npages <= 15)
will only be able to use 3/4 of the available pages, given pages on
largepool  not being available for !largealloc.

This could mean some drivers not being able to fully use all the available
pages for the DMA window.

Add pages on largepool as a last resort for !largealloc, making all pages
of the DMA window available.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/iommu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3329ef045805..ae6ad8dca605 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -255,6 +255,15 @@ static unsigned long iommu_range_alloc(struct device *dev,
pass++;
goto again;
 
+   } else if (pass == tbl->nr_pools + 1) {
+   /* Last resort: try largepool */
+   spin_unlock(>lock);
+   pool = >large_pool;
+   spin_lock(>lock);
+   pool->hint = pool->start;
+   pass++;
+   goto again;
+
} else {
/* Give up */
spin_unlock_irqrestore(&(pool->lock), flags);
-- 
2.29.2

[PATCH 1/1] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE() to save TCEs

2021-03-18 Thread Leonardo Bras

Currently both iommu_alloc_coherent() and iommu_free_coherent() align the
desired allocation size to PAGE_SIZE, and gets system pages and IOMMU
mappings (TCEs) for that value.

When IOMMU_PAGE_SIZE < PAGE_SIZE, this behavior may cause unnecessary
TCEs to be created for mapping the whole system page.

Example:
- PAGE_SIZE = 64k, IOMMU_PAGE_SIZE() = 4k
- iommu_alloc_coherent() is called for 128 bytes
- 1 system page (64k) is allocated
- 16 IOMMU pages (16 x 4k) are allocated (16 TCEs used)

It would be enough to use a single TCE for this, so 15 TCEs are
wasted in the process.

Update iommu_*_coherent() to make sure the size alignment happens only
for IOMMU_PAGE_SIZE() before calling iommu_alloc() and iommu_free().

Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
with IOMMU_PAGE_ALIGN(n, tbl), which is easier to read and does the
same.

Signed-off-by: Leonardo Bras 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/iommu.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5b69a6a72a0e..3329ef045805 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -851,6 +851,7 @@ void *iommu_alloc_coherent(struct device *dev, struct 
iommu_table *tbl,
unsigned int order;
unsigned int nio_pages, io_order;
struct page *page;
+   size_t size_io = size;
 
size = PAGE_ALIGN(size);
order = get_order(size);
@@ -877,8 +878,9 @@ void *iommu_alloc_coherent(struct device *dev, struct 
iommu_table *tbl,
memset(ret, 0, size);
 
/* Set up tces to cover the allocated range */
-   nio_pages = size >> tbl->it_page_shift;
-   io_order = get_iommu_order(size, tbl);
+   size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
+   nio_pages = size_io >> tbl->it_page_shift;
+   io_order = get_iommu_order(size_io, tbl);
mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
  mask >> tbl->it_page_shift, io_order, 0);
if (mapping == DMA_MAPPING_ERROR) {
@@ -893,10 +895,9 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
 void *vaddr, dma_addr_t dma_handle)
 {
if (tbl) {
-   unsigned int nio_pages;
+   size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
+   unsigned int nio_pages = size_io >> tbl->it_page_shift;
 
-   size = PAGE_ALIGN(size);
-   nio_pages = size >> tbl->it_page_shift;
iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
free_pages((unsigned long)vaddr, get_order(size));
-- 
2.29.2

[PATCH 3/3] powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

2021-03-11 Thread Leonardo Bras

During memory hotunplug, after each LMB is removed, the HPT may be
resized-down if it would map a max of 4 times the current amount of memory.
(2 shifts, due to introduced histeresis)

It usually is not an issue, but it can take a lot of time if HPT
resizing-down fails. This happens  because resize-down failures
usually repeat at each LMB removal, until there are no more bolted entries
conflict, which can take a while to happen.

This can be solved by doing a single HPT resize at the end of memory
hotunplug, after all requested entries are removed.

To make this happen, it's necessary to temporarily disable all HPT
resize-downs before hotunplug, re-enable them after hotunplug ends,
and then resize-down HPT to the current memory size.

As an example, hotunplugging 256GB from a 385GB guest took 621s without
this patch, and 100s after applied.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/hash.h |  2 ++
 arch/powerpc/include/asm/sparsemem.h  |  2 ++
 arch/powerpc/mm/book3s64/hash_utils.c | 28 +++
 arch/powerpc/mm/book3s64/pgtable.c| 12 
 .../platforms/pseries/hotplug-memory.c| 16 +++
 5 files changed, 60 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 843b0a178590..f92697c107f7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -256,6 +256,8 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 void hash_memory_batch_expand_prepare(unsigned long newsize);
+void hash_memory_batch_shrink_begin(void);
+void hash_memory_batch_shrink_end(void);
 
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 16b5f5300c84..a7a8a0d070fc 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -18,6 +18,8 @@ extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
 
 void memory_batch_expand_prepare(unsigned long newsize);
+void memory_batch_shrink_begin(void);
+void memory_batch_shrink_end(void);
 
 #ifdef CONFIG_NUMA
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 1f6aa0bf27e7..e16f207de8e4 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -794,6 +794,9 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+
+atomic_t hpt_resize_disable = ATOMIC_INIT(0);
+
 static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
 {
unsigned target_hpt_shift;
@@ -805,6 +808,10 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size, bool shrinking)
 
if (shrinking) {
 
+   /* When batch removing entries, only resizes HPT at the end. */
+   if (atomic_read_acquire(_resize_disable))
+   return 0;
+
/*
 * To avoid lots of HPT resizes if memory size is fluctuating
 * across a boundary, we deliberately have some hysterisis
@@ -872,6 +879,27 @@ void hash_memory_batch_expand_prepare(unsigned long 
newsize)
pr_warn("Hash collision while resizing HPT\n");
}
 }
+
+void hash_memory_batch_shrink_begin(void)
+{
+   /* Disable HPT resize-down during hot-unplug */
+   atomic_set_release(_resize_disable, 1);
+}
+
+void hash_memory_batch_shrink_end(void)
+{
+   unsigned long newsize;
+
+   /* Re-enables HPT resize-down after hot-unplug */
+   atomic_set_release(_resize_disable, 0);
+
+   newsize = memblock_phys_mem_size();
+   /* Resize to smallest SHIFT possible */
+   while (resize_hpt_for_hotplug(newsize, true) == -ENOSPC) {
+   newsize *= 2;
+   pr_warn("Hash collision while resizing HPT\n");
+   }
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static void __init hash_init_partition_table(phys_addr_t hash_table,
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index f1cd8af0f67f..e01681e22e00 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -199,6 +199,18 @@ void memory_batch_expand_prepare(unsigned long newsize)
if (!radix_enabled())
hash_memory_batch_expand_prepare(newsize);
 }
+
+void memory_batch_shrink_begin(void)
+{
+   if (!radix_enabled())
+   hash_memory_batch_shrink_begin();
+}
+
+void memory_batch_shrink_end(void)
+{
+   if (!radix_enabled())
+   hash_memory_batch_shrink_end();
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 void __init mmu_partition_table_init(void)
diff --git a/ar

[PATCH 2/3] powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug

2021-03-11 Thread Leonardo Bras

Every time a memory hotplug happens, and the memory limit crosses a 2^n
value, it may be necessary to perform HPT resizing-up, which can take
some time (over 100ms in my tests).

It usually is not an issue, but it can take some time if a lot of memory
is added to a guest with little starting memory:
Adding 256G to a 2GB guest, for example will require 8 HPT resizes.

Perform an HPT resize before memory hotplug, updating HPT to its
final size (considering a successful hotplug), taking the number of
HPT resizes to at most one per memory hotplug action.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/hash.h   |  2 ++
 arch/powerpc/include/asm/sparsemem.h|  2 ++
 arch/powerpc/mm/book3s64/hash_utils.c   | 14 ++
 arch/powerpc/mm/book3s64/pgtable.c  |  6 ++
 arch/powerpc/platforms/pseries/hotplug-memory.c |  6 ++
 5 files changed, 30 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index d959b0195ad9..843b0a178590 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -255,6 +255,8 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
 int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
+void hash_memory_batch_expand_prepare(unsigned long newsize);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index d072866842e4..16b5f5300c84 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -17,6 +17,8 @@ extern int remove_section_mapping(unsigned long start, 
unsigned long end);
 extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
 
+void memory_batch_expand_prepare(unsigned long newsize);
+
 #ifdef CONFIG_NUMA
 extern int hot_add_scn_to_nid(unsigned long scn_addr);
 #else
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index cfb3ec164f56..1f6aa0bf27e7 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -858,6 +858,20 @@ int hash__remove_section_mapping(unsigned long start, 
unsigned long end)
 
return rc;
 }
+
+void hash_memory_batch_expand_prepare(unsigned long newsize)
+{
+   /*
+* Resizing-up HPT should never fail, but there are some cases system 
starts with higher
+* SHIFT than required, and we go through the funny case of resizing 
HPT down while
+* adding memory
+*/
+
+   while (resize_hpt_for_hotplug(newsize, false) == -ENOSPC) {
+   newsize *= 2;
+   pr_warn("Hash collision while resizing HPT\n");
+   }
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static void __init hash_init_partition_table(phys_addr_t hash_table,
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 5b3a3bae21aa..f1cd8af0f67f 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -193,6 +193,12 @@ int __meminit remove_section_mapping(unsigned long start, 
unsigned long end)
 
return hash__remove_section_mapping(start, end);
 }
+
+void memory_batch_expand_prepare(unsigned long newsize)
+{
+   if (!radix_enabled())
+   hash_memory_batch_expand_prepare(newsize);
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 void __init mmu_partition_table_init(void)
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 8377f1f7c78e..353c71249214 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -671,6 +671,8 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add)
if (lmbs_available < lmbs_to_add)
return -EINVAL;
 
+   memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
drmem_lmb_size());
+
for_each_drmem_lmb(lmb) {
if (lmb->flags & DRCONF_MEM_ASSIGNED)
continue;
@@ -734,6 +736,8 @@ static int dlpar_memory_add_by_index(u32 drc_index)
 
pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index);
 
+   memory_batch_expand_prepare(memblock_phys_mem_size() +
+drmem_info->n_lmbs * drmem_lmb_size());
lmb_found = 0;
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
@@ -788,6 +792,8 @@ static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 
drc_index)
if (lmbs_available < lmbs_to_add)
return -EINVAL;
 
+   memory_batch_expand_prepare(memblock_phys_mem_size() + lmbs_to_add * 
drmem_lmb_size());
+

[PATCH 0/3] powerpc/mm/hash: Time improvements for memory hot(un)plug

2021-03-11 Thread Leonardo Bras

This patchset intends to reduce time needed for processing memory
hotplug/hotunplug in hash guests.

The first one, makes sure guests with pagesize over 4k don't need to
go through HPT resize-downs after memory hotplug.

The second and third patches make hotplug / hotunplug perform a single
HPT resize per operation, instead of one for each shift change, or one
for each LMB in case of resize-down error.

Why haven't the same mechanism used for both memory hotplug and hotunplug?
They both have different requirements:

Memory hotplug causes (usually) HPT resize-ups, which are fine happening
at the start of hotplug, but resize-ups should not ever be disabled, as
other mechanisms may try to increase memory, hitting issues with a HPT
that is too small.

Memory hotunplug causes HPT resize-downs, which can be disabled (HPT will
just remain larger for a while), but need to happen at the end of an
hotunplug operation. If we want to batch it, we need to disable
resize-downs and perform it only at the end.

Tests done with this patchset in the same machine / guest config:
Starting memory: 129GB, DIMM: 256GB
Before patchset: hotplug = 710s, hotunplug = 621s.
After patchset: hotplug  = 21s, hotunplug = 100s.

Any feedback will be appreciated!
I believe the code may not be very well placed in available files,
so please give some feedback on that.

Best regards,

Leonardo Bras (3):
  powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug
  powerpc/mm/hash: Avoid multiple HPT resize-ups on memory hotplug
  powerpc/mm/hash: Avoid multiple HPT resize-downs on memory hotunplug

 arch/powerpc/include/asm/book3s/64/hash.h |  4 +
 arch/powerpc/include/asm/sparsemem.h  |  4 +
 arch/powerpc/mm/book3s64/hash_utils.c | 78 +++
 arch/powerpc/mm/book3s64/pgtable.c| 18 +
 .../platforms/pseries/hotplug-memory.c| 22 ++
 5 files changed, 111 insertions(+), 15 deletions(-)

-- 
2.29.2

[PATCH 1/3] powerpc/mm/hash: Avoid resizing-down HPT on first memory hotplug

2021-03-11 Thread Leonardo Bras

Because hypervisors may need to create HPTs without knowing the guest
page size, the smallest used page-size (4k) may be chosen, resulting in
a HPT that is possibly bigger than needed.

On a guest with bigger page-sizes, the amount of entries for HTP may be
too high, causing the guest to ask for a HPT resize-down on the first
hotplug.

This becomes a problem when HPT resize-down fails, and causes the
HPT resize to be performed on every LMB added, until HPT size is
compatible to guest memory size, causing a major slowdown.

So, avoiding HPT resizing-down on hot-add significantly improves memory
hotplug times.

As an example, hotplugging 256GB on a 129GB guest took 710s without this
patch, and 21s after applied.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 36 ---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 73b06adb6eeb..cfb3ec164f56 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -794,7 +794,7 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-static int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size, bool shrinking)
 {
unsigned target_hpt_shift;
 
@@ -803,19 +803,25 @@ static int resize_hpt_for_hotplug(unsigned long 
new_mem_size)
 
target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
 
-   /*
-* To avoid lots of HPT resizes if memory size is fluctuating
-* across a boundary, we deliberately have some hysterisis
-* here: we immediately increase the HPT size if the target
-* shift exceeds the current shift, but we won't attempt to
-* reduce unless the target shift is at least 2 below the
-* current shift
-*/
-   if (target_hpt_shift > ppc64_pft_size ||
-   target_hpt_shift < ppc64_pft_size - 1)
-   return mmu_hash_ops.resize_hpt(target_hpt_shift);
+   if (shrinking) {
 
-   return 0;
+   /*
+* To avoid lots of HPT resizes if memory size is fluctuating
+* across a boundary, we deliberately have some hysterisis
+* here: we immediately increase the HPT size if the target
+* shift exceeds the current shift, but we won't attempt to
+* reduce unless the target shift is at least 2 below the
+* current shift
+*/
+
+   if (target_hpt_shift >= ppc64_pft_size - 1)
+   return 0;
+
+   } else if (target_hpt_shift <= ppc64_pft_size) {
+   return 0;
+   }
+
+   return mmu_hash_ops.resize_hpt(target_hpt_shift);
 }
 
 int hash__create_section_mapping(unsigned long start, unsigned long end,
@@ -828,7 +834,7 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end,
return -1;
}
 
-   resize_hpt_for_hotplug(memblock_phys_mem_size());
+   resize_hpt_for_hotplug(memblock_phys_mem_size(), false);
 
rc = htab_bolt_mapping(start, end, __pa(start),
   pgprot_val(prot), mmu_linear_psize,
@@ -847,7 +853,7 @@ int hash__remove_section_mapping(unsigned long start, 
unsigned long end)
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
 mmu_kernel_ssize);
 
-   if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+   if (resize_hpt_for_hotplug(memblock_phys_mem_size(), true) == -ENOSPC)
pr_warn("Hash collision while resizing HPT\n");
 
return rc;
-- 
2.29.2

Re: [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues

2021-02-08 Thread Leonardo Bras

Hello Sebastian, 
Thanks for the feedback!

On Thu, 2021-01-28 at 11:33 +0100, Sebastian Andrzej Siewior wrote:
> On 2021-01-28 03:55:06 [-0300], Leonardo Bras wrote:
> > Currently, during flush_smp_call_function_queue():
> > - All items are transversed once, for inverting.
> > - The SYNC items are transversed twice.
> > - The ASYNC & IRQ_WORK items are transversed tree times.
> > - The TTWU items are transversed four times;.
> > 
> > Also, a lot of extra work is done to keep track and remove the items
> > already processed in each step.
> > 
> > By using three queues, it's possible to avoid all this work, and
> > all items in list are transversed only twice: once for inverting,
> > and once for processing..
> > 
> > In exchange, this requires 2 extra llist_del_all() in the beginning
> > of flush_smp_call_function_queue(), and some extra logic to decide
> > the correct queue to add the desired csd.
> > 
> > This is not supposed to cause any change in the order the items are
> > processed, but will change the order of printing (cpu offlining)
> > to the order the items will be proceessed.
> > 
> > (The above transversed count ignores the cpu-offlining case, in
> > which all items would be transversed again, in both cases.)
> 
> Numbers would be good.
> 

Sure, I will try to get some time to compare performance.


>  Having three queues increases the memory foot
> print from one pointer to three but we still remain in one cache line.
> One difference your patch makes is this hunk:
> 
> > +   if (smp_add_to_queue(cpu, node))
> >     send_call_function_single_ipi(cpu);
> 
> Previously only the first addition resulted in sending an IPI. With this
> change you could send two IPIs, one for adding to two independent queues.

Yes, you are correct. 
I need to change this to looking into all queues, which should just add
a few compares, given all llist_heads are in the same cacheline.

> 
> A quick smoke test ended up
>   -0   [005] d..h1..   146.255996: 
> flush_smp_call_function_queue: A1 S2 I0 T0 X3
> 
> with the patch at the bottom of the mail. This shows that in my
> smoke test at least, the number of items in the individual list is low.

Yes, but depending on workload this list may get longer.

My patch also needs some other changes, so I will send a v2 with those
+ the proposed changes.

> Sebastian

Best regards,
Leonardo Bras

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-08 Thread Leonardo Bras

Hello Nick,

On Sat, 2021-02-06 at 13:03 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of February 5, 2021 5:01 pm:
> > Hey Nick, thanks for reviewing :)
> > 
> > On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
> > > Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> > > > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > > > After exitting guest, the register is reverted to it's original value.
> > > > 
> > > > If one tries to get the timestamp from host between those changes, it
> > > > will present an incorrect value.
> > > > 
> > > > An example would be trying to add a tracepoint in
> > > > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > > > acquired could actually cause the host to crash.
> > > > 
> > > > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > > > get the correct timestamp.
> > > 
> > > Ouch. Not sure how reasonable it is to half switch into guest registers 
> > > and expect to call into the wider kernel, fixing things up as we go. 
> > > What if mftb is used in other places?
> > 
> > IIUC, the CPU is not supposed to call anything as host between guest
> > entry and guest exit, except guest-related cases, like
> 
> When I say "call", I'm including tracing in that. If a function is not 
> marked as no trace, then it will call into the tracing subsystem.
> 
> > kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
> > will still get the same value as before.
> 
> Right, so it'll be out of whack again.
> 
> > This is only supposed to change stuff that depends on sched_clock, like
> > Tracepoints, that can happen in those exceptions.
> 
> If they depend on sched_clock that's one thing. Do they definitely have 
> no dependencies on mftb from other calls?

We could change that on get_tb() or mftb() @ timebase.h, which would
have a broader reach, but would not reach any mftb from asm code.

> > > Especially as it doesn't seem like there is a reason that function _has_
> > > to be called after the timebase is switched to guest, that's just how 
> > > the code is structured.
> > 
> > Correct, but if called, like in rb routines, used by tracepoints, the
> > difference between last tb and current (lower) tb may cause the CPU to
> > trap PROGRAM exception, crashing host. 
> 
> Yes, so I agree with Michael any function that is involved when we begin 
> to switch into guest context (or have not completed switching back to 
> host going the other way) should be marked as no trace (noinstr even, 
> perhaps).

Sure, that would avoid having to get paca->tb_offset for every mftb()
called, and avoid inconsistencies when different ways to get time are
used in code.

On the other hand, it would make it very hard to debug functions like
kvmppc_guest_entry_inject_int() as I am doing right now.

> 
> > > As a local hack to work out a bug okay. If you really need it upstream 
> > > could you put it under a debug config option?
> > 
> > You mean something that is automatically selected whenever those
> > configs are enabled? 
> > 
> > CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64
> > 
> > Or something the user need to select himself in menuconfig?
> 
> Yeah I meant a default n thing under powerpc kernel debugging somewhere.

So, IIUC all we can do is split this in 2 changes:
1 - Adding notrace to those functions
2 - Introducing a kernel debug config that reverts (1) and 'fixes' mftb

If that's correct, I have some ideas we can use. 

For debug option, should we add the offset on get_tb() or mftb()?

Another option would be to adding this tb_offset only in the routines
used by tracing. But this could probably mean having to add a function
in arch-generic code, but still an option.

What do you think?

> 
> Thanks,
> Nick

Thank you!
Leonardo Bras

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-05 Thread Leonardo Bras

Hello Fabiano, 
Thanks for reviewing! 
(answers inline)

On Fri, 2021-02-05 at 10:09 -0300, Fabiano Rosas wrote:
> Leonardo Bras  writes:
> 
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> > 
> > Signed-off-by: Leonardo Bras 
> > Suggested-by: Paul Mackerras 
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c | 1 +
> >  arch/powerpc/kernel/time.c| 8 +++-
> >  arch/powerpc/kvm/book3s_hv.c  | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> > b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >     u64 cfar;
> >     u64 ppr;
> >     u64 host_fscr;
> > +   u64 tb_offset;  /* Timebase offset: keeps correct
> > timebase while on guest */
> 
> Couldn't you use the vc->tb_offset_applied for this? We have a reference
> for the vcore in the hstate already.

But it's a pointer, which means we would have to keep checking for NULL
every time we need sched_clock(). 
Potentially it would cost a cache miss for PACA memory region that
contain vc, another for getting the part of *vc that contains the
tb_offset_applied, instead of only one for PACA struct region that
contains tb_offset.

On the other hand, it got me thinking: If the offset is applied per
cpu, why don't we get this info only in PACA, instead of in vc?
It could be a general way to get an offset applied for any purpose and
still get the sched_clock() right. 
(Not that I have any idea of any other purpose we could use it) 

Best regards!
Leonardo Bras

> 
> >  #endif
> >  };
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >     HSTATE_FIELD(HSTATE_CFAR, cfar);
> >     HSTATE_FIELD(HSTATE_PPR, ppr);
> >     HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> > 
> >  #else /* CONFIG_PPC_BOOK3S */
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index 67feb3524460..f27f0163792b 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> >   */
> >  notrace unsigned long long sched_clock(void)
> >  {
> > -   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> > +   u64 tb = get_tb() - boot_tb;
> > +
> > +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> > +   tb -= local_paca->kvm_hstate.tb_offset;
> > +#endif
> > +
> > +   return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> >  }
> > 
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index b3731572295e..c08593c63353 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> >     if ((tb & 0xff) < (new_tb & 0xff))
> >     mtspr(SPRN_TBU40, new_tb + 0x100);
> >     vc->tb_offset_applied = vc->tb_offset;
> > +   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> >     }
> > 
> >     if (vc->pcr)
> > @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> &

Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-04 Thread Leonardo Bras

Hey Nick, thanks for reviewing :)

On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> 
> Ouch. Not sure how reasonable it is to half switch into guest registers 
> and expect to call into the wider kernel, fixing things up as we go. 
> What if mftb is used in other places?

IIUC, the CPU is not supposed to call anything as host between guest
entry and guest exit, except guest-related cases, like
kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
will still get the same value as before.

This is only supposed to change stuff that depends on sched_clock, like
Tracepoints, that can happen in those exceptions.


> Especially as it doesn't seem like there is a reason that function _has_
> to be called after the timebase is switched to guest, that's just how 
> the code is structured.

Correct, but if called, like in rb routines, used by tracepoints, the
difference between last tb and current (lower) tb may cause the CPU to
trap PROGRAM exception, crashing host. 

> As a local hack to work out a bug okay. If you really need it upstream 
> could you put it under a debug config option?

You mean something that is automatically selected whenever those
configs are enabled? 

CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64

Or something the user need to select himself in menuconfig?

> 
> Thanks,
> Nick
> 

Thank you!
Leonardo Bras

> > Signed-off-by: Leonardo Bras 
> > Suggested-by: Paul Mackerras 
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c | 1 +
> >  arch/powerpc/kernel/time.c| 8 +++-
> >  arch/powerpc/kvm/book3s_hv.c  | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
> > b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >     u64 cfar;
> >     u64 ppr;
> >     u64 host_fscr;
> > +   u64 tb_offset;  /* Timebase offset: keeps correct timebase 
> > while on guest */
> >  #endif
> >  };
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >     HSTATE_FIELD(HSTATE_CFAR, cfar);
> >     HSTATE_FIELD(HSTATE_PPR, ppr);
> >     HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
>

[PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-04 Thread Leonardo Bras

Before guest entry, TBU40 register is changed to reflect guest timebase.
After exitting guest, the register is reverted to it's original value.

If one tries to get the timestamp from host between those changes, it
will present an incorrect value.

An example would be trying to add a tracepoint in
kvmppc_guest_entry_inject_int(), which depending on last tracepoint
acquired could actually cause the host to crash.

Save the Timebase Offset to PACA and use it on sched_clock() to always
get the correct timestamp.

Signed-off-by: Leonardo Bras 
Suggested-by: Paul Mackerras 
---
Changes since v1:
- Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
  CONFIG_PPC_BOOK3S_64 are defined.
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
 arch/powerpc/kernel/asm-offsets.c | 1 +
 arch/powerpc/kernel/time.c| 8 +++-
 arch/powerpc/kvm/book3s_hv.c  | 2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
 5 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..e2c12a10eed2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -131,6 +131,7 @@ struct kvmppc_host_state {
u64 cfar;
u64 ppr;
u64 host_fscr;
+   u64 tb_offset;  /* Timebase offset: keeps correct timebase 
while on guest */
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..0beb8fdc6352 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -706,6 +706,7 @@ int main(void)
HSTATE_FIELD(HSTATE_CFAR, cfar);
HSTATE_FIELD(HSTATE_PPR, ppr);
HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 67feb3524460..f27f0163792b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
  */
 notrace unsigned long long sched_clock(void)
 {
-   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+   u64 tb = get_tb() - boot_tb;
+
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
+   tb -= local_paca->kvm_hstate.tb_offset;
+#endif
+
+   return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b3731572295e..c08593c63353 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = vc->tb_offset;
+   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
}
 
if (vc->pcr)
@@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = 0;
+   local_paca->kvm_hstate.tb_offset = 0;
}
 
mtspr(SPRN_HDEC, 0x7fff);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b73140607875..8f7a9f7f4ee6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
cmpdi   r8,0
beq 37f
std r8, VCORE_TB_OFFSET_APPL(r5)
+   std r8, HSTATE_TB_OFFSET(r13)
mftbr6  /* current host timebase */
add r8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
@@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
beq 17f
li  r0, 0
std r0, VCORE_TB_OFFSET_APPL(r5)
+   std r0, HSTATE_TB_OFFSET(r13)
mftbr6  /* current guest timebase */
subfr8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
-- 
2.29.2

[PATCH 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

2021-02-04 Thread Leonardo Bras

Before guest entry, TBU40 register is changed to reflect guest timebase.
After exitting guest, the register is reverted to it's original value.

If one tries to get the timestamp from host between those changes, it
will present an incorrect value.

An example would be trying to add a tracepoint in
kvmppc_guest_entry_inject_int(), which depending on last tracepoint
acquired could actually cause the host to crash.

Save the Timebase Offset to PACA and use it on sched_clock() to always
get the correct timestamp.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
 arch/powerpc/kernel/asm-offsets.c | 1 +
 arch/powerpc/kernel/time.c| 3 ++-
 arch/powerpc/kvm/book3s_hv.c  | 2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
 5 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..e2c12a10eed2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -131,6 +131,7 @@ struct kvmppc_host_state {
u64 cfar;
u64 ppr;
u64 host_fscr;
+   u64 tb_offset;  /* Timebase offset: keeps correct timebase 
while on guest */
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..0beb8fdc6352 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -706,6 +706,7 @@ int main(void)
HSTATE_FIELD(HSTATE_CFAR, cfar);
HSTATE_FIELD(HSTATE_PPR, ppr);
HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+   HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 67feb3524460..adf6648e3572 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -699,7 +699,8 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
  */
 notrace unsigned long long sched_clock(void)
 {
-   return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+   return mulhdu(get_tb() - boot_tb - local_paca->kvm_hstate.tb_offset, 
tb_to_ns_scale)
+   << tb_to_ns_shift;
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b3731572295e..c08593c63353 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = vc->tb_offset;
+   local_paca->kvm_hstate.tb_offset = vc->tb_offset;
}
 
if (vc->pcr)
@@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
if ((tb & 0xff) < (new_tb & 0xff))
mtspr(SPRN_TBU40, new_tb + 0x100);
vc->tb_offset_applied = 0;
+   local_paca->kvm_hstate.tb_offset = 0;
}
 
mtspr(SPRN_HDEC, 0x7fff);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b73140607875..8f7a9f7f4ee6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
cmpdi   r8,0
beq 37f
std r8, VCORE_TB_OFFSET_APPL(r5)
+   std r8, HSTATE_TB_OFFSET(r13)
mftbr6  /* current host timebase */
add r8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
@@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
beq 17f
li  r0, 0
std r0, VCORE_TB_OFFSET_APPL(r5)
+   std r0, HSTATE_TB_OFFSET(r13)
mftbr6  /* current guest timebase */
subfr8,r8,r6
mtspr   SPRN_TBU40,r8   /* update upper 40 bits */
-- 
2.29.2

[PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues

2021-01-27 Thread Leonardo Bras

Currently, during flush_smp_call_function_queue():
- All items are transversed once, for inverting.
- The SYNC items are transversed twice.
- The ASYNC & IRQ_WORK items are transversed tree times.
- The TTWU items are transversed four times;.

Also, a lot of extra work is done to keep track and remove the items
already processed in each step.

By using three queues, it's possible to avoid all this work, and
all items in list are transversed only twice: once for inverting,
and once for processing..

In exchange, this requires 2 extra llist_del_all() in the beginning
of flush_smp_call_function_queue(), and some extra logic to decide
the correct queue to add the desired csd.

This is not supposed to cause any change in the order the items are
processed, but will change the order of printing (cpu offlining)
to the order the items will be proceessed.

(The above transversed count ignores the cpu-offlining case, in
which all items would be transversed again, in both cases.)

Signed-off-by: Leonardo Bras 
---
 kernel/smp.c | 173 ++-
 1 file changed, 87 insertions(+), 86 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 1b6070bf97bb..67fb415873f9 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -37,7 +37,13 @@ struct call_function_data {
 
 static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+struct call_multi_queue {
+   struct llist_head sync;
+   struct llist_head async_n_irq_work;
+   struct llist_head ttwu;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_multi_queue, call_mq);
 
 static void flush_smp_call_function_queue(bool warn_cpu_offline);
 
@@ -93,8 +99,13 @@ void __init call_function_init(void)
 {
int i;
 
-   for_each_possible_cpu(i)
-   init_llist_head(_cpu(call_single_queue, i));
+   for_each_possible_cpu(i) {
+   struct call_multi_queue *cmq = _cpu(call_mq, i);
+
+   init_llist_head(>sync);
+   init_llist_head(>async_n_irq_work);
+   init_llist_head(>ttwu);
+   }
 
smpcfd_prepare_cpu(smp_processor_id());
 }
@@ -253,6 +264,31 @@ static __always_inline void csd_unlock(call_single_data_t 
*csd)
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 
+static __always_inline bool smp_add_to_queue(int cpu, struct llist_node *node)
+{
+   struct llist_head *head;
+   call_single_data_t *csd = llist_entry(node, call_single_data_t, 
node.llist);
+   struct call_multi_queue *cmq = _cpu(call_mq, cpu);
+
+   switch (CSD_TYPE(csd)) {
+   case CSD_TYPE_SYNC:
+   head = >sync;
+   break;
+   case CSD_TYPE_ASYNC:
+   case CSD_TYPE_IRQ_WORK:
+   head = >async_n_irq_work;
+   break;
+   case CSD_TYPE_TTWU:
+   head = >ttwu;
+   break;
+   default:
+   pr_warn("IPI callback, unknown type %d blocked from %d\n", cpu, 
CSD_TYPE(csd));
+   return false;
+   }
+
+   return llist_add(node, head);
+}
+
 void __smp_call_single_queue(int cpu, struct llist_node *node)
 {
/*
@@ -266,7 +302,7 @@ void __smp_call_single_queue(int cpu, struct llist_node 
*node)
 * locking and barrier primitives. Generic code isn't really
 * equipped to do the right thing...
 */
-   if (llist_add(node, _cpu(call_single_queue, cpu)))
+   if (smp_add_to_queue(cpu, node))
send_call_function_single_ipi(cpu);
 }
 
@@ -327,124 +363,89 @@ void generic_smp_call_function_single_interrupt(void)
  * to ensure that all pending IPI callbacks are run before it goes completely
  * offline.
  *
- * Loop through the call_single_queue and run all the queued callbacks.
+ * Loop through the call_multi_queue->* and run all the queued callbacks.
  * Must be called with interrupts disabled.
  */
 static void flush_smp_call_function_queue(bool warn_cpu_offline)
 {
-   call_single_data_t *csd, *csd_next;
-   struct llist_node *entry, *prev;
-   struct llist_head *head;
+   call_single_data_t *csd;
+   struct llist_node *entry_sync, *entry_async_n_irq_work, *entry_ttwu;
+   struct call_multi_queue *cmq;
static bool warned;
 
lockdep_assert_irqs_disabled();
 
-   head = this_cpu_ptr(_single_queue);
-   entry = llist_del_all(head);
-   entry = llist_reverse_order(entry);
+   cmq = this_cpu_ptr(_mq);
+   entry_sync = llist_del_all(>sync);
+   entry_async_n_irq_work = llist_del_all(>async_n_irq_work);
+   entry_ttwu = llist_del_all(>ttwu);
+
+   entry_sync = llist_reverse_order(entry_sync);
+   entry_async_n_irq_work = llist_reverse_order(entry_async_n_irq_work);
+   entry_ttwu = llist_reverse_order(entry_ttwu);
 
/* There shouldn't be any pending callbacks on an offline C

[PATCH v2 1/1] powerpc/kvm: Fix mask size for emulated msgsndp

2020-12-08 Thread Leonardo Bras

According to ISAv3.1 and ISAv3.0b, the msgsndp is described to split RB in:
msgtype <- (RB) 32:36
payload <- (RB) 37:63
t   <- (RB) 57:63

The current way of getting 'msgtype', and 't' is missing their MSB:
msgtype: ((arg >> 27) & 0xf) : Gets (RB) 33:36, missing bit 32
t:   (arg &= 0x3f)   : Gets (RB) 58:63, missing bit 57

Fixes this by applying the correct mask.

Signed-off-by: Leonardo Bras 
---
Changes since v1:
- Commit message 's/LSB/MSB/', because ISA ordering is big-endian.

 arch/powerpc/kvm/book3s_hv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e3b1839fc251..5af0a429cee8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1241,9 +1241,9 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
switch (get_xop(inst)) {
case OP_31_XOP_MSGSNDP:
arg = kvmppc_get_gpr(vcpu, rb);
-   if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+   if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
break;
-   arg &= 0x3f;
+   arg &= 0x7f;
if (arg >= kvm->arch.emul_smt_mode)
break;
tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
@@ -1256,7 +1256,7 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
break;
case OP_31_XOP_MSGCLRP:
arg = kvmppc_get_gpr(vcpu, rb);
-   if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+   if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
break;
vcpu->arch.vcore->dpdes = 0;
vcpu->arch.doorbell_request = 0;
-- 
2.25.4

[PATCH 1/1] powerpc/kvm: Fix mask size for emulated msgsndp

2020-11-11 Thread Leonardo Bras

According to ISAv3.1 and ISAv3.0b, the msgsndp is described to split RB in:
msgtype <- (RB) 32:36
payload <- (RB) 37:63
t   <- (RB) 57:63

The current way of getting 'msgtype', and 't' is missing their LSB:
msgtype: ((arg >> 27) & 0xf) : Gets (RB) 33:36, missing bit 32
t:   (arg &= 0x3f)   : Gets (RB) 58:63, missing bit 57

Fixes this by applying the correct mask.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/book3s_hv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e3b1839fc251..5af0a429cee8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1241,9 +1241,9 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
switch (get_xop(inst)) {
case OP_31_XOP_MSGSNDP:
arg = kvmppc_get_gpr(vcpu, rb);
-   if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+   if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
break;
-   arg &= 0x3f;
+   arg &= 0x7f;
if (arg >= kvm->arch.emul_smt_mode)
break;
tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
@@ -1256,7 +1256,7 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
break;
case OP_31_XOP_MSGCLRP:
arg = kvmppc_get_gpr(vcpu, rb);
-   if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
+   if (((arg >> 27) & 0x1f) != PPC_DBELL_SERVER)
break;
vcpu->arch.vcore->dpdes = 0;
vcpu->arch.doorbell_request = 0;
-- 
2.25.4

Re: [PATCH 0/3] Support NVIDIA Tegra-based Ouya game console

2020-09-16 Thread Leonardo Bras

On Wed, 2020-09-16 at 12:22 +, Peter Geis wrote:
> Good Day,
> 
> This series introduces upstream kernel support for the Ouya game console 
> device. Please review and apply. Thank you in advance.
> 
> Peter Geis (3):
>   ARM: tegra: Add device-tree for Ouya
>   dt-bindings: Add vendor prefix for Ouya Inc.
>   dt-bindings: ARM: tegra: Add Ouya game console
> 
>  .../devicetree/bindings/arm/tegra.yaml|3 +
>  .../devicetree/bindings/vendor-prefixes.yaml  |2 +
>  arch/arm/boot/dts/Makefile|3 +-
>  arch/arm/boot/dts/tegra30-ouya.dts| 4498 +
>  4 files changed, 4505 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm/boot/dts/tegra30-ouya.dts
> 

Hello Peter, seems a great work :)

Does this work in that device I sent you info back in april 14th?

Best regards!

Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()

2020-09-04 Thread Leonardo Bras

On Thu, 2020-09-03 at 14:41 +1000, Alexey Kardashevskiy wrote:
> I am new to this, so I am trying to understand how a memory page mapped
> > as DMA, and used for something else could be a problem.
> 
>  From the device prospective, there is PCI space and everything from 0 
> till 1<<64 is accessible and what is that mapped to - the device does 
> not know. PHB's IOMMU is the thing to notice invalid access and raise 
> EEH but PHB only knows about PCI->physical memory mapping (with IOMMU 
> pages) but nothing about the host kernel pages. Does this help? Thanks,

According to our conversation on Slack:
1- There is a problem if a hypervisor gives to it's VMs contiguous
memory blocks that are not aligned to IOMMU pages, because then an 
iommu_map_page() could map some memory in this VM and some memory in
other VM / process.
2- To guarantee this, we should have system pagesize >= iommu_pagesize 

One way to get (2) is by doing this in enable_ddw():
if ((query.page_size & 4) && PAGE_SHIFT >= 24) {
page_shift = 24; /* 16MB */
} else if ((query.page_size & 2) &&  PAGE_SHIFT >= 16 ) {
page_shift = 16; /* 64kB */
} else if (query.page_size & 1 &&  PAGE_SHIFT >= 12) {
page_shift = 12; /* 4kB */
[...]

Another way of solving this, would be adding in LoPAR documentation
that the blocksize of contiguous memory the hypervisor gives a VM
should always be aligned to IOMMU pagesize offered.

I think the best approach would be first sending the above patch, which
is faster, and then get working into adding that to documentation, so
hypervisors guarantee this.

If this gets into the docs, we can revert the patch.

What do you think?

Best regards!

Re: [PATCH v1 09/10] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-09-02 Thread Leonardo Bras

On Mon, 2020-08-31 at 14:35 +1000, Alexey Kardashevskiy wrote:
> 
> On 29/08/2020 04:36, Leonardo Bras wrote:
> > On Mon, 2020-08-24 at 15:17 +1000, Alexey Kardashevskiy wrote:
> > > On 18/08/2020 09:40, Leonardo Bras wrote:
> > > > As of today, if the biggest DDW that can be created can't map the whole
> > > > partition, it's creation is skipped and the default DMA window
> > > > "ibm,dma-window" is used instead.
> > > > 
> > > > DDW is 16x bigger than the default DMA window,
> > > 
> > > 16x only under very specific circumstances which are
> > > 1. phyp
> > > 2. sriov
> > > 3. device class in hmc (or what that priority number is in the lpar 
> > > config).
> > 
> > Yeah, missing details.
> > 
> > > > having the same amount of
> > > > pages, but increasing the page size to 64k.
> > > > Besides larger DMA window,
> > > 
> > > "Besides being larger"?
> > 
> > You are right there.
> > 
> > > > it performs better for allocations over 4k,
> > > 
> > > Better how?
> > 
> > I was thinking for allocations larger than (512 * 4k), since >2
> > hypercalls are needed here, and for 64k pages would still be just 1
> > hypercall up to (512 * 64k). 
> > But yeah, not the usual case anyway.
> 
> Yup.
> 
> 
> > > > so it would be nice to use it instead.
> > > 
> > > I'd rather say something like:
> > > ===
> > > So far we assumed we can map the guest RAM 1:1 to the bus which worked
> > > with a small number of devices. SRIOV changes it as the user can
> > > configure hundreds VFs and since phyp preallocates TCEs and does not
> > > allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> > > per a PE to limit waste of physical pages.
> > > ===
> > 
> > I mixed this in my commit message, it looks like this:
> > 
> > ===
> > powerpc/pseries/iommu: Make use of DDW for indirect mapping
> > 
> > So far it's assumed possible to map the guest RAM 1:1 to the bus, which
> > works with a small number of devices. SRIOV changes it as the user can
> > configure hundreds VFs and since phyp preallocates TCEs and does not
> > allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> > per a PE to limit waste of physical pages.
> > 
> > As of today, if the assumed direct mapping is not possible, DDW
> > creation is skipped and the default DMA window "ibm,dma-window" is used
> > instead.
> > 
> > The default DMA window uses 4k pages instead of 64k pages, and since
> > the amount of pages is the same,
> 
> Is the amount really the same? I thought you can prioritize some VFs
> over others (== allocate different number of TCEs). Does it really
> matter if it is the same?

On a conversation with Travis Pizel, he explained how it's supposed to
work, and I understood this:

When a VF is created, it will be assigned a capacity, like 4%, 20%, and
so on. The number of 'TCE entries' that are available to that partition
are proportional to that capacity. 

If we use the default DMA window, the IOMMU pagesize/entry will be 4k,
and if we use DDW, we will get 64k pagesize. As the number of entries
will be the same (for the same capacity), the total space that can be
addressed by the IOMMU will be 16 times bigger. This sometimes enable
direct mapping, but sometimes it's still not enough.

On Travis words :
"A low capacity VF, with less resources available, will certainly have
less DMA window capability than a high capacity VF. But, an 8GB DMA
window (with 64k pages) is still 16x larger than an 512MB window (with
4K pages).
A high capacity VF - for example, one that Leonardo has in his scenario
- will go from 8GB (using 4K pages) to 128GB (using 64K pages) - again,
16x larger - but it's obviously still possible to create a partition
that exceeds 128GB of memory in size."

> 
> 
> > making use of DDW instead of the
> > default DMA window for indirect mapping will expand in 16x the amount
> > of memory that can be mapped on DMA.
> 
> Stop saying "16x", it is not guaranteed by anything :)
> 
> 
> > The DDW created will be used for direct mapping by default. [...]
> > ===
> > 
> > What do you think?
> > 
> > > > The DDW created will be used for direct mapping by default.
> > > > If it's not available, indirect mapping will be used instead.
> > > > 
> > > > For indirect mapping, it's necessary to update the iommu_table so
> > > > iommu_alloc() can use the

Re: [PATCH v1 08/10] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2020-09-01 Thread Leonardo Bras

On Mon, 2020-08-31 at 14:34 +1000, Alexey Kardashevskiy wrote:
> 
> On 29/08/2020 01:25, Leonardo Bras wrote:
> > On Mon, 2020-08-24 at 15:07 +1000, Alexey Kardashevskiy wrote:
> > > On 18/08/2020 09:40, Leonardo Bras wrote:
> > > > Code used to create a ddw property that was previously scattered in
> > > > enable_ddw() is now gathered in ddw_property_create(), which deals with
> > > > allocation and filling the property, letting it ready for
> > > > of_property_add(), which now occurs in sequence.
> > > > 
> > > > This created an opportunity to reorganize the second part of 
> > > > enable_ddw():
> > > > 
> > > > Without this patch enable_ddw() does, in order:
> > > > kzalloc() property & members, create_ddw(), fill ddwprop inside 
> > > > property,
> > > > ddw_list_add(), do tce_setrange_multi_pSeriesLP_walk in all memory,
> > > > of_add_property().
> > > > 
> > > > With this patch enable_ddw() does, in order:
> > > > create_ddw(), ddw_property_create(), of_add_property(), ddw_list_add(),
> > > > do tce_setrange_multi_pSeriesLP_walk in all memory.
> > > > 
> > > > This change requires of_remove_property() in case anything fails after
> > > > of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
> > > > in all memory, which looks the most expensive operation, only if
> > > > everything else succeeds.
> > > > 
> > > > Signed-off-by: Leonardo Bras 
> > > > ---
> > > >  arch/powerpc/platforms/pseries/iommu.c | 97 +++---
> > > >  1 file changed, 57 insertions(+), 40 deletions(-)
> > > > 
> > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > > > b/arch/powerpc/platforms/pseries/iommu.c
> > > > index 4031127c9537..3a1ef02ad9d5 100644
> > > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > > @@ -1123,6 +1123,31 @@ static void reset_dma_window(struct pci_dev 
> > > > *dev, struct device_node *par_dn)
> > > >  ret);
> > > >  }
> > > >  
> > > > +static int ddw_property_create(struct property **ddw_win, const char 
> > > > *propname,
> > > 
> > > @propname is always the same, do you really want to pass it every time?
> > 
> > I think it reads better, like "create a ddw property with this name".
> 
> This reads as "there are at least two ddw properties".
> 
> > Also, it makes possible to create ddw properties with other names, in
> > case we decide to create properties with different names depending on
> > the window created.
> 
> It is one window at any given moment, why call it different names... I
> get the part that it is not always "direct" anymore but still...
> 

It seems the case as one of the options you suggested on patch [09/10]

>> I suspect it breaks kexec (from older kernel to this one) so you
>> either need to check for both DT names or just keep the old one.

> 
> > Also, it's probably optimized / inlined at this point.
> > Is it ok doing it like this?
> > 
> > > > +  u32 liobn, u64 dma_addr, u32 page_shift, 
> > > > u32 window_shift)
> > > > +{
> > > > +   struct dynamic_dma_window_prop *ddwprop;
> > > > +   struct property *win64;
> > > > +
> > > > +   *ddw_win = win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
> > > > +   if (!win64)
> > > > +   return -ENOMEM;
> > > > +
> > > > +   win64->name = kstrdup(propname, GFP_KERNEL);
> > > 
> > > Not clear why "win64->name = DIRECT64_PROPNAME" would not work here, the
> > > generic OF code does not try kfree() it but it is probably out of scope
> > > here.
> > 
> > Yeah, I had that question too. 
> > Previous code was like that, and I as trying not to mess too much on
> > how it's done.
> > 
> > > > +   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
> > > > +   win64->value = ddwprop;
> > > > +   win64->length = sizeof(*ddwprop);
> > > > +   if (!win64->name || !win64->value)
> > > > +   return -ENOMEM;
> > > 
> > > Up to 2 memory leaks here. I see the cleanup at "out_free_prop:" but
> > > still l

Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()

2020-09-01 Thread Leonardo Bras

On Mon, 2020-08-31 at 10:47 +1000, Alexey Kardashevskiy wrote:
> > 
> > Maybe testing with host 64k pagesize and IOMMU 16MB pagesize in qemu
> > should be enough, is there any chance to get indirect mapping in qemu
> > like this? (DDW but with smaller DMA window available) 
> 
> You will have to hack the guest kernel to always do indirect mapping or
> hack QEMU's rtas_ibm_query_pe_dma_window() to return a small number of
> available TCEs. But you will be testing QEMU/KVM which behave quite
> differently to pHyp in this particular case.
> 

As you suggested before, building for 4k cpu pagesize should be the
best approach. It would allow testing for both pHyp and qemu scenarios.

> > > > > Because if we want the former (==support), then we'll have to align 
> > > > > the
> > > > > size up to the bigger page size when allocating/zeroing system pages,
> > > > > etc. 
> > > > 
> > > > This part I don't understand. Why do we need to align everything to the
> > > > bigger pagesize? 
> > > > 
> > > > I mean, is not that enough that the range [ret, ret + size[ is both
> > > > allocated by mm and mapped on a iommu range?
> > > > 
> > > > Suppose a iommu_alloc_coherent() of 16kB on PAGESIZE = 4k and
> > > > IOMMU_PAGE_SIZE() == 64k.
> > > > Why 4 * cpu_pages mapped by a 64k IOMMU page is not enough? 
> > > > All the space the user asked for is allocated and mapped for DMA.
> > > 
> > > The user asked to map 16K, the rest - 48K - is used for something else
> > > (may be even mapped to another device) but you are making all 64K
> > > accessible by the device which only should be able to access 16K.
> > > 
> > > In practice, if this happens, H_PUT_TCE will simply fail.
> > 
> > I have noticed mlx5 driver getting a few bytes in a buffer, and using
> > iommu_map_page(). It does map a whole page for as few bytes as the user
> 
> Whole 4K system page or whole 64K iommu page?

I tested it in 64k system page + 64k iommu page.

The 64K system page may be used for anything, and a small portion of it
(say 128 bytes) needs to be used for DMA. 
The whole page is mapped by IOMMU, and the driver gets info of the
memory range it should access / modify.

> 
> > wants mapped, and the other bytes get used for something else, or just
> > mapped on another DMA page.
> > It seems to work fine.  
> 
> 
> With 4K system page and 64K IOMMU page? In practice it would take an
> effort or/and bad luck to see it crashing. Thanks,

I haven't tested it yet. On a 64k system page and 4k/64k iommu page, it
works as described above.

I am new to this, so I am trying to understand how a memory page mapped
as DMA, and used for something else could be a problem.

Thanks!

> 
> > > 
> > > > > Bigger pages are not the case here as I understand it.
> > > > 
> > > > I did not get this part, what do you mean?
> > > 
> > > Possible IOMMU page sizes are 4K, 64K, 2M, 16M, 256M, 1GB, and the
> > > supported set of sizes is different for P8/P9 and type of IO (PHB,
> > > NVLink/CAPI).
> > > 
> > > 
> > > > > > Update those functions to guarantee alignment with requested size
> > > > > > using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free().
> > > > > > 
> > > > > > Also, on iommu_range_alloc(), replace ALIGN(n, 1 << 
> > > > > > tbl->it_page_shift)
> > > > > > with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read.
> > > > > > 
> > > > > > Signed-off-by: Leonardo Bras 
> > > > > > ---
> > > > > >  arch/powerpc/kernel/iommu.c | 17 +
> > > > > >  1 file changed, 9 insertions(+), 8 deletions(-)
> > > > > > 
> > > > > > diff --git a/arch/powerpc/kernel/iommu.c 
> > > > > > b/arch/powerpc/kernel/iommu.c
> > > > > > index 9704f3f76e63..d7086087830f 100644
> > > > > > --- a/arch/powerpc/kernel/iommu.c
> > > > > > +++ b/arch/powerpc/kernel/iommu.c
> > > > > > @@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct 
> > > > > > device *dev,
> > > > > > }
> > > > > >  
> > > > > > if (dev)
> > > > > > -   boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
> > > > > > - 1 << tbl->it_page_shift);
> &

Re: [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift

2020-09-01 Thread Leonardo Bras

On Mon, 2020-08-31 at 13:48 +1000, Alexey Kardashevskiy wrote:
> > > > Well, I created this TCE_RPN_BITS = 52 because the previous mask was a
> > > > hardcoded 40-bit mask (0xfful), for hard-coded 12-bit (4k)
> > > > pagesize, and on PAPR+/LoPAR also defines TCE as having bits 0-51
> > > > described as RPN, as described before.
> > > > 
> > > > IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figure 3.4 and 3.5.
> > > > shows system memory mapping into a TCE, and the TCE also has bits 0-51
> > > > for the RPN (52 bits). "Table 3.6. TCE Definition" also shows it.
> > > > In fact, by the looks of those figures, the RPN_MASK should always be a
> > > > 52-bit mask, and RPN = (page >> tceshift) & RPN_MASK.
> > > 
> > > I suspect the mask is there in the first place for extra protection
> > > against too big addresses going to the TCE table (or/and for virtial vs
> > > physical addresses). Using 52bit mask makes no sense for anything, you
> > > could just drop the mask and let c compiler deal with 64bit "uint" as it
> > > is basically a 4K page address anywhere in the 64bit space. Thanks,
> > 
> > Assuming 4K pages you need 52 RPN bits to cover the whole 64bit
> > physical address space. The IODA3 spec does explicitly say the upper
> > bits are optional and the implementation only needs to support enough
> > to cover up to the physical address limit, which is 56bits of P9 /
> > PHB4. If you want to validate that the address will fit inside of
> > MAX_PHYSMEM_BITS then fine, but I think that should be done as a
> > WARN_ON or similar rather than just silently masking off the bits.
> 
> We can do this and probably should anyway but I am also pretty sure we
> can just ditch the mask and have the hypervisor return an error which
> will show up in dmesg.

Ok then, ditching the mask.
Thanks!

[RFC PATCH 1/2] dma-direction: Add DMA_DIR_COMPAT() macro to test direction compability

2020-08-31 Thread Leonardo Bras

Given a existing mapping with 'current' direction, and a 'wanted'
direction for using that mapping, check if 'wanted' is satisfied by
'current'.

current accepts
DMA_BIDIRECTIONAL   DMA_BIDIRECTIONAL, DMA_TO_DEVICE,
DMA_FROM_DEVICE, DMA_NONE
DMA_TO_DEVICE   DMA_TO_DEVICE, DMA_NONE
DMA_FROM_DEVICE DMA_FROM_DEVICE, DMA_NONE
DMA_NONEDMA_NONE

This macro is useful for checking if a DMA mapping can be reused.

Signed-off-by: Leonardo Bras 
---
 include/linux/dma-direction.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/dma-direction.h b/include/linux/dma-direction.h
index 9c96e30e6a0b..caf3943a21f4 100644
--- a/include/linux/dma-direction.h
+++ b/include/linux/dma-direction.h
@@ -9,4 +9,7 @@ enum dma_data_direction {
DMA_NONE = 3,
 };
 
+/* Checks if wanted direction is satisfied by current mapping direction*/
+#define DMA_DIR_COMPAT(current, wanted)(((current) & ~(wanted)) == 0)
+
 #endif
-- 
2.25.4

[RFC PATCH 0/2] DMA pagecache

2020-08-31 Thread Leonardo Bras

This RFC improves the performance of indirect mapping on all tested DMA
usages, based on a mlx5 device, ranging from 64k packages to 1-byte
packages, from 1 thread to 64 threads.

In all workloads tested, the performance of indirect mapping gets very
near to direct mapping case.

The whole thing is designed to have as much perfomance as possible, so
the impact of the pagecache is not too big.

As I am not very experienced in XArrays usage, nor in lockless
algorithms, I would specially appreaciate feedback on possible
failures on it's usage, missing barriers, and so on.

Also, this size for the FIFO is just for testing purposes.
It's also very possible that it will not be a good idea in platforms
other than pseries, (i have not tested them).
I can plan I bypass for those cases without much work.

Thank you!

Leonardo Bras (2):
  dma-direction: Add DMA_DIR_COMPAT() macro to test direction
compability
  powerpc/kernel/iommu: Introduce IOMMU DMA pagecache

 arch/powerpc/include/asm/iommu-cache.h |  31 
 arch/powerpc/include/asm/iommu.h   |   4 +
 arch/powerpc/kernel/Makefile   |   2 +-
 arch/powerpc/kernel/iommu-cache.c  | 247 +
 arch/powerpc/kernel/iommu.c|  15 +-
 include/linux/dma-direction.h  |   3 +
 6 files changed, 296 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/include/asm/iommu-cache.h
 create mode 100644 arch/powerpc/kernel/iommu-cache.c

-- 
2.25.4

[RFC PATCH 2/2] powerpc/kernel/iommu: Introduce IOMMU DMA pagecache

2020-08-31 Thread Leonardo Bras

In pseries, mapping a DMA page for a cpu memory range requires a
H_PUT_TCE* hypercall, and unmapping requires a H_STUFF_TCE hypercall.
When doing a lot of I/O, a thread can spend a lot of time doing such
hcalls, specially when a DMA mapping don't get reused.

The purpose of this change is to introduce a mechanism similar to
a pagecache, but for reusing DMA mappings, improving performance and
avoiding multiple DMA mappings of the same cpupage.

This works based in a few current behaviors:
- Page reuse:
It's common for userspace process to reuse the same page for several
allocations This is probably caused by page buffering behavior that
come from libc, so getting pages from the kernel is less expensive.

- A lot of small allocations are used:
Some workloads do a lot of allocations that do not fill a whole page
causing multiple DMA mappings for a single page.

How it work:
- When a DMA mapping is required for given allocation, it first searches
  the DMA pagecache for a matching mapping. When found, increment refcount,
  when not found, a new mapping is created.
- Every time a new mapping is created, it's added to the DMA pagecache,
  with refcount = 1;
- When the mapping is not needed anymore (iommu_free()), it looks in the
  DMA pagecache for the entry, and the decrements it's refcount.
- When there are more than IOMMU_MAP_LIST_MAX entries in the
   DMA pagecache, it removes the older ones.

What is inside:
- 1 XArray which indexes the DMA page addresses, used for removing
  mappings and decreasing refcounts.
- 1 XArray which indexes CPU page addresses, for finding matching mappings.
  - As there can be multiple mappings (directions) for the same cpupage,
   this one keeps a llist for looking into the entries.
- Every entry points (page) in the XArray points to the mapping struct.

- The mapping struct have:
  - DMA & CPU page addresses, size (pages) , direction and refcount.
  - 1 llist used for multiple mappings at the same cpupage
  - 1 llist used for the FIFO (removing old unused entries)

- The cache struct, added to iommu_table, have:
  - Both XArrays,
  - 2 llist for entry/removal point on FIFO,
  - 1 Atomic Cachesize, to manage the FIFO.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu-cache.h |  31 
 arch/powerpc/include/asm/iommu.h   |   4 +
 arch/powerpc/kernel/Makefile   |   2 +-
 arch/powerpc/kernel/iommu-cache.c  | 247 +
 arch/powerpc/kernel/iommu.c|  15 +-
 5 files changed, 293 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/include/asm/iommu-cache.h
 create mode 100644 arch/powerpc/kernel/iommu-cache.c

diff --git a/arch/powerpc/include/asm/iommu-cache.h 
b/arch/powerpc/include/asm/iommu-cache.h
new file mode 100644
index ..ad298a4cd9c9
--- /dev/null
+++ b/arch/powerpc/include/asm/iommu-cache.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMMU_CACHE_H
+#define _IOMMU_CACHE_H
+#ifdef __KERNEL__
+
+#include 
+#include 
+#include 
+
+struct dmacache {
+   struct llist_head fifo_add;
+   struct llist_head fifo_del;
+   struct xarray cpupages;
+   struct xarray dmapages;
+   atomic64_t cachesize;
+};
+
+#include 
+
+void iommu_cache_init(struct iommu_table *tbl);
+void iommu_dmacache_add(struct iommu_table *tbl, void *page, unsigned int 
npages, dma_addr_t addr,
+   enum dma_data_direction direction);
+dma_addr_t iommu_dmacache_use(struct iommu_table *tbl, void *page, unsigned 
int npages,
+ enum dma_data_direction direction);
+void iommu_dmacache_free(struct iommu_table *tbl, dma_addr_t dma_handle, 
unsigned int npages);
+
+#define IOMMU_MAP_LIST_MAX 8192
+#define IOMMU_MAP_LIST_THRES   128
+
+#endif /* __KERNEL__ */
+#endif /* _IOMMU_CACHE_H */
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2913e5c8b1f8..51a2f5503f8e 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IOMMU_PAGE_SHIFT_4K  12
 #define IOMMU_PAGE_SIZE_4K   (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K)
@@ -114,6 +115,7 @@ struct iommu_table {
int it_nid;
unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */
unsigned long it_reserved_end;
+   struct dmacache cache;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
@@ -317,6 +319,8 @@ extern void iommu_release_ownership(struct iommu_table 
*tbl);
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
 extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
+void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int 
npages);
+
 #ifdef CONFIG_PPC_CELL_NATIVE
 extern bool iommu_fixed_is_weak;
 #else
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index cbf41fb4ee89..62f6c9007

Re: [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper

2020-08-28 Thread Leonardo Bras

On Fri, 2020-08-28 at 11:58 +1000, Alexey Kardashevskiy wrote:
> 
> On 28/08/2020 08:11, Leonardo Bras wrote:
> > On Mon, 2020-08-24 at 13:46 +1000, Alexey Kardashevskiy wrote:
> > > >  static int find_existing_ddw_windows(void)
> > > >  {
> > > > int len;
> > > > @@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void)
> > > > if (!direct64)
> > > > continue;
> > > >  
> > > > -   window = kzalloc(sizeof(*window), GFP_KERNEL);
> > > > -   if (!window || len < sizeof(struct 
> > > > dynamic_dma_window_prop)) {
> > > > +   window = ddw_list_add(pdn, direct64);
> > > > +   if (!window || len < sizeof(*direct64)) {
> > > 
> > > Since you are touching this code, it looks like the "len <
> > > sizeof(*direct64)" part should go above to "if (!direct64)".
> > 
> > Sure, makes sense.
> > It will be fixed for v2.
> > 
> > > 
> > > 
> > > > kfree(window);
> > > > remove_ddw(pdn, true);
> > > > -   continue;
> > > > }
> > > > -
> > > > -   window->device = pdn;
> > > > -   window->prop = direct64;
> > > > -   spin_lock(_window_list_lock);
> > > > -   list_add(>list, _window_list);
> > > > -   spin_unlock(_window_list_lock);
> > > > }
> > > >  
> > > > return 0;
> > > > @@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
> > > > device_node *pdn)
> > > > dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
> > > >   create.liobn, dn);
> > > >  
> > > > -   window = kzalloc(sizeof(*window), GFP_KERNEL);
> > > > +   /* Add new window to existing DDW list */
> > > 
> > > The comment seems to duplicate what the ddw_list_add name already 
> > > suggests.
> > 
> > Ok, I will remove it then.
> > 
> > > > +   window = ddw_list_add(pdn, ddwprop);
> > > > if (!window)
> > > > goto out_clear_window;
> > > >  
> > > > @@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, 
> > > > struct device_node *pdn)
> > > > goto out_free_window;
> > > > }
> > > >  
> > > > -   window->device = pdn;
> > > > -   window->prop = ddwprop;
> > > > -   spin_lock(_window_list_lock);
> > > > -   list_add(>list, _window_list);
> > > > -   spin_unlock(_window_list_lock);
> > > 
> > > I'd leave these 3 lines here and in find_existing_ddw_windows() (which
> > > would make  ddw_list_add -> ddw_prop_alloc). In general you want to have
> > > less stuff to do on the failure path. kmalloc may fail and needs kfree
> > > but you can safely delay list_add (which cannot fail) and avoid having
> > > the lock help twice in the same function (one of them is hidden inside
> > > ddw_list_add).
> > > Not sure if this change is really needed after all. Thanks,
> > 
> > I understand this leads to better performance in case anything fails.
> > Also, I think list_add happening in the end is less error-prone (in
> > case the list is checked between list_add and a fail).
> 
> Performance was not in my mind at all.
> 
> I noticed you remove from a list with a lock help and it was not there
> before and there is a bunch on labels on the exit path and started
> looking for list_add() and if you do not double remove from the list.
> 
> 
> > But what if we put it at the end?
> > What is the chance of a kzalloc of 4 pointers (struct direct_window)
> > failing after walk_system_ram_range?
> 
> This is not about chances really, it is about readability. If let's say
> kmalloc failed, you just to the error exit label and simply call kfree()
> on that pointer, kfree will do nothing if it is NULL already, simple.
> list_del() does not have this simplicity.
> 
> 
> > Is it not worthy doing that for making enable_ddw() easier to
> > understand?
> 
> This is my goal here :)

Ok, it makes sense to me now. 
I tried creating list_add() to keep everything related to list-adding
into a single place, instead of splitting it around the other stuff,
but now I understand that the code may look more complex than it was
before, because of the failing path increasing in size.

For me it was strange creating a list entry end not list_add()ing it
right away, but maybe it's something worth to get used to, as it may
increase the failing path simplicity, since list_add() don't fail.

I will try to see if the ddw_list_add() routine would become a useful
ddw_list_entry(), but if not, I will remove this patch.

Alexey, Thank you for reviewing this series!
Best regards,

Leonardo

Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()

2020-08-28 Thread Leonardo Bras

On Fri, 2020-08-28 at 11:40 +1000, Alexey Kardashevskiy wrote:
> > I think it would be better to keep the code as much generic as possible
> > regarding page sizes. 
> 
> Then you need to test it. Does 4K guest even boot (it should but I would
> not bet much on it)?

Maybe testing with host 64k pagesize and IOMMU 16MB pagesize in qemu
should be enough, is there any chance to get indirect mapping in qemu
like this? (DDW but with smaller DMA window available) 

> > > Because if we want the former (==support), then we'll have to align the
> > > size up to the bigger page size when allocating/zeroing system pages,
> > > etc. 
> > 
> > This part I don't understand. Why do we need to align everything to the
> > bigger pagesize? 
> > 
> > I mean, is not that enough that the range [ret, ret + size[ is both
> > allocated by mm and mapped on a iommu range?
> > 
> > Suppose a iommu_alloc_coherent() of 16kB on PAGESIZE = 4k and
> > IOMMU_PAGE_SIZE() == 64k.
> > Why 4 * cpu_pages mapped by a 64k IOMMU page is not enough? 
> > All the space the user asked for is allocated and mapped for DMA.
> 
> The user asked to map 16K, the rest - 48K - is used for something else
> (may be even mapped to another device) but you are making all 64K
> accessible by the device which only should be able to access 16K.
> 
> In practice, if this happens, H_PUT_TCE will simply fail.

I have noticed mlx5 driver getting a few bytes in a buffer, and using
iommu_map_page(). It does map a whole page for as few bytes as the user
wants mapped, and the other bytes get used for something else, or just
mapped on another DMA page.
It seems to work fine.  

> 
> 
> > > Bigger pages are not the case here as I understand it.
> > 
> > I did not get this part, what do you mean?
> 
> Possible IOMMU page sizes are 4K, 64K, 2M, 16M, 256M, 1GB, and the
> supported set of sizes is different for P8/P9 and type of IO (PHB,
> NVLink/CAPI).
> 
> 
> > > > Update those functions to guarantee alignment with requested size
> > > > using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free().
> > > > 
> > > > Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
> > > > with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read.
> > > > 
> > > > Signed-off-by: Leonardo Bras 
> > > > ---
> > > >  arch/powerpc/kernel/iommu.c | 17 +
> > > >  1 file changed, 9 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > > > index 9704f3f76e63..d7086087830f 100644
> > > > --- a/arch/powerpc/kernel/iommu.c
> > > > +++ b/arch/powerpc/kernel/iommu.c
> > > > @@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct 
> > > > device *dev,
> > > > }
> > > >  
> > > > if (dev)
> > > > -   boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
> > > > - 1 << tbl->it_page_shift);
> > > > +   boundary_size = 
> > > > IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, tbl);
> > > 
> > > Run checkpatch.pl, should complain about a long line.
> > 
> > It's 86 columns long, which is less than the new limit of 100 columns
> > Linus announced a few weeks ago. checkpatch.pl was updated too:
> > https://www.phoronix.com/scan.php?page=news_item=Linux-Kernel-Deprecates-80-Col
> 
> Yay finally :) Thanks,

:)

> 
> 
> > > 
> > > > else
> > > > -   boundary_size = ALIGN(1UL << 32, 1 << 
> > > > tbl->it_page_shift);
> > > > +   boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl);
> > > > /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
> > > >  
> > > > n = iommu_area_alloc(tbl->it_map, limit, start, npages, 
> > > > tbl->it_offset,
> > > > @@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, 
> > > > struct iommu_table *tbl,
> > > > unsigned int order;
> > > > unsigned int nio_pages, io_order;
> > > > struct page *page;
> > > > +   size_t size_io = size;
> > > >  
> > > > size = PAGE_ALIGN(size);
> > > > order = get_order(size);
> > > > @@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, 
> > > > struct iommu_t

Re: [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift

2020-08-28 Thread Leonardo Bras

On Fri, 2020-08-28 at 12:27 +1000, Alexey Kardashevskiy wrote:
> 
> On 28/08/2020 01:32, Leonardo Bras wrote:
> > Hello Alexey, thank you for this feedback!
> > 
> > On Sat, 2020-08-22 at 19:33 +1000, Alexey Kardashevskiy wrote:
> > > > +#define TCE_RPN_BITS   52  /* Bits 0-51 represent 
> > > > RPN on TCE */
> > > 
> > > Ditch this one and use MAX_PHYSMEM_BITS instead? I am pretty sure this
> > > is the actual limit.
> > 
> > I understand this MAX_PHYSMEM_BITS(51) comes from the maximum physical 
> > memory addressable in the machine. IIUC, it means we can access physical 
> > address up to (1ul << MAX_PHYSMEM_BITS). 
> > 
> > This 52 comes from PAPR "Table 9. TCE Definition" which defines bits
> > 0-51 as the RPN. By looking at code, I understand that it means we may 
> > input any address < (1ul << 52) to TCE.
> > 
> > In practice, MAX_PHYSMEM_BITS should be enough as of today, because I 
> > suppose we can't ever pass a physical page address over 
> > (1ul << 51), and TCE accepts up to (1ul << 52).
> > But if we ever increase MAX_PHYSMEM_BITS, it doesn't necessarily means that 
> > TCE_RPN_BITS will also be increased, so I think they are independent 
> > values. 
> > 
> > Does it make sense? Please let me know if I am missing something.
> 
> The underlying hardware is PHB3/4 about which the IODA2 Version 2.4
> 6Apr2012.pdf spec says:
> 
> "The number of most significant RPN bits implemented in the TCE is
> dependent on the max size of System Memory to be supported by the platform".
> 
> IODA3 is the same on this matter.
> 
> This is MAX_PHYSMEM_BITS and PHB itself does not have any other limits
> on top of that. So the only real limit comes from MAX_PHYSMEM_BITS and
> where TCE_RPN_BITS comes from exactly - I have no idea.

Well, I created this TCE_RPN_BITS = 52 because the previous mask was a
hardcoded 40-bit mask (0xfful), for hard-coded 12-bit (4k)
pagesize, and on PAPR+/LoPAR also defines TCE as having bits 0-51
described as RPN, as described before.

IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figure 3.4 and 3.5.
shows system memory mapping into a TCE, and the TCE also has bits 0-51
for the RPN (52 bits). "Table 3.6. TCE Definition" also shows it.

In fact, by the looks of those figures, the RPN_MASK should always be a
52-bit mask, and RPN = (page >> tceshift) & RPN_MASK.

Maybe that's it?

> 
> 
> > > 
> > > > +#define TCE_RPN_MASK(ps)   ((1ul << (TCE_RPN_BITS - (ps))) - 1)
> > > >  #define TCE_VALID  0x800   /* TCE valid */
> > > >  #define TCE_ALLIO  0x400   /* TCE valid for all 
> > > > lpars */
> > > >  #define TCE_PCI_WRITE  0x2 /* write from PCI 
> > > > allowed */
> > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > > > b/arch/powerpc/platforms/pseries/iommu.c
> > > > index e4198700ed1a..8fe23b7dff3a 100644
> > > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > > @@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table 
> > > > *tbl, long index,
> > > > u64 proto_tce;
> > > > __be64 *tcep;
> > > > u64 rpn;
> > > > +   const unsigned long tceshift = tbl->it_page_shift;
> > > > +   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
> > > > +   const u64 rpn_mask = TCE_RPN_MASK(tceshift);
> > > 
> > > Using IOMMU_PAGE_SIZE macro for the page size and not using
> > > IOMMU_PAGE_MASK for the mask - this incosistency makes my small brain
> > > explode :) I understand the history but man... Oh well, ok.
> > > 
> > 
> > Yeah, it feels kind of weird after two IOMMU related consts. :)
> > But sure IOMMU_PAGE_MASK() would not be useful here :)
> > 
> > And this kind of let me thinking:
> > > > +   rpn = __pa(uaddr) >> tceshift;
> > > > +   *tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << 
> > > > tceshift);
> > Why not:
> > rpn_mask =  TCE_RPN_MASK(tceshift) << tceshift;
> 
> A mask for a page number (but not the address!) hurts my brain, masks
> are good against addresses but numbers should already have all bits
> adjusted imho, may be it is just me :-/
> 
> 
> > 
> > rpn = __pa(uaddr) & rpn_mask;
> > *tcep = cpu_to_be64(proto_tce | rpn)
> > 
> > I am usually afraid of changing stuff like this, but I think it's safe.
> > 
> > > Good, otherwise. Thanks,
> > 
> > Thank you for reviewing!
> >  
> > 
> >

Re: [PATCH v1 09/10] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-08-28 Thread Leonardo Bras

On Mon, 2020-08-24 at 15:17 +1000, Alexey Kardashevskiy wrote:
> 
> On 18/08/2020 09:40, Leonardo Bras wrote:
> > As of today, if the biggest DDW that can be created can't map the whole
> > partition, it's creation is skipped and the default DMA window
> > "ibm,dma-window" is used instead.
> > 
> > DDW is 16x bigger than the default DMA window,
> 
> 16x only under very specific circumstances which are
> 1. phyp
> 2. sriov
> 3. device class in hmc (or what that priority number is in the lpar config).

Yeah, missing details.

> > having the same amount of
> > pages, but increasing the page size to 64k.
> > Besides larger DMA window,
> 
> "Besides being larger"?

You are right there.

> 
> > it performs better for allocations over 4k,
> 
> Better how?

I was thinking for allocations larger than (512 * 4k), since >2
hypercalls are needed here, and for 64k pages would still be just 1
hypercall up to (512 * 64k). 
But yeah, not the usual case anyway.

> 
> > so it would be nice to use it instead.
> 
> I'd rather say something like:
> ===
> So far we assumed we can map the guest RAM 1:1 to the bus which worked
> with a small number of devices. SRIOV changes it as the user can
> configure hundreds VFs and since phyp preallocates TCEs and does not
> allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> per a PE to limit waste of physical pages.
> ===

I mixed this in my commit message, it looks like this:

===
powerpc/pseries/iommu: Make use of DDW for indirect mapping

So far it's assumed possible to map the guest RAM 1:1 to the bus, which
works with a small number of devices. SRIOV changes it as the user can
configure hundreds VFs and since phyp preallocates TCEs and does not
allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
per a PE to limit waste of physical pages.

As of today, if the assumed direct mapping is not possible, DDW
creation is skipped and the default DMA window "ibm,dma-window" is used
instead.

The default DMA window uses 4k pages instead of 64k pages, and since
the amount of pages is the same, making use of DDW instead of the
default DMA window for indirect mapping will expand in 16x the amount
of memory that can be mapped on DMA.

The DDW created will be used for direct mapping by default. [...]
===

What do you think?

> > The DDW created will be used for direct mapping by default.
> > If it's not available, indirect mapping will be used instead.
> > 
> > For indirect mapping, it's necessary to update the iommu_table so
> > iommu_alloc() can use the DDW created. For this,
> > iommu_table_update_window() is called when everything else succeeds
> > at enable_ddw().
> > 
> > Removing the default DMA window for using DDW with indirect mapping
> > is only allowed if there is no current IOMMU memory allocated in
> > the iommu_table. enable_ddw() is aborted otherwise.
> > 
> > As there will never have both direct and indirect mappings at the same
> > time, the same property name can be used for the created DDW.
> > 
> > So renaming
> > define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> > to
> > define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> > looks the right thing to do.
> 
> I know I suggested this but this does not look so good anymore as I
> suspect it breaks kexec (from older kernel to this one) so you either
> need to check for both DT names or just keep the old one. Changing the
> macro name is fine.
> 

Yeah, having 'direct' in the name don't really makes sense if it's used
for indirect mapping. I will just add this new define instead of
replacing the old one, and check for both. 
Is that ok?

> 
> > To make sure the property differentiates both cases, a new u32 for flags
> > was added at the end of the property, where BIT(0) set means direct
> > mapping.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 108 +++--
> >  1 file changed, 84 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index 3a1ef02ad9d5..9544e3c91ced 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -350,8 +350,11 @@ struct dynamic_dma_window_prop {
> > __be64  dma_base;   /* address hi,lo */
> > __be32  tce_shift;  /* ilog2(tce_page_size) */
> > __be32  window_shift;   /* ilog2(tce_window_size) */
> > +   __be32  flags;  /* DDW properties, see bellow */
> >  };
> >  
> > +#define DDW_FL

Re: [PATCH v1 08/10] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2020-08-28 Thread Leonardo Bras

On Mon, 2020-08-24 at 15:07 +1000, Alexey Kardashevskiy wrote:
> 
> On 18/08/2020 09:40, Leonardo Bras wrote:
> > Code used to create a ddw property that was previously scattered in
> > enable_ddw() is now gathered in ddw_property_create(), which deals with
> > allocation and filling the property, letting it ready for
> > of_property_add(), which now occurs in sequence.
> > 
> > This created an opportunity to reorganize the second part of enable_ddw():
> > 
> > Without this patch enable_ddw() does, in order:
> > kzalloc() property & members, create_ddw(), fill ddwprop inside property,
> > ddw_list_add(), do tce_setrange_multi_pSeriesLP_walk in all memory,
> > of_add_property().
> > 
> > With this patch enable_ddw() does, in order:
> > create_ddw(), ddw_property_create(), of_add_property(), ddw_list_add(),
> > do tce_setrange_multi_pSeriesLP_walk in all memory.
> > 
> > This change requires of_remove_property() in case anything fails after
> > of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
> > in all memory, which looks the most expensive operation, only if
> > everything else succeeds.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 97 +++---
> >  1 file changed, 57 insertions(+), 40 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index 4031127c9537..3a1ef02ad9d5 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -1123,6 +1123,31 @@ static void reset_dma_window(struct pci_dev *dev, 
> > struct device_node *par_dn)
> >  ret);
> >  }
> >  
> > +static int ddw_property_create(struct property **ddw_win, const char 
> > *propname,
> 
> @propname is always the same, do you really want to pass it every time?

I think it reads better, like "create a ddw property with this name".
Also, it makes possible to create ddw properties with other names, in
case we decide to create properties with different names depending on
the window created.

Also, it's probably optimized / inlined at this point.
Is it ok doing it like this?

> 
> > +  u32 liobn, u64 dma_addr, u32 page_shift, u32 
> > window_shift)
> > +{
> > +   struct dynamic_dma_window_prop *ddwprop;
> > +   struct property *win64;
> > +
> > +   *ddw_win = win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
> > +   if (!win64)
> > +   return -ENOMEM;
> > +
> > +   win64->name = kstrdup(propname, GFP_KERNEL);
> 
> Not clear why "win64->name = DIRECT64_PROPNAME" would not work here, the
> generic OF code does not try kfree() it but it is probably out of scope
> here.

Yeah, I had that question too. 
Previous code was like that, and I as trying not to mess too much on
how it's done.

> > +   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
> > +   win64->value = ddwprop;
> > +   win64->length = sizeof(*ddwprop);
> > +   if (!win64->name || !win64->value)
> > +   return -ENOMEM;
> 
> Up to 2 memory leaks here. I see the cleanup at "out_free_prop:" but
> still looks fragile. Instead you could simply return win64 as the only
> error possible here is -ENOMEM and returning NULL is equally good.

I agree. It's better if this function have it's own cleaning routine.
It will be fixed for next version.

> 
> 
> > +
> > +   ddwprop->liobn = cpu_to_be32(liobn);
> > +   ddwprop->dma_base = cpu_to_be64(dma_addr);
> > +   ddwprop->tce_shift = cpu_to_be32(page_shift);
> > +   ddwprop->window_shift = cpu_to_be32(window_shift);
> > +
> > +   return 0;
> > +}
> > +
> >  /*
> >   * If the PE supports dynamic dma windows, and there is space for a table
> >   * that can map all pages in a linear offset, then setup such a table,
> > @@ -1140,12 +1165,11 @@ static bool enable_ddw(struct pci_dev *dev, struct 
> > device_node *pdn)
> > struct ddw_query_response query;
> > struct ddw_create_response create;
> > int page_shift;
> > -   u64 max_addr;
> > +   u64 max_addr, win_addr;
> > struct device_node *dn;
> > u32 ddw_avail[DDW_APPLICABLE_SIZE];
> > struct direct_window *window;
> > -   struct property *win64;
> > -   struct dynamic_dma_window_prop *ddwprop;
> > +   struct property *win64 = NULL;
> > struct failed_ddw_pdn *fpdn;
> > bool default_win_removed = false;
> >  
> > @@ -1244,3

Re: [PATCH v1 07/10] powerpc/pseries/iommu: Allow DDW windows starting at 0x00

2020-08-28 Thread Leonardo Bras

On Mon, 2020-08-24 at 13:44 +1000, Alexey Kardashevskiy wrote:
> 
> > On 18/08/2020 09:40, Leonardo Bras wrote:
> > enable_ddw() currently returns the address of the DMA window, which is
> > considered invalid if has the value 0x00.
> > 
> > Also, it only considers valid an address returned from find_existing_ddw
> > if it's not 0x00.
> > 
> > Changing this behavior makes sense, given the users of enable_ddw() only
> > need to know if direct mapping is possible. It can also allow a DMA window
> > starting at 0x00 to be used.
> > 
> > This will be helpful for using a DDW with indirect mapping, as the window
> > address will be different than 0x00, but it will not map the whole
> > partition.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 30 --
> >  1 file changed, 14 insertions(+), 16 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index fcdefcc0f365..4031127c9537 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -852,24 +852,25 @@ static void remove_ddw(struct device_node *np, bool 
> > remove_prop)
> > np, ret);
> >  }
> > >  
> > -static u64 find_existing_ddw(struct device_node *pdn)
> > +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> >  {
> > struct direct_window *window;
> > const struct dynamic_dma_window_prop *direct64;
> > -   u64 dma_addr = 0;
> > +   bool found = false;
> >  
> > spin_lock(_window_list_lock);
> > /* check if we already created a window and dupe that config if so */
> > list_for_each_entry(window, _window_list, list) {
> > if (window->device == pdn) {
> > direct64 = window->prop;
> > -   dma_addr = be64_to_cpu(direct64->dma_base);
> > +   *dma_addr = be64_to_cpu(direct64->dma_base);
> > +   found = true;
> > break;
> > }
> > }
> > spin_unlock(_window_list_lock);
> >  
> > -   return dma_addr;
> > +   return found;
> >  }
> >  
> >  static struct direct_window *ddw_list_add(struct device_node *pdn,
> > @@ -1131,15 +1132,15 @@ static void reset_dma_window(struct pci_dev *dev, 
> > struct device_node *par_dn)
> >   * pdn: the parent pe node with the ibm,dma_window property
> >   * Future: also check if we can remap the base window for our base page 
> > size
> >   *
> > - * returns the dma offset for use by the direct mapped DMA code.
> > + * returns true if can map all pages (direct mapping), false otherwise..
> >   */
> > -static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > +static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> >  {
> > int len, ret;
> > struct ddw_query_response query;
> > struct ddw_create_response create;
> > int page_shift;
> > -   u64 dma_addr, max_addr;
> > +   u64 max_addr;
> > struct device_node *dn;
> > u32 ddw_avail[DDW_APPLICABLE_SIZE];
> > struct direct_window *window;
> > @@ -1150,8 +1151,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
> > device_node *pdn)
> >  
> > mutex_lock(_window_init_mutex);
> >  
> > -   dma_addr = find_existing_ddw(pdn);
> > -   if (dma_addr != 0)
> > +   if (find_existing_ddw(pdn, >dev.archdata.dma_offset))
> > goto out_unlock;
> >  
> > /*
> > @@ -1292,7 +1292,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
> > device_node *pdn)
> > goto out_free_window;
> > }
> >  
> > -   dma_addr = be64_to_cpu(ddwprop->dma_base);
> > +   dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
> 
> Do not you need the same chunk in the find_existing_ddw() case above as
> well? Thanks,

The new signature of find_existing_ddw() is 
static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)

And on enable_ddw(), we call 
find_existing_ddw(pdn, >dev.archdata.dma_offset)

And inside the function we do:
*dma_addr = be64_to_cpu(direct64->dma_base);

I think it's the same as the chunk before.
Am I missing something?

> 
> 
> > goto out_unlock;
> >  
> >  out_free_window:
> > @@ -1309,6 +1309,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
> > device_node *pdn)
> >

Re: [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper

2020-08-27 Thread Leonardo Bras

On Mon, 2020-08-24 at 13:46 +1000, Alexey Kardashevskiy wrote:
> >  static int find_existing_ddw_windows(void)
> >  {
> > int len;
> > @@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void)
> > if (!direct64)
> > continue;
> >  
> > -   window = kzalloc(sizeof(*window), GFP_KERNEL);
> > -   if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
> > +   window = ddw_list_add(pdn, direct64);
> > +   if (!window || len < sizeof(*direct64)) {
> 
> Since you are touching this code, it looks like the "len <
> sizeof(*direct64)" part should go above to "if (!direct64)".

Sure, makes sense.
It will be fixed for v2.

> 
> 
> 
> > kfree(window);
> > remove_ddw(pdn, true);
> > -   continue;
> > }
> > -
> > -   window->device = pdn;
> > -   window->prop = direct64;
> > -   spin_lock(_window_list_lock);
> > -   list_add(>list, _window_list);
> > -   spin_unlock(_window_list_lock);
> > }
> >  
> > return 0;
> > @@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
> > device_node *pdn)
> > dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
> >   create.liobn, dn);
> >  
> > -   window = kzalloc(sizeof(*window), GFP_KERNEL);
> > +   /* Add new window to existing DDW list */
> 
> The comment seems to duplicate what the ddw_list_add name already suggests.

Ok, I will remove it then.

> > +   window = ddw_list_add(pdn, ddwprop);
> > if (!window)
> > goto out_clear_window;
> >  
> > @@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
> > device_node *pdn)
> > goto out_free_window;
> > }
> >  
> > -   window->device = pdn;
> > -   window->prop = ddwprop;
> > -   spin_lock(_window_list_lock);
> > -   list_add(>list, _window_list);
> > -   spin_unlock(_window_list_lock);
> 
> I'd leave these 3 lines here and in find_existing_ddw_windows() (which
> would make  ddw_list_add -> ddw_prop_alloc). In general you want to have
> less stuff to do on the failure path. kmalloc may fail and needs kfree
> but you can safely delay list_add (which cannot fail) and avoid having
> the lock help twice in the same function (one of them is hidden inside
> ddw_list_add).
> Not sure if this change is really needed after all. Thanks,

I understand this leads to better performance in case anything fails.
Also, I think list_add happening in the end is less error-prone (in
case the list is checked between list_add and a fail).

But what if we put it at the end?
What is the chance of a kzalloc of 4 pointers (struct direct_window)
failing after walk_system_ram_range?  

Is it not worthy doing that for making enable_ddw() easier to
understand?

Best regards,
Leonardo

Re: [PATCH v1 05/10] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper

2020-08-27 Thread Leonardo Bras

On Mon, 2020-08-24 at 10:38 +1000, Alexey Kardashevskiy wrote:
> 
> On 18/08/2020 09:40, Leonardo Bras wrote:
> > Creates a helper to allow allocating a new iommu_table without the need
> > to reallocate the iommu_group.
> > 
> > This will be helpful for replacing the iommu_table for the new DMA window,
> > after we remove the old one with iommu_tce_table_put().
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 25 ++---
> >  1 file changed, 14 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index 8fe23b7dff3a..39617ce0ec83 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -53,28 +53,31 @@ enum {
> > DDW_EXT_QUERY_OUT_SIZE = 2
> >  };
> >  
> > -static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> > +static struct iommu_table *iommu_pseries_alloc_table(int node)
> >  {
> > -   struct iommu_table_group *table_group;
> > struct iommu_table *tbl;
> >  
> > -   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
> > -  node);
> > -   if (!table_group)
> > -   return NULL;
> > -
> > tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
> > if (!tbl)
> > -   goto free_group;
> > +   return NULL;
> >  
> > INIT_LIST_HEAD_RCU(>it_group_list);
> > kref_init(>it_kref);
> > +   return tbl;
> > +}
> >  
> > -   table_group->tables[0] = tbl;
> > +static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> > +{
> > +   struct iommu_table_group *table_group;
> > +
> > +   table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
> 
> I'd prefer you did not make unrelated changes (sizeof(struct
> iommu_table_group) -> sizeof(*table_group)) so the diff stays shorter
> and easier to follow. You changed  sizeof(struct iommu_table_group) but
> not sizeof(struct iommu_table) and this confused me enough to spend more
> time than this straight forward change deserves.

Sorry, I will keep this in mind for future patches.
Thank you for the tip!

> 
> Not important in this case though so
> 
> Reviewed-by: Alexey Kardashevskiy 

Thank you!

Re: [PATCH v1 04/10] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2020-08-27 Thread Leonardo Bras

On Sat, 2020-08-22 at 20:34 +1000, Alexey Kardashevskiy wrote:
> > +
> > +   /*ignore reserved bit0*/
> 
> s/ignore reserved bit0/ ignore reserved bit0 /  (add spaces)

Fixed

> > +   if (tbl->it_offset == 0)
> > +   p1_start = 1;
> > +
> > +   /* Check if reserved memory is valid*/
> 
> A missing space here.

Fixed

> 
> > +   if (tbl->it_reserved_start >= tbl->it_offset &&
> > +   tbl->it_reserved_start <= (tbl->it_offset + tbl->it_size) &&
> > +   tbl->it_reserved_end   >= tbl->it_offset &&
> > +   tbl->it_reserved_end   <= (tbl->it_offset + tbl->it_size)) {
> 
> Uff. What if tbl->it_reserved_end is bigger than tbl->it_offset +
> tbl->it_size?
> 
> The reserved area is to preserve MMIO32 so it is for it_offset==0 only
> and the boundaries are checked in the only callsite, and it is unlikely
> to change soon or ever.
> 
> Rather that bothering with fixing that, may be just add (did not test):
> 
> if (WARN_ON((
> (tbl->it_reserved_start || tbl->it_reserved_end) && (it_offset != 0))
> (tbl->it_reserved_start > it_offset && tbl->it_reserved_end < it_offset
> + it_size) && (it_offset == 0)) )
>  return true;
> 
> Or simply always look for it_offset..it_reserved_start and
> it_reserved_end..it_offset+it_size and if there is no reserved area,
> initialize it_reserved_start=it_reserved_end=it_offset so the first
> it_offset..it_reserved_start becomes a no-op.

The problem here is that the values of it_reserved_{start,end} are not
necessarily valid. I mean, on iommu_table_reserve_pages() the values
are stored however they are given (bit reserving is done only if they
are valid). 

Having a it_reserved_{start,end} value outside the valid ranges would
cause find_next_bit() to run over memory outside the bitmap.
Even if the those values are < tbl->it_offset, the resulting
subtraction on unsigned would cause it to become a big value and run
over memory outside the bitmap.

But I think you are right. That is not the place to check if the
reserved values are valid. It should just trust them here.
I intent to change iommu_table_reserve_pages() to only store the
parameters in it_reserved_{start,end} if they are in the range, and or
it_offset in both of them if they are not.

What do you think?

Thanks for the feedback!
Leonardo Bras

Re: [PATCH v1 03/10] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc

2020-08-27 Thread Leonardo Bras

On Sat, 2020-08-22 at 20:09 +1000, Alexey Kardashevskiy wrote:
> > +   goto again;
> > +
> 
> A nit: unnecessary new line.

I was following the pattern used above. There is a newline after every
"goto again" in this 'if'. 

> Reviewed-by: Alexey Kardashevskiy 

Thank you!

Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()

2020-08-27 Thread Leonardo Bras

On Sat, 2020-08-22 at 20:07 +1000, Alexey Kardashevskiy wrote:
> 
> On 18/08/2020 09:40, Leonardo Bras wrote:
> > Both iommu_alloc_coherent() and iommu_free_coherent() assume that once
> > size is aligned to PAGE_SIZE it will be aligned to IOMMU_PAGE_SIZE.
> 
> The only case when it is not aligned is when IOMMU_PAGE_SIZE > PAGE_SIZE
> which is unlikely but not impossible, we could configure the kernel for
> 4K system pages and 64K IOMMU pages I suppose. Do we really want to do
> this here, or simply put WARN_ON(tbl->it_page_shift > PAGE_SHIFT)?

I think it would be better to keep the code as much generic as possible
regarding page sizes. 

> Because if we want the former (==support), then we'll have to align the
> size up to the bigger page size when allocating/zeroing system pages,
> etc. 

This part I don't understand. Why do we need to align everything to the
bigger pagesize? 

I mean, is not that enough that the range [ret, ret + size[ is both
allocated by mm and mapped on a iommu range?

Suppose a iommu_alloc_coherent() of 16kB on PAGESIZE = 4k and
IOMMU_PAGE_SIZE() == 64k.
Why 4 * cpu_pages mapped by a 64k IOMMU page is not enough? 
All the space the user asked for is allocated and mapped for DMA.


> Bigger pages are not the case here as I understand it.

I did not get this part, what do you mean?

> > Update those functions to guarantee alignment with requested size
> > using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free().
> > 
> > Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
> > with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/kernel/iommu.c | 17 +
> >  1 file changed, 9 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index 9704f3f76e63..d7086087830f 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct device 
> > *dev,
> > }
> >  
> > if (dev)
> > -   boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
> > - 1 << tbl->it_page_shift);
> > +   boundary_size = IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, 
> > tbl);
> 
> Run checkpatch.pl, should complain about a long line.

It's 86 columns long, which is less than the new limit of 100 columns
Linus announced a few weeks ago. checkpatch.pl was updated too:
https://www.phoronix.com/scan.php?page=news_item=Linux-Kernel-Deprecates-80-Col


> 
> 
> > else
> > -   boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
> > +   boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl);
> > /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
> >  
> > n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
> > @@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, struct 
> > iommu_table *tbl,
> > unsigned int order;
> > unsigned int nio_pages, io_order;
> > struct page *page;
> > +   size_t size_io = size;
> >  
> > size = PAGE_ALIGN(size);
> > order = get_order(size);
> > @@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, struct 
> > iommu_table *tbl,
> > memset(ret, 0, size);
> >  
> > /* Set up tces to cover the allocated range */
> > -   nio_pages = size >> tbl->it_page_shift;
> > -   io_order = get_iommu_order(size, tbl);
> > +   size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
> > +   nio_pages = size_io >> tbl->it_page_shift;
> > +   io_order = get_iommu_order(size_io, tbl);
> > mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
> >   mask >> tbl->it_page_shift, io_order, 0);
> > if (mapping == DMA_MAPPING_ERROR) {
> > @@ -900,11 +901,11 @@ void iommu_free_coherent(struct iommu_table *tbl, 
> > size_t size,
> >  void *vaddr, dma_addr_t dma_handle)
> >  {
> > if (tbl) {
> > -   unsigned int nio_pages;
> > +   size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
> > +   unsigned int nio_pages = size_io >> tbl->it_page_shift;
> >  
> > -   size = PAGE_ALIGN(size);
> > -   nio_pages = size >> tbl->it_page_shift;
> > iommu_free(tbl, dma_handle, nio_pages);
> > +
> 
> Unrelated new line.

Will be removed. Thanks!

> 
> 
> > size = PAGE_ALIGN(size);
> > free_pages((unsigned long)vaddr, get_order(size));
> > }
> >

Re: [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift

2020-08-27 Thread Leonardo Bras

Hello Alexey, thank you for this feedback!

On Sat, 2020-08-22 at 19:33 +1000, Alexey Kardashevskiy wrote:
> > +#define TCE_RPN_BITS   52  /* Bits 0-51 represent 
> > RPN on TCE */
> 
> Ditch this one and use MAX_PHYSMEM_BITS instead? I am pretty sure this
> is the actual limit.

I understand this MAX_PHYSMEM_BITS(51) comes from the maximum physical memory 
addressable in the machine. IIUC, it means we can access physical address up to 
(1ul << MAX_PHYSMEM_BITS). 

This 52 comes from PAPR "Table 9. TCE Definition" which defines bits
0-51 as the RPN. By looking at code, I understand that it means we may input 
any address < (1ul << 52) to TCE.

In practice, MAX_PHYSMEM_BITS should be enough as of today, because I suppose 
we can't ever pass a physical page address over 
(1ul << 51), and TCE accepts up to (1ul << 52).
But if we ever increase MAX_PHYSMEM_BITS, it doesn't necessarily means that 
TCE_RPN_BITS will also be increased, so I think they are independent values. 

Does it make sense? Please let me know if I am missing something.

> 
> 
> > +#define TCE_RPN_MASK(ps)   ((1ul << (TCE_RPN_BITS - (ps))) - 1)
> >  #define TCE_VALID  0x800   /* TCE valid */
> >  #define TCE_ALLIO  0x400   /* TCE valid for all lpars */
> >  #define TCE_PCI_WRITE  0x2 /* write from PCI 
> > allowed */
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > b/arch/powerpc/platforms/pseries/iommu.c
> > index e4198700ed1a..8fe23b7dff3a 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
> > long index,
> > u64 proto_tce;
> > __be64 *tcep;
> > u64 rpn;
> > +   const unsigned long tceshift = tbl->it_page_shift;
> > +   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
> > +   const u64 rpn_mask = TCE_RPN_MASK(tceshift);
> 
> Using IOMMU_PAGE_SIZE macro for the page size and not using
> IOMMU_PAGE_MASK for the mask - this incosistency makes my small brain
> explode :) I understand the history but man... Oh well, ok.
> 

Yeah, it feels kind of weird after two IOMMU related consts. :)
But sure IOMMU_PAGE_MASK() would not be useful here :)

And this kind of let me thinking:
> > +   rpn = __pa(uaddr) >> tceshift;
> > +   *tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift);
Why not:
rpn_mask =  TCE_RPN_MASK(tceshift) << tceshift;

rpn = __pa(uaddr) & rpn_mask;
*tcep = cpu_to_be64(proto_tce | rpn)

I am usually afraid of changing stuff like this, but I think it's safe.

> Good, otherwise. Thanks,

Thank you for reviewing!

[PATCH v1 08/10] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2020-08-17 Thread Leonardo Bras

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_add(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(), ddw_list_add(),
do tce_setrange_multi_pSeriesLP_walk in all memory.

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 97 +++---
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 4031127c9537..3a1ef02ad9d5 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1123,6 +1123,31 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
 }
 
+static int ddw_property_create(struct property **ddw_win, const char *propname,
+  u32 liobn, u64 dma_addr, u32 page_shift, u32 
window_shift)
+{
+   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64;
+
+   *ddw_win = win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+   if (!win64)
+   return -ENOMEM;
+
+   win64->name = kstrdup(propname, GFP_KERNEL);
+   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+   win64->value = ddwprop;
+   win64->length = sizeof(*ddwprop);
+   if (!win64->name || !win64->value)
+   return -ENOMEM;
+
+   ddwprop->liobn = cpu_to_be32(liobn);
+   ddwprop->dma_base = cpu_to_be64(dma_addr);
+   ddwprop->tce_shift = cpu_to_be32(page_shift);
+   ddwprop->window_shift = cpu_to_be32(window_shift);
+
+   return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1140,12 +1165,11 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
-   u64 max_addr;
+   u64 max_addr, win_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
-   struct property *win64;
-   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64 = NULL;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
 
@@ -1244,38 +1268,34 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_failed;
}
len = order_base_2(max_addr);
-   win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
-   if (!win64) {
-   dev_info(>dev,
-   "couldn't allocate property for 64bit dma window\n");
+
+   ret = create_ddw(dev, ddw_avail, , page_shift, len);
+   if (ret != 0)
goto out_failed;
-   }
-   win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
-   win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
-   win64->length = sizeof(*ddwprop);
-   if (!win64->name || !win64->value) {
+
+   dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
+   create.liobn, dn);
+
+   win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+   ret = ddw_property_create(, DIRECT64_PROPNAME, create.liobn, 
win_addr,
+ page_shift, len);
+   if (ret) {
dev_info(>dev,
-   "couldn't allocate property name and value\n");
+"couldn't allocate property, property name, or 
value\n");
goto out_free_prop;
}
 
-   ret = create_ddw(dev, ddw_avail, , page_shift, len);
-   if (ret != 0)
+   ret = of_add_property(pdn, win64);
+   if (ret) {
+   dev_err(>dev, "unable to add dma window property for %pOF: 
%d",
+   pdn, ret);
goto out_free_prop;
-
-   ddwprop->liobn = cpu_to_be32(create.liobn);
-   ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
-   create.addr_lo);
-   ddwprop->tce_shift = cpu_to_be32(page_shift);
-   ddwprop->wi

[PATCH v1 09/10] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-08-17 Thread Leonardo Bras

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
"ibm,dma-window" is used instead.

DDW is 16x bigger than the default DMA window, having the same amount of
pages, but increasing the page size to 64k.
Besides larger DMA window, it performs better for allocations over 4k,
so it would be nice to use it instead.

The DDW created will be used for direct mapping by default.
If it's not available, indirect mapping will be used instead.

For indirect mapping, it's necessary to update the iommu_table so
iommu_alloc() can use the DDW created. For this,
iommu_table_update_window() is called when everything else succeeds
at enable_ddw().

Removing the default DMA window for using DDW with indirect mapping
is only allowed if there is no current IOMMU memory allocated in
the iommu_table. enable_ddw() is aborted otherwise.

As there will never have both direct and indirect mappings at the same
time, the same property name can be used for the created DDW.

So renaming
define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
to
define DMA64_PROPNAME "linux,dma64-ddr-window-info"
looks the right thing to do.

To make sure the property differentiates both cases, a new u32 for flags
was added at the end of the property, where BIT(0) set means direct
mapping.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 108 +++--
 1 file changed, 84 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 3a1ef02ad9d5..9544e3c91ced 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -350,8 +350,11 @@ struct dynamic_dma_window_prop {
__be64  dma_base;   /* address hi,lo */
__be32  tce_shift;  /* ilog2(tce_page_size) */
__be32  window_shift;   /* ilog2(tce_window_size) */
+   __be32  flags;  /* DDW properties, see bellow */
 };
 
+#define DDW_FLAGS_DIRECT   0x01
+
 struct direct_window {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
@@ -377,7 +380,7 @@ static LIST_HEAD(direct_window_list);
 static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
-#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -836,7 +839,7 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
if (ret)
return;
 
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   win = of_find_property(np, DMA64_PROPNAME, NULL);
if (!win)
return;
 
@@ -852,7 +855,7 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
np, ret);
 }
 
-static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, bool 
*direct_mapping)
 {
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
@@ -864,6 +867,7 @@ static bool find_existing_ddw(struct device_node *pdn, u64 
*dma_addr)
if (window->device == pdn) {
direct64 = window->prop;
*dma_addr = be64_to_cpu(direct64->dma_base);
+   *direct_mapping = be32_to_cpu(direct64->flags) & 
DDW_FLAGS_DIRECT;
found = true;
break;
}
@@ -901,8 +905,8 @@ static int find_existing_ddw_windows(void)
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
+   for_each_node_with_property(pdn, DMA64_PROPNAME) {
+   direct64 = of_get_property(pdn, DMA64_PROPNAME, );
if (!direct64)
continue;
 
@@ -1124,7 +1128,8 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 }
 
 static int ddw_property_create(struct property **ddw_win, const char *propname,
-  u32 liobn, u64 dma_addr, u32 page_shift, u32 
window_shift)
+  u32 liobn, u64 dma_addr, u32 page_shift,
+  u32 window_shift, bool direct_mapping)
 {
struct dynamic_dma_window_prop *ddwprop;
struct property *win64;
@@ -1144,6 +1149,36 @@ static int ddw_property_create(struct property 
**ddw_win, const char *propname,
ddwprop->dma_base = cpu_to_be64(dma_addr);
ddwprop->tce_shif

[PATCH v1 07/10] powerpc/pseries/iommu: Allow DDW windows starting at 0x00

2020-08-17 Thread Leonardo Bras

enable_ddw() currently returns the address of the DMA window, which is
considered invalid if has the value 0x00.

Also, it only considers valid an address returned from find_existing_ddw
if it's not 0x00.

Changing this behavior makes sense, given the users of enable_ddw() only
need to know if direct mapping is possible. It can also allow a DMA window
starting at 0x00 to be used.

This will be helpful for using a DDW with indirect mapping, as the window
address will be different than 0x00, but it will not map the whole
partition.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 30 --
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index fcdefcc0f365..4031127c9537 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -852,24 +852,25 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn)
+static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
 {
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
-   u64 dma_addr = 0;
+   bool found = false;
 
spin_lock(_window_list_lock);
/* check if we already created a window and dupe that config if so */
list_for_each_entry(window, _window_list, list) {
if (window->device == pdn) {
direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   *dma_addr = be64_to_cpu(direct64->dma_base);
+   found = true;
break;
}
}
spin_unlock(_window_list_lock);
 
-   return dma_addr;
+   return found;
 }
 
 static struct direct_window *ddw_list_add(struct device_node *pdn,
@@ -1131,15 +1132,15 @@ static void reset_dma_window(struct pci_dev *dev, 
struct device_node *par_dn)
  * pdn: the parent pe node with the ibm,dma_window property
  * Future: also check if we can remap the base window for our base page size
  *
- * returns the dma offset for use by the direct mapped DMA code.
+ * returns true if can map all pages (direct mapping), false otherwise..
  */
-static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
int len, ret;
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
-   u64 dma_addr, max_addr;
+   u64 max_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
@@ -1150,8 +1151,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
mutex_lock(_window_init_mutex);
 
-   dma_addr = find_existing_ddw(pdn);
-   if (dma_addr != 0)
+   if (find_existing_ddw(pdn, >dev.archdata.dma_offset))
goto out_unlock;
 
/*
@@ -1292,7 +1292,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_free_window;
}
 
-   dma_addr = be64_to_cpu(ddwprop->dma_base);
+   dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base);
goto out_unlock;
 
 out_free_window:
@@ -1309,6 +1309,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
kfree(win64->name);
kfree(win64->value);
kfree(win64);
+   win64 = NULL;
 
 out_failed:
if (default_win_removed)
@@ -1322,7 +1323,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 
 out_unlock:
mutex_unlock(_window_init_mutex);
-   return dma_addr;
+   return win64;
 }
 
 static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
@@ -1401,11 +1402,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
pci_dev *pdev, u64 dma_mask)
break;
}
 
-   if (pdn && PCI_DN(pdn)) {
-   pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
-   if (pdev->dev.archdata.dma_offset)
-   return true;
-   }
+   if (pdn && PCI_DN(pdn))
+   return enable_ddw(pdev, pdn);
 
return false;
 }
-- 
2.25.4

[PATCH v1 10/10] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2020-08-17 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.

Those changes are not supposed to change how the code works in any
way, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 110 +
 1 file changed, 57 insertions(+), 53 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 9544e3c91ced..c1454f9cd254 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -355,7 +355,7 @@ struct dynamic_dma_window_prop {
 
 #define DDW_FLAGS_DIRECT   0x01
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -375,12 +375,13 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -713,15 +714,18 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
-   dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+   dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window != NULL)
break;
}
 
if (dma_window == NULL) {
-   pr_debug("  no ibm,dma-window property !\n");
+   pr_debug("  no %s property !\n", DEFAULT_DMA_WIN);
return;
}
 
@@ -819,11 +823,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -851,36 +855,36 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove dma window property: %d\n",
np, ret);
 }
 
 static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, bool 
*direct_mapping)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
bool found = false;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   *dma_addr = be64_to_cpu(direct64->dma_base);
-   *direct_mapping = be32_to_cpu(direct64->flags) & 
DDW_FLAGS_DIRECT;
+   dma64 = window->prop;
+

[PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper

2020-08-17 Thread Leonardo Bras

There are two functions adding DDW to the direct_window_list in a
similar way, so create a ddw_list_add() to avoid duplicity and
simplify those functions.

Also, on enable_ddw(), add list_del() on out_free_window to allow
removing the window from list if any error occurs.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 42 --
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 39617ce0ec83..fcdefcc0f365 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -872,6 +872,24 @@ static u64 find_existing_ddw(struct device_node *pdn)
return dma_addr;
 }
 
+static struct direct_window *ddw_list_add(struct device_node *pdn,
+ const struct dynamic_dma_window_prop 
*dma64)
+{
+   struct direct_window *window;
+
+   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   if (!window)
+   return NULL;
+
+   window->device = pdn;
+   window->prop = dma64;
+   spin_lock(_window_list_lock);
+   list_add(>list, _window_list);
+   spin_unlock(_window_list_lock);
+
+   return window;
+}
+
 static int find_existing_ddw_windows(void)
 {
int len;
@@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void)
if (!direct64)
continue;
 
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
-   if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
+   window = ddw_list_add(pdn, direct64);
+   if (!window || len < sizeof(*direct64)) {
kfree(window);
remove_ddw(pdn, true);
-   continue;
}
-
-   window->device = pdn;
-   window->prop = direct64;
-   spin_lock(_window_list_lock);
-   list_add(>list, _window_list);
-   spin_unlock(_window_list_lock);
}
 
return 0;
@@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",
  create.liobn, dn);
 
-   window = kzalloc(sizeof(*window), GFP_KERNEL);
+   /* Add new window to existing DDW list */
+   window = ddw_list_add(pdn, ddwprop);
if (!window)
goto out_clear_window;
 
@@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
goto out_free_window;
}
 
-   window->device = pdn;
-   window->prop = ddwprop;
-   spin_lock(_window_list_lock);
-   list_add(>list, _window_list);
-   spin_unlock(_window_list_lock);
-
dma_addr = be64_to_cpu(ddwprop->dma_base);
goto out_unlock;
 
 out_free_window:
+   spin_lock(_window_list_lock);
+   list_del(>list);
+   spin_unlock(_window_list_lock);
+
kfree(window);
 
 out_clear_window:
-- 
2.25.4

[PATCH v1 05/10] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper

2020-08-17 Thread Leonardo Bras

Creates a helper to allow allocating a new iommu_table without the need
to reallocate the iommu_group.

This will be helpful for replacing the iommu_table for the new DMA window,
after we remove the old one with iommu_tce_table_put().

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 8fe23b7dff3a..39617ce0ec83 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,28 +53,31 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
-static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+static struct iommu_table *iommu_pseries_alloc_table(int node)
 {
-   struct iommu_table_group *table_group;
struct iommu_table *tbl;
 
-   table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
-  node);
-   if (!table_group)
-   return NULL;
-
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
if (!tbl)
-   goto free_group;
+   return NULL;
 
INIT_LIST_HEAD_RCU(>it_group_list);
kref_init(>it_kref);
+   return tbl;
+}
 
-   table_group->tables[0] = tbl;
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+   struct iommu_table_group *table_group;
+
+   table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
+   if (!table_group)
+   return NULL;
 
-   return table_group;
+   table_group->tables[0] = iommu_pseries_alloc_table(node);
+   if (table_group->tables[0])
+   return table_group;
 
-free_group:
kfree(table_group);
return NULL;
 }
-- 
2.25.4

[PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()

2020-08-17 Thread Leonardo Bras

Both iommu_alloc_coherent() and iommu_free_coherent() assume that once
size is aligned to PAGE_SIZE it will be aligned to IOMMU_PAGE_SIZE.

Update those functions to guarantee alignment with requested size
using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free().

Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kernel/iommu.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 9704f3f76e63..d7086087830f 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
}
 
if (dev)
-   boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- 1 << tbl->it_page_shift);
+   boundary_size = IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, 
tbl);
else
-   boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
+   boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl);
/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
@@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, struct 
iommu_table *tbl,
unsigned int order;
unsigned int nio_pages, io_order;
struct page *page;
+   size_t size_io = size;
 
size = PAGE_ALIGN(size);
order = get_order(size);
@@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, struct 
iommu_table *tbl,
memset(ret, 0, size);
 
/* Set up tces to cover the allocated range */
-   nio_pages = size >> tbl->it_page_shift;
-   io_order = get_iommu_order(size, tbl);
+   size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
+   nio_pages = size_io >> tbl->it_page_shift;
+   io_order = get_iommu_order(size_io, tbl);
mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
  mask >> tbl->it_page_shift, io_order, 0);
if (mapping == DMA_MAPPING_ERROR) {
@@ -900,11 +901,11 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
 void *vaddr, dma_addr_t dma_handle)
 {
if (tbl) {
-   unsigned int nio_pages;
+   size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
+   unsigned int nio_pages = size_io >> tbl->it_page_shift;
 
-   size = PAGE_ALIGN(size);
-   nio_pages = size >> tbl->it_page_shift;
iommu_free(tbl, dma_handle, nio_pages);
+
size = PAGE_ALIGN(size);
free_pages((unsigned long)vaddr, get_order(size));
}
-- 
2.25.4

[PATCH v1 03/10] powerpc/kernel/iommu: Use largepool as a last resort when !largealloc

2020-08-17 Thread Leonardo Bras

As of today, doing iommu_range_alloc() only for !largealloc (npages <= 15)
will only be able to use 3/4 of the available pages, given pages on
largepool  not being available for !largealloc.

This could mean some drivers not being able to fully use all the available
pages for the DMA window.

Add pages on largepool as a last resort for !largealloc, making all pages
of the DMA window available.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kernel/iommu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d7086087830f..7f603d4e62d4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -261,6 +261,15 @@ static unsigned long iommu_range_alloc(struct device *dev,
pass++;
goto again;
 
+   } else if (pass == tbl->nr_pools + 1) {
+   /* Last resort: try largepool */
+   spin_unlock(>lock);
+   pool = >large_pool;
+   spin_lock(>lock);
+   pool->hint = pool->start;
+   pass++;
+   goto again;
+
} else {
/* Give up */
spin_unlock_irqrestore(&(pool->lock), flags);
-- 
2.25.4

[PATCH v1 00/10] DDW indirect mapping

2020-08-17 Thread Leonardo Bras

This patchset must be applied on top of:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=194179=%2A=both

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
ibm,dma-window" is used instead.

Usually, the available DDW will be 16x bigger than the default DMA window,
as it keep the same page count and raise the page size from 4k to 64k.
Besides the increased window size, it performs better on allocations
bigger than 4k, so it would be nice to use it instead.

Patch #1 replaces hard-coded 4K page size with a variable containing the
correct page size for the window.

Patch #2 makes sure alignment is correct in iommu_*_coherent().

Patch #3 let small allocations use largepool if there is no more space
left in the other pools, thus allowing the whole DMA window to be used by
smaller allocations.

Patch #4 introduces iommu_table_in_use(), and replace manual bit-field
checking where it's used. It will be used for aborting enable_ddw() if
there is any current iommu allocation and we are trying single window
indirect mapping.

Patch #5 introduces iommu_pseries_alloc_table() that will be helpful
when indirect mapping needs to replace the iommu_table.

Patch #6 adds helpers for adding and removing DDWs in the list.

Patch #7 refactors enable_ddw() so it returns if direct mapping is
possible, instead of DMA offset. It helps for next patches on
indirect DMA mapping and also allows DMA windows starting at 0x00.

Patch #8 bring new helper to simplify enable_ddw(), allowing
some reorganization for introducing indirect mapping DDW.

Patch #9:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.

Patch #10:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an Ethernet VF:
4005:01:00.0 Ethernet controller: Mellanox Technologies MT27700 Family
[ConnectX-4 Virtual Function]

Patchset was tested with a 64GB DDW which did not map the whole
partition (128G).

Leonardo Bras (10):
  powerpc/pseries/iommu: Replace hard-coded page shift
  powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on
iommu_*_coherent()
  powerpc/kernel/iommu: Use largepool as a last resort when !largealloc
  powerpc/kernel/iommu: Add new iommu_table_in_use() helper
  powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
  powerpc/pseries/iommu: Add ddw_list_add() helper
  powerpc/pseries/iommu: Allow DDW windows starting at 0x00
  powerpc/pseries/iommu: Add ddw_property_create() and refactor
enable_ddw()
  powerpc/pseries/iommu: Make use of DDW even if it does not map the
partition
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/include/asm/iommu.h   |   1 +
 arch/powerpc/include/asm/tce.h |  10 +-
 arch/powerpc/kernel/iommu.c|  88 +++---
 arch/powerpc/platforms/pseries/iommu.c | 394 -
 4 files changed, 305 insertions(+), 188 deletions(-)

-- 
2.25.4

[PATCH v1 04/10] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2020-08-17 Thread Leonardo Bras

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h |  1 +
 arch/powerpc/kernel/iommu.c  | 62 ++--
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5032f1593299..2913e5c8b1f8 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 7f603d4e62d4..c5d5d36ab65e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -668,21 +668,6 @@ static void iommu_table_reserve_pages(struct iommu_table 
*tbl,
set_bit(i - tbl->it_offset, tbl->it_map);
 }
 
-static void iommu_table_release_pages(struct iommu_table *tbl)
-{
-   int i;
-
-   /*
-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl->it_offset == 0)
-   clear_bit(0, tbl->it_map);
-
-   for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
-   clear_bit(i - tbl->it_offset, tbl->it_map);
-}
-
 /*
  * Build a iommu_table structure.  This contains a bit map which
  * is used to manage allocation of the tce space.
@@ -743,6 +728,38 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
+bool iommu_table_in_use(struct iommu_table *tbl)
+{
+   bool in_use;
+   unsigned long p1_start = 0, p1_end, p2_start, p2_end;
+
+   /*ignore reserved bit0*/
+   if (tbl->it_offset == 0)
+   p1_start = 1;
+
+   /* Check if reserved memory is valid*/
+   if (tbl->it_reserved_start >= tbl->it_offset &&
+   tbl->it_reserved_start <= (tbl->it_offset + tbl->it_size) &&
+   tbl->it_reserved_end   >= tbl->it_offset &&
+   tbl->it_reserved_end   <= (tbl->it_offset + tbl->it_size)) {
+   p1_end = tbl->it_reserved_start - tbl->it_offset;
+   p2_start = tbl->it_reserved_end - tbl->it_offset + 1;
+   p2_end = tbl->it_size;
+   } else {
+   p1_end = tbl->it_size;
+   p2_start = 0;
+   p2_end = 0;
+   }
+
+   in_use = (find_next_bit(tbl->it_map, p1_end, p1_start) != p1_end);
+   if (in_use || p2_start == 0)
+   return in_use;
+
+   in_use = (find_next_bit(tbl->it_map, p2_end, p2_start) != p2_end);
+
+   return in_use;
+}
+
 static void iommu_table_free(struct kref *kref)
 {
unsigned long bitmap_sz;
@@ -759,10 +776,8 @@ static void iommu_table_free(struct kref *kref)
return;
}
 
-   iommu_table_release_pages(tbl);
-
/* verify that table contains no entries */
-   if (!bitmap_empty(tbl->it_map, tbl->it_size))
+   if (iommu_table_in_use(tbl))
pr_warn("%s: Unexpected TCEs\n", __func__);
 
/* calculate bitmap size in bytes */
@@ -1069,18 +1084,13 @@ int iommu_take_ownership(struct iommu_table *tbl)
for (i = 0; i < tbl->nr_pools; i++)
spin_lock(>pools[i].lock);
 
-   iommu_table_release_pages(tbl);
-
-   if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+   if (iommu_table_in_use(tbl)) {
pr_err("iommu_tce: it_map is not empty");
ret = -EBUSY;
-   /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */
-   iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
-   tbl->it_reserved_end);
-   } else {
-   memset(tbl->it_map, 0xff, sz);
}
 
+   memset(tbl->it_map, 0xff, sz);
+
for (i = 0; i < tbl->nr_pools; i++)
spin_unlock(>pools[i].lock);
spin_unlock_irqrestore(>large_pool.lock, flags);
-- 
2.25.4

[PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift

2020-08-17 Thread Leonardo Bras

Some functions assume IOMMU page size can only be 4K (pageshift == 12).
Update them to accept any page size passed, so we can use 64K pages.

In the process, some defines like TCE_SHIFT were made obsolete, and then
removed. TCE_RPN_MASK was updated to generate a mask according to
the pageshift used.

Most places had a tbl struct, so using tbl->it_page_shift was simple.
tce_free_pSeriesLP() was a special case, since callers not always have a
tbl struct, so adding a tceshift parameter seems the right thing to do.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/tce.h | 10 ++
 arch/powerpc/platforms/pseries/iommu.c | 42 --
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
index db5fc2f2262d..971cba2d87cc 100644
--- a/arch/powerpc/include/asm/tce.h
+++ b/arch/powerpc/include/asm/tce.h
@@ -19,15 +19,9 @@
 #define TCE_VB 0
 #define TCE_PCI1
 
-/* TCE page size is 4096 bytes (1 << 12) */
-
-#define TCE_SHIFT  12
-#define TCE_PAGE_SIZE  (1 << TCE_SHIFT)
-
 #define TCE_ENTRY_SIZE 8   /* each TCE is 64 bits */
-
-#define TCE_RPN_MASK   0xfful  /* 40-bit RPN (4K pages) */
-#define TCE_RPN_SHIFT  12
+#define TCE_RPN_BITS   52  /* Bits 0-51 represent RPN on 
TCE */
+#define TCE_RPN_MASK(ps)   ((1ul << (TCE_RPN_BITS - (ps))) - 1)
 #define TCE_VALID  0x800   /* TCE valid */
 #define TCE_ALLIO  0x400   /* TCE valid for all lpars */
 #define TCE_PCI_WRITE  0x2 /* write from PCI allowed */
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index e4198700ed1a..8fe23b7dff3a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, long 
index,
u64 proto_tce;
__be64 *tcep;
u64 rpn;
+   const unsigned long tceshift = tbl->it_page_shift;
+   const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
+   const u64 rpn_mask = TCE_RPN_MASK(tceshift);
 
proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -117,10 +120,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, 
long index,
 
while (npages--) {
/* can't move this out since we might cross MEMBLOCK boundary */
-   rpn = __pa(uaddr) >> TCE_SHIFT;
-   *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << 
TCE_RPN_SHIFT);
+   rpn = __pa(uaddr) >> tceshift;
+   *tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift);
 
-   uaddr += TCE_PAGE_SIZE;
+   uaddr += pagesize;
tcep++;
}
return 0;
@@ -146,7 +149,7 @@ static unsigned long tce_get_pseries(struct iommu_table 
*tbl, long index)
return be64_to_cpu(*tcep);
 }
 
-static void tce_free_pSeriesLP(unsigned long liobn, long, long);
+static void tce_free_pSeriesLP(unsigned long liobn, long, long, long);
 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
 
 static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
@@ -159,6 +162,7 @@ static int tce_build_pSeriesLP(unsigned long liobn, long 
tcenum, long tceshift,
u64 rpn;
int ret = 0;
long tcenum_start = tcenum, npages_start = npages;
+   const u64 rpn_mask = TCE_RPN_MASK(tceshift);
 
rpn = __pa(uaddr) >> tceshift;
proto_tce = TCE_PCI_READ;
@@ -166,12 +170,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long 
tcenum, long tceshift,
proto_tce |= TCE_PCI_WRITE;
 
while (npages--) {
-   tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
+   tce = proto_tce | (rpn & rpn_mask) << tceshift;
rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
 
if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
ret = (int)rc;
-   tce_free_pSeriesLP(liobn, tcenum_start,
+   tce_free_pSeriesLP(liobn, tcenum_start, tceshift,
   (npages_start - (npages + 1)));
break;
}
@@ -205,10 +209,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table 
*tbl, long tcenum,
long tcenum_start = tcenum, npages_start = npages;
int ret = 0;
unsigned long flags;
+   const unsigned long tceshift = tbl->it_page_shift;
+   const u64 rpn_mask = TCE_RPN_MASK(tceshift);
 
if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) {
return tce_build_pSeriesLP(tbl->it_index, tcenum,
-  tbl->

Re: [PATCH v5 0/4] Allow bigger 64bit window by removing default DMA window

2020-08-11 Thread Leonardo Bras

Hello Michael,

Do you suggest any change for this patchset?
Any chance it can get in this merge window?

Best regards,
Leonardo Bras

On Wed, 2020-08-05 at 00:04 -0300, Leonardo Bras wrote:
> There are some devices in which a hypervisor may only allow 1 DMA window
> to exist at a time, and in those cases, a DDW is never created to them,
> since the default DMA window keeps using this resource.
> 
> LoPAR recommends this procedure:
> 1. Remove the default DMA window,
> 2. Query for which configs the DDW can be created,
> 3. Create a DDW.
> 
> Patch #1:
> Create defines for outputs of ibm,ddw-applicable, so it's easier to
> identify them.
> 
> Patch #2:
> - After LoPAR level 2.8, there is an extension that can make
>   ibm,query-pe-dma-windows to have 6 outputs instead of 5. This changes the
>   order of the outputs, and that can cause some trouble. 
> - query_ddw() was updated to check how many outputs the 
>   ibm,query-pe-dma-windows is supposed to have, update the rtas_call() and
>   deal correctly with the outputs in both cases.
> - This patch looks somehow unrelated to the series, but it can avoid future
>   problems on DDW creation.
> 
> Patch #3 moves the window-removing code from remove_ddw() to
> remove_dma_window(), creating a way to delete any DMA window, so it can be
> used to delete the default DMA window.
> 
> Patch #4 makes use of the remove_dma_window() from patch #3 to remove the
> default DMA window before query_ddw(). It also implements a new rtas call
> to recover the default DMA window, in case anything fails after it was
> removed, and a DDW couldn't be created.
> 
> ---
> Changes since v4:
> - Removed patches 5+ in order to deal with a feature at a time
> - Remove unnecessary parentesis in patch #4
> - Changed patch #4 title from 
>   "Remove default DMA window before creating DDW"
> - Included David Dai tested-by
> - v4 link: 
> http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190051=%2A=both
> 
> Changes since v3:
> - Introduces new patch #5, to prepare for an important change in #6
> - struct iommu_table was not being updated, so include a way to do this
>   in patch #6.
> - Improved patch #4 based in a suggestion from Alexey, to make code
>   more easily understandable
> - v3 link: 
> http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348=%2A=both
> 
> Changes since v2:
> - Change the way ibm,ddw-extensions is accessed, using a proper function
>   instead of doing this inline everytime it's used.
> - Remove previous patch #6, as it doesn't look like it would be useful.
> - Add new patch, for changing names from direct* to dma*, as indirect 
>   mapping can be used from now on.
> - Fix some typos, corrects some define usage.
> - v2 link: 
> http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=185433=%2A=both
> 
> Changes since v1:
> - Add defines for ibm,ddw-applicable and ibm,ddw-extensions outputs
> - Merge aux function query_ddw_out_sz() into query_ddw()
> - Merge reset_dma_window() patch (prev. #2) into remove default DMA
>   window patch (#4).
> - Keep device_node *np name instead of using pdn in remove_*()
> - Rename 'device_node *pdn' into 'parent' in new functions
> - Rename dfl_win to default_win
> - Only remove the default DMA window if there is no window available
>   in first query.
> - Check if default DMA window can be restored before removing it.
> - Fix 'unitialized use' (found by travis mpe:ci-test)
> - New patches #5 and #6
> - v1 link: 
> http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=184420=%2A=both
> 
> Special thanks for Alexey Kardashevskiy, Brian King and
> Oliver O'Halloran for the feedback provided!
> 
> 
> Leonardo Bras (4):
>   powerpc/pseries/iommu: Create defines for operations in
> ibm,ddw-applicable
>   powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
>   powerpc/pseries/iommu: Move window-removing part of remove_ddw into
> remove_dma_window
>   powerpc/pseries/iommu: Allow bigger 64bit window by removing default
> DMA window
> 
>  arch/powerpc/platforms/pseries/iommu.c | 242 -
>  1 file changed, 195 insertions(+), 47 deletions(-)
>

Re: [PATCH v5 0/4] Allow bigger 64bit window by removing default DMA window

2020-08-05 Thread Leonardo Bras

Travis reported successful compilation with mpe/merge:

https://travis-ci.org/github/LeoBras/linux-ppc/builds/715028857

[PATCH v5 2/4] powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows

2020-08-04 Thread Leonardo Bras

>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.

This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.

This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.

Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.

Signed-off-by: Leonardo Bras 
Tested-by: David Dai 
---
 arch/powerpc/platforms/pseries/iommu.c | 91 +++---
 1 file changed, 81 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index ac0d6376bdad..1a933c4e8bba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -47,6 +47,12 @@ enum {
DDW_APPLICABLE_SIZE
 };
 
+enum {
+   DDW_EXT_SIZE = 0,
+   DDW_EXT_RESET_DMA_WIN = 1,
+   DDW_EXT_QUERY_OUT_SIZE = 2
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -342,7 +348,7 @@ struct direct_window {
 /* Dynamic DMA Window support */
 struct ddw_query_response {
u32 windows_available;
-   u32 largest_available_block;
+   u64 largest_available_block;
u32 page_size;
u32 migration_capable;
 };
@@ -877,14 +883,62 @@ static int find_existing_ddw_windows(void)
 }
 machine_arch_initcall(pseries, find_existing_ddw_windows);
 
+/**
+ * ddw_read_ext - Get the value of an DDW extension
+ * @np:device node from which the extension value is to be 
read.
+ * @extnum:index number of the extension.
+ * @value: pointer to return value, modified when extension is available.
+ *
+ * Checks if "ibm,ddw-extensions" exists for this node, and get the value
+ * on index 'extnum'.
+ * It can be used only to check if a property exists, passing value == NULL.
+ *
+ * Returns:
+ * 0 if extension successfully read
+ * -EINVAL if the "ibm,ddw-extensions" does not exist,
+ * -ENODATA if "ibm,ddw-extensions" does not have a value, and
+ * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
+ */
+static inline int ddw_read_ext(const struct device_node *np, int extnum,
+  u32 *value)
+{
+   static const char propname[] = "ibm,ddw-extensions";
+   u32 count;
+   int ret;
+
+   ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, );
+   if (ret)
+   return ret;
+
+   if (count < extnum)
+   return -EOVERFLOW;
+
+   if (!value)
+   value = 
+
+   return of_property_read_u32_index(np, propname, extnum, value);
+}
+
 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
-   struct ddw_query_response *query)
+struct ddw_query_response *query,
+struct device_node *parent)
 {
struct device_node *dn;
struct pci_dn *pdn;
-   u32 cfg_addr;
+   u32 cfg_addr, ext_query, query_out[5];
u64 buid;
-   int ret;
+   int ret, out_sz;
+
+   /*
+* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
+* output parameters ibm,query-pe-dma-windows will have, ranging from
+* 5 to 6.
+*/
+   ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, _query);
+   if (!ret && ext_query == 1)
+   out_sz = 6;
+   else
+   out_sz = 5;
 
/*
 * Get the config address and phb buid of the PE window.
@@ -897,11 +951,28 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
cfg_addr, BUID_HI(buid), BUID_LO(buid));
-   dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
-BUID_HI(buid), BUID_LO(buid), ret);
+   dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned 
%d\n",
+ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), ret);
+
+   switch (out_sz) {
+   case 5:
+   query->windows_available = query_out[0];
+

[PATCH v5 3/4] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window

2020-08-04 Thread Leonardo Bras

Move the window-removing part of remove_ddw into a new function
(remove_dma_window), so it can be used to remove other DMA windows.

It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
property, like the default DMA window from the device, which uses
"ibm,dma-window".

Signed-off-by: Leonardo Bras 
Tested-by: David Dai 
---
 arch/powerpc/platforms/pseries/iommu.c | 45 +++---
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 1a933c4e8bba..4e33147825cc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -781,25 +781,14 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
 {
struct dynamic_dma_window_prop *dwp;
-   struct property *win64;
-   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
-   int ret = 0;
-
-   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-_avail[0], DDW_APPLICABLE_SIZE);
-
-   win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win64)
-   return;
-
-   if (ret || win64->length < sizeof(*dwp))
-   goto delprop;
+   int ret;
 
-   dwp = win64->value;
+   dwp = win->value;
liobn = (u64)be32_to_cpu(dwp->liobn);
 
/* clear the whole window, note the arg is in kernel pages */
@@ -821,10 +810,30 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
+}
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+   struct property *win;
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
+   int ret = 0;
+
+   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+_avail[0], DDW_APPLICABLE_SIZE);
+   if (ret)
+   return;
+
+   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   if (!win)
+   return;
+
+   if (win->length >= sizeof(struct dynamic_dma_window_prop))
+   remove_dma_window(np, ddw_avail, win);
+
+   if (!remove_prop)
+   return;
 
-delprop:
-   if (remove_prop)
-   ret = of_remove_property(np, win64);
+   ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
-- 
2.25.4

[PATCH v5 1/4] powerpc/pseries/iommu: Create defines for operations in ibm,ddw-applicable

2020-08-04 Thread Leonardo Bras

Create defines to help handling ibm,ddw-applicable values, avoiding
confusion about the index of given operations.

Signed-off-by: Leonardo Bras 
Tested-by: David Dai 
---
 arch/powerpc/platforms/pseries/iommu.c | 43 --
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6d47b4a3ce39..ac0d6376bdad 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -39,6 +39,14 @@
 
 #include "pseries.h"
 
+enum {
+   DDW_QUERY_PE_DMA_WIN  = 0,
+   DDW_CREATE_PE_DMA_WIN = 1,
+   DDW_REMOVE_PE_DMA_WIN = 2,
+
+   DDW_APPLICABLE_SIZE
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -771,12 +779,12 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 {
struct dynamic_dma_window_prop *dwp;
struct property *win64;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
int ret = 0;
 
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-_avail[0], 3);
+_avail[0], DDW_APPLICABLE_SIZE);
 
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
if (!win64)
@@ -798,15 +806,15 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF successfully cleared tces in window.\n",
 np);
 
-   ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+   ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
pr_warn("%pOF: failed to remove direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 
 delprop:
if (remove_prop)
@@ -889,11 +897,11 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
- cfg_addr, BUID_HI(buid), BUID_LO(buid));
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   cfg_addr, BUID_HI(buid), BUID_LO(buid));
dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
-   BUID_LO(buid), ret);
+   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
+BUID_HI(buid), BUID_LO(buid), ret);
return ret;
 }
 
@@ -920,15 +928,16 @@ static int create_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
 
do {
/* extra outputs are LIOBN and dma-addr (hi, lo) */
-   ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
-   cfg_addr, BUID_HI(buid), BUID_LO(buid),
-   page_shift, window_shift);
+   ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
+   (u32 *)create, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid), page_shift, window_shift);
} while (rtas_busy_delay(ret));
dev_info(>dev,
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
-   "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
-cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
-window_shift, ret, create->liobn, create->addr_hi, 
create->addr_lo);
+   "(liobn = 0x%x starting addr = %x %x)\n",
+ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
+create->addr_hi, create->addr_lo);
 
return ret;
 }
@@ -996,7 +1005,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
int page_shift;
u64 dma_addr, max_addr;
struct device_node *dn;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
@@ -1029,7 +1038,7 @@ static u64 enable_ddw(struct p

[PATCH v5 4/4] powerpc/pseries/iommu: Allow bigger 64bit window by removing default DMA window

2020-08-04 Thread Leonardo Bras

On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.

This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.

If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:

Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.

It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.

Signed-off-by: Leonardo Bras 
Tested-by: David Dai 
---
 arch/powerpc/platforms/pseries/iommu.c | 73 +++---
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 4e33147825cc..e4198700ed1a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
return max_addr;
 }
 
+/*
+ * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
+ * ibm,ddw-extensions, which carries the rtas token for
+ * ibm,reset-pe-dma-windows.
+ * That rtas-call can be used to restore the default DMA window for the device.
+ */
+static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
+{
+   int ret;
+   u32 cfg_addr, reset_dma_win;
+   u64 buid;
+   struct device_node *dn;
+   struct pci_dn *pdn;
+
+   ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, _dma_win);
+   if (ret)
+   return;
+
+   dn = pci_device_to_OF_node(dev);
+   pdn = PCI_DN(dn);
+   buid = pdn->phb->buid;
+   cfg_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+
+   ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid));
+   if (ret)
+   dev_info(>dev,
+"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
+reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
+ret);
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1090,6 +1122,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
+   bool default_win_removed = false;
 
mutex_lock(_window_init_mutex);
 
@@ -1133,14 +1166,38 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (ret != 0)
goto out_failed;
 
+   /*
+* If there is no window available, remove the default DMA window,
+* if it's present. This will make all the resources available to the
+* new DDW window.
+* If anything fails after this, we need to restore it, so also check
+* for extensions presence.
+*/
if (query.windows_available == 0) {
-   /*
-* no additional windows are available for this device.
-* We might be able to reallocate the existing window,
-* trading in for a larger page size.
-*/
-   dev_dbg(>dev, "no free dynamic windows");
-   goto out_failed;
+   struct property *default_win;
+   int reset_win_ext;
+
+   default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+   if (!default_win)
+   goto out_failed;
+
+   reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
+   if (reset_win_ext)
+   goto out_failed;
+
+   remove_dma_window(pdn, ddw_avail, default_win);
+   default_win_removed = true;
+
+   /* Query again, to check if the window is available */
+   ret = query_ddw(dev, ddw_avail, , pdn);
+   if (ret != 0)
+   goto out_failed;
+
+   if (query.windows_available == 0) {
+   /* no windows are available for this device. */
+   dev_dbg(>dev, "no free dynamic windows");
+   goto out_failed;
+   }
}
if (query.page_size & 4) {
page_shift = 24; /* 16MB */
@@ -1231,6 +1

[PATCH v5 0/4] Allow bigger 64bit window by removing default DMA window

2020-08-04 Thread Leonardo Bras

There are some devices in which a hypervisor may only allow 1 DMA window
to exist at a time, and in those cases, a DDW is never created to them,
since the default DMA window keeps using this resource.

LoPAR recommends this procedure:
1. Remove the default DMA window,
2. Query for which configs the DDW can be created,
3. Create a DDW.

Patch #1:
Create defines for outputs of ibm,ddw-applicable, so it's easier to
identify them.

Patch #2:
- After LoPAR level 2.8, there is an extension that can make
  ibm,query-pe-dma-windows to have 6 outputs instead of 5. This changes the
  order of the outputs, and that can cause some trouble. 
- query_ddw() was updated to check how many outputs the 
  ibm,query-pe-dma-windows is supposed to have, update the rtas_call() and
  deal correctly with the outputs in both cases.
- This patch looks somehow unrelated to the series, but it can avoid future
  problems on DDW creation.

Patch #3 moves the window-removing code from remove_ddw() to
remove_dma_window(), creating a way to delete any DMA window, so it can be
used to delete the default DMA window.

Patch #4 makes use of the remove_dma_window() from patch #3 to remove the
default DMA window before query_ddw(). It also implements a new rtas call
to recover the default DMA window, in case anything fails after it was
removed, and a DDW couldn't be created.

---
Changes since v4:
- Removed patches 5+ in order to deal with a feature at a time
- Remove unnecessary parentesis in patch #4
- Changed patch #4 title from 
  "Remove default DMA window before creating DDW"
- Included David Dai tested-by
- v4 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190051=%2A=both

Changes since v3:
- Introduces new patch #5, to prepare for an important change in #6
- struct iommu_table was not being updated, so include a way to do this
  in patch #6.
- Improved patch #4 based in a suggestion from Alexey, to make code
  more easily understandable
- v3 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348=%2A=both

Changes since v2:
- Change the way ibm,ddw-extensions is accessed, using a proper function
  instead of doing this inline everytime it's used.
- Remove previous patch #6, as it doesn't look like it would be useful.
- Add new patch, for changing names from direct* to dma*, as indirect 
  mapping can be used from now on.
- Fix some typos, corrects some define usage.
- v2 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=185433=%2A=both

Changes since v1:
- Add defines for ibm,ddw-applicable and ibm,ddw-extensions outputs
- Merge aux function query_ddw_out_sz() into query_ddw()
- Merge reset_dma_window() patch (prev. #2) into remove default DMA
  window patch (#4).
- Keep device_node *np name instead of using pdn in remove_*()
- Rename 'device_node *pdn' into 'parent' in new functions
- Rename dfl_win to default_win
- Only remove the default DMA window if there is no window available
  in first query.
- Check if default DMA window can be restored before removing it.
- Fix 'unitialized use' (found by travis mpe:ci-test)
- New patches #5 and #6
- v1 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=184420=%2A=both

Special thanks for Alexey Kardashevskiy, Brian King and
Oliver O'Halloran for the feedback provided!


Leonardo Bras (4):
  powerpc/pseries/iommu: Create defines for operations in
ibm,ddw-applicable
  powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
  powerpc/pseries/iommu: Move window-removing part of remove_ddw into
remove_dma_window
  powerpc/pseries/iommu: Allow bigger 64bit window by removing default
DMA window

 arch/powerpc/platforms/pseries/iommu.c | 242 -
 1 file changed, 195 insertions(+), 47 deletions(-)

-- 
2.25.4

Re: [PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-22 Thread Leonardo Bras

On Tue, 2020-07-21 at 19:52 -0500, Brian King wrote:
> > 
> > As of today, there seems to be nothing like that happening in the
> > driver I am testing. 
> > I spoke to Brian King on slack, and he mentioned that at the point DDW
> > is created there should be no allocations in place.
> 
> I think there are a couple of scenarios here. One is where there is a DMA
> allocation prior to a call to set the DMA mask. Second scenario is if the
> driver makes multiple calls to set the DMA mask. I would argue that a properly
> written driver should tell the IOMMU subsystem what DMA mask it supports prior
> to allocating DMA memroy. Documentation/core-api/dma-api-howto.rst should
> describe what is legal and what is not.
> 
> It might be reasonable to declare its not allowed to allocate DMA memory
> and then later change the DMA mask and clearly call this out in the 
> documentation
> if its not already.
> 
> -Brian

Thank you for the feedback Brian!

That makes sense to me. I will try to have this in mind for the next
patchset. 

Best regards,

Re: [PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-22 Thread Leonardo Bras

On Wed, 2020-07-22 at 11:28 +1000, Alexey Kardashevskiy wrote:
> 
> On 22/07/2020 08:13, Leonardo Bras wrote:
> > On Tue, 2020-07-21 at 14:59 +1000, Alexey Kardashevskiy wrote:
> > > On 16/07/2020 17:16, Leonardo Bras wrote:
> > > > Move the part of iommu_table_free() that does struct iommu_table 
> > > > cleaning
> > > > into iommu_table_clean, so we can invoke it separately.
> > > > 
> > > > This new function is useful for cleaning struct iommu_table before
> > > > initializing it again with a new DMA window, without having it freed and
> > > > allocated again.
> > > > 
> > > > Signed-off-by: Leonardo Bras 
> > > > ---
> > > >  arch/powerpc/kernel/iommu.c | 30 ++
> > > >  1 file changed, 18 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > > > index 9704f3f76e63..c3242253a4e7 100644
> > > > --- a/arch/powerpc/kernel/iommu.c
> > > > +++ b/arch/powerpc/kernel/iommu.c
> > > > @@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct 
> > > > iommu_table *tbl, int nid,
> > > > return tbl;
> > > >  }
> > > >  
> > > > -static void iommu_table_free(struct kref *kref)
> > > > +static void iommu_table_clean(struct iommu_table *tbl)
> > > 
> > > iommu_table_free() + iommu_init_table() + set_iommu_table_base() should
> > > work too, why new helper?
> > 
> > iommu_table_free() also frees the tbl, which would need allocate it
> > again (new address) and to fill it up again, unnecessarily. 
> 
> It is a new table in fact, everything is new there. You are only saving
> kfree+kzalloc which does not seem a huge win.
> 
> Also, iommu_table_update() simply assumes 64bit window by passing
> res_start=res_end=0 to iommu_init_table() which is not horribly robust
> either. Yeah, I know, iommu_init_table() is always called with zeroes in
> pseries but this is somewhat ok as those tables are from the device tree
> and those windows don't overlap with 32bit MMIO but under KVM they will
> (well, if we hack QEMU to advertise a single window).
> 
> I suggest removing iommu_pseries_table_update() from 6/7 and do
> iommu_table_free() + iommu_init_table() + set_iommu_table_base() with a
> WARN_ON(pdev->dev.archdata.dma_offset>=SZ_4G), may be even do this all
> in enable_ddw() where we know for sure if it is 1:1 mapping or just a
> big window.

Sure, I have yet to understand the full impact of this change, but I
will implement this and give it a try.

> 
> Out of curiosity - what page sizes does pHyp advertise in "query"?

64kB (page shift 0x10)

> 
> 
> > I think it's a better approach to only change what is needed.
> > 
> > > There is also iommu_table_clear() which does a different thing so you
> > > need a better name.
> > 
> > I agree.
> > I had not noticed this other function before sending the patchset. What
> > would be a better name though? __iommu_table_free()? 
> > 
> > > Second, iommu_table_free
> > > use and it would be ok as we would only see this when hot-unplugging a
> > > PE because we always kept the default window.
> > > Btw you must be seeing these warnings now every time you create DDW with
> > > these patches as at least the first page is reserved, do not you?
> > 
> > It does not print a warning.
> > I noticed other warnings,
> 
> And what are these?

tce_freemulti_pSeriesLP: plpar_tce_stuff failed
[...]

It's regarding the change in pagesize. 
Some places have the tceshift hardcoded as 12, tce_freemulti_pSeriesLP
is one of them, and that is causing some errors.

I wrote a patch fixing this, and I will include it in the next series.

> 
> > but not this one from iommu_table_free():
> > /* verify that table contains no entries */
> > if (!bitmap_empty(tbl->it_ma
> > p, tbl->it_size))
> > pr_warn("%s: Unexpected TCEs\n", __func__);
> > 
> > Before that, iommu_table_release_pages(tbl) is supposed to clear the 
> > bitmap, so this only tests for a tce that is created in this short period.
> 
> iommu_table_release_pages() only clears reserved pages - page 0 (just a
> protection against NULL DMA pointers) and 32bit MMIO (these should not
> be set for 64bit window). The "%s: Unexpected TCEs\n" is what checks for
> actual mapped TCEs.
> 

Oh, I haven't noticed that. Thanks for pointing!

> > > Since we are replacing a table for a device whi

Re: [PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-21 Thread Leonardo Bras

On Tue, 2020-07-21 at 14:59 +1000, Alexey Kardashevskiy wrote:
> 
> On 16/07/2020 17:16, Leonardo Bras wrote:
> > Move the part of iommu_table_free() that does struct iommu_table cleaning
> > into iommu_table_clean, so we can invoke it separately.
> > 
> > This new function is useful for cleaning struct iommu_table before
> > initializing it again with a new DMA window, without having it freed and
> > allocated again.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/kernel/iommu.c | 30 ++
> >  1 file changed, 18 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index 9704f3f76e63..c3242253a4e7 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct 
> > iommu_table *tbl, int nid,
> > return tbl;
> >  }
> >  
> > -static void iommu_table_free(struct kref *kref)
> > +static void iommu_table_clean(struct iommu_table *tbl)
> 
> iommu_table_free() + iommu_init_table() + set_iommu_table_base() should
> work too, why new helper?

iommu_table_free() also frees the tbl, which would need allocate it
again (new address) and to fill it up again, unnecessarily. 
I think it's a better approach to only change what is needed.

> There is also iommu_table_clear() which does a different thing so you
> need a better name.

I agree.
I had not noticed this other function before sending the patchset. What
would be a better name though? __iommu_table_free()? 

> Second, iommu_table_free
> use and it would be ok as we would only see this when hot-unplugging a
> PE because we always kept the default window.
> Btw you must be seeing these warnings now every time you create DDW with
> these patches as at least the first page is reserved, do not you?

It does not print a warning.
I noticed other warnings, but not this one from iommu_table_free():
/* verify that table contains no entries */
if (!bitmap_empty(tbl->it_ma
p, tbl->it_size))
pr_warn("%s: Unexpected TCEs\n", __func__);

Before that, iommu_table_release_pages(tbl) is supposed to clear the 
bitmap, so this only tests for a tce that is created in this short period.

> Since we are replacing a table for a device which is still in the
> system, we should not try messing with its DMA if it already has
> mappings so the warning should become an error preventing DDW. It is
> rather hard to trigger in practice but I could hack a driver to ask for
> 32bit DMA mask first, map few pages and then ask for 64bit DMA mask, it
> is not illegal, I think. So this needs a new helper - "bool
> iommu_table_in_use(tbl)" - to use in enable_ddw(). Or I am overthinking
> this?... Thanks,

As of today, there seems to be nothing like that happening in the
driver I am testing. 
I spoke to Brian King on slack, and he mentioned that at the point DDW
is created there should be no allocations in place.

But I suppose some driver could try to do this.

Maybe a better approach would be removing the mapping only if the
default window is removed (at the end of enable_ddw, as an else to
resetting the default DMA window), and having a way to add more
mappings to those pools. But this last part doesn't look so simple, and
it would be better to understand if it's necessary investing work in
this.

What do you think?

Best regards,

Re: [PATCH v4 6/7] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-21 Thread Leonardo Bras

On Thu, 2020-07-16 at 04:16 -0300, Leonardo Bras wrote:
> +static void iommu_pseries_table_update(struct pci_dev *dev,
> +  struct device_node *pdn)
> +{
> +   const struct dynamic_dma_window_prop *ddw;
> +   struct pci_dn *pci;
> +   int len;
> +
> +   ddw = of_get_property(pdn, DMA64_PROPNAME, );
> +   if (!ddw  || len < sizeof(struct dynamic_dma_window_prop))
> +   return;
> +
> +   iommu_table_update(pci->table_group->tables[0], pci->phb->node,
> +  ddw->liobn, ddw->dma_base, ddw->tce_shift,
> +  ddw->window_shift);
> +}
> +
>  static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  {
> struct device_node *pdn, *dn;
> @@ -1382,6 +1403,7 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
> pci_dev *pdev, u64 dma_mask)
> pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
> if (pdev->dev.archdata.dma_offset)
> return true;
> +   iommu_pseries_table_update(pdev, pdn);
> }
> 

Noticed a bug in this one: pci is not getting assigned. 
My bad, there must have been a merge error.

Also, I will refactor the function to make use of pdn only, as I can do
pci = PCI_DN(pdn) (I think it's better this way).

Sorry for the buggy patch.

Best regards,

[PATCH v4 7/7] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2020-07-16 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.

Those changes are not supposed to change how the code works in any
way, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 100 +
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6e1c9d1599d1..5ca952d966a4 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -339,7 +339,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -359,12 +359,13 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -697,15 +698,18 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
-   dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+   dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window != NULL)
break;
}
 
if (dma_window == NULL) {
-   pr_debug("  no ibm,dma-window property !\n");
+   pr_debug("  no %s property !\n", DEFAULT_DMA_WIN);
return;
}
 
@@ -803,11 +807,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -835,26 +839,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove dma window property: %d\n",
np, ret);
 }
 
 static u64 find_existing_ddw(struct device_node *pdn)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
u64 dma_addr = 0;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   dma_addr = be64_to_cpu(direct64->dma_base);
+   dma64 = window->prop;
+   dma_addr = be64_to_cpu(dma64->dma_base);
break;
}
}
-   spin_unlock(_w

[PATCH v4 6/7] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-16 Thread Leonardo Bras

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
"ibm,dma-window" is used instead.

Usually this DDW is bigger than the default DMA window, and it performs
better, so it would be nice to use it instead.

The DDW created will be used for direct mapping by default.
If it's not available, indirect mapping will be used instead.

In this case, it's necessary to update the iommu_table so iommu_alloc()
can use the DDW created. For this, iommu_table_update() is called after a
enable_ddw() when direct DMA is not available.

As there will never have both direct and indirect mappings at the same
time, the same property name can be used for the created DDW.

So renaming
define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
to
define DMA64_PROPNAME "linux,dma64-ddr-window-info"
looks the right thing to do.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/iommu.h   |  3 ++
 arch/powerpc/kernel/iommu.c| 15 +
 arch/powerpc/platforms/pseries/iommu.c | 46 +++---
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5032f1593299..dc4480a9d60d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+void iommu_table_update(struct iommu_table *tbl, int nid, unsigned long liobn,
+   unsigned long win_addr, unsigned long page_shift,
+   unsigned long window_shift);
 
 #define IOMMU_TABLE_GROUP_MAX_TABLES   2
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c3242253a4e7..cb0cb572eb0a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -774,6 +774,21 @@ static void iommu_table_free(struct kref *kref)
kfree(tbl);
 }
 
+void iommu_table_update(struct iommu_table *tbl, int nid, unsigned long liobn,
+   unsigned long win_addr, unsigned long page_shift,
+   unsigned long window_shift)
+{
+   iommu_table_clean(tbl);
+
+   /* Update tlb with values from ddw */
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_size = 1 << (window_shift - page_shift);
+
+   iommu_init_table(tbl, nid, 0, 0);
+}
+
 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
 {
if (kref_get_unless_zero(>it_kref))
diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index fc8d0555e2e9..6e1c9d1599d1 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -364,7 +364,7 @@ static LIST_HEAD(direct_window_list);
 static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
-#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -823,7 +823,7 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
if (ret)
return;
 
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   win = of_find_property(np, DMA64_PROPNAME, NULL);
if (!win)
return;
 
@@ -869,8 +869,8 @@ static int find_existing_ddw_windows(void)
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
+   for_each_node_with_property(pdn, DMA64_PROPNAME) {
+   direct64 = of_get_property(pdn, DMA64_PROPNAME, );
if (!direct64)
continue;
 
@@ -1210,23 +1210,26 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
  query.page_size);
goto out_failed;
}
+
/* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
-   dev_dbg(>dev, "can't map partition max 0x%llx with %llu "
- "%llu-sized pages\n", max_addr,  
query.largest_available_block,
- 1ULL << page_shift);
-

[PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-16 Thread Leonardo Bras

Move the part of iommu_table_free() that does struct iommu_table cleaning
into iommu_table_clean, so we can invoke it separately.

This new function is useful for cleaning struct iommu_table before
initializing it again with a new DMA window, without having it freed and
allocated again.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kernel/iommu.c | 30 ++
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 9704f3f76e63..c3242253a4e7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
 }
 
-static void iommu_table_free(struct kref *kref)
+static void iommu_table_clean(struct iommu_table *tbl)
 {
unsigned long bitmap_sz;
unsigned int order;
-   struct iommu_table *tbl;
-
-   tbl = container_of(kref, struct iommu_table, it_kref);
-
-   if (tbl->it_ops->free)
-   tbl->it_ops->free(tbl);
-
-   if (!tbl->it_map) {
-   kfree(tbl);
-   return;
-   }
 
iommu_table_release_pages(tbl);
 
@@ -763,6 +752,23 @@ static void iommu_table_free(struct kref *kref)
/* free bitmap */
order = get_order(bitmap_sz);
free_pages((unsigned long) tbl->it_map, order);
+}
+
+static void iommu_table_free(struct kref *kref)
+{
+   struct iommu_table *tbl;
+
+   tbl = container_of(kref, struct iommu_table, it_kref);
+
+   if (tbl->it_ops->free)
+   tbl->it_ops->free(tbl);
+
+   if (!tbl->it_map) {
+   kfree(tbl);
+   return;
+   }
+
+   iommu_table_clean(tbl);
 
/* free table */
kfree(tbl);
-- 
2.25.4

[PATCH v4 3/7] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window

2020-07-16 Thread Leonardo Bras

Move the window-removing part of remove_ddw into a new function
(remove_dma_window), so it can be used to remove other DMA windows.

It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
property, like the default DMA window from the device, which uses
"ibm,dma-window".

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 45 +++---
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 1a933c4e8bba..4e33147825cc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -781,25 +781,14 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
 {
struct dynamic_dma_window_prop *dwp;
-   struct property *win64;
-   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
-   int ret = 0;
-
-   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-_avail[0], DDW_APPLICABLE_SIZE);
-
-   win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win64)
-   return;
-
-   if (ret || win64->length < sizeof(*dwp))
-   goto delprop;
+   int ret;
 
-   dwp = win64->value;
+   dwp = win->value;
liobn = (u64)be32_to_cpu(dwp->liobn);
 
/* clear the whole window, note the arg is in kernel pages */
@@ -821,10 +810,30 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
+}
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+   struct property *win;
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
+   int ret = 0;
+
+   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+_avail[0], DDW_APPLICABLE_SIZE);
+   if (ret)
+   return;
+
+   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   if (!win)
+   return;
+
+   if (win->length >= sizeof(struct dynamic_dma_window_prop))
+   remove_dma_window(np, ddw_avail, win);
+
+   if (!remove_prop)
+   return;
 
-delprop:
-   if (remove_prop)
-   ret = of_remove_property(np, win64);
+   ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
-- 
2.25.4

[PATCH v4 4/7] powerpc/pseries/iommu: Remove default DMA window before creating DDW

2020-07-16 Thread Leonardo Bras

On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.

This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.

If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:

Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.

It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 73 +++---
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 4e33147825cc..fc8d0555e2e9 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
return max_addr;
 }
 
+/*
+ * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
+ * ibm,ddw-extensions, which carries the rtas token for
+ * ibm,reset-pe-dma-windows.
+ * That rtas-call can be used to restore the default DMA window for the device.
+ */
+static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
+{
+   int ret;
+   u32 cfg_addr, reset_dma_win;
+   u64 buid;
+   struct device_node *dn;
+   struct pci_dn *pdn;
+
+   ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, _dma_win);
+   if (ret)
+   return;
+
+   dn = pci_device_to_OF_node(dev);
+   pdn = PCI_DN(dn);
+   buid = pdn->phb->buid;
+   cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+   ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid));
+   if (ret)
+   dev_info(>dev,
+"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
+reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
+ret);
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1090,6 +1122,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
+   bool default_win_removed = false;
 
mutex_lock(_window_init_mutex);
 
@@ -1133,14 +1166,38 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (ret != 0)
goto out_failed;
 
+   /*
+* If there is no window available, remove the default DMA window,
+* if it's present. This will make all the resources available to the
+* new DDW window.
+* If anything fails after this, we need to restore it, so also check
+* for extensions presence.
+*/
if (query.windows_available == 0) {
-   /*
-* no additional windows are available for this device.
-* We might be able to reallocate the existing window,
-* trading in for a larger page size.
-*/
-   dev_dbg(>dev, "no free dynamic windows");
-   goto out_failed;
+   struct property *default_win;
+   int reset_win_ext;
+
+   default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+   if (!default_win)
+   goto out_failed;
+
+   reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
+   if (reset_win_ext)
+   goto out_failed;
+
+   remove_dma_window(pdn, ddw_avail, default_win);
+   default_win_removed = true;
+
+   /* Query again, to check if the window is available */
+   ret = query_ddw(dev, ddw_avail, , pdn);
+   if (ret != 0)
+   goto out_failed;
+
+   if (query.windows_available == 0) {
+   /* no windows are available for this device. */
+   dev_dbg(>dev, "no free dynamic windows");
+   goto out_failed;
+   }
}
if (query.page_size & 4) {
page_shift = 24; /* 16MB */
@@ -1231,6 +1288,8 @@ static

[PATCH v4 0/7] Remove default DMA window before creating DDW

2020-07-16 Thread Leonardo Bras

There are some devices in which a hypervisor may only allow 1 DMA window
to exist at a time, and in those cases, a DDW is never created to them,
since the default DMA window keeps using this resource.

LoPAR recommends this procedure:
1. Remove the default DMA window,
2. Query for which configs the DDW can be created,
3. Create a DDW.

Patch #1:
Create defines for outputs of ibm,ddw-applicable, so it's easier to
identify them.

Patch #2:
- After LoPAR level 2.8, there is an extension that can make
  ibm,query-pe-dma-windows to have 6 outputs instead of 5. This changes the
  order of the outputs, and that can cause some trouble. 
- query_ddw() was updated to check how many outputs the 
  ibm,query-pe-dma-windows is supposed to have, update the rtas_call() and
  deal correctly with the outputs in both cases.
- This patch looks somehow unrelated to the series, but it can avoid future
  problems on DDW creation.

Patch #3 moves the window-removing code from remove_ddw() to
remove_dma_window(), creating a way to delete any DMA window, so it can be
used to delete the default DMA window.

Patch #4 makes use of the remove_dma_window() from patch #3 to remove the
default DMA window before query_ddw(). It also implements a new rtas call
to recover the default DMA window, in case anything fails after it was
removed, and a DDW couldn't be created.

Patch #5 moves the part of iommu_table_free() that does struct iommu_table
cleaning into iommu_table_clean, so we can invoke it separately in
patch #6.

Patch #6:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance. Also, update the iommu_table and re-generate the pools.

Patch #7:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an Ethernet VF:
4005:01:00.0 Ethernet controller: Mellanox Technologies MT27700 Family
[ConnectX-4 Virtual Function]

Patch #6 It was tested with a 64GB DDW which did not map the whole
partition (128G). Performance improvement noticed by using the DDW instead
of the default DMA window:

64 thread write throughput: +203.0%
64 thread read throughput: +17.5%
1 thread write throughput: +20.5%
1 thread read throughput: +3.43%
Average write latency: -23.0%
Average read latency:  -2.26%

---
Changes since v3:
- Introduces new patch #5, to prepare for an important change in #6
- struct iommu_table was not being updated, so include a way to do this
  in patch #6.
- Improved patch #4 based in a suggestion from Alexey, to make code
  more easily understandable
- v3 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348=%2A=both

Changes since v2:
- Change the way ibm,ddw-extensions is accessed, using a proper function
  instead of doing this inline everytime it's used.
- Remove previous patch #6, as it doesn't look like it would be useful.
- Add new patch, for changing names from direct* to dma*, as indirect 
  mapping can be used from now on.
- Fix some typos, corrects some define usage.
- v2 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=185433=%2A=both

Changes since v1:
- Add defines for ibm,ddw-applicable and ibm,ddw-extensions outputs
- Merge aux function query_ddw_out_sz() into query_ddw()
- Merge reset_dma_window() patch (prev. #2) into remove default DMA
  window patch (#4).
- Keep device_node *np name instead of using pdn in remove_*()
- Rename 'device_node *pdn' into 'parent' in new functions
- Rename dfl_win to default_win
- Only remove the default DMA window if there is no window available
  in first query.
- Check if default DMA window can be restored before removing it.
- Fix 'unitialized use' (found by travis mpe:ci-test)
- New patches #5 and #6
- v1 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=184420=%2A=both

Special thanks for Alexey Kardashevskiy, Brian King and
Oliver O'Halloran for the feedback provided!


Leonardo Bras (7):
  powerpc/pseries/iommu: Create defines for operations in
ibm,ddw-applicable
  powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
  powerpc/pseries/iommu: Move window-removing part of remove_ddw into
remove_dma_window
  powerpc/pseries/iommu: Remove default DMA window before creating DDW
  powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean
  powerpc/pseries/iommu: Make use of DDW even if it does not map the
partition
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/include/asm/iommu.h   |   3 +
 arch/powerpc/kernel/iommu.c|  45 ++-
 arch/powerpc/platforms/pseries/iommu.c | 380 ++---
 3 files changed, 313 insertions(+), 115 deletions(-)

-- 
2.25.4

[PATCH v4 1/7] powerpc/pseries/iommu: Create defines for operations in ibm,ddw-applicable

2020-07-16 Thread Leonardo Bras

Create defines to help handling ibm,ddw-applicable values, avoiding
confusion about the index of given operations.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 43 --
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6d47b4a3ce39..ac0d6376bdad 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -39,6 +39,14 @@
 
 #include "pseries.h"
 
+enum {
+   DDW_QUERY_PE_DMA_WIN  = 0,
+   DDW_CREATE_PE_DMA_WIN = 1,
+   DDW_REMOVE_PE_DMA_WIN = 2,
+
+   DDW_APPLICABLE_SIZE
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -771,12 +779,12 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 {
struct dynamic_dma_window_prop *dwp;
struct property *win64;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
int ret = 0;
 
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-_avail[0], 3);
+_avail[0], DDW_APPLICABLE_SIZE);
 
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
if (!win64)
@@ -798,15 +806,15 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF successfully cleared tces in window.\n",
 np);
 
-   ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+   ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
pr_warn("%pOF: failed to remove direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 
 delprop:
if (remove_prop)
@@ -889,11 +897,11 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
- cfg_addr, BUID_HI(buid), BUID_LO(buid));
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   cfg_addr, BUID_HI(buid), BUID_LO(buid));
dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
-   BUID_LO(buid), ret);
+   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
+BUID_HI(buid), BUID_LO(buid), ret);
return ret;
 }
 
@@ -920,15 +928,16 @@ static int create_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
 
do {
/* extra outputs are LIOBN and dma-addr (hi, lo) */
-   ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
-   cfg_addr, BUID_HI(buid), BUID_LO(buid),
-   page_shift, window_shift);
+   ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
+   (u32 *)create, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid), page_shift, window_shift);
} while (rtas_busy_delay(ret));
dev_info(>dev,
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
-   "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
-cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
-window_shift, ret, create->liobn, create->addr_hi, 
create->addr_lo);
+   "(liobn = 0x%x starting addr = %x %x)\n",
+ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
+create->addr_hi, create->addr_lo);
 
return ret;
 }
@@ -996,7 +1005,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
int page_shift;
u64 dma_addr, max_addr;
struct device_node *dn;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
@@ -1029,7 +1038,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * the pr

[PATCH v4 2/7] powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows

2020-07-16 Thread Leonardo Bras

>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.

This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.

This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.

Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 91 +++---
 1 file changed, 81 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index ac0d6376bdad..1a933c4e8bba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -47,6 +47,12 @@ enum {
DDW_APPLICABLE_SIZE
 };
 
+enum {
+   DDW_EXT_SIZE = 0,
+   DDW_EXT_RESET_DMA_WIN = 1,
+   DDW_EXT_QUERY_OUT_SIZE = 2
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -342,7 +348,7 @@ struct direct_window {
 /* Dynamic DMA Window support */
 struct ddw_query_response {
u32 windows_available;
-   u32 largest_available_block;
+   u64 largest_available_block;
u32 page_size;
u32 migration_capable;
 };
@@ -877,14 +883,62 @@ static int find_existing_ddw_windows(void)
 }
 machine_arch_initcall(pseries, find_existing_ddw_windows);
 
+/**
+ * ddw_read_ext - Get the value of an DDW extension
+ * @np:device node from which the extension value is to be 
read.
+ * @extnum:index number of the extension.
+ * @value: pointer to return value, modified when extension is available.
+ *
+ * Checks if "ibm,ddw-extensions" exists for this node, and get the value
+ * on index 'extnum'.
+ * It can be used only to check if a property exists, passing value == NULL.
+ *
+ * Returns:
+ * 0 if extension successfully read
+ * -EINVAL if the "ibm,ddw-extensions" does not exist,
+ * -ENODATA if "ibm,ddw-extensions" does not have a value, and
+ * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
+ */
+static inline int ddw_read_ext(const struct device_node *np, int extnum,
+  u32 *value)
+{
+   static const char propname[] = "ibm,ddw-extensions";
+   u32 count;
+   int ret;
+
+   ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, );
+   if (ret)
+   return ret;
+
+   if (count < extnum)
+   return -EOVERFLOW;
+
+   if (!value)
+   value = 
+
+   return of_property_read_u32_index(np, propname, extnum, value);
+}
+
 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
-   struct ddw_query_response *query)
+struct ddw_query_response *query,
+struct device_node *parent)
 {
struct device_node *dn;
struct pci_dn *pdn;
-   u32 cfg_addr;
+   u32 cfg_addr, ext_query, query_out[5];
u64 buid;
-   int ret;
+   int ret, out_sz;
+
+   /*
+* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
+* output parameters ibm,query-pe-dma-windows will have, ranging from
+* 5 to 6.
+*/
+   ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, _query);
+   if (!ret && ext_query == 1)
+   out_sz = 6;
+   else
+   out_sz = 5;
 
/*
 * Get the config address and phb buid of the PE window.
@@ -897,11 +951,28 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
cfg_addr, BUID_HI(buid), BUID_LO(buid));
-   dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
-BUID_HI(buid), BUID_LO(buid), ret);
+   dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned 
%d\n",
+ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), ret);
+
+   switch (out_sz) {
+   case 5:
+   query->windows_available = query_out[0];
+   query->largest_available_bl

Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW

2020-07-14 Thread Leonardo Bras

In fact, the changes over the last patch are more complex than the
current patch. 
Just for reference, that's how enable_ddw() currently patches:

@@ -1087,7 +1119,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
-   struct property *win64;
+   struct property *win64, *default_win = NULL;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
 
@@ -1133,14 +1165,38 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
if (ret != 0)
goto out_failed;
 
+   /*
+* If there is no window available, remove the default DMA
window,
+* if it's present. This will make all the resources available
to the
+* new DDW window.
+* If anything fails after this, we need to restore it, so also
check
+* for extensions presence.
+*/
if (query.windows_available == 0) {
-   /*
-* no additional windows are available for this device.
-* We might be able to reallocate the existing window,
-* trading in for a larger page size.
-*/
-   dev_dbg(>dev, "no free dynamic windows");
-   goto out_failed;
+   int reset_win_ext;
+
+   default_win = of_find_property(pdn, "ibm,dma-window",
NULL);
+   if (!default_win)
+   goto out_failed;
+
+   reset_win_ext = ddw_read_ext(pdn,
DDW_EXT_RESET_DMA_WIN, NULL);
+   if (reset_win_ext) {
+   default_win = NULL;
+   goto out_failed;
+   }
+
+   remove_dma_window(pdn, ddw_avail, default_win);
+
+   /* Query again, to check if the window is available */
+   ret = query_ddw(dev, ddw_avail, , pdn);
+   if (ret != 0)
+   goto out_failed;
+
+   if (query.windows_available == 0) {
+   /* no windows are available for this device. */
+   dev_dbg(>dev, "no free dynamic windows");
+   goto out_failed;
+   }
}
if (query.page_size & 4) {
page_shift = 24; /* 16MB */
@@ -1231,6 +1287,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
kfree(win64);
 
 out_failed:
+   if (default_win)
+   reset_dma_window(dev, pdn);
 
fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
if (!fpdn)

Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW

2020-07-14 Thread Leonardo Bras

On Tue, 2020-07-14 at 14:52 +1000, Alexey Kardashevskiy wrote:
> 
> On 14/07/2020 12:40, Leonardo Bras wrote:
> > Thank you for this feedback Alexey!
> > 
> > On Mon, 2020-07-13 at 17:33 +1000, Alexey Kardashevskiy wrote:
> > > [...]
> > > > -   int len, ret;
> > > > +   int len, ret, reset_win_ext;
> > > 
> > > Make it "reset_token".
> > 
> > Oh, it's not a token here, it just checks if the reset_win extension
> > exists. The token would be returned in *value, but since we did not
> > need it here, it's not copied.
> 
> ah right, so it is a bool actually.

In fact I did it a int, as it's the return value of ddw_read_ext(),
which can return 0 on success and -error otherwise.

> > > > [...]
> > > > -out_failed:
> > > > +out_restore_defwin:
> > > > +   if (default_win && reset_win_ext == 0)
> > > 
> > > reset_win_ext potentially may be uninitialized here. Yeah I know it is
> > > tied to default_win but still.
> > 
> > I can't see it being used uninitialized here, as you said it's tied to
> > default_win. 
> 
> Where it is declared - it is not initialized so in theory it can skip
> "if (query.windows_available == 0)".

Humm, I thought doing if (default_win && reset_win_ext == 0) would
guarantee default_win to be tested before reset_win_ext is ever tested,
so I could control it using default_win. 

> 
> 
> > Could you please tell me how it can be used uninitialized here, or what
> > is bad by doing this way?
> > 
> > > After looking at this function for a few minutes, it could use some
> > > refactoring (way too many gotos)  such as:
> > 
> > Yes, I agree.
> > 
> > > 1. move (query.page_size & xx) checks before "if
> > > (query.windows_available == 0)"
> > 
> > Moving 'page_size selection' above 'checking windows available' will
> > need us to duplicate the 'page_size selection' after the new query,
> > inside the if.
> 
> page_size selection is not going to change, why?

In theory, a query after freeing the default DMA window could have a
different (bigger) page size, so we should test again.

> 
> 
> > I mean, as query will be done again, it will need to get the (new) page
> > size.
> > 
> > > 2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
> > > "if (query.windows_available == 0)"
> > > 3. call "reset_dma_window(dev, pdn)" inside the "if
> > > (query.windows_available == 0)" branch.
> > > Then you can drop all "goto out_restore_defwin" and move default_win and
> > > reset_win_ext inside "if (query.windows_available == 0)".
> > 
> > I did all changes suggested locally and did some analysis in the
> > result:
> > 
> > I did not see a way to put default_win and reset_win_ext inside 
> > "if (query.windows_available == 0)", because if we still need a way to
> > know if the default window was removed, and if so, restore in case
> > anything ever fails ahead (like creating the node property). 
> 
> Ah, I missed that new out_restore_defwin label is between other exit
> labels. Sorry :-/
> 
> 
> > reset_win_ext = ddw_read_ext(pdn,
> > DDW_EXT_RESET_DMA_WIN, NULL);
> > -   if (reset_win_ext)
> > +   if (reset_win_ext){
> > +   default_win = NULL;
> > goto out_failed;
> > +   }
> 
> This says "if we can reset, then we fail", no?

Here ddw_read_ext() should return 0 if extension was found, and 
(-EINVAL, -ENODATA or -EOVERFLOW) otherwise.
So it should return nonzero if we can't find the extension, in which
case we should fail.

> 
> > remove_dma_window(pdn, ddw_avail, default_win);
> 
> I think you can do "default_win=NULL" here and later at
> out_restore_defwin check if it is NULL - then call reset.

Currently I initialize 'default_win = NULL', and it only changes when I
read the default DMA window. If reset is not available I restore it to
NULL, so it will be not-NULL only when the have removed the default DMA
window. 

If I make it NULL here, we either never reset the default DMA window
(as it is now "if (default_win)" ) or we may always reset it (in case
 "if (default_win == NULL)"). 

If you think it's better, I can create a bool variable like
"default_win_removed", initialized with 'false', which can be assigned
here with 'true' and test in the end if(default_win_removed) reset();

This

Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW

2020-07-13 Thread Leonardo Bras

Thank you for this feedback Alexey!

On Mon, 2020-07-13 at 17:33 +1000, Alexey Kardashevskiy wrote:
> [...]
> > -   int len, ret;
> > +   int len, ret, reset_win_ext;
> 
> Make it "reset_token".

Oh, it's not a token here, it just checks if the reset_win extension
exists. The token would be returned in *value, but since we did not
need it here, it's not copied.

> > [...]
> > -out_failed:
> > +out_restore_defwin:
> > +   if (default_win && reset_win_ext == 0)
> 
> reset_win_ext potentially may be uninitialized here. Yeah I know it is
> tied to default_win but still.

I can't see it being used uninitialized here, as you said it's tied to
default_win. 
Could you please tell me how it can be used uninitialized here, or what
is bad by doing this way?

> After looking at this function for a few minutes, it could use some
> refactoring (way too many gotos)  such as:

Yes, I agree.

> 1. move (query.page_size & xx) checks before "if
> (query.windows_available == 0)"

Moving 'page_size selection' above 'checking windows available' will
need us to duplicate the 'page_size selection' after the new query,
inside the if.
I mean, as query will be done again, it will need to get the (new) page
size.

> 2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
> "if (query.windows_available == 0)"

> 3. call "reset_dma_window(dev, pdn)" inside the "if
> (query.windows_available == 0)" branch.

> Then you can drop all "goto out_restore_defwin" and move default_win and
> reset_win_ext inside "if (query.windows_available == 0)".

I did all changes suggested locally and did some analysis in the
result:

I did not see a way to put default_win and reset_win_ext inside 
"if (query.windows_available == 0)", because if we still need a way to
know if the default window was removed, and if so, restore in case
anything ever fails ahead (like creating the node property). 

But from that analysis I noted it's possible to remove all the new
"goto out_restore_defwin", if we do default_win = NULL if
ddw_read_ext() fails. 

So testing only default_win should always be enough to say if the
default window was deleted, and reset_win_ext could be moved inside "if
(query.windows_available == 0)".
Also, it would avoid reset_win_ext being 'used uninitialized' and
"out_restore_defwin:" would not be needed.

Against the current patch, we would have something like this:

#

 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-   int len, ret, reset_win_ext;
+   int len, ret;
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
@@ -1173,25 +1173,28 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
 * for extensions presence.
 */
if (query.windows_available == 0) {
+   int reset_win_ext;
default_win = of_find_property(pdn, "ibm,dma-window",
NULL);
if (!default_win)
goto out_failed;

reset_win_ext = ddw_read_ext(pdn,
DDW_EXT_RESET_DMA_WIN, NULL);
-   if (reset_win_ext)
+   if (reset_win_ext){
+   default_win = NULL;
goto out_failed;
+   }

remove_dma_window(pdn, ddw_avail, default_win);

/* Query again, to check if the window is available */
ret = query_ddw(dev, ddw_avail, , pdn);
if (ret != 0)
-   goto out_restore_defwin;
+   goto out_failed;

if (query.windows_available == 0) {
/* no windows are available for this device. */
dev_dbg(>dev, "no free dynamic windows");
-   goto out_restore_defwin;
+   goto out_failed;
}
}
if (query.page_size & 4) {
@@ -1203,7 +1206,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
} else {
dev_dbg(>dev, "no supported direct page size in
mask %x",
  query.page_size);
-   goto out_restore_defwin;
+   goto out_failed;
}
/* verify the window * number of ptes will map the partition */
/* check largest block * page size > max memory hotplug addr */
@@ -1212,14 +1215,14 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
dev_dbg(>dev, "can't map partition max 0x%llx with
%llu "
  "%llu-sized pages\n",
max_addr,  query.largest_available_block,
  1ULL << page_shift);
-   goto out_restore_defwin;
+   goto out_failed;
}
len = order_base_2(max_addr);
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
if (!win64) {
dev_info(>dev,
"couldn't allocate property for 64bit dma
window\n");
-

[PATCH 1/1] KVM/PPC: Fix typo on H_DISABLE_AND_GET hcall

2020-07-06 Thread Leonardo Bras

On PAPR+ the hcall() on 0x1B0 is called H_DISABLE_AND_GET, but got
defined as H_DISABLE_AND_GETC instead.

This define was introduced with a typo in commit 
("[PATCH] powerpc: Extends HCALL interface for InfiniBand usage"), and was
later used without having the typo noticed.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/hvcall.h| 2 +-
 arch/powerpc/kvm/trace_hv.h  | 2 +-
 tools/perf/arch/powerpc/util/book3s_hcalls.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index e90c073e437e..d8ada9c7ec78 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -237,7 +237,7 @@
 #define H_CREATE_RPT0x1A4
 #define H_REMOVE_RPT0x1A8
 #define H_REGISTER_RPAGES   0x1AC
-#define H_DISABLE_AND_GETC  0x1B0
+#define H_DISABLE_AND_GET   0x1B0
 #define H_ERROR_DATA0x1B4
 #define H_GET_HCA_INFO  0x1B8
 #define H_GET_PERF_COUNT0x1BC
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 4a61a971c34e..830a126e095d 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -89,7 +89,7 @@
{H_CREATE_RPT,  "H_CREATE_RPT"}, \
{H_REMOVE_RPT,  "H_REMOVE_RPT"}, \
{H_REGISTER_RPAGES, "H_REGISTER_RPAGES"}, \
-   {H_DISABLE_AND_GETC,"H_DISABLE_AND_GETC"}, \
+   {H_DISABLE_AND_GET, "H_DISABLE_AND_GET"}, \
{H_ERROR_DATA,  "H_ERROR_DATA"}, \
{H_GET_HCA_INFO,"H_GET_HCA_INFO"}, \
{H_GET_PERF_COUNT,  "H_GET_PERF_COUNT"}, \
diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h 
b/tools/perf/arch/powerpc/util/book3s_hcalls.h
index 54cfa0530e86..488f4339b83c 100644
--- a/tools/perf/arch/powerpc/util/book3s_hcalls.h
+++ b/tools/perf/arch/powerpc/util/book3s_hcalls.h
@@ -84,7 +84,7 @@
{0x1a4, "H_CREATE_RPT"},\
{0x1a8, "H_REMOVE_RPT"},\
{0x1ac, "H_REGISTER_RPAGES"},   \
-   {0x1b0, "H_DISABLE_AND_GETC"},  \
+   {0x1b0, "H_DISABLE_AND_GET"},   \
{0x1b4, "H_ERROR_DATA"},\
{0x1b8, "H_GET_HCA_INFO"},  \
{0x1bc, "H_GET_PERF_COUNT"},\
-- 
2.25.4

Re: [PATCH v2 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-03 Thread Leonardo Bras

On Thu, 2020-07-02 at 10:31 +1000, Alexey Kardashevskiy wrote:
> 
> On 02/07/2020 09:48, Leonardo Bras wrote:
> > On Wed, 2020-07-01 at 16:57 -0300, Leonardo Bras wrote:
> > > > It is not necessarily "direct" anymore as the name suggests, you may
> > > > want to change that. DMA64_PROPNAME, may be. Thanks,
> > > > 
> > > 
> > > Yeah, you are right.
> > > I will change this for next version, also changing the string name to
> > > reflect this.
> > > 
> > > -#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> > > +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> > > 
> > > Is that ok?
> > > 
> > > Thank you for helping!
> > 
> > In fact, there is a lot of places in this file where it's called direct
> > window. Should I replace everything?
> > Should it be in a separated patch?
> 
> If it looks simple and you write a nice commit log explaining all that
> and why you are not reusing the existing ibm,dma-window property (to
> provide a clue what "reset" will reset to? is there any other reason?)
> for that - sure, do it :)
> 

v3 available here:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348=%2A=both

Best regards,
Leonardo

[PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW

2020-07-03 Thread Leonardo Bras

On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.

This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.

If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:

Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.

It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 83 +-
 1 file changed, 69 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 4e33147825cc..5b520ac354c6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
return max_addr;
 }
 
+/*
+ * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
+ * ibm,ddw-extensions, which carries the rtas token for
+ * ibm,reset-pe-dma-windows.
+ * That rtas-call can be used to restore the default DMA window for the device.
+ */
+static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
+{
+   int ret;
+   u32 cfg_addr, reset_dma_win;
+   u64 buid;
+   struct device_node *dn;
+   struct pci_dn *pdn;
+
+   ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, _dma_win);
+   if (ret)
+   return;
+
+   dn = pci_device_to_OF_node(dev);
+   pdn = PCI_DN(dn);
+   buid = pdn->phb->buid;
+   cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+   ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid));
+   if (ret)
+   dev_info(>dev,
+"ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
+reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
+ret);
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1079,7 +,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
  */
 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-   int len, ret;
+   int len, ret, reset_win_ext;
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
@@ -1087,7 +1119,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
-   struct property *win64;
+   struct property *win64, *default_win = NULL;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
 
@@ -1122,7 +1154,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (ret)
goto out_failed;
 
-   /*
+   /*
 * Query if there is a second window of size to map the
 * whole partition.  Query returns number of windows, largest
 * block assigned to PE (partition endpoint), and two bitmasks
@@ -1133,14 +1165,34 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
if (ret != 0)
goto out_failed;
 
+   /*
+* If there is no window available, remove the default DMA window,
+* if it's present. This will make all the resources available to the
+* new DDW window.
+* If anything fails after this, we need to restore it, so also check
+* for extensions presence.
+*/
if (query.windows_available == 0) {
-   /*
-* no additional windows are available for this device.
-* We might be able to reallocate the existing window,
-* trading in for a larger page size.
-*/
-   dev_dbg(>dev, "no free dynamic windows");
-   goto out_failed;
+   default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+   if (!default_win)
+   goto out_failed;
+
+   reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
+   if (reset_win_ext)
+   goto out_failed;
+
+

[PATCH v3 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-03 Thread Leonardo Bras

As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
"ibm,dma-window" is used instead.

Usually this DDW is bigger than the default DMA window, and it performs
better, so it would be nice to use it instead.

The ddw created will be used for direct mapping by default.
If it's not available, indirect mapping sill be used instead.

As there will never have both mappings at the same time, the same property
name can be used for the created DDW.

So renaming
define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
to
define DMA64_PROPNAME "linux,dma64-ddr-window-info"
looks the right thing to do.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 38 --
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 5b520ac354c6..c652177de09c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -364,7 +364,7 @@ static LIST_HEAD(direct_window_list);
 static DEFINE_SPINLOCK(direct_window_list_lock);
 /* protects initializing window twice for same device */
 static DEFINE_MUTEX(direct_window_init_mutex);
-#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
struct iommu_table *tbl;
struct device_node *dn, *pdn;
struct pci_dn *ppci;
-   const __be32 *dma_window = NULL;
+   const __be32 *dma_window = NULL, *alt_dma_window = NULL;
 
dn = pci_bus_to_OF_node(bus);
 
@@ -704,8 +704,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
break;
}
 
+   /* If there is a DDW available, use it instead */
+   alt_dma_window = of_get_property(pdn, DMA64_PROPNAME, NULL);
+   if (alt_dma_window)
+   dma_window = alt_dma_window;
+
if (dma_window == NULL) {
-   pr_debug("  no ibm,dma-window property !\n");
+   pr_debug("  no ibm,dma-window nor linux,dma64-ddr-window-info 
property !\n");
return;
}
 
@@ -823,7 +828,7 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
if (ret)
return;
 
-   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   win = of_find_property(np, DMA64_PROPNAME, NULL);
if (!win)
return;
 
@@ -869,8 +874,8 @@ static int find_existing_ddw_windows(void)
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
 
-   for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
-   direct64 = of_get_property(pdn, DIRECT64_PROPNAME, );
+   for_each_node_with_property(pdn, DMA64_PROPNAME) {
+   direct64 = of_get_property(pdn, DMA64_PROPNAME, );
if (!direct64)
continue;
 
@@ -1205,23 +1210,26 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
  query.page_size);
goto out_restore_defwin;
}
+
/* verify the window * number of ptes will map the partition */
-   /* check largest block * page size > max memory hotplug addr */
max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
-   dev_dbg(>dev, "can't map partition max 0x%llx with %llu "
- "%llu-sized pages\n", max_addr,  
query.largest_available_block,
- 1ULL << page_shift);
-   goto out_restore_defwin;
+   dev_dbg(>dev, "can't map partition max 0x%llx with %llu 
%llu-sized pages\n",
+   max_addr, query.largest_available_block,
+   1ULL << page_shift);
+
+   len = order_base_2(query.largest_available_block << page_shift);
+   } else {
+   len = order_base_2(max_addr);
}
-   len = order_base_2(max_addr);
+
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
if (!win64) {
dev_info(>dev,
"couldn't allocate property for 64bit dma window\n");
goto out_restore_defwin;
}
-   win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+   win64->name = kstrdup(DMA64_PROPNAME, GFP_KERNEL);
win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
win64->length = sizeof(*ddwprop);
if (!win64->name || !win64->value) {

[PATCH v3 3/6] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window

2020-07-03 Thread Leonardo Bras

Move the window-removing part of remove_ddw into a new function
(remove_dma_window), so it can be used to remove other DMA windows.

It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
property, like the default DMA window from the device, which uses
"ibm,dma-window".

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 45 +++---
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 1a933c4e8bba..4e33147825cc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -781,25 +781,14 @@ static int __init disable_ddw_setup(char *str)
 
 early_param("disable_ddw", disable_ddw_setup);
 
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
 {
struct dynamic_dma_window_prop *dwp;
-   struct property *win64;
-   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
-   int ret = 0;
-
-   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-_avail[0], DDW_APPLICABLE_SIZE);
-
-   win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
-   if (!win64)
-   return;
-
-   if (ret || win64->length < sizeof(*dwp))
-   goto delprop;
+   int ret;
 
-   dwp = win64->value;
+   dwp = win->value;
liobn = (u64)be32_to_cpu(dwp->liobn);
 
/* clear the whole window, note the arg is in kernel pages */
@@ -821,10 +810,30 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
+}
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+   struct property *win;
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
+   int ret = 0;
+
+   ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+_avail[0], DDW_APPLICABLE_SIZE);
+   if (ret)
+   return;
+
+   win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+   if (!win)
+   return;
+
+   if (win->length >= sizeof(struct dynamic_dma_window_prop))
+   remove_dma_window(np, ddw_avail, win);
+
+   if (!remove_prop)
+   return;
 
-delprop:
-   if (remove_prop)
-   ret = of_remove_property(np, win64);
+   ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
-- 
2.25.4

[PATCH v3 6/6] powerpc/pseries/iommu: Rename "direct window" to "dma window"

2020-07-03 Thread Leonardo Bras

A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.

As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.

Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.

Those changes are not supposed to change how the code works in any
way, just adjust naming.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 101 +
 1 file changed, 53 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index c652177de09c..070b80efc43a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -339,7 +339,7 @@ struct dynamic_dma_window_prop {
__be32  window_shift;   /* ilog2(tce_window_size) */
 };
 
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -359,12 +359,13 @@ struct ddw_create_response {
u32 addr_lo;
 };
 
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
 /* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
 /* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
 #define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
 
 static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -697,9 +698,12 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
*bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
 dn);
 
-   /* Find nearest ibm,dma-window, walking up the device tree */
+   /*
+* Find nearest ibm,dma-window (default DMA window), walking up the
+* device tree
+*/
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
-   dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+   dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window != NULL)
break;
}
@@ -710,7 +714,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
dma_window = alt_dma_window;
 
if (dma_window == NULL) {
-   pr_debug("  no ibm,dma-window nor linux,dma64-ddr-window-info 
property !\n");
+   pr_debug("  no %s nor %s property !\n",
+DEFAULT_DMA_WIN, DMA64_PROPNAME);
return;
}
 
@@ -808,11 +813,11 @@ static void remove_dma_window(struct device_node *np, u32 
*ddw_avail,
 
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
-   pr_warn("%pOF: failed to remove direct window: rtas returned "
+   pr_warn("%pOF: failed to remove dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
-   pr_debug("%pOF: successfully removed direct window: rtas 
returned "
+   pr_debug("%pOF: successfully removed dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 }
@@ -840,26 +845,26 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 
ret = of_remove_property(np, win);
if (ret)
-   pr_warn("%pOF: failed to remove direct window property: %d\n",
+   pr_warn("%pOF: failed to remove dma window property: %d\n",
np, ret);
 }
 
 static u64 find_existing_ddw(struct device_node *pdn)
 {
-   struct direct_window *window;
-   const struct dynamic_dma_window_prop *direct64;
+   struct dma_win *window;
+   const struct dynamic_dma_window_prop *dma64;
u64 dma_addr = 0;
 
-   spin_lock(_window_list_lock);
+   spin_lock(_win_list_lock);
/* check if we already created a window and dupe that config if so */
-   list_for_each_entry(window, _window_list, list) {
+   list_for_each_entry(window, _win_list, list) {
if (window->device == pdn) {
-   direct64 = window->prop;
-   dma_addr = be64_to_cpu(di

[PATCH v3 2/6] powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows

2020-07-03 Thread Leonardo Bras

>From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.

This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.

This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.

Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 91 +++---
 1 file changed, 81 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index ac0d6376bdad..1a933c4e8bba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -47,6 +47,12 @@ enum {
DDW_APPLICABLE_SIZE
 };
 
+enum {
+   DDW_EXT_SIZE = 0,
+   DDW_EXT_RESET_DMA_WIN = 1,
+   DDW_EXT_QUERY_OUT_SIZE = 2
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -342,7 +348,7 @@ struct direct_window {
 /* Dynamic DMA Window support */
 struct ddw_query_response {
u32 windows_available;
-   u32 largest_available_block;
+   u64 largest_available_block;
u32 page_size;
u32 migration_capable;
 };
@@ -877,14 +883,62 @@ static int find_existing_ddw_windows(void)
 }
 machine_arch_initcall(pseries, find_existing_ddw_windows);
 
+/**
+ * ddw_read_ext - Get the value of an DDW extension
+ * @np:device node from which the extension value is to be 
read.
+ * @extnum:index number of the extension.
+ * @value: pointer to return value, modified when extension is available.
+ *
+ * Checks if "ibm,ddw-extensions" exists for this node, and get the value
+ * on index 'extnum'.
+ * It can be used only to check if a property exists, passing value == NULL.
+ *
+ * Returns:
+ * 0 if extension successfully read
+ * -EINVAL if the "ibm,ddw-extensions" does not exist,
+ * -ENODATA if "ibm,ddw-extensions" does not have a value, and
+ * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
+ */
+static inline int ddw_read_ext(const struct device_node *np, int extnum,
+  u32 *value)
+{
+   static const char propname[] = "ibm,ddw-extensions";
+   u32 count;
+   int ret;
+
+   ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, );
+   if (ret)
+   return ret;
+
+   if (count < extnum)
+   return -EOVERFLOW;
+
+   if (!value)
+   value = 
+
+   return of_property_read_u32_index(np, propname, extnum, value);
+}
+
 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
-   struct ddw_query_response *query)
+struct ddw_query_response *query,
+struct device_node *parent)
 {
struct device_node *dn;
struct pci_dn *pdn;
-   u32 cfg_addr;
+   u32 cfg_addr, ext_query, query_out[5];
u64 buid;
-   int ret;
+   int ret, out_sz;
+
+   /*
+* From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
+* output parameters ibm,query-pe-dma-windows will have, ranging from
+* 5 to 6.
+*/
+   ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, _query);
+   if (!ret && ext_query == 1)
+   out_sz = 6;
+   else
+   out_sz = 5;
 
/*
 * Get the config address and phb buid of the PE window.
@@ -897,11 +951,28 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
cfg_addr, BUID_HI(buid), BUID_LO(buid));
-   dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
-BUID_HI(buid), BUID_LO(buid), ret);
+   dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned 
%d\n",
+ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), ret);
+
+   switch (out_sz) {
+   case 5:
+   query->windows_available = query_out[0];
+   query->largest_available_bl

[PATCH v3 0/6] Remove default DMA window before creating DDW

2020-07-03 Thread Leonardo Bras

There are some devices in which a hypervisor may only allow 1 DMA window
to exist at a time, and in those cases, a DDW is never created to them,
since the default DMA window keeps using this resource.

LoPAR recommends this procedure:
1. Remove the default DMA window,
2. Query for which configs the DDW can be created,
3. Create a DDW.

Patch #1:
Create defines for outputs of ibm,ddw-applicable, so it's easier to
identify them.

Patch #2:
- After LoPAR level 2.8, there is an extension that can make
  ibm,query-pe-dma-windows to have 6 outputs instead of 5. This changes the
  order of the outputs, and that can cause some trouble. 
- query_ddw() was updated to check how many outputs the 
  ibm,query-pe-dma-windows is supposed to have, update the rtas_call() and
  deal correctly with the outputs in both cases.
- This patch looks somehow unrelated to the series, but it can avoid future
  problems on DDW creation.

Patch #3 moves the window-removing code from remove_ddw() to
remove_dma_window(), creating a way to delete any DMA window, so it can be
used to delete the default DMA window.

Patch #4 makes use of the remove_dma_window() from patch #3 to remove the
default DMA window before query_ddw(). It also implements a new rtas call
to recover the default DMA window, in case anything fails after it was
removed, and a DDW couldn't be created.

Patch #5:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance.

Patch #6:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.

All patches were tested into an LPAR with an Ethernet VF:
4005:01:00.0 Ethernet controller: Mellanox Technologies MT27700 Family
[ConnectX-4 Virtual Function]

Patch #5 It was tested with a 64GB DDW which did not map the whole
partition (128G). Performance improvement noticed by using the DDW instead
of the default DMA window:

64 thread write throughput: +203.0%
64 thread read throughput: +17.5%
1 thread write throughput: +20.5%
1 thread read throughput: +3.43%
Average write latency: -23.0%
Average read latency:  -2.26%

---
Changes since v2:
- Change the way ibm,ddw-extensions is accessed, using a proper function
  instead of doing this inline everytime it's used.
- Remove previous patch #6, as it doesn't look like it would be useful.
- Add new patch, for changing names from direct* to dma*, as indirect 
  mapping can be used from now on.
- Fix some typos, corrects some define usage.
- v2 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=185433=%2A=both

Changes since v1:
- Add defines for ibm,ddw-applicable and ibm,ddw-extensions outputs
- Merge aux function query_ddw_out_sz() into query_ddw()
- Merge reset_dma_window() patch (prev. #2) into remove default DMA
  window patch (#4).
- Keep device_node *np name instead of using pdn in remove_*()
- Rename 'device_node *pdn' into 'parent' in new functions
- Rename dfl_win to default_win
- Only remove the default DMA window if there is no window available
  in first query.
- Check if default DMA window can be restored before removing it.
- Fix 'unitialized use' (found by travis mpe:ci-test)
- New patches #5 and #6
- v1 link: 
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=184420=%2A=both

Special thanks for Alexey Kardashevskiy and Oliver O'Halloran for
the feedback provided!

Leonardo Bras (6):
  powerpc/pseries/iommu: Create defines for operations in
ibm,ddw-applicable
  powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
  powerpc/pseries/iommu: Move window-removing part of remove_ddw into
remove_dma_window
  powerpc/pseries/iommu: Remove default DMA window before creating DDW
  powerpc/pseries/iommu: Make use of DDW even if it does not map the
partition
  powerpc/pseries/iommu: Rename "direct window" to "dma window"

 arch/powerpc/platforms/pseries/iommu.c | 379 ++---
 1 file changed, 269 insertions(+), 110 deletions(-)

-- 
2.25.4

[PATCH v3 1/6] powerpc/pseries/iommu: Create defines for operations in ibm,ddw-applicable

2020-07-03 Thread Leonardo Bras

Create defines to help handling ibm,ddw-applicable values, avoiding
confusion about the index of given operations.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/platforms/pseries/iommu.c | 43 --
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 6d47b4a3ce39..ac0d6376bdad 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -39,6 +39,14 @@
 
 #include "pseries.h"
 
+enum {
+   DDW_QUERY_PE_DMA_WIN  = 0,
+   DDW_CREATE_PE_DMA_WIN = 1,
+   DDW_REMOVE_PE_DMA_WIN = 2,
+
+   DDW_APPLICABLE_SIZE
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
struct iommu_table_group *table_group;
@@ -771,12 +779,12 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
 {
struct dynamic_dma_window_prop *dwp;
struct property *win64;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
int ret = 0;
 
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
-_avail[0], 3);
+_avail[0], DDW_APPLICABLE_SIZE);
 
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
if (!win64)
@@ -798,15 +806,15 @@ static void remove_ddw(struct device_node *np, bool 
remove_prop)
pr_debug("%pOF successfully cleared tces in window.\n",
 np);
 
-   ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+   ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
pr_warn("%pOF: failed to remove direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
pr_debug("%pOF: successfully removed direct window: rtas 
returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
-   np, ret, ddw_avail[2], liobn);
+   np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
 
 delprop:
if (remove_prop)
@@ -889,11 +897,11 @@ static int query_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
 
-   ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
- cfg_addr, BUID_HI(buid), BUID_LO(buid));
+   ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+   cfg_addr, BUID_HI(buid), BUID_LO(buid));
dev_info(>dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
-   " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
-   BUID_LO(buid), ret);
+   " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
+BUID_HI(buid), BUID_LO(buid), ret);
return ret;
 }
 
@@ -920,15 +928,16 @@ static int create_ddw(struct pci_dev *dev, const u32 
*ddw_avail,
 
do {
/* extra outputs are LIOBN and dma-addr (hi, lo) */
-   ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
-   cfg_addr, BUID_HI(buid), BUID_LO(buid),
-   page_shift, window_shift);
+   ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
+   (u32 *)create, cfg_addr, BUID_HI(buid),
+   BUID_LO(buid), page_shift, window_shift);
} while (rtas_busy_delay(ret));
dev_info(>dev,
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
-   "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
-cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
-window_shift, ret, create->liobn, create->addr_hi, 
create->addr_lo);
+   "(liobn = 0x%x starting addr = %x %x)\n",
+ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
+create->addr_hi, create->addr_lo);
 
return ret;
 }
@@ -996,7 +1005,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
int page_shift;
u64 dma_addr, max_addr;
struct device_node *dn;
-   u32 ddw_avail[3];
+   u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
@@ -1029,7 +1038,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
 * the pr

Re: [PATCH v2 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-01 Thread Leonardo Bras

On Thu, 2020-07-02 at 10:31 +1000, Alexey Kardashevskiy wrote:
> > In fact, there is a lot of places in this file where it's called direct
> > window. Should I replace everything?
> > Should it be in a separated patch?
> 
> If it looks simple and you write a nice commit log explaining all that
> and why you are not reusing the existing ibm,dma-window property 
> for that - sure, do it :)

Nice, I will do that :)

> (to provide a clue what "reset" will reset to? is there any other
> reason?)

That's the main reason here. 

The way I perceive this, ibm,dma-window should only point to the
default DMA window, which is guaranteed to always be the same, even if
it's destroyed and re-created. So there I see no point destroying /
overwriting it.

On the other hand, I also thought about using a new node name for this
window, but it would be very troublesome and I could see no real gain.

Thanks !

1 2 3 4 >

1 - 100 of 334 matches

Mail list logo