Re: [PATCH v1 3/3] swiotlb: Split up single swiotlb lock

2022-06-29 Thread Chao Gao
On Tue, Jun 28, 2022 at 03:01:34PM +0800, Chao Gao wrote:
>From: Andi Kleen 
>
>Traditionally swiotlb was not performance critical because it was only
>used for slow devices. But in some setups, like TDX confidential
>guests, all IO has to go through swiotlb. Currently swiotlb only has a
>single lock. Under high IO load with multiple CPUs this can lead to
>signifiant lock contention on the swiotlb lock. We've seen 20+% CPU
>time in locks in some extreme cases.
>
>This patch splits the swiotlb into individual areas which have their
>own lock. Each CPU tries to allocate in its own area first. Only if
>that fails does it search other areas. On freeing the allocation is
>freed into the area where the memory was originally allocated from.
>
>To avoid doing a full modulo in the main path the number of swiotlb
>areas is always rounded to the next power of two. I believe that's
>not really needed anymore on modern CPUs (which have fast enough
>dividers), but still a good idea on older parts.
>
>The number of areas can be set using the swiotlb option. But to avoid
>every user having to set this option set the default to the number of
>available CPUs. Unfortunately on x86 swiotlb is initialized before
>num_possible_cpus() is available, that is why it uses a custom hook
>called from the early ACPI code.
>
>Signed-off-by: Andi Kleen 
>[ rebase and fix warnings of checkpatch.pl ]
>Signed-off-by: Chao Gao 

Just noticed that Tianyu already posted a variant of this patch.
Will drop this one from my series.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v1 3/3] swiotlb: Split up single swiotlb lock

2022-06-28 Thread Chao Gao
From: Andi Kleen 

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
signifiant lock contention on the swiotlb lock. We've seen 20+% CPU
time in locks in some extreme cases.

This patch splits the swiotlb into individual areas which have their
own lock. Each CPU tries to allocate in its own area first. Only if
that fails does it search other areas. On freeing the allocation is
freed into the area where the memory was originally allocated from.

To avoid doing a full modulo in the main path the number of swiotlb
areas is always rounded to the next power of two. I believe that's
not really needed anymore on modern CPUs (which have fast enough
dividers), but still a good idea on older parts.

The number of areas can be set using the swiotlb option. But to avoid
every user having to set this option set the default to the number of
available CPUs. Unfortunately on x86 swiotlb is initialized before
num_possible_cpus() is available, that is why it uses a custom hook
called from the early ACPI code.

Signed-off-by: Andi Kleen 
[ rebase and fix warnings of checkpatch.pl ]
Signed-off-by: Chao Gao 
---
 .../admin-guide/kernel-parameters.txt |   4 +-
 arch/x86/kernel/acpi/boot.c   |   4 +
 include/linux/swiotlb.h   |  30 +++-
 kernel/dma/swiotlb.c  | 168 --
 4 files changed, 180 insertions(+), 26 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 8090130b544b..5d46271982d5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5869,8 +5869,10 @@
it if 0 is given (See 
Documentation/admin-guide/cgroup-v1/memory.rst)
 
swiotlb=[ARM,IA-64,PPC,MIPS,X86]
-   Format: {  | force | noforce }
+   Format: {  [,] | force | noforce }
 -- Number of I/O TLB slabs
+-- Second integer after comma. Number of swiotlb
+areas with their own lock. Must be power of 2.
force -- force using of bounce buffers even if they
 wouldn't be automatically used by the kernel
noforce -- Never use bounce buffers (for debugging)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 907cc98b1938..f05e55b2d50c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1131,6 +1132,9 @@ static int __init acpi_parse_madt_lapic_entries(void)
return count;
}
 
+   /* This does not take overrides into consideration */
+   swiotlb_hint_cpus(max(count, x2count));
+
x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
acpi_parse_x2apic_nmi, 0);
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 012a6fde873b..cf36c5cc7584 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -69,7 +69,25 @@ struct io_tlb_slot {
 };
 
 /**
- * struct io_tlb_mem - IO TLB Memory Pool Descriptor
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used:  The number of used IO TLB block.
+ * @list:  The free list describing the number of free entries available
+ * from each index.
+ * @lock:  The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+
+struct io_tlb_area {
+   unsigned long used;
+   struct list_head free_slots;
+   spinlock_t lock;
+};
+
+/**
+ * struct io_tlb_mem - io tlb memory pool descriptor
  *
  * @start: The start address of the swiotlb memory pool. Used to do a quick
  * range check to see if the memory was in fact allocated by this
@@ -87,8 +105,6 @@ struct io_tlb_slot {
  * @index: The index to start searching in the next round.
  * @orig_addr: The original address corresponding to a mapped entry.
  * @alloc_size:Size of the allocated buffer.
- * @lock:  The lock to protect the above data structures in the map and
- * unmap calls.
  * @debugfs:   The dentry to debugfs.
  * @late_alloc:%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
@@ -101,13 +117,11 @@ struct io_tlb_mem {
phys_addr_t end;
void *vaddr;
unsigned long nslabs;
-   unsigned long used;
-   struct list_

[PATCH v1 2/3] swiotlb: Allocate memory in a cache-friendly way

2022-06-28 Thread Chao Gao
Currently, swiotlb uses a global index to indicate the starting point
of next search. The index increases from 0 to the number of slots - 1
and then wraps around. It is straightforward but not cache-friendly
because the "oldest" slot in swiotlb tends to be used first.

Freed slots are probably accessed right before being freed, especially
in VM's case (device backends access them in DMA_TO_DEVICE mode; guest
accesses them in other DMA modes). Thus those just freed slots may
reside in cache. Then reusing those just freed slots can reduce cache
misses.

To that end, maintain a free list for free slots and insert freed slots
from the head and searching for free slots always starts from the head.

With this optimization, network throughput of sending data from host to
guest, measured by iperf3, increases by 7%.

A bad side effect of this patch is we cannot use a large stride to skip
unaligned slots when there is an alignment requirement. Currently, a
large stride is used when a) device has an alignment requirement, stride
is calculated according to the requirement; b) the requested size is
larger than PAGE_SIZE. For x86 with 4KB page size, stride is set to 2.

For case a), few devices have an alignment requirement; the impact is
limited. For case b) this patch probably leads to one (or more if page size
is larger than 4K) additional lookup; but as the "io_tlb_slot" struct of
free slots are also accessed when freeing slots, they probably resides in
CPU cache as well and then the overhead is almost negligible.

Suggested-by: Andi Kleen 
Signed-off-by: Chao Gao 
---
 include/linux/swiotlb.h | 13 +
 kernel/dma/swiotlb.c| 63 -
 2 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index c3eab237991a..012a6fde873b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -62,6 +62,12 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
 
+struct io_tlb_slot {
+   phys_addr_t orig_addr;
+   size_t alloc_size;
+   struct list_head node;
+};
+
 /**
  * struct io_tlb_mem - IO TLB Memory Pool Descriptor
  *
@@ -96,16 +102,13 @@ struct io_tlb_mem {
void *vaddr;
unsigned long nslabs;
unsigned long used;
-   unsigned int index;
+   struct list_head free_slots;
spinlock_t lock;
struct dentry *debugfs;
bool late_alloc;
bool force_bounce;
bool for_alloc;
-   struct io_tlb_slot {
-   phys_addr_t orig_addr;
-   size_t alloc_size;
-   } *slots;
+   struct io_tlb_slot *slots;
unsigned long *bitmap;
 };
 extern struct io_tlb_mem io_tlb_default_mem;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d7f68c0af7f5..2283f3a36f0d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -200,7 +200,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, 
phys_addr_t start,
mem->nslabs = nslabs;
mem->start = start;
mem->end = mem->start + bytes;
-   mem->index = 0;
+   INIT_LIST_HEAD(&mem->free_slots);
mem->late_alloc = late_alloc;
 
mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
@@ -210,6 +210,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, 
phys_addr_t start,
__set_bit(i, mem->bitmap);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
+   list_add_tail(&mem->slots[i].node, &mem->free_slots);
}
 
/*
@@ -484,13 +485,6 @@ static inline unsigned long get_max_slots(unsigned long 
boundary_mask)
return nr_slots(boundary_mask + 1);
 }
 
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
-{
-   if (index >= mem->nslabs)
-   return 0;
-   return index;
-}
-
 /*
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
@@ -499,29 +493,21 @@ static int swiotlb_find_slots(struct device *dev, 
phys_addr_t orig_addr,
  size_t alloc_size, unsigned int alloc_align_mask)
 {
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+   struct io_tlb_slot *slot, *tmp;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
+   dma_addr_t slot_dma_addr;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
-   unsigned int nslots = nr_slots(alloc_size), stride;
-   unsigned int index, wrap, i;
+   unsigned int nslots = nr_slot

[PATCH v1 1/3] swiotlb: Use bitmap to track free slots

2022-06-28 Thread Chao Gao
Currently, each slot tracks the number of contiguous free slots starting
from itself. It helps to quickly check if there are enough contiguous
entries when dealing with an allocation request. But maintaining this
information can leads to some overhead. Specifically, if a slot is
allocated/freed, preceding slots may need to be updated as the number
of contiguous free slots can change. This process may access memory
scattering over multiple cachelines.

To reduce the overhead of maintaining the number of contiguous free
entries, use a global bitmap to track free slots; each bit represents
if a slot is available. The number of contiguous free slots can be
calculated by counting the number of consecutive 1s in the bitmap.

Tests show that the average cost of freeing slots drops by 120 cycles
while the average cost of allocation increases by 20 cycles. Overall,
100 cycles are saved from a pair of allocation and freeing.

Signed-off-by: Chao Gao 
---
 include/linux/swiotlb.h |  6 ++--
 kernel/dma/swiotlb.c| 64 -
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7ed35dd3de6e..c3eab237991a 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -78,8 +78,6 @@ extern enum swiotlb_force swiotlb_force;
  * @end. For default swiotlb, this is command line adjustable via
  * setup_io_tlb_npages.
  * @used:  The number of used IO TLB block.
- * @list:  The free list describing the number of free entries available
- * from each index.
  * @index: The index to start searching in the next round.
  * @orig_addr: The original address corresponding to a mapped entry.
  * @alloc_size:Size of the allocated buffer.
@@ -89,6 +87,8 @@ extern enum swiotlb_force swiotlb_force;
  * @late_alloc:%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
  * @for_alloc:  %true if the pool is used for memory allocation
+ * @bitmap:The bitmap used to track free entries. 1 in bit X means the slot
+ * indexed by X is free.
  */
 struct io_tlb_mem {
phys_addr_t start;
@@ -105,8 +105,8 @@ struct io_tlb_mem {
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
-   unsigned int list;
} *slots;
+   unsigned long *bitmap;
 };
 extern struct io_tlb_mem io_tlb_default_mem;
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index cb50f8d38360..d7f68c0af7f5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -207,7 +207,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, 
phys_addr_t start,
 
spin_lock_init(&mem->lock);
for (i = 0; i < mem->nslabs; i++) {
-   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+   __set_bit(i, mem->bitmap);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
@@ -274,6 +274,11 @@ void __init swiotlb_init_remap(bool addressing_limit, 
unsigned int flags,
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
  __func__, alloc_size, PAGE_SIZE);
 
+   mem->bitmap = memblock_alloc(BITS_TO_BYTES(nslabs), SMP_CACHE_BYTES);
+   if (!mem->bitmap)
+   panic("%s: Failed to allocate %lu bytes align=0x%x\n",
+ __func__, DIV_ROUND_UP(nslabs, BITS_PER_BYTE), 
SMP_CACHE_BYTES);
+
swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false);
 
if (flags & SWIOTLB_VERBOSE)
@@ -337,10 +342,13 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
(PAGE_SIZE << order) >> 20);
}
 
+   mem->bitmap = bitmap_zalloc(nslabs, GFP_KERNEL);
mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(array_size(sizeof(*mem->slots), nslabs)));
-   if (!mem->slots) {
+   if (!mem->slots || !mem->bitmap) {
free_pages((unsigned long)vstart, order);
+   bitmap_free(mem->bitmap);
+   kfree(mem->slots);
return -ENOMEM;
}
 
@@ -498,7 +506,7 @@ static int swiotlb_find_slots(struct device *dev, 
phys_addr_t orig_addr,
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
unsigned int nslots = nr_slots(alloc_size), stride;
-   unsigned int index, wrap, count = 0, i;
+   unsigned int index, wrap, i;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned long flags;
 
@@ -514,6 +522,9 @@ static int swiotlb_find_slots(struct device *dev, 
phys_addr_t orig_addr,
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
stride 

[PATCH v1 0/3] swiotlb performance optimizations

2022-06-28 Thread Chao Gao
Intent of this post:
 Seek reviews from Intel reviewers and anyone else in the list
 interested in IO performance in confidential VMs. Need some acked-by
 reviewed-by tags before I can add swiotlb maintainers to "to/cc" lists
 and ask for a review from them.

swiotlb is now widely used by confidential VMs. This series optimizes
swiotlb to reduce cache misses and lock contention during bounce buffer
allocation/free and memory bouncing to improve IO workload performance in
confidential VMs.

Here are some FIO tests we did to demonstrate the improvement.

Test setup
--

A normal VM with 8vCPU and 32G memory, swiotlb is enabled by swiotlb=force.
100 in Host/Guest CPU utilization means 1 logical processor. FIO block size
is 4K and iodepth is 256. Note that a normal VM is used so that others lack
of necessary hardware to host confidential VMs can reproduce results below.

Results
---

1 FIO job   read/write  Throughput  IOPSHost CPUGuest 
CPU
(MB/s)  (k) utilization 
utilization
vanilla read1037253 228.48  101.92
write   1148280 233.28  100.96
optimized   read1160283 232.32  101.12
write   1195292 233.28  100.64

1-job FIO sequential read/write perf increase by 12% and 4% respectively.

4 FIO jobs  read/write  Throughput  IOPSHost CPUGuest 
CPU
(MB/s)  (k) utilization 
utilization
vanilla read885 214.9   527.04  401.12
write   868 212.1   531.84  400.64
optimized   read2320567 344.64  202.8
write   1998488 312 173.92

4-job FIO sequential read/write perf increase by 164% and 130% respectively.

This series is based on 5.19-rc2.

Andi Kleen (1):
  swiotlb: Split up single swiotlb lock

Chao Gao (2):
  swiotlb: Use bitmap to track free slots
  swiotlb: Allocate memory in a cache-friendly way

 .../admin-guide/kernel-parameters.txt |   4 +-
 arch/x86/kernel/acpi/boot.c   |   4 +
 include/linux/swiotlb.h   |  47 +++-
 kernel/dma/swiotlb.c  | 263 +-
 4 files changed, 229 insertions(+), 89 deletions(-)

-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] dma-direct: avoid redundant memory sync for swiotlb

2022-04-12 Thread Chao Gao
On Wed, Apr 13, 2022 at 06:59:58AM +0200, Christoph Hellwig wrote:
>So for now I'd be happy with the one liner presented here, but
>eventually the whole area could use an overhaul.

Thanks. Do you want me to post a new version with the Fixes tag or you
will take care of it?

Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct 
code")
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] dma-direct: avoid redundant memory sync for swiotlb

2022-04-12 Thread Chao Gao
On Tue, Apr 12, 2022 at 02:33:05PM +0100, Robin Murphy wrote:
>On 12/04/2022 12:38 pm, Chao Gao wrote:
>> When we looked into FIO performance with swiotlb enabled in VM, we found
>> swiotlb_bounce() is always called one more time than expected for each DMA
>> read request.
>> 
>> It turns out that the bounce buffer is copied to original DMA buffer twice
>> after the completion of a DMA request (one is done by in
>> dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
>> But the content in bounce buffer actually doesn't change between the two
>> rounds of copy. So, one round of copy is redundant.
>> 
>> Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
>> skip the memory copy in it.
>
>It's still a little suboptimal and non-obvious to call into SWIOTLB twice
>though - even better might be for SWIOTLB to call arch_sync_dma_for_cpu() at
>the appropriate place internally,

Hi Robin,

dma_direct_sync_single_for_cpu() also calls arch_sync_dma_for_cpu_all()
and arch_dma_mark_clean() in some cases. if SWIOTLB does sync internally,
should these two functions be called by SWIOTLB?

Personally, it might be better if swiotlb can just focus on bounce buffer
alloc/free. Adding more DMA coherence logic into swiotlb will make it
a little complicated.

How about an open-coded version of dma_direct_sync_single_for_cpu
in dma_direct_unmap_page with swiotlb_sync_single_for_cpu replaced by
swiotlb_tbl_unmap_single?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] dma-direct: avoid redundant memory sync for swiotlb

2022-04-12 Thread Chao Gao
On Tue, Apr 12, 2022 at 07:38:05PM +0800, Chao Gao wrote:
>When we looked into FIO performance with swiotlb enabled in VM, we found
>swiotlb_bounce() is always called one more time than expected for each DMA
>read request.
>
>It turns out that the bounce buffer is copied to original DMA buffer twice
>after the completion of a DMA request (one is done by in
>dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
>But the content in bounce buffer actually doesn't change between the two
>rounds of copy. So, one round of copy is redundant.
>
>Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
>skip the memory copy in it.
>
>This fix increases FIO 64KB sequential read throughput in a guest with
>swiotlb=force by 5.6%.
>

Sorry. A fixes tag is missing:

Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct 
code")

>Reported-by: Wang Zhaoyang1 
>Reported-by: Gao Liang 
>Signed-off-by: Chao Gao 
>Reviewed-by: Kevin Tian 
>---
> kernel/dma/direct.h | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
>diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
>index 4632b0f4f72e..8a6cd53dbe8c 100644
>--- a/kernel/dma/direct.h
>+++ b/kernel/dma/direct.h
>@@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device 
>*dev, dma_addr_t addr,
>   dma_direct_sync_single_for_cpu(dev, addr, size, dir);
> 
>   if (unlikely(is_swiotlb_buffer(dev, phys)))
>-  swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
>+  swiotlb_tbl_unmap_single(dev, phys, size, dir,
>+   attrs | DMA_ATTR_SKIP_CPU_SYNC);
> }
> #endif /* _KERNEL_DMA_DIRECT_H */
>-- 
>2.25.1
>
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] dma-direct: avoid redundant memory sync for swiotlb

2022-04-12 Thread Chao Gao
When we looked into FIO performance with swiotlb enabled in VM, we found
swiotlb_bounce() is always called one more time than expected for each DMA
read request.

It turns out that the bounce buffer is copied to original DMA buffer twice
after the completion of a DMA request (one is done by in
dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
But the content in bounce buffer actually doesn't change between the two
rounds of copy. So, one round of copy is redundant.

Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
skip the memory copy in it.

This fix increases FIO 64KB sequential read throughput in a guest with
swiotlb=force by 5.6%.

Reported-by: Wang Zhaoyang1 
Reported-by: Gao Liang 
Signed-off-by: Chao Gao 
Reviewed-by: Kevin Tian 
---
 kernel/dma/direct.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72e..8a6cd53dbe8c 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device 
*dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
if (unlikely(is_swiotlb_buffer(dev, phys)))
-   swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+   swiotlb_tbl_unmap_single(dev, phys, size, dir,
+attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] swiotlb: allocate memory in a cache-friendly way

2021-09-16 Thread Chao Gao
On Thu, Sep 16, 2021 at 11:49:39AM -0400, Konrad Rzeszutek Wilk wrote:
>On Wed, Sep 01, 2021 at 12:21:35PM +0800, Chao Gao wrote:
>> Currently, swiotlb uses a global index to indicate the starting point
>> of next search. The index increases from 0 to the number of slots - 1
>> and then wraps around. It is straightforward but not cache-friendly
>> because the "oldest" slot in swiotlb tends to be used first.
>> 
>> Freed slots are probably accessed right before being freed, especially
>> in VM's case (device backends access them in DMA_TO_DEVICE mode; guest
>> accesses them in other DMA modes). Thus those just freed slots may
>> reside in cache. Then reusing those just freed slots can reduce cache
>> misses.
>> 
>> To that end, maintain a free list for free slots and insert freed slots
>> from the head and searching for free slots always starts from the head.
>> 
>> With this optimization, network throughput of sending data from host to
>> guest, measured by iperf3, increases by 7%.
>
>Wow, that is pretty awesome!
>
>Are there any other benchmarks that you ran that showed a negative
>performance?

TBH, yes. Recently I do fio tests with this patch. The impact of this patch
is: (+ means performance improvement; - means performance regression)

1-job fio:
randread: +6.7%
randwrite: -1.6%
read: +8.2%
write: +7.4%

8-job fio:
randread: -5.5%
randwrite: -12.6%
read: -24.8%
write: -45.5%

I haven't figured out why multi-job fio tests suffer. Will post v2 once
the issue gets resolved.

Thanks
Chao
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] swiotlb: allocate memory in a cache-friendly way

2021-08-31 Thread Chao Gao
Currently, swiotlb uses a global index to indicate the starting point
of next search. The index increases from 0 to the number of slots - 1
and then wraps around. It is straightforward but not cache-friendly
because the "oldest" slot in swiotlb tends to be used first.

Freed slots are probably accessed right before being freed, especially
in VM's case (device backends access them in DMA_TO_DEVICE mode; guest
accesses them in other DMA modes). Thus those just freed slots may
reside in cache. Then reusing those just freed slots can reduce cache
misses.

To that end, maintain a free list for free slots and insert freed slots
from the head and searching for free slots always starts from the head.

With this optimization, network throughput of sending data from host to
guest, measured by iperf3, increases by 7%.

A bad side effect of this patch is we cannot use a large stride to skip
unaligned slots when there is an alignment requirement. Currently, a
large stride is used when a) device has an alignment requirement, stride
is calculated according to the requirement; b) the requested size is
larger than PAGE_SIZE. For x86 with 4KB page size, stride is set to 2.

For case a), few devices have an alignment requirement; the impact is
limited. For case b) this patch probably leads to one (or more if page size
is larger than 4K) additional lookup; but as the "io_tlb_slot" struct of
free slots are also accessed when freeing slots, they probably resides in
CPU cache as well and then the overhead is almost negligible.

Suggested-by: Andi Kleen 
Signed-off-by: Chao Gao 
---
 include/linux/swiotlb.h | 15 --
 kernel/dma/swiotlb.c| 43 +++--
 2 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b0cb2a9973f4..8cafafd218af 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -63,6 +63,13 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
 
+struct io_tlb_slot {
+   phys_addr_t orig_addr;
+   size_t alloc_size;
+   unsigned int list;
+   struct list_head node;
+};
+
 /**
  * struct io_tlb_mem - IO TLB Memory Pool Descriptor
  *
@@ -93,17 +100,13 @@ struct io_tlb_mem {
phys_addr_t end;
unsigned long nslabs;
unsigned long used;
-   unsigned int index;
+   struct list_head free_slots;
spinlock_t lock;
struct dentry *debugfs;
bool late_alloc;
bool force_bounce;
bool for_alloc;
-   struct io_tlb_slot {
-   phys_addr_t orig_addr;
-   size_t alloc_size;
-   unsigned int list;
-   } *slots;
+   struct io_tlb_slot *slots;
 };
 extern struct io_tlb_mem io_tlb_default_mem;
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 87c40517e822..12b5b8471e54 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -184,7 +184,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, 
phys_addr_t start,
mem->nslabs = nslabs;
mem->start = start;
mem->end = mem->start + bytes;
-   mem->index = 0;
+   INIT_LIST_HEAD(&mem->free_slots);
mem->late_alloc = late_alloc;
 
if (swiotlb_force == SWIOTLB_FORCE)
@@ -195,6 +195,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, 
phys_addr_t start,
mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
+   list_add_tail(&mem->slots[i].node, &mem->free_slots);
}
memset(vaddr, 0, bytes);
 }
@@ -447,13 +448,6 @@ static inline unsigned long get_max_slots(unsigned long 
boundary_mask)
return nr_slots(boundary_mask + 1);
 }
 
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
-{
-   if (index >= mem->nslabs)
-   return 0;
-   return index;
-}
-
 /*
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
@@ -462,38 +456,29 @@ static int swiotlb_find_slots(struct device *dev, 
phys_addr_t orig_addr,
  size_t alloc_size)
 {
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+   struct io_tlb_slot *slot, *tmp;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
-   unsigned int nslots = nr_slots(alloc_size), stride;
-   unsigned int index, wrap, count = 0, i;
+   unsigned int nslots