Re: [Intel-gfx] [PATCH 5/7] drm/i915/ttm, drm/ttm: Add a generic TTM memcpy move for page-based iomem

2021-05-17 Thread Thomas Hellström



On 5/17/21 12:57 PM, Jani Nikula wrote:

On Tue, 11 May 2021, Thomas Hellström  wrote:

The internal ttm_bo_util memcpy uses vmap functionality, and while it
probably might be possible to use it for copying in- and out of
sglist represented io memory, using io_mem_reserve() / io_mem_free()
callbacks, that would cause problems with fault().
Instead, implement a method mapping page-by-page using kmap_local()
semantics. As an additional benefit we then avoid the occasional global
TLB flushes of vmap() and consuming vmap space, elimination of a critical
point of failure and with a slight change of semantics we could also push
the memcpy out async for testing and async driver develpment purposes.
Pushing out async can be done since there is no memory allocation going on
that could violate the dma_fence lockdep rules.

Note that drivers that don't want to use struct io_mapping but relies on
memremap functionality, and that don't want to use scatterlists for
VRAM may well define specialized (hopefully reusable) iterators for their
particular environment.

Cc: Christian König 
Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/Makefile |   1 +
  .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.c   | 155 ++
  .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.h   | 141 
  drivers/gpu/drm/ttm/ttm_bo.c  |   1 +
  4 files changed, 298 insertions(+)
  create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
  create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index cb8823570996..958ccc1edfed 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -155,6 +155,7 @@ gem-y += \
gem/i915_gem_stolen.o \
gem/i915_gem_throttle.o \
gem/i915_gem_tiling.o \
+   gem/i915_gem_ttm_bo_util.o \
gem/i915_gem_userptr.o \
gem/i915_gem_wait.o \
gem/i915_gemfs.o
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
new file mode 100644
index ..1116d7df1461
--- /dev/null
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2021 Intel Corporation
+ */
+
+/**
+ * DOC: Usage and intentions.
+ *
+ * This file contains functionality that we might want to move into
+ * ttm_bo_util.c if there is a common interest.
+ * Currently a kmap_local only memcpy with support for page-based iomem 
regions,
+ * and fast memcpy from write-combined memory.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "i915_memcpy.h"
+
+#include "gem/i915_gem_ttm_bo_util.h"
+
+static void i915_ttm_kmap_iter_tt_kmap_local(struct i915_ttm_kmap_iter *iter,
+struct dma_buf_map *dmap,
+pgoff_t i)
+{
+   struct i915_ttm_kmap_iter_tt *iter_tt =
+   container_of(iter, typeof(*iter_tt), base);
+
+   dma_buf_map_set_vaddr(dmap, kmap_local_page(iter_tt->tt->pages[i]));
+}
+
+static void i915_ttm_kmap_iter_iomap_kmap_local(struct i915_ttm_kmap_iter 
*iter,
+   struct dma_buf_map *dmap,
+   pgoff_t i)
+{
+   struct i915_ttm_kmap_iter_iomap *iter_io =
+   container_of(iter, typeof(*iter_io), base);
+   void __iomem *addr;
+
+retry:
+   while (i >= iter_io->cache.end) {
+   iter_io->cache.sg = iter_io->cache.sg ?
+   sg_next(iter_io->cache.sg) : iter_io->st->sgl;
+   iter_io->cache.i = iter_io->cache.end;
+   iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >>
+   PAGE_SHIFT;
+   iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) -
+   iter_io->start;
+   }
+
+   if (i < iter_io->cache.i) {
+   iter_io->cache.end = 0;
+   iter_io->cache.sg = NULL;
+   goto retry;
+   }
+
+   addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs +
+  (((resource_size_t)i - iter_io->cache.i)
+   << PAGE_SHIFT));
+   dma_buf_map_set_vaddr_iomem(dmap, addr);
+}
+
+struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops = {
+   .kmap_local = i915_ttm_kmap_iter_tt_kmap_local
+};
+
+struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops = {
+   .kmap_local =  i915_ttm_kmap_iter_iomap_kmap_local
+};
+
+static void kunmap_local_dma_buf_map(struct dma_buf_map *map)
+{
+   if (map->is_iomem)
+   io_mapping_unmap_local(map->vaddr_iomem);
+   else
+   kunmap_local(map->vaddr);
+}
+
+/**
+ * i915_ttm_move_memcpy - Helper to perform a memcpy ttm move operation.
+ * @bo: The struct ttm_buffer_object.
+ * @new_mem: The struct 

Re: [Intel-gfx] [PATCH 5/7] drm/i915/ttm, drm/ttm: Add a generic TTM memcpy move for page-based iomem

2021-05-17 Thread Jani Nikula
On Tue, 11 May 2021, Thomas Hellström  wrote:
> The internal ttm_bo_util memcpy uses vmap functionality, and while it
> probably might be possible to use it for copying in- and out of
> sglist represented io memory, using io_mem_reserve() / io_mem_free()
> callbacks, that would cause problems with fault().
> Instead, implement a method mapping page-by-page using kmap_local()
> semantics. As an additional benefit we then avoid the occasional global
> TLB flushes of vmap() and consuming vmap space, elimination of a critical
> point of failure and with a slight change of semantics we could also push
> the memcpy out async for testing and async driver develpment purposes.
> Pushing out async can be done since there is no memory allocation going on
> that could violate the dma_fence lockdep rules.
>
> Note that drivers that don't want to use struct io_mapping but relies on
> memremap functionality, and that don't want to use scatterlists for
> VRAM may well define specialized (hopefully reusable) iterators for their
> particular environment.
>
> Cc: Christian König 
> Signed-off-by: Thomas Hellström 
> ---
>  drivers/gpu/drm/i915/Makefile |   1 +
>  .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.c   | 155 ++
>  .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.h   | 141 
>  drivers/gpu/drm/ttm/ttm_bo.c  |   1 +
>  4 files changed, 298 insertions(+)
>  create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
>  create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index cb8823570996..958ccc1edfed 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -155,6 +155,7 @@ gem-y += \
>   gem/i915_gem_stolen.o \
>   gem/i915_gem_throttle.o \
>   gem/i915_gem_tiling.o \
> + gem/i915_gem_ttm_bo_util.o \
>   gem/i915_gem_userptr.o \
>   gem/i915_gem_wait.o \
>   gem/i915_gemfs.o
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c 
> b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
> new file mode 100644
> index ..1116d7df1461
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
> @@ -0,0 +1,155 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2021 Intel Corporation
> + */
> +
> +/**
> + * DOC: Usage and intentions.
> + *
> + * This file contains functionality that we might want to move into
> + * ttm_bo_util.c if there is a common interest.
> + * Currently a kmap_local only memcpy with support for page-based iomem 
> regions,
> + * and fast memcpy from write-combined memory.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "i915_memcpy.h"
> +
> +#include "gem/i915_gem_ttm_bo_util.h"
> +
> +static void i915_ttm_kmap_iter_tt_kmap_local(struct i915_ttm_kmap_iter *iter,
> +  struct dma_buf_map *dmap,
> +  pgoff_t i)
> +{
> + struct i915_ttm_kmap_iter_tt *iter_tt =
> + container_of(iter, typeof(*iter_tt), base);
> +
> + dma_buf_map_set_vaddr(dmap, kmap_local_page(iter_tt->tt->pages[i]));
> +}
> +
> +static void i915_ttm_kmap_iter_iomap_kmap_local(struct i915_ttm_kmap_iter 
> *iter,
> + struct dma_buf_map *dmap,
> + pgoff_t i)
> +{
> + struct i915_ttm_kmap_iter_iomap *iter_io =
> + container_of(iter, typeof(*iter_io), base);
> + void __iomem *addr;
> +
> +retry:
> + while (i >= iter_io->cache.end) {
> + iter_io->cache.sg = iter_io->cache.sg ?
> + sg_next(iter_io->cache.sg) : iter_io->st->sgl;
> + iter_io->cache.i = iter_io->cache.end;
> + iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >>
> + PAGE_SHIFT;
> + iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) -
> + iter_io->start;
> + }
> +
> + if (i < iter_io->cache.i) {
> + iter_io->cache.end = 0;
> + iter_io->cache.sg = NULL;
> + goto retry;
> + }
> +
> + addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs +
> +(((resource_size_t)i - iter_io->cache.i)
> + << PAGE_SHIFT));
> + dma_buf_map_set_vaddr_iomem(dmap, addr);
> +}
> +
> +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops = {
> + .kmap_local = i915_ttm_kmap_iter_tt_kmap_local
> +};
> +
> +struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops = {
> + .kmap_local =  i915_ttm_kmap_iter_iomap_kmap_local
> +};
> +
> +static void kunmap_local_dma_buf_map(struct dma_buf_map *map)
> +{
> + if (map->is_iomem)
> + io_mapping_unmap_local(map->vaddr_iomem);
> + else
> + kunmap_local(map->vaddr);
> +}
> +
> +/**
> + * i915_ttm_

[PATCH 5/7] drm/i915/ttm, drm/ttm: Add a generic TTM memcpy move for page-based iomem

2021-05-11 Thread Thomas Hellström
The internal ttm_bo_util memcpy uses vmap functionality, and while it
probably might be possible to use it for copying in- and out of
sglist represented io memory, using io_mem_reserve() / io_mem_free()
callbacks, that would cause problems with fault().
Instead, implement a method mapping page-by-page using kmap_local()
semantics. As an additional benefit we then avoid the occasional global
TLB flushes of vmap() and consuming vmap space, elimination of a critical
point of failure and with a slight change of semantics we could also push
the memcpy out async for testing and async driver develpment purposes.
Pushing out async can be done since there is no memory allocation going on
that could violate the dma_fence lockdep rules.

Note that drivers that don't want to use struct io_mapping but relies on
memremap functionality, and that don't want to use scatterlists for
VRAM may well define specialized (hopefully reusable) iterators for their
particular environment.

Cc: Christian König 
Signed-off-by: Thomas Hellström 
---
 drivers/gpu/drm/i915/Makefile |   1 +
 .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.c   | 155 ++
 .../gpu/drm/i915/gem/i915_gem_ttm_bo_util.h   | 141 
 drivers/gpu/drm/ttm/ttm_bo.c  |   1 +
 4 files changed, 298 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
 create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index cb8823570996..958ccc1edfed 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -155,6 +155,7 @@ gem-y += \
gem/i915_gem_stolen.o \
gem/i915_gem_throttle.o \
gem/i915_gem_tiling.o \
+   gem/i915_gem_ttm_bo_util.o \
gem/i915_gem_userptr.o \
gem/i915_gem_wait.o \
gem/i915_gemfs.o
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
new file mode 100644
index ..1116d7df1461
--- /dev/null
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_bo_util.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2021 Intel Corporation
+ */
+
+/**
+ * DOC: Usage and intentions.
+ *
+ * This file contains functionality that we might want to move into
+ * ttm_bo_util.c if there is a common interest.
+ * Currently a kmap_local only memcpy with support for page-based iomem 
regions,
+ * and fast memcpy from write-combined memory.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "i915_memcpy.h"
+
+#include "gem/i915_gem_ttm_bo_util.h"
+
+static void i915_ttm_kmap_iter_tt_kmap_local(struct i915_ttm_kmap_iter *iter,
+struct dma_buf_map *dmap,
+pgoff_t i)
+{
+   struct i915_ttm_kmap_iter_tt *iter_tt =
+   container_of(iter, typeof(*iter_tt), base);
+
+   dma_buf_map_set_vaddr(dmap, kmap_local_page(iter_tt->tt->pages[i]));
+}
+
+static void i915_ttm_kmap_iter_iomap_kmap_local(struct i915_ttm_kmap_iter 
*iter,
+   struct dma_buf_map *dmap,
+   pgoff_t i)
+{
+   struct i915_ttm_kmap_iter_iomap *iter_io =
+   container_of(iter, typeof(*iter_io), base);
+   void __iomem *addr;
+
+retry:
+   while (i >= iter_io->cache.end) {
+   iter_io->cache.sg = iter_io->cache.sg ?
+   sg_next(iter_io->cache.sg) : iter_io->st->sgl;
+   iter_io->cache.i = iter_io->cache.end;
+   iter_io->cache.end += sg_dma_len(iter_io->cache.sg) >>
+   PAGE_SHIFT;
+   iter_io->cache.offs = sg_dma_address(iter_io->cache.sg) -
+   iter_io->start;
+   }
+
+   if (i < iter_io->cache.i) {
+   iter_io->cache.end = 0;
+   iter_io->cache.sg = NULL;
+   goto retry;
+   }
+
+   addr = io_mapping_map_local_wc(iter_io->iomap, iter_io->cache.offs +
+  (((resource_size_t)i - iter_io->cache.i)
+   << PAGE_SHIFT));
+   dma_buf_map_set_vaddr_iomem(dmap, addr);
+}
+
+struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_tt_ops = {
+   .kmap_local = i915_ttm_kmap_iter_tt_kmap_local
+};
+
+struct i915_ttm_kmap_iter_ops i915_ttm_kmap_iter_io_ops = {
+   .kmap_local =  i915_ttm_kmap_iter_iomap_kmap_local
+};
+
+static void kunmap_local_dma_buf_map(struct dma_buf_map *map)
+{
+   if (map->is_iomem)
+   io_mapping_unmap_local(map->vaddr_iomem);
+   else
+   kunmap_local(map->vaddr);
+}
+
+/**
+ * i915_ttm_move_memcpy - Helper to perform a memcpy ttm move operation.
+ * @bo: The struct ttm_buffer_object.
+ * @new_mem: The struct ttm_resource we're moving to (copy destination).
+ * @new_kmap: A struct i915_ttm_kmap_iter repres