[PATCH] locking/lockdep: Fix expected depth value in __lock_release()

2015-10-19 Thread j . glisse
From: Jérôme Glisse 

In __lock_release() we are removing one entry from the stack and
rebuilding the hash chain by re-adding entry above the entry we
just removed. If the entry removed was between 2 entry of same
class then this 2 entry might be coalesced into one single entry
which in turns means that the lockdep_depth value will not be
incremented and thus the expected lockdep_depth value after this
operation will be wrong triggering an unjustified WARN_ONCE() at
the end of __lock_release().

This patch adjust the expect depth value by decrementing it if
what was previously 2 entry inside the stack are coalesced into
only one entry.

Note that __lock_set_class() does not suffer from same issue as
it adds a new class and thus can not lead to coalescing of stack
entry.

Signed-off-by: Jérôme Glisse 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Sasha Levin 
---
 kernel/locking/lockdep.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4e49cc4..cac5e21 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3428,6 +3428,8 @@ found_it:
curr->curr_chain_key = hlock->prev_chain_key;
 
for (i++; i < depth; i++) {
+   int tmp = curr->lockdep_depth;
+
hlock = curr->held_locks + i;
if (!__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass, hlock->trylock,
@@ -3435,6 +3437,13 @@ found_it:
hlock->nest_lock, hlock->acquire_ip,
hlock->references, hlock->pin_count))
return 0;
+   /*
+* If nest_lock is true and the lock we just removed allow two
+* lock of same class to be consolidated in only one held_lock
+* then the lockdep_depth count will not increase as we expect
+* it to. So adjust the expected depth value accordingly.
+*/
+   depth -= (curr->lockdep_depth == tmp);
}
 
/*
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] locking/lockdep: Fix expected depth value in __lock_release()

2015-10-19 Thread j . glisse
From: Jérôme Glisse 

In __lock_release() we are removing one entry from the stack and
rebuilding the hash chain by re-adding entry above the entry we
just removed. If the entry removed was between 2 entry of same
class then this 2 entry might be coalesced into one single entry
which in turns means that the lockdep_depth value will not be
incremented and thus the expected lockdep_depth value after this
operation will be wrong triggering an unjustified WARN_ONCE() at
the end of __lock_release().

This patch adjust the expect depth value by decrementing it if
what was previously 2 entry inside the stack are coalesced into
only one entry.

Note that __lock_set_class() does not suffer from same issue as
it adds a new class and thus can not lead to coalescing of stack
entry.

Signed-off-by: Jérôme Glisse 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Sasha Levin 
---
 kernel/locking/lockdep.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4e49cc4..cac5e21 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3428,6 +3428,8 @@ found_it:
curr->curr_chain_key = hlock->prev_chain_key;
 
for (i++; i < depth; i++) {
+   int tmp = curr->lockdep_depth;
+
hlock = curr->held_locks + i;
if (!__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass, hlock->trylock,
@@ -3435,6 +3437,13 @@ found_it:
hlock->nest_lock, hlock->acquire_ip,
hlock->references, hlock->pin_count))
return 0;
+   /*
+* If nest_lock is true and the lock we just removed allow two
+* lock of same class to be consolidated in only one held_lock
+* then the lockdep_depth count will not increase as we expect
+* it to. So adjust the expected depth value accordingly.
+*/
+   depth -= (curr->lockdep_depth == tmp);
}
 
/*
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] IB/core: Print error when umem fails due to locked memory limit.

2015-10-15 Thread j . glisse
From: Jérôme Glisse 

It can be rather tedious to find why userspace is failing when only
thing kernel report is -ENOMEM. This add an error message so that
user can figure out why they are getting -ENOMEM.

Signed-off-by: Jérôme Glisse 
cc: 
Cc: Haggai Eran 
Cc: Sagi Grimberg 
Cc: Shachar Raindel 
Cc: Doug Ledford 
---
 drivers/infiniband/core/umem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 38acb3c..a66929e 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -169,6 +169,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, 
unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+   pr_err("locked memory quota exhausted (see ulimit -l)\n");
ret = -ENOMEM;
goto out;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] IB/core: Print error when umem fails due to locked memory limit.

2015-10-15 Thread j . glisse
From: Jérôme Glisse 

It can be rather tedious to find why userspace is failing when only
thing kernel report is -ENOMEM. This add an error message so that
user can figure out why they are getting -ENOMEM.

Signed-off-by: Jérôme Glisse 
cc: 
Cc: Haggai Eran 
Cc: Sagi Grimberg 
Cc: Shachar Raindel 
Cc: Doug Ledford 
---
 drivers/infiniband/core/umem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 38acb3c..a66929e 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -169,6 +169,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, 
unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+   pr_err("locked memory quota exhausted (see ulimit -l)\n");
ret = -ENOMEM;
goto out;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] iommu/amd: Fix amd_iommu_detect() (does not fix any issues).

2015-08-31 Thread j . glisse
From: Jérôme Glisse 

Fix amd_iommu_detect() to return positive value on success, like
intended, and not zero. This will not change anything in the end
as AMD IOMMU disable swiotlb and properly associate itself with
devices even if detect() doesn't return a positive value.

Signed-off-by: Jérôme Glisse 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 drivers/iommu/amd_iommu_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index a24495e..360a451 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -2198,7 +2198,7 @@ int __init amd_iommu_detect(void)
iommu_detected = 1;
x86_init.iommu.iommu_init = amd_iommu_init;
 
-   return 0;
+   return 1;
 }
 
 /
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] iommu/amd: Fix amd_iommu_detect() (does not fix any issues).

2015-08-31 Thread j . glisse
From: Jérôme Glisse 

Fix amd_iommu_detect() to return positive value on success, like
intended, and not zero. This will not change anything in the end
as AMD IOMMU disable swiotlb and properly associate itself with
devices even if detect() doesn't return a positive value.

Signed-off-by: Jérôme Glisse 
Cc: Joerg Roedel 
Cc: io...@lists.linux-foundation.org
---
 drivers/iommu/amd_iommu_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index a24495e..360a451 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -2198,7 +2198,7 @@ int __init amd_iommu_detect(void)
iommu_detected = 1;
x86_init.iommu.iommu_init = amd_iommu_init;
 
-   return 0;
+   return 1;
 }
 
 /
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


HMM (Heterogeneous Memory Management) v8

2015-05-21 Thread j . glisse

So sorry had to resend because i stupidly forgot to cc mailing list.
Ignore private send done before.


HMM (Heterogeneous Memory Management) is an helper layer for device
that want to mirror a process address space into their own mmu. Main
target is GPU but other hardware, like network device can take also
use HMM.

There is two side to HMM, first one is mirroring of process address
space on behalf of a device. HMM will manage a secondary page table
for the device and keep it synchronize with the CPU page table. HMM
also do DMA mapping on behalf of the device (which would allow new
kind of optimization further down the road (1)).

Second side is allowing to migrate process memory to device memory
where device memory is unmappable by the CPU. Any CPU access will
trigger special fault that will migrate memory back.

>From design point of view not much changed since last patchset (2).
Most of the change are in small details of the API expose to device
driver. This version also include device driver change for Mellanox
hardware to use HMM as an alternative to ODP (which provide a subset
of HMM functionality specificaly for RDMA devices). Long term plan
is to have HMM completely replace ODP.



Why doing this ?

Mirroring a process address space is mandatory with OpenCL 2.0 and
with other GPU compute API. OpenCL 2.0 allow different level of
implementation and currently only the lowest 2 are supported on
Linux. To implement the highest level, where CPU and GPU access
can happen concurently and are cache coherent, HMM is needed, or
something providing same functionality, for instance through
platform hardware.

Hardware solution such as PCIE ATS/PASID is limited to mirroring
system memory and does not provide way to migrate memory to device
memory (which offer significantly more bandwidth up to 10 times
faster than regular system memory with discret GPU, also have
lower latency than PCIE transaction).

Current CPU with GPU on same die (AMD or Intel) use the ATS/PASID
and for Intel a special level of cache (backed by a large pool of
fast memory).

For foreseeable futur, discrete GPU will remain releveant as they
can have a large quantity of faster memory than integrated GPU.

Thus we believe HMM will allow to leverage discret GPU memory in
a transparent fashion to the application, with minimum disruption
to the linux kernel mm code. Also HMM can work along hardware
solution such as PCIE ATS/PASID (leaving regular case to ATS/PASID
while HMM handles the migrated memory case).



Design :

The patch 1, 2, 3 and 4 augment the mmu notifier API with new
informations to more efficiently mirror CPU page table updates.

The first side of HMM, process address space mirroring, is
implemented in patch 5 through 12. This use a secondary page
table, in which HMM mirror memory actively use by the device.
HMM does not take a reference on any of the page, it use the
mmu notifier API to track changes to the CPU page table and to
update the mirror page table. All this while providing a simple
API to device driver.

To implement this we use a "generic" page table and not a radix
tree because we need to store more flags than radix allows and
we need to store dma address (sizeof(dma_addr_t) > sizeof(long)
on some platform). All this is

Patch 14 pass down the lane the new child mm struct of a parent
process being forked. This is necessary to properly handle fork
when parent process have migrated memory (more on that below).

Patch 15 allow to get the current memcg against which anonymous
memory of a process should be accounted. It usefull because in
HMM we do bulk transaction on address space and we wish to avoid
storing a pointer to memcg for each single page. All operation
dealing with memcg happens under the protection of the mmap
semaphore.


Second side of HMM, migration to device memory, is implemented
in patch 16 to 28. This only deal with anonymous memory. A new
special swap type is introduced. Migrated memory will have there
CPU page table entry set to this special swap entry (like the
migration entry but unlike migration this is not a short lived
state).

All the patches are then set of functions that deals with those
special entry in the various code path that might face them.

Memory migration require several steps, first the memory is un-
mapped from CPU and replace with special "locked" entry, HMM
locked entry is a short lived transitional state, this is to
avoid two threads to fight over migration entry.

Once unmapped HMM can determine what can be migrated or not by
comparing mapcount and page count. If something holds a reference
then the page is not migrated and CPU page table is restored.
Next step is to schedule the copy to device memory and update
the CPU page table to regular HMM entry.

Migration back follow the same pattern, replace with special
lock entry, then copy back, then update CPU page table.


(1) Because HMM keeps a secondary page table which keeps track of
DMA mapping, there is room for new 

[PATCH 07/36] HMM: add per mirror page table v3.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

This patch add the per mirror page table. It also propagate CPU page
table update to this per mirror page table using mmu_notifier callback.
All update are contextualized with an HMM event structure that convey
all information needed by device driver to take proper actions (update
its own mmu to reflect changes and schedule proper flushing).

Core HMM is responsible for updating the per mirror page table once
the device driver is done with its update. Most importantly HMM will
properly propagate HMM page table dirty bit to underlying page.

Changed since v1:
  - Removed unused fence code to defer it to latter patches.

Changed since v2:
  - Use new bit flag helper for mirror page table manipulation.
  - Differentiate fork event with HMM_FORK from other events.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h |  83 
 mm/hmm.c| 221 
 2 files changed, 304 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 175a757..573560b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -46,6 +46,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 struct hmm_device;
@@ -53,6 +54,39 @@ struct hmm_mirror;
 struct hmm;
 
 
+/*
+ * hmm_event - each event is described by a type associated with a struct.
+ */
+enum hmm_etype {
+   HMM_NONE = 0,
+   HMM_FORK,
+   HMM_ISDIRTY,
+   HMM_MIGRATE,
+   HMM_MUNMAP,
+   HMM_DEVICE_RFAULT,
+   HMM_DEVICE_WFAULT,
+   HMM_WRITE_PROTECT,
+};
+
+/* struct hmm_event - memory event information.
+ *
+ * @list: So HMM can keep track of all active events.
+ * @start: First address (inclusive).
+ * @end: Last address (exclusive).
+ * @pte_mask: HMM pte update mask (bit(s) that are still valid).
+ * @etype: Event type (munmap, migrate, truncate, ...).
+ * @backoff: Only meaningful for device page fault.
+ */
+struct hmm_event {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   dma_addr_t  pte_mask;
+   enum hmm_etype  etype;
+   boolbackoff;
+};
+
+
 /* hmm_device - Each device must register one and only one hmm_device.
  *
  * The hmm_device is the link btw HMM and each device driver.
@@ -76,6 +110,53 @@ struct hmm_device_ops {
 * callback against that mirror.
 */
void (*release)(struct hmm_mirror *mirror);
+
+   /* update() - update device mmu following an event.
+*
+* @mirror: The mirror that link process address space with the device.
+* @event: The event that triggered the update.
+* Returns: 0 on success or error code {-EIO, -ENOMEM}.
+*
+* Called to update device page table for a range of address.
+* The event type provide the nature of the update :
+*   - Range is no longer valid (munmap).
+*   - Range protection changes (mprotect, COW, ...).
+*   - Range is unmapped (swap, reclaim, page migration, ...).
+*   - Device page fault.
+*   - ...
+*
+* Thought most device driver only need to use pte_mask as it reflects
+* change that will happen to the HMM page table ie :
+*   new_pte = old_pte & event->pte_mask;
+*
+* Device driver must not update the HMM mirror page table (except the
+* dirty bit see below). Core HMM will update HMM page table after the
+* update is done.
+*
+* Note that device must be cache coherent with system memory (snooping
+* in case of PCIE devices) so there should be no need for device to
+* flush anything.
+*
+* When write protection is turned on device driver must make sure the
+* hardware will no longer be able to write to the page otherwise file
+* system corruption may occur.
+*
+* Device must properly set the dirty bit using hmm_pte_set_bit() on
+* each page entry for memory that was written by the device. If device
+* can not properly account for write access then the dirty bit must be
+* set unconditionaly so that proper write back of file backed page can
+* happen.
+*
+* Device driver must not fail lightly, any failure result in device
+* process being kill.
+*
+* Return 0 on success, error value otherwise :
+* -ENOMEM Not enough memory for performing the operation.
+* -EIOSome input/output error with the device.
+*
+* All other return value trigger warning and are transformed to -EIO.
+*/
+   int (*update)(struct hmm_mirror *mirror,const struct hmm_event *event);
 };
 
 
@@ -142,6 +223,7 @@ int hmm_device_unregister(struct hmm_device 

[PATCH 04/36] mmu_notifier: allow range invalidation to exclude a specific mmu_notifier

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

This patch allow to invalidate a range while excluding call to a specific
mmu_notifier which allow for a subsystem to invalidate a range for everyone
but itself.

Signed-off-by: Jérôme Glisse 
---
 include/linux/mmu_notifier.h | 60 +++-
 mm/mmu_notifier.c| 16 +---
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 283ad26..867ca06 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -294,11 +294,15 @@ extern void __mmu_notifier_invalidate_page(struct 
mm_struct *mm,
  struct page *page,
  enum mmu_event event);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- struct mmu_notifier_range 
*range);
+ struct mmu_notifier_range 
*range,
+ const struct mmu_notifier 
*exclude);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-   struct mmu_notifier_range 
*range);
+   struct mmu_notifier_range 
*range,
+   const struct mmu_notifier 
*exclude);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end);
+   unsigned long start,
+   unsigned long end,
+   const struct mmu_notifier *exclude);
 extern bool mmu_notifier_range_is_valid(struct mm_struct *mm,
unsigned long start,
unsigned long end);
@@ -351,21 +355,46 @@ static inline void 
mmu_notifier_invalidate_range_start(struct mm_struct *mm,
   struct 
mmu_notifier_range *range)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_range_start(mm, range);
+   __mmu_notifier_invalidate_range_start(mm, range, NULL);
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 struct mmu_notifier_range 
*range)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_range_end(mm, range);
+   __mmu_notifier_invalidate_range_end(mm, range, NULL);
 }
 
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_range(mm, start, end);
+   __mmu_notifier_invalidate_range(mm, start, end, NULL);
+}
+
+static inline void mmu_notifier_invalidate_range_start_excluding(struct 
mm_struct *mm,
+   struct mmu_notifier_range 
*range,
+   const struct mmu_notifier 
*exclude)
+{
+   if (mm_has_notifiers(mm))
+   __mmu_notifier_invalidate_range_start(mm, range, exclude);
+}
+
+static inline void mmu_notifier_invalidate_range_end_excluding(struct 
mm_struct *mm,
+   struct 
mmu_notifier_range *range,
+   const struct 
mmu_notifier *exclude)
+{
+   if (mm_has_notifiers(mm))
+   __mmu_notifier_invalidate_range_end(mm, range, exclude);
+}
+
+static inline void mmu_notifier_invalidate_range_excluding(struct mm_struct 
*mm,
+   unsigned long start,
+   unsigned long end,
+   const struct mmu_notifier 
*exclude)
+{
+   if (mm_has_notifiers(mm))
+   __mmu_notifier_invalidate_range(mm, start, end, exclude);
 }
 
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
@@ -515,6 +544,25 @@ static inline void mmu_notifier_invalidate_range(struct 
mm_struct *mm,
 {
 }
 
+static inline void mmu_notifier_invalidate_range_start_excluding(struct 
mm_struct *mm,
+   struct mmu_notifier_range 
*range,
+   const struct mmu_notifier 
*exclude)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_end_excluding(struct 
mm_struct *mm,
+   struct 
mmu_notifier_range *range,
+   const struct 
mmu_notifier *exclude)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_excluding(struct mm_struct 
*mm,
+

[PATCH 10/36] HMM: use CPU page table during invalidation.

2015-05-21 Thread j . glisse
From: Jerome Glisse 

Once we store the dma mapping inside the secondary page table we can
no longer easily find back the page backing an address. Instead use
the cpu page table which still has the proper informations, except for
the invalidate_page() case which is handled by using the page passed
by the mmu_notifier layer.

Signed-off-by: Jérôme Glisse 
---
 mm/hmm.c | 51 ++-
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 93d6f5e..8ec9ffa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -50,9 +50,11 @@ static inline struct hmm_mirror *hmm_mirror_ref(struct 
hmm_mirror *mirror);
 static inline void hmm_mirror_unref(struct hmm_mirror **mirror);
 static void hmm_mirror_kill(struct hmm_mirror *mirror);
 static inline int hmm_mirror_update(struct hmm_mirror *mirror,
-   struct hmm_event *event);
+   struct hmm_event *event,
+   struct page *page);
 static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
-struct hmm_event *event);
+struct hmm_event *event,
+struct page *page);
 
 
 /* hmm_event - use to track information relating to an event.
@@ -232,7 +234,9 @@ again:
}
 }
 
-static void hmm_update(struct hmm *hmm, struct hmm_event *event)
+static void hmm_update(struct hmm *hmm,
+  struct hmm_event *event,
+  struct page *page)
 {
struct hmm_mirror *mirror;
 
@@ -245,7 +249,7 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
 again:
down_read(>rwsem);
hlist_for_each_entry(mirror, >mirrors, mlist)
-   if (hmm_mirror_update(mirror, event)) {
+   if (hmm_mirror_update(mirror, event, page)) {
mirror = hmm_mirror_ref(mirror);
up_read(>rwsem);
hmm_mirror_kill(mirror);
@@ -343,9 +347,10 @@ static void hmm_mmu_mprot_to_etype(struct mm_struct *mm,
*etype = HMM_NONE;
 }
 
-static void hmm_notifier_invalidate_range_start(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   const struct mmu_notifier_range 
*range)
+static void hmm_notifier_invalidate(struct mmu_notifier *mn,
+   struct mm_struct *mm,
+   struct page *page,
+   const struct mmu_notifier_range *range)
 {
struct hmm_event event;
unsigned long start = range->start, end = range->end;
@@ -386,7 +391,14 @@ static void hmm_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
 
hmm_event_init(, hmm, start, end, event.etype);
 
-   hmm_update(hmm, );
+   hmm_update(hmm, , page);
+}
+
+static void hmm_notifier_invalidate_range_start(struct mmu_notifier *mn,
+   struct mm_struct *mm,
+   const struct mmu_notifier_range 
*range)
+{
+   hmm_notifier_invalidate(mn, mm, NULL, range);
 }
 
 static void hmm_notifier_invalidate_page(struct mmu_notifier *mn,
@@ -400,7 +412,7 @@ static void hmm_notifier_invalidate_page(struct 
mmu_notifier *mn,
range.start = addr & PAGE_MASK;
range.end = range.start + PAGE_SIZE;
range.event = mmu_event;
-   hmm_notifier_invalidate_range_start(mn, mm, );
+   hmm_notifier_invalidate(mn, mm, page, );
 }
 
 static struct mmu_notifier_ops hmm_notifier_ops = {
@@ -551,23 +563,27 @@ static inline void hmm_mirror_unref(struct hmm_mirror 
**mirror)
 }
 
 static inline int hmm_mirror_update(struct hmm_mirror *mirror,
-   struct hmm_event *event)
+   struct hmm_event *event,
+   struct page *page)
 {
struct hmm_device *device = mirror->device;
int ret = 0;
 
ret = device->ops->update(mirror, event);
-   hmm_mirror_update_pt(mirror, event);
+   hmm_mirror_update_pt(mirror, event, page);
return ret;
 }
 
 static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
-struct hmm_event *event)
+struct hmm_event *event,
+struct page *page)
 {
unsigned long addr;
struct hmm_pt_iter iter;
+   struct mm_pt_iter mm_iter;
 
hmm_pt_iter_init();
+   mm_pt_iter_init(_iter, mirror->hmm->mm);
for (addr = event->start; addr != event->end;) {
unsigned long end, next;
dma_addr_t *hmm_pte;
@@ -593,10 +609,10 @@ static void hmm_mirror_update_pt(struct hmm_mirror 
*mirror,
continue;
if 

[PATCH 03/36] mmu_notifier: pass page pointer to mmu_notifier_invalidate_page()

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

Listener of mm event might not have easy way to get the struct page
behind and address invalidated with mmu_notifier_invalidate_page()
function as this happens after the cpu page table have been clear/
updated. This happens for instance if the listener is storing a dma
mapping inside its secondary page table. To avoid complex reverse
dma mapping lookup just pass along a pointer to the page being
invalidated.

Signed-off-by: Jérôme Glisse 
---
 drivers/infiniband/core/umem_odp.c | 1 +
 drivers/iommu/amd_iommu_v2.c   | 1 +
 drivers/misc/sgi-gru/grutlbpurge.c | 1 +
 drivers/xen/gntdev.c   | 1 +
 include/linux/mmu_notifier.h   | 6 +-
 mm/mmu_notifier.c  | 3 ++-
 mm/rmap.c  | 4 ++--
 virt/kvm/kvm_main.c| 1 +
 8 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 8f7f845..d10dd88 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -166,6 +166,7 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long address,
+struct page *page,
 enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 4aa4de6..de3c540 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -385,6 +385,7 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
 static void mn_invalidate_page(struct mmu_notifier *mn,
   struct mm_struct *mm,
   unsigned long address,
+  struct page *page,
   enum mmu_event event)
 {
__mn_flush_page(mn, address);
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 44b41b7..c7659b76 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -250,6 +250,7 @@ static void gru_invalidate_range_end(struct mmu_notifier 
*mn,
 
 static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long address,
+   struct page *page,
enum mmu_event event)
 {
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 0e8aa12..90693ce 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -485,6 +485,7 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
 static void mn_invl_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long address,
+struct page *page,
 enum mmu_event event)
 {
struct mmu_notifier_range range;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index ada3ed1..283ad26 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -172,6 +172,7 @@ struct mmu_notifier_ops {
void (*invalidate_page)(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
+   struct page *page,
enum mmu_event event);
 
/*
@@ -290,6 +291,7 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm,
  enum mmu_event event);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
  unsigned long address,
+ struct page *page,
  enum mmu_event event);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  struct mmu_notifier_range 
*range);
@@ -338,10 +340,11 @@ static inline void mmu_notifier_change_pte(struct 
mm_struct *mm,
 
 static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address,
+   struct page *page,
enum mmu_event event)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_page(mm, address, event);
+   __mmu_notifier_invalidate_page(mm, address, page, event);
 }
 
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -492,6 +495,7 @@ static inline void mmu_notifier_change_pte(struct 

[PATCH 08/36] HMM: add device page fault support v3.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

This patch add helper for device page fault. Device page fault helper will
fill the mirror page table using the CPU page table all this synchronized
with any update to CPU page table.

Changed since v1:
  - Add comment about directory lock.

Changed since v2:
  - Check for mirror->hmm in hmm_mirror_fault()

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h |   9 ++
 mm/hmm.c| 386 +++-
 2 files changed, 394 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 573560b..fdb1975 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -169,6 +169,10 @@ struct hmm_device_ops {
  * @rwsem: Serialize the mirror list modifications.
  * @mmu_notifier: The mmu_notifier of this mm.
  * @rcu: For delayed cleanup call from mmu_notifier.release() callback.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
+ * @lock: Serialize device_faults list modification.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -185,6 +189,10 @@ struct hmm {
struct rw_semaphore rwsem;
struct mmu_notifier mmu_notifier;
struct rcu_head rcu;
+   struct list_headdevice_faults;
+   unsignedndevice_faults;
+   wait_queue_head_t   wait_queue;
+   spinlock_t  lock;
 };
 
 
@@ -241,6 +249,7 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 04a3743..e1aa6ca 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -63,6 +63,11 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
  * help dealing with all this.
  */
 
+static inline bool hmm_event_overlap(struct hmm_event *a, struct hmm_event *b)
+{
+   return !((a->end <= b->start) || (a->start >= b->end));
+}
+
 static inline int hmm_event_init(struct hmm_event *event,
 struct hmm *hmm,
 unsigned long start,
@@ -70,7 +75,7 @@ static inline int hmm_event_init(struct hmm_event *event,
 enum hmm_etype etype)
 {
event->start = start & PAGE_MASK;
-   event->end = min(end, hmm->vm_end);
+   event->end = PAGE_ALIGN(min(end, hmm->vm_end));
if (event->start >= event->end)
return -EINVAL;
event->etype = etype;
@@ -107,6 +112,10 @@ static int hmm_init(struct hmm *hmm)
kref_init(>kref);
INIT_HLIST_HEAD(>mirrors);
init_rwsem(>rwsem);
+   INIT_LIST_HEAD(>device_faults);
+   hmm->ndevice_faults = 0;
+   init_waitqueue_head(>wait_queue);
+   spin_lock_init(>lock);
 
/* register notifier */
hmm->mmu_notifier.ops = _notifier_ops;
@@ -171,6 +180,58 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+   int ret = 0;
+
+   mmu_notifier_range_wait_valid(hmm->mm, event->start, event->end);
+
+   spin_lock(>lock);
+   if (mmu_notifier_range_is_valid(hmm->mm, event->start, event->end)) {
+   list_add_tail(>list, >device_faults);
+   hmm->ndevice_faults++;
+   event->backoff = false;
+   } else
+   ret = -EAGAIN;
+   spin_unlock(>lock);
+
+   wake_up(>wait_queue);
+
+   return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+   spin_lock(>lock);
+   list_del_init(>list);
+   hmm->ndevice_faults--;
+   spin_unlock(>lock);
+
+   wake_up(>wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+   struct hmm_event *fevent;
+   unsigned long wait_for = 0;
+
+again:
+   spin_lock(>lock);
+   list_for_each_entry(fevent, >device_faults, list) {
+   if (!hmm_event_overlap(fevent, ievent))
+   continue;
+   fevent->backoff = true;
+   wait_for = hmm->ndevice_faults;
+   }
+   spin_unlock(>lock);
+
+   if (wait_for > 0) {
+   wait_event(hmm->wait_queue, wait_for != hmm->ndevice_faults);
+   wait_for = 0;
+   goto again;
+   }
+}
+
 static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 {
struct hmm_mirror *mirror;
@@ -179,6 +240,8 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 

[PATCH 13/36] HMM: DMA map memory on behalf of device driver.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

Do the DMA mapping on behalf of the device as HMM is a good place
to perform this common task. Moreover in the future we hope to
add new infrastructure that would make DMA mapping more efficient
(lower overhead per page) by leveraging HMM data structure.

Signed-off-by: Jérôme Glisse 
---
 include/linux/hmm_pt.h |  11 +++
 mm/hmm.c   | 223 ++---
 2 files changed, 184 insertions(+), 50 deletions(-)

diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
index 330edb2..78a9073 100644
--- a/include/linux/hmm_pt.h
+++ b/include/linux/hmm_pt.h
@@ -176,6 +176,17 @@ static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
return (pfn << PAGE_SHIFT) | (1 << HMM_PTE_VALID_PFN_BIT);
 }
 
+static inline dma_addr_t hmm_pte_from_dma_addr(dma_addr_t dma_addr)
+{
+   return (dma_addr & HMM_PTE_DMA_MASK) | (1 << HMM_PTE_VALID_DMA_BIT);
+}
+
+static inline dma_addr_t hmm_pte_dma_addr(dma_addr_t pte)
+{
+   /* FIXME Use max dma addr instead of 0 ? */
+   return hmm_pte_test_valid_dma() ? (pte & HMM_PTE_DMA_MASK) : 0;
+}
+
 static inline unsigned long hmm_pte_pfn(dma_addr_t pte)
 {
return hmm_pte_test_valid_pfn() ? pte >> PAGE_SHIFT : 0;
diff --git a/mm/hmm.c b/mm/hmm.c
index 21fda9f..1533223 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -574,6 +575,46 @@ static inline int hmm_mirror_update(struct hmm_mirror 
*mirror,
return ret;
 }
 
+static void hmm_mirror_update_pte(struct hmm_mirror *mirror,
+ struct hmm_event *event,
+ struct hmm_pt_iter *iter,
+ struct mm_pt_iter *mm_iter,
+ struct page *page,
+ dma_addr_t *hmm_pte,
+ unsigned long addr)
+{
+   bool dirty = hmm_pte_test_and_clear_dirty(hmm_pte);
+
+   if (hmm_pte_test_valid_pfn(hmm_pte)) {
+   *hmm_pte &= event->pte_mask;
+   if (!hmm_pte_test_valid_pfn(hmm_pte))
+   hmm_pt_iter_directory_unref(iter, mirror->pt.llevel);
+   goto out;
+   }
+
+   if (!hmm_pte_test_valid_dma(hmm_pte))
+   return;
+
+   if (!hmm_pte_test_valid_dma(>pte_mask)) {
+   struct device *dev = mirror->device->dev;
+   dma_addr_t dma_addr;
+
+   dma_addr = hmm_pte_dma_addr(*hmm_pte);
+   dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+   }
+
+   *hmm_pte &= event->pte_mask;
+   if (!hmm_pte_test_valid_dma(hmm_pte))
+   hmm_pt_iter_directory_unref(iter, mirror->pt.llevel);
+
+out:
+   if (dirty) {
+   page = page ? : mm_pt_iter_page(mm_iter, addr);
+   if (page)
+   set_page_dirty(page);
+   }
+}
+
 static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
 struct hmm_event *event,
 struct page *page)
@@ -605,19 +646,9 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
do {
next = hmm_pt_level_next(>pt, addr, end,
 mirror->pt.llevel);
-   if (!hmm_pte_test_valid_pfn(hmm_pte))
-   continue;
-   if (hmm_pte_test_and_clear_dirty(hmm_pte) &&
-   hmm_pte_test_write(hmm_pte)) {
-   page = page ? : mm_pt_iter_page(_iter, addr);
-   if (page)
-   set_page_dirty(page);
-   page = NULL;
-   }
-   *hmm_pte &= event->pte_mask;
-   if (hmm_pte_test_valid_pfn(hmm_pte))
-   continue;
-   hmm_pt_iter_directory_unref(, mirror->pt.llevel);
+   hmm_mirror_update_pte(mirror, event, , _iter,
+ page, hmm_pte, addr);
+   page = NULL;
} while (addr = next, hmm_pte++, addr != end);
hmm_pt_iter_directory_unlock(, >pt);
}
@@ -697,12 +728,12 @@ static int hmm_mirror_fault_hpmd(struct hmm_mirror 
*mirror,
next = hmm_pt_level_next(>pt, addr, hmm_end,
 mirror->pt.llevel);
 
-   if (!hmm_pte_test_valid_pfn(_pte[i])) {
-   hmm_pte[i] = hmm_pte_from_pfn(pfn);
-   hmm_pt_iter_directory_ref(iter,
- mirror->pt.llevel);
-   }
-   BUG_ON(hmm_pte_pfn(hmm_pte[i]) != pfn);
+  

[PATCH 14/36] fork: pass the dst vma to copy_page_range() and its sub-functions.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

For HMM we will need to resort to the old way of allocating new page
for anonymous memory when that anonymous memory have been migrated
to device memory.

This does not impact any process that do not use HMM through some
device driver. Only process that migrate anonymous memory to device
memory with HMM will have to copy migrated page on fork.

We do not expect this to be a common or adviced thing to do so we
resort to the simpler solution of allocating new page. If this kind
of usage turns out to be important we will revisit way to achieve
COW even for remote memory.

Signed-off-by: Jérôme Glisse 
---
 include/linux/mm.h |  5 +++--
 kernel/fork.c  |  2 +-
 mm/memory.c| 33 +
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf642d9..8923532 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1083,8 +1083,9 @@ int walk_page_range(unsigned long addr, unsigned long end,
 int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-   struct vm_area_struct *vma);
+int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+   struct vm_area_struct *dst_vma,
+   struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/kernel/fork.c b/kernel/fork.c
index 4083be7..0bd5b59 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -492,7 +492,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct 
*oldmm)
rb_parent = >vm_rb;
 
mm->map_count++;
-   retval = copy_page_range(mm, oldmm, mpnt);
+   retval = copy_page_range(mm, oldmm, tmp, mpnt);
 
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
diff --git a/mm/memory.c b/mm/memory.c
index 5a1131f..6497009 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -885,8 +885,10 @@ out_set_pte:
 }
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-  pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-  unsigned long addr, unsigned long end)
+ pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
 {
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
@@ -947,9 +949,12 @@ again:
return 0;
 }
 
-static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
-   pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
-   unsigned long addr, unsigned long end)
+static inline int copy_pmd_range(struct mm_struct *dst_mm,
+struct mm_struct *src_mm,
+pud_t *dst_pud, pud_t *src_pud,
+struct vm_area_struct *dst_vma,
+struct vm_area_struct *vma,
+unsigned long addr, unsigned long end)
 {
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
@@ -974,15 +979,18 @@ static inline int copy_pmd_range(struct mm_struct 
*dst_mm, struct mm_struct *src
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
-   vma, addr, next))
+  dst_vma, vma, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
 }
 
-static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
-   pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
-   unsigned long addr, unsigned long end)
+static inline int copy_pud_range(struct mm_struct *dst_mm,
+struct mm_struct *src_mm,
+pgd_t *dst_pgd, pgd_t *src_pgd,
+struct vm_area_struct *dst_vma,
+struct vm_area_struct *vma,
+unsigned long addr, unsigned long end)
 {
pud_t *src_pud, *dst_pud;
unsigned long next;
@@ -996,14 +1004,15 @@ static inline int copy_pud_range(struct mm_struct 
*dst_mm, struct mm_struct *src
if (pud_none_or_clear_bad(src_pud))
continue;

[PATCH 11/36] HMM: add discard range helper (to clear and free resources for a range).

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

A common use case is for device driver to stop caring for a range of address
long before said range is munmapped by userspace program. To avoid keeping
track of such range provide an helper function that will free HMM resources
for a range of address.

NOTE THAT DEVICE DRIVER MUST MAKE SURE THE HARDWARE WILL NO LONGER ACCESS THE
RANGE BECAUSE CALLING THIS HELPER !

Signed-off-by: Jérôme Glisse 
---
 include/linux/hmm.h |  3 +++
 mm/hmm.c| 24 
 2 files changed, 27 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index fdb1975..ec05df8 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -250,6 +250,9 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
+void hmm_mirror_range_discard(struct hmm_mirror *mirror,
+ unsigned long start,
+ unsigned long end);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 8ec9ffa..4cab3f2 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -916,6 +916,30 @@ out:
 }
 EXPORT_SYMBOL(hmm_mirror_fault);
 
+/* hmm_mirror_range_discard() - discard a range of address.
+ *
+ * @mirror: The mirror struct.
+ * @start: Start address of the range to discard (inclusive).
+ * @end: End address of the range to discard (exclusive).
+ *
+ * Call when device driver want to stop mirroring a range of address and free
+ * any HMM resources associated with that range (including dma mapping if any).
+ *
+ * THIS FUNCTION ASSUME THAT DRIVER ALREADY STOPPED USING THE RANGE OF ADDRESS
+ * AND THUS DO NOT PERFORM ANY SYNCHRONIZATION OR UPDATE WITH THE DRIVER TO
+ * INVALIDATE SAID RANGE.
+ */
+void hmm_mirror_range_discard(struct hmm_mirror *mirror,
+ unsigned long start,
+ unsigned long end)
+{
+   struct hmm_event event;
+
+   hmm_event_init(, mirror->hmm, start, end, HMM_MUNMAP);
+   hmm_mirror_update_pt(mirror, , NULL);
+}
+EXPORT_SYMBOL(hmm_mirror_range_discard);
+
 /* hmm_mirror_register() - register mirror against current process for a 
device.
  *
  * @mirror: The mirror struct being registered.
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 15/36] memcg: export get_mem_cgroup_from_mm()

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

Usefull for HMM when trying to uncharge freshly allocated anonymous
pages after error inside migration memory migration path.

Signed-off-by: Jérôme Glisse 
---
 include/linux/memcontrol.h | 7 +++
 mm/memcontrol.c| 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c89181..488748e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -93,6 +93,7 @@ bool task_in_mem_cgroup(struct task_struct *task, struct 
mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+extern struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
@@ -275,6 +276,12 @@ static inline struct cgroup_subsys_state
return NULL;
 }
 
+
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+   return NULL;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14c2f20..360d9e0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -966,7 +966,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct 
*p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
 
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
struct mem_cgroup *memcg = NULL;
 
@@ -988,6 +988,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct 
mm_struct *mm)
rcu_read_unlock();
return memcg;
 }
+EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 18/36] HMM: add new HMM page table flag (select flag).

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

When migrating memory the same array for HMM page table entry might be
use with several different devices. Add a new select flag so current
device driver callback can know which entry are selected for the device.

Signed-off-by: Jérôme Glisse 
---
 include/linux/hmm_pt.h | 6 --
 mm/hmm.c   | 5 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
index 26cfe5e..36f7e00 100644
--- a/include/linux/hmm_pt.h
+++ b/include/linux/hmm_pt.h
@@ -77,8 +77,9 @@ static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
 #define HMM_PTE_VALID_DEV_BIT  0
 #define HMM_PTE_VALID_DMA_BIT  1
 #define HMM_PTE_VALID_PFN_BIT  2
-#define HMM_PTE_WRITE_BIT  3
-#define HMM_PTE_DIRTY_BIT  4
+#define HMM_PTE_SELECT 3
+#define HMM_PTE_WRITE_BIT  4
+#define HMM_PTE_DIRTY_BIT  5
 /*
  * Reserve some bits for device driver private flags. Note that thus can only
  * be manipulated using the hmm_pte_*_bit() sets of helpers.
@@ -170,6 +171,7 @@ static inline bool hmm_pte_test_and_set_bit(dma_addr_t 
*ptep,
 HMM_PTE_BIT_HELPER(valid_dev, HMM_PTE_VALID_DEV_BIT)
 HMM_PTE_BIT_HELPER(valid_dma, HMM_PTE_VALID_DMA_BIT)
 HMM_PTE_BIT_HELPER(valid_pfn, HMM_PTE_VALID_PFN_BIT)
+HMM_PTE_BIT_HELPER(select, HMM_PTE_SELECT)
 HMM_PTE_BIT_HELPER(dirty, HMM_PTE_DIRTY_BIT)
 HMM_PTE_BIT_HELPER(write, HMM_PTE_WRITE_BIT)
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 2143a58..761905a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -757,6 +757,7 @@ static int hmm_mirror_fault_hpmd(struct hmm_mirror *mirror,
hmm_pte[i] = hmm_pte_from_pfn(pfn);
if (pmd_write(*pmdp))
hmm_pte_set_write(_pte[i]);
+   hmm_pte_set_select(_pte[i]);
} while (addr = next, pfn++, i++, addr != hmm_end);
hmm_pt_iter_directory_unlock(iter, >pt);
mirror_fault->addr = addr;
@@ -826,6 +827,7 @@ static int hmm_mirror_fault_pmd(pmd_t *pmdp,
hmm_pte[i] = hmm_pte_from_pfn(pte_pfn(*ptep));
if (pte_write(*ptep))
hmm_pte_set_write(_pte[i]);
+   hmm_pte_set_select(_pte[i]);
} while (addr = next, ptep++, i++, addr != hmm_end);
hmm_pt_iter_directory_unlock(iter, >pt);
pte_unmap(ptep - 1);
@@ -864,7 +866,8 @@ static int hmm_mirror_dma_map(struct hmm_mirror *mirror,
 
 again:
pte = ACCESS_ONCE(hmm_pte[i]);
-   if (!hmm_pte_test_valid_pfn()) {
+   if (!hmm_pte_test_valid_pfn() ||
+   !hmm_pte_test_select()) {
if (!hmm_pte_test_valid_dma()) {
ret = -ENOENT;
break;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 19/36] HMM: handle HMM device page table entry on mirror page table fault and update.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

When faulting or updating the device page table properly handle the case of 
device
memory entry.

Signed-off-by: Jérôme Glisse 
---
 mm/hmm.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index 761905a..e4585b7 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -613,6 +613,13 @@ static void hmm_mirror_update_pte(struct hmm_mirror 
*mirror,
goto out;
}
 
+   if (hmm_pte_test_valid_dev(hmm_pte)) {
+   *hmm_pte &= event->pte_mask;
+   if (!hmm_pte_test_valid_dev(hmm_pte))
+   hmm_pt_iter_directory_unref(iter, mirror->pt.llevel);
+   return;
+   }
+
if (!hmm_pte_test_valid_dma(hmm_pte))
return;
 
@@ -813,6 +820,13 @@ static int hmm_mirror_fault_pmd(pmd_t *pmdp,
do {
next = hmm_pt_level_next(>pt, addr, hmm_end,
 mirror->pt.llevel);
+
+   if (hmm_pte_test_valid_dev(_pte[i])) {
+   if (write)
+   hmm_pte_set_write(_pte[i]);
+   continue;
+   }
+
if (!pte_present(*ptep) || (write && 
!pte_write(*ptep))) {
ret = -ENOENT;
ptep++;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 16/36] HMM: add special swap filetype for memory migrated to HMM device memory.

2015-05-21 Thread j . glisse
From: Jerome Glisse 

When migrating anonymous memory from system memory to device memory
CPU pte are replaced with special HMM swap entry so that page fault,
get user page (gup), fork, ... are properly redirected to HMM helpers.

This patch only add the new swap type entry and hooks HMM helpers
functions inside the page fault and fork code path.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h | 34 ++
 include/linux/swap.h| 12 +++-
 include/linux/swapops.h | 43 ++-
 mm/hmm.c| 21 +
 mm/memory.c | 22 ++
 5 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 186f497..f243eb5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -257,6 +257,40 @@ void hmm_mirror_range_dirty(struct hmm_mirror *mirror,
unsigned long start,
unsigned long end);
 
+int hmm_handle_cpu_fault(struct mm_struct *mm,
+   struct vm_area_struct *vma,
+   pmd_t *pmdp, unsigned long addr,
+   unsigned flags, pte_t orig_pte);
+
+int hmm_mm_fork(struct mm_struct *src_mm,
+   struct mm_struct *dst_mm,
+   struct vm_area_struct *dst_vma,
+   pmd_t *dst_pmd,
+   unsigned long start,
+   unsigned long end);
+
+#else /* CONFIG_HMM */
+
+static inline int hmm_handle_mm_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmdp, unsigned long addr,
+ unsigned flags, pte_t orig_pte)
+{
+   return VM_FAULT_SIGBUS;
+}
+
+static inline int hmm_mm_fork(struct mm_struct *src_mm,
+ struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ pmd_t *dst_pmd,
+ unsigned long start,
+ unsigned long end)
+{
+   BUG();
+   return -ENOMEM;
+}
 
 #endif /* CONFIG_HMM */
+
+
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0428e4c..89b9dda 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -70,8 +70,18 @@ static inline int current_is_kswapd(void)
 #define SWP_HWPOISON_NUM 0
 #endif
 
+/*
+ * HMM (heterogeneous memory management) used when data is in remote memory.
+ */
+#ifdef CONFIG_HMM
+#define SWP_HMM_NUM 1
+#define SWP_HMM(MAX_SWAPFILES + SWP_MIGRATION_NUM + 
SWP_HWPOISON_NUM)
+#else
+#define SWP_HMM_NUM 0
+#endif
+
 #define MAX_SWAPFILES \
-   ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+   ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - 
SWP_HMM_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cedf3d3..934359f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -190,7 +190,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 }
 #endif
 
-#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || 
defined(CONFIG_HMM)
 static inline int non_swap_entry(swp_entry_t entry)
 {
return swp_type(entry) >= MAX_SWAPFILES;
@@ -202,4 +202,45 @@ static inline int non_swap_entry(swp_entry_t entry)
 }
 #endif
 
+#ifdef CONFIG_HMM
+static inline swp_entry_t make_hmm_entry(void)
+{
+   /* We do not store anything inside the CPU page table entry (pte). */
+   return swp_entry(SWP_HMM, 0);
+}
+
+static inline swp_entry_t make_hmm_entry_locked(void)
+{
+   /* We do not store anything inside the CPU page table entry (pte). */
+   return swp_entry(SWP_HMM, 1);
+}
+
+static inline swp_entry_t make_hmm_entry_poisonous(void)
+{
+   /* We do not store anything inside the CPU page table entry (pte). */
+   return swp_entry(SWP_HMM, 2);
+}
+
+static inline int is_hmm_entry(swp_entry_t entry)
+{
+   return (swp_type(entry) == SWP_HMM);
+}
+
+static inline int is_hmm_entry_locked(swp_entry_t entry)
+{
+   return (swp_type(entry) == SWP_HMM) && (swp_offset(entry) == 1);
+}
+
+static inline int is_hmm_entry_poisonous(swp_entry_t entry)
+{
+   return (swp_type(entry) == SWP_HMM) && (swp_offset(entry) == 2);
+}
+#else /* CONFIG_HMM */
+static inline int is_hmm_entry(swp_entry_t swp)
+{
+   return 0;
+}
+#endif /* CONFIG_HMM */
+
+
 #endif /* _LINUX_SWAPOPS_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index 1533223..2143a58 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -423,6 +423,27 @@ static struct mmu_notifier_ops hmm_notifier_ops = {
 };
 
 

[PATCH 09/36] HMM: add mm page table iterator helpers.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

Because inside the mmu_notifier callback we do not have access to the
vma nor do we know which lock we are holding (the mmap semaphore or
the i_mmap_lock) we can not rely on the regular page table walk (nor
do we want as we have to be carefull to not split huge page).

So this patch introduce an helper to iterate of the cpu page table
content in an efficient way for the situation we are in. Which is we
know that none of the page table entry might vanish from below us
and thus it is safe to walk the page table.

The only added value of the iterator is that it keeps the page table
entry level map accross call which fit well with the HMM mirror page
table update code.

Signed-off-by: Jérôme Glisse 
---
 mm/hmm.c | 95 
 1 file changed, 95 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index e1aa6ca..93d6f5e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -410,6 +410,101 @@ static struct mmu_notifier_ops hmm_notifier_ops = {
 };
 
 
+struct mm_pt_iter {
+   struct mm_struct*mm;
+   pte_t   *ptep;
+   unsigned long   addr;
+};
+
+static void mm_pt_iter_init(struct mm_pt_iter *pt_iter, struct mm_struct *mm)
+{
+   pt_iter->mm = mm;
+   pt_iter->ptep = NULL;
+   pt_iter->addr = -1UL;
+}
+
+static void mm_pt_iter_fini(struct mm_pt_iter *pt_iter)
+{
+   pte_unmap(pt_iter->ptep);
+   pt_iter->ptep = NULL;
+   pt_iter->addr = -1UL;
+   pt_iter->mm = NULL;
+}
+
+static inline bool mm_pt_iter_in_range(struct mm_pt_iter *pt_iter,
+  unsigned long addr)
+{
+   return (addr >= pt_iter->addr && addr < (pt_iter->addr + PMD_SIZE));
+}
+
+static struct page *mm_pt_iter_page(struct mm_pt_iter *pt_iter,
+   unsigned long addr)
+{
+   pgd_t *pgdp;
+   pud_t *pudp;
+   pmd_t *pmdp;
+
+again:
+   /*
+* What we are doing here is only valid if we old either the mmap
+* semaphore or the i_mmap_lock of vma->address_space the address
+* belongs to. Sadly because we can not easily get the vma struct
+* we can not sanity test that either of those lock is taken.
+*
+* We have to rely on people using this code knowing what they do.
+*/
+   if (mm_pt_iter_in_range(pt_iter, addr) && likely(pt_iter->ptep)) {
+   pte_t pte = *(pt_iter->ptep + pte_index(addr));
+   unsigned long pfn;
+
+   if (pte_none(pte) || !pte_present(pte))
+   return NULL;
+   if (unlikely(pte_special(pte)))
+   return NULL;
+
+   pfn = pte_pfn(pte);
+   if (is_zero_pfn(pfn))
+   return NULL;
+   return pfn_to_page(pfn);
+   }
+
+   if (pt_iter->ptep) {
+   pte_unmap(pt_iter->ptep);
+   pt_iter->ptep = NULL;
+   pt_iter->addr = -1UL;
+   }
+
+   pgdp = pgd_offset(pt_iter->mm, addr);
+   if (pgd_none_or_clear_bad(pgdp))
+   return NULL;
+   pudp = pud_offset(pgdp, addr);
+   if (pud_none_or_clear_bad(pudp))
+   return NULL;
+   pmdp = pmd_offset(pudp, addr);
+   /*
+* Because we either have the mmap semaphore or the i_mmap_lock we know
+* that pmd can not vanish from under us, thus if pmd exist then it is
+* either a huge page or a valid pmd. It might also be in the splitting
+* transitory state.
+*/
+   if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp)))
+   return NULL;
+   if (pmd_trans_splitting(*pmdp))
+   /*
+* FIXME idealy we would wait but we have no easy mean to get a
+* hold of the vma. So for now busy loop until the splitting is
+* done.
+*/
+   goto again;
+   if (pmd_huge(*pmdp))
+   return pmd_page(*pmdp) + pte_index(addr);
+   /* Regular pmd and it can not morph. */
+   pt_iter->ptep = pte_offset_map(pmdp, addr & PMD_MASK);
+   pt_iter->addr = addr & PMD_MASK;
+   goto again;
+}
+
+
 /* hmm_mirror - per device mirroring functions.
  *
  * Each device that mirror a process has a uniq hmm_mirror struct. A process
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 17/36] HMM: add new HMM page table flag (valid device memory).

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

For memory migrated to device we need a new type of memory entry.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm_pt.h | 24 +++-
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
index 78a9073..26cfe5e 100644
--- a/include/linux/hmm_pt.h
+++ b/include/linux/hmm_pt.h
@@ -74,10 +74,11 @@ static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
  * In the first case the device driver must ignore any pfn entry as they might
  * show as transient state while HMM is mapping the page.
  */
-#define HMM_PTE_VALID_DMA_BIT  0
-#define HMM_PTE_VALID_PFN_BIT  1
-#define HMM_PTE_WRITE_BIT  2
-#define HMM_PTE_DIRTY_BIT  3
+#define HMM_PTE_VALID_DEV_BIT  0
+#define HMM_PTE_VALID_DMA_BIT  1
+#define HMM_PTE_VALID_PFN_BIT  2
+#define HMM_PTE_WRITE_BIT  3
+#define HMM_PTE_DIRTY_BIT  4
 /*
  * Reserve some bits for device driver private flags. Note that thus can only
  * be manipulated using the hmm_pte_*_bit() sets of helpers.
@@ -85,7 +86,7 @@ static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
  * WARNING ONLY SET/CLEAR THOSE FLAG ON PTE ENTRY THAT HAVE THE VALID BIT SET
  * AS OTHERWISE ANY BIT SET BY THE DRIVER WILL BE OVERWRITTEN BY HMM.
  */
-#define HMM_PTE_HW_SHIFT   4
+#define HMM_PTE_HW_SHIFT   8
 
 #define HMM_PTE_PFN_MASK   (~((dma_addr_t)((1 << PAGE_SHIFT) - 1)))
 #define HMM_PTE_DMA_MASK   (~((dma_addr_t)((1 << PAGE_SHIFT) - 1)))
@@ -166,6 +167,7 @@ static inline bool hmm_pte_test_and_set_bit(dma_addr_t 
*ptep,
HMM_PTE_TEST_AND_CLEAR_BIT(name, bit)\
HMM_PTE_TEST_AND_SET_BIT(name, bit)
 
+HMM_PTE_BIT_HELPER(valid_dev, HMM_PTE_VALID_DEV_BIT)
 HMM_PTE_BIT_HELPER(valid_dma, HMM_PTE_VALID_DMA_BIT)
 HMM_PTE_BIT_HELPER(valid_pfn, HMM_PTE_VALID_PFN_BIT)
 HMM_PTE_BIT_HELPER(dirty, HMM_PTE_DIRTY_BIT)
@@ -176,11 +178,23 @@ static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
return (pfn << PAGE_SHIFT) | (1 << HMM_PTE_VALID_PFN_BIT);
 }
 
+static inline dma_addr_t hmm_pte_from_dev_addr(dma_addr_t dma_addr)
+{
+   return (dma_addr & HMM_PTE_DMA_MASK) | (1 << HMM_PTE_VALID_DEV_BIT);
+}
+
 static inline dma_addr_t hmm_pte_from_dma_addr(dma_addr_t dma_addr)
 {
return (dma_addr & HMM_PTE_DMA_MASK) | (1 << HMM_PTE_VALID_DMA_BIT);
 }
 
+static inline dma_addr_t hmm_pte_dev_addr(dma_addr_t pte)
+{
+   /* FIXME Use max dma addr instead of 0 ? */
+   return hmm_pte_test_valid_dev() ? (pte & HMM_PTE_DMA_MASK) :
+ (dma_addr_t)-1UL;
+}
+
 static inline dma_addr_t hmm_pte_dma_addr(dma_addr_t pte)
 {
/* FIXME Use max dma addr instead of 0 ? */
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/36] HMM: add dirty range helper (to toggle dirty bit inside mirror page table).

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

Device driver must properly toggle the dirty inside the mirror page table so
dirtyness is properly accounted when core mm code needs to know. Provide a
simple helper to toggle that bit for a range of address.

Signed-off-by: Jérôme Glisse 
---
 include/linux/hmm.h |  3 +++
 mm/hmm.c| 47 +++
 2 files changed, 50 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ec05df8..186f497 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -253,6 +253,9 @@ int hmm_mirror_fault(struct hmm_mirror *mirror, struct 
hmm_event *event);
 void hmm_mirror_range_discard(struct hmm_mirror *mirror,
  unsigned long start,
  unsigned long end);
+void hmm_mirror_range_dirty(struct hmm_mirror *mirror,
+   unsigned long start,
+   unsigned long end);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 4cab3f2..21fda9f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -940,6 +940,53 @@ void hmm_mirror_range_discard(struct hmm_mirror *mirror,
 }
 EXPORT_SYMBOL(hmm_mirror_range_discard);
 
+/* hmm_mirror_range_dirty() - toggle dirty bit for a range of address.
+ *
+ * @mirror: The mirror struct.
+ * @start: Start address of the range to discard (inclusive).
+ * @end: End address of the range to discard (exclusive).
+ *
+ * Call when device driver want to toggle the dirty bit for a range of address.
+ * Usefull when the device driver just want to toggle the bit for whole range
+ * without walking the mirror page table itself.
+ *
+ * Note this function does not directly dirty the page behind an address, but
+ * this will happen once address is invalidated or discard by device driver or
+ * core mm code.
+ */
+void hmm_mirror_range_dirty(struct hmm_mirror *mirror,
+   unsigned long start,
+   unsigned long end)
+{
+   struct hmm_pt_iter iter;
+   unsigned long addr;
+
+   hmm_pt_iter_init();
+   for (addr = start; addr != end;) {
+   unsigned long cend, next;
+   dma_addr_t *hmm_pte;
+
+   hmm_pte = hmm_pt_iter_update(, >pt, addr);
+   if (!hmm_pte) {
+   addr = hmm_pt_iter_next(, >pt,
+   addr, end);
+   continue;
+   }
+   cend = hmm_pt_level_next(>pt, addr, end,
+mirror->pt.llevel - 1);
+   do {
+   next = hmm_pt_level_next(>pt, addr, cend,
+mirror->pt.llevel);
+   if (!hmm_pte_test_valid_pfn(hmm_pte) ||
+   !hmm_pte_test_write(hmm_pte))
+   continue;
+   hmm_pte_set_dirty(hmm_pte);
+   } while (addr = next, hmm_pte++, addr != cend);
+   }
+   hmm_pt_iter_fini(, >pt);
+}
+EXPORT_SYMBOL(hmm_mirror_range_dirty);
+
 /* hmm_mirror_register() - register mirror against current process for a 
device.
  *
  * @mirror: The mirror struct being registered.
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/36] mmu_notifier: keep track of active invalidation ranges v3

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an "atomic" section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Changed since v2:
  - Add the range to invalid range list before calling ->range_start().
  - Del the range from invalid range list after calling ->range_end().
  - Remove useless list initialization.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
Reviewed-by: Haggai Eran 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |  9 ++--
 drivers/gpu/drm/radeon/radeon_mn.c  | 14 +++---
 drivers/infiniband/core/umem_odp.c  | 16 +++
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++
 drivers/xen/gntdev.c| 15 ---
 fs/proc/task_mmu.c  | 11 +++--
 include/linux/mmu_notifier.h| 55 ---
 kernel/events/uprobes.c | 13 +++---
 mm/huge_memory.c| 78 ++--
 mm/hugetlb.c| 55 ---
 mm/ksm.c| 28 +---
 mm/madvise.c| 20 -
 mm/memory.c | 72 +-
 mm/migrate.c| 36 +++
 mm/mmu_notifier.c   | 79 -
 mm/mprotect.c   | 18 
 mm/mremap.c | 14 +++---
 virt/kvm/kvm_main.c | 10 ++---
 18 files changed, 302 insertions(+), 256 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 452e9b1..80fe72a 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -131,16 +131,15 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range->start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range->end - 1, start = range->start;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next < end) {
struct drm_i915_gem_object *obj = NULL;
 
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index 3a9615b..24898bf 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -112,34 +112,30 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  *
  * @mn: our notifier
  * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: Address range information.
  *
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-enum mmu_event event)
+const struct mmu_notifier_range 
*range)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
-
/* 

[PATCH 05/36] HMM: introduce heterogeneous memory management v3.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

This patch only introduce core HMM functions for registering a new
mirror and stopping a mirror as well as HMM device registering and
unregistering.

The lifecycle of HMM object is handled differently then the one of
mmu_notifier because unlike mmu_notifier there can be concurrent
call from both mm code to HMM code and/or from device driver code
to HMM code. Moreover lifetime of HMM can be uncorrelated from the
lifetime of the process that is being mirror (GPU might take longer
time to cleanup).

Changed since v1:
  - Updated comment of hmm_device_register().

Changed since v2:
  - Expose struct hmm for easy access to mm struct.
  - Simplify hmm_mirror_register() arguments.
  - Removed the device name.
  - Refcount the mirror struct internaly to HMM allowing to get
rid of the srcu and making the device driver callback error
handling simpler.
  - Safe to call several time hmm_mirror_unregister().
  - Rework the mmu_notifier unregistration and release callback.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
cc: 
---
 MAINTAINERS  |   7 +
 include/linux/hmm.h  | 164 +
 include/linux/mm.h   |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c|   2 +
 mm/Kconfig   |  15 ++
 mm/Makefile  |   1 +
 mm/hmm.c | 370 +++
 8 files changed, 584 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 78ea7b6..2f2a2be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4730,6 +4730,13 @@ F:   include/uapi/linux/if_hippi.h
 F: net/802/hippi.c
 F: drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse 
+L: linux...@kvack.org
+S: Maintained
+F: mm/hmm.c
+F: include/linux/hmm.h
+
 HOST AP DRIVER
 M: Jouni Malinen 
 L: hos...@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000..175a757
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+   /* release() - mirror must stop using the address space.
+*
+* @mirror: The mirror that link process address space with the device.
+*
+* When this is call, device driver must kill all device thread using
+* this mirror. Also, this callback is the last thing call by HMM and
+* HMM will not access the mirror struct after this call (ie no more
+* dereference of it so it is safe for the device driver to free it).
+* It is call either from :
+*   - mm dying (all process using this mm exiting).
+*   - hmm_mirror_unregister() (if no other thread holds a reference)
+*   - outcome of some device error reported by any of the device
+* callback against that mirror.
+*/
+   void (*release)(struct hmm_mirror *mirror);
+};
+
+
+/* struct hmm - 

[PATCH 06/36] HMM: add HMM page table v2.

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

Heterogeneous memory management main purpose is to mirror a process address.
To do so it must maintain a secondary page table that is use by the device
driver to program the device or build a device specific page table.

Radix tree can not be use to create this secondary page table because HMM
needs more flags than RADIX_TREE_MAX_TAGS (while this can be increase we
believe HMM will require so much flags that cost will becomes prohibitive
to others users of radix tree).

Moreover radix tree is built around long but for HMM we need to store dma
address and on some platform sizeof(dma_addr_t) > sizeof(long). Thus radix
tree is unsuitable to fulfill HMM requirement hence why we introduce this
code which allows to create page table that can grow and shrink dynamicly.

The design is very clause to CPU page table as it reuse some of the feature
such as spinlock embedded in struct page.

Changed since v1:
  - Use PAGE_SHIFT as shift value to reserve low bit for private device
specific flags. This is to allow device driver to use and some of the
lower bits for their own device specific purpose.
  - Add set of helper for atomically clear, setting and testing bit on
dma_addr_t pointer. Atomicity being usefull only for dirty bit.
  - Differentiate btw DMA mapped entry and non mapped entry (pfn).
  - Split page directory entry and page table entry helpers.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 MAINTAINERS|   2 +
 include/linux/hmm_pt.h | 380 +++
 mm/Makefile|   2 +-
 mm/hmm_pt.c| 425 +
 4 files changed, 808 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm_pt.h
 create mode 100644 mm/hmm_pt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2f2a2be..8cd0aa7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4736,6 +4736,8 @@ L:linux...@kvack.org
 S: Maintained
 F: mm/hmm.c
 F: include/linux/hmm.h
+F: mm/hmm_pt.c
+F: include/linux/hmm_pt.h
 
 HOST AP DRIVER
 M: Jouni Malinen 
diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
new file mode 100644
index 000..330edb2
--- /dev/null
+++ b/include/linux/hmm_pt.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * This provide a set of helpers for HMM page table. See include/linux/hmm.h
+ * for a description of what HMM is.
+ *
+ * HMM page table rely on a locking mecanism similar to CPU page table for page
+ * table update. It use the spinlock embedded inside the struct page to protect
+ * change to page table directory which should minimize lock contention for
+ * concurrent update.
+ *
+ * It does also provide a directory tree protection mechanism. Unlike CPU page
+ * table there is no mmap semaphore to protect directory tree from removal and
+ * this is done intentionaly so that concurrent removal/insertion of directory
+ * inside the tree can happen.
+ *
+ * So anyone walking down the page table must protect directory it traverses so
+ * they are not free by some other thread. This is done by using a reference
+ * counter for each directory. Before traversing a directory a reference is
+ * taken and once traversal is done the reference is drop.
+ *
+ * A directory entry dereference and refcount increment of sub-directory page
+ * must happen in a critical rcu section so that directory page removal can
+ * gracefully wait for all possible other threads that might have dereferenced
+ * the directory.
+ */
+#ifndef _HMM_PT_H
+#define _HMM_PT_H
+
+/*
+ * The HMM page table entry does not reflect any specific hardware. It is just
+ * a common entry format use by HMM internal and expose to HMM user so they can
+ * extract information out of HMM page table.
+ *
+ * Device driver should only rely on the helpers and should not traverse the
+ * page table themself.
+ */
+#define HMM_PT_MAX_LEVEL   6
+
+#define HMM_PDE_VALID_BIT  0
+#define HMM_PDE_VALID  (1 << HMM_PDE_VALID_BIT)
+#define HMM_PDE_PFN_MASK   (~((dma_addr_t)((1 << PAGE_SHIFT) - 1)))
+
+static inline dma_addr_t hmm_pde_from_pfn(dma_addr_t pfn)
+{
+   return (pfn << PAGE_SHIFT) | HMM_PDE_VALID;
+}
+
+static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
+{
+   return (pde & 

[PATCH 01/36] mmu_notifier: add event information to address invalidation v7

2015-05-21 Thread j . glisse
From: Jérôme Glisse 

The event information will be useful for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Changed since v6:
  - try_to_unmap_one() only invalidate when doing migration.
  - Differentiate fork from other case.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |   3 +-
 drivers/infiniband/core/umem_odp.c  |   9 ++-
 drivers/iommu/amd_iommu_v2.c|   3 +-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 135 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/huge_memory.c|  39 ++---
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/madvise.c|   4 +-
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   6 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |   4 +-
 virt/kvm/kvm_main.c |  12 ++-
 20 files changed, 261 insertions(+), 102 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 4039ede..452e9b1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -132,7 +132,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index eef006c..3a9615b 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -121,7 +121,8 @@ static void radeon_mn_release(struct mmu_notifier *mn,
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
-unsigned long end)
+unsigned long end,
+enum mmu_event event)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 40becdb..6ed69fa 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -165,7 +165,8 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long address)
+unsigned long address,
+enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
 
@@ -192,7 +193,8 @@ static int invalidate_range_start_trampoline(struct ib_umem 
*item, u64 start,
 static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,

[PATCH 08/36] HMM: add device page fault support v3.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch add helper for device page fault. Device page fault helper will
fill the mirror page table using the CPU page table all this synchronized
with any update to CPU page table.

Changed since v1:
  - Add comment about directory lock.

Changed since v2:
  - Check for mirror-hmm in hmm_mirror_fault()

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h |   9 ++
 mm/hmm.c| 386 +++-
 2 files changed, 394 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 573560b..fdb1975 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -169,6 +169,10 @@ struct hmm_device_ops {
  * @rwsem: Serialize the mirror list modifications.
  * @mmu_notifier: The mmu_notifier of this mm.
  * @rcu: For delayed cleanup call from mmu_notifier.release() callback.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
+ * @lock: Serialize device_faults list modification.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -185,6 +189,10 @@ struct hmm {
struct rw_semaphore rwsem;
struct mmu_notifier mmu_notifier;
struct rcu_head rcu;
+   struct list_headdevice_faults;
+   unsignedndevice_faults;
+   wait_queue_head_t   wait_queue;
+   spinlock_t  lock;
 };
 
 
@@ -241,6 +249,7 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 04a3743..e1aa6ca 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -63,6 +63,11 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
  * help dealing with all this.
  */
 
+static inline bool hmm_event_overlap(struct hmm_event *a, struct hmm_event *b)
+{
+   return !((a-end = b-start) || (a-start = b-end));
+}
+
 static inline int hmm_event_init(struct hmm_event *event,
 struct hmm *hmm,
 unsigned long start,
@@ -70,7 +75,7 @@ static inline int hmm_event_init(struct hmm_event *event,
 enum hmm_etype etype)
 {
event-start = start  PAGE_MASK;
-   event-end = min(end, hmm-vm_end);
+   event-end = PAGE_ALIGN(min(end, hmm-vm_end));
if (event-start = event-end)
return -EINVAL;
event-etype = etype;
@@ -107,6 +112,10 @@ static int hmm_init(struct hmm *hmm)
kref_init(hmm-kref);
INIT_HLIST_HEAD(hmm-mirrors);
init_rwsem(hmm-rwsem);
+   INIT_LIST_HEAD(hmm-device_faults);
+   hmm-ndevice_faults = 0;
+   init_waitqueue_head(hmm-wait_queue);
+   spin_lock_init(hmm-lock);
 
/* register notifier */
hmm-mmu_notifier.ops = hmm_notifier_ops;
@@ -171,6 +180,58 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+   int ret = 0;
+
+   mmu_notifier_range_wait_valid(hmm-mm, event-start, event-end);
+
+   spin_lock(hmm-lock);
+   if (mmu_notifier_range_is_valid(hmm-mm, event-start, event-end)) {
+   list_add_tail(event-list, hmm-device_faults);
+   hmm-ndevice_faults++;
+   event-backoff = false;
+   } else
+   ret = -EAGAIN;
+   spin_unlock(hmm-lock);
+
+   wake_up(hmm-wait_queue);
+
+   return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+   spin_lock(hmm-lock);
+   list_del_init(event-list);
+   hmm-ndevice_faults--;
+   spin_unlock(hmm-lock);
+
+   wake_up(hmm-wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+   struct hmm_event *fevent;
+   unsigned long wait_for = 0;
+
+again:
+   spin_lock(hmm-lock);
+   list_for_each_entry(fevent, hmm-device_faults, list) {
+   if (!hmm_event_overlap(fevent, ievent))
+   continue;
+   fevent-backoff = true;
+   wait_for = hmm-ndevice_faults;
+   }
+   spin_unlock(hmm-lock);
+
+   if (wait_for  0) {
+   wait_event(hmm-wait_queue, wait_for != hmm-ndevice_faults);
+   wait_for = 0;
+   goto again;
+   }
+}
+
 static void 

[PATCH 03/36] mmu_notifier: pass page pointer to mmu_notifier_invalidate_page()

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Listener of mm event might not have easy way to get the struct page
behind and address invalidated with mmu_notifier_invalidate_page()
function as this happens after the cpu page table have been clear/
updated. This happens for instance if the listener is storing a dma
mapping inside its secondary page table. To avoid complex reverse
dma mapping lookup just pass along a pointer to the page being
invalidated.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 drivers/infiniband/core/umem_odp.c | 1 +
 drivers/iommu/amd_iommu_v2.c   | 1 +
 drivers/misc/sgi-gru/grutlbpurge.c | 1 +
 drivers/xen/gntdev.c   | 1 +
 include/linux/mmu_notifier.h   | 6 +-
 mm/mmu_notifier.c  | 3 ++-
 mm/rmap.c  | 4 ++--
 virt/kvm/kvm_main.c| 1 +
 8 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 8f7f845..d10dd88 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -166,6 +166,7 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long address,
+struct page *page,
 enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 4aa4de6..de3c540 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -385,6 +385,7 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
 static void mn_invalidate_page(struct mmu_notifier *mn,
   struct mm_struct *mm,
   unsigned long address,
+  struct page *page,
   enum mmu_event event)
 {
__mn_flush_page(mn, address);
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 44b41b7..c7659b76 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -250,6 +250,7 @@ static void gru_invalidate_range_end(struct mmu_notifier 
*mn,
 
 static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long address,
+   struct page *page,
enum mmu_event event)
 {
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 0e8aa12..90693ce 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -485,6 +485,7 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
 static void mn_invl_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long address,
+struct page *page,
 enum mmu_event event)
 {
struct mmu_notifier_range range;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index ada3ed1..283ad26 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -172,6 +172,7 @@ struct mmu_notifier_ops {
void (*invalidate_page)(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
+   struct page *page,
enum mmu_event event);
 
/*
@@ -290,6 +291,7 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm,
  enum mmu_event event);
 extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
  unsigned long address,
+ struct page *page,
  enum mmu_event event);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  struct mmu_notifier_range 
*range);
@@ -338,10 +340,11 @@ static inline void mmu_notifier_change_pte(struct 
mm_struct *mm,
 
 static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address,
+   struct page *page,
enum mmu_event event)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_page(mm, address, event);
+   __mmu_notifier_invalidate_page(mm, address, page, event);
 }
 
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -492,6 +495,7 @@ static inline 

[PATCH 10/36] HMM: use CPU page table during invalidation.

2015-05-21 Thread j . glisse
From: Jerome Glisse jgli...@redhat.com

Once we store the dma mapping inside the secondary page table we can
no longer easily find back the page backing an address. Instead use
the cpu page table which still has the proper informations, except for
the invalidate_page() case which is handled by using the page passed
by the mmu_notifier layer.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 mm/hmm.c | 51 ++-
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 93d6f5e..8ec9ffa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -50,9 +50,11 @@ static inline struct hmm_mirror *hmm_mirror_ref(struct 
hmm_mirror *mirror);
 static inline void hmm_mirror_unref(struct hmm_mirror **mirror);
 static void hmm_mirror_kill(struct hmm_mirror *mirror);
 static inline int hmm_mirror_update(struct hmm_mirror *mirror,
-   struct hmm_event *event);
+   struct hmm_event *event,
+   struct page *page);
 static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
-struct hmm_event *event);
+struct hmm_event *event,
+struct page *page);
 
 
 /* hmm_event - use to track information relating to an event.
@@ -232,7 +234,9 @@ again:
}
 }
 
-static void hmm_update(struct hmm *hmm, struct hmm_event *event)
+static void hmm_update(struct hmm *hmm,
+  struct hmm_event *event,
+  struct page *page)
 {
struct hmm_mirror *mirror;
 
@@ -245,7 +249,7 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
 again:
down_read(hmm-rwsem);
hlist_for_each_entry(mirror, hmm-mirrors, mlist)
-   if (hmm_mirror_update(mirror, event)) {
+   if (hmm_mirror_update(mirror, event, page)) {
mirror = hmm_mirror_ref(mirror);
up_read(hmm-rwsem);
hmm_mirror_kill(mirror);
@@ -343,9 +347,10 @@ static void hmm_mmu_mprot_to_etype(struct mm_struct *mm,
*etype = HMM_NONE;
 }
 
-static void hmm_notifier_invalidate_range_start(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   const struct mmu_notifier_range 
*range)
+static void hmm_notifier_invalidate(struct mmu_notifier *mn,
+   struct mm_struct *mm,
+   struct page *page,
+   const struct mmu_notifier_range *range)
 {
struct hmm_event event;
unsigned long start = range-start, end = range-end;
@@ -386,7 +391,14 @@ static void hmm_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
 
hmm_event_init(event, hmm, start, end, event.etype);
 
-   hmm_update(hmm, event);
+   hmm_update(hmm, event, page);
+}
+
+static void hmm_notifier_invalidate_range_start(struct mmu_notifier *mn,
+   struct mm_struct *mm,
+   const struct mmu_notifier_range 
*range)
+{
+   hmm_notifier_invalidate(mn, mm, NULL, range);
 }
 
 static void hmm_notifier_invalidate_page(struct mmu_notifier *mn,
@@ -400,7 +412,7 @@ static void hmm_notifier_invalidate_page(struct 
mmu_notifier *mn,
range.start = addr  PAGE_MASK;
range.end = range.start + PAGE_SIZE;
range.event = mmu_event;
-   hmm_notifier_invalidate_range_start(mn, mm, range);
+   hmm_notifier_invalidate(mn, mm, page, range);
 }
 
 static struct mmu_notifier_ops hmm_notifier_ops = {
@@ -551,23 +563,27 @@ static inline void hmm_mirror_unref(struct hmm_mirror 
**mirror)
 }
 
 static inline int hmm_mirror_update(struct hmm_mirror *mirror,
-   struct hmm_event *event)
+   struct hmm_event *event,
+   struct page *page)
 {
struct hmm_device *device = mirror-device;
int ret = 0;
 
ret = device-ops-update(mirror, event);
-   hmm_mirror_update_pt(mirror, event);
+   hmm_mirror_update_pt(mirror, event, page);
return ret;
 }
 
 static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
-struct hmm_event *event)
+struct hmm_event *event,
+struct page *page)
 {
unsigned long addr;
struct hmm_pt_iter iter;
+   struct mm_pt_iter mm_iter;
 
hmm_pt_iter_init(iter);
+   mm_pt_iter_init(mm_iter, mirror-hmm-mm);
for (addr = event-start; addr != event-end;) {
unsigned long end, next;
dma_addr_t *hmm_pte;
@@ -593,10 +609,10 @@ static void hmm_mirror_update_pt(struct hmm_mirror 
*mirror,

[PATCH 04/36] mmu_notifier: allow range invalidation to exclude a specific mmu_notifier

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch allow to invalidate a range while excluding call to a specific
mmu_notifier which allow for a subsystem to invalidate a range for everyone
but itself.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/mmu_notifier.h | 60 +++-
 mm/mmu_notifier.c| 16 +---
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 283ad26..867ca06 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -294,11 +294,15 @@ extern void __mmu_notifier_invalidate_page(struct 
mm_struct *mm,
  struct page *page,
  enum mmu_event event);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- struct mmu_notifier_range 
*range);
+ struct mmu_notifier_range 
*range,
+ const struct mmu_notifier 
*exclude);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
-   struct mmu_notifier_range 
*range);
+   struct mmu_notifier_range 
*range,
+   const struct mmu_notifier 
*exclude);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end);
+   unsigned long start,
+   unsigned long end,
+   const struct mmu_notifier *exclude);
 extern bool mmu_notifier_range_is_valid(struct mm_struct *mm,
unsigned long start,
unsigned long end);
@@ -351,21 +355,46 @@ static inline void 
mmu_notifier_invalidate_range_start(struct mm_struct *mm,
   struct 
mmu_notifier_range *range)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_range_start(mm, range);
+   __mmu_notifier_invalidate_range_start(mm, range, NULL);
 }
 
 static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 struct mmu_notifier_range 
*range)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_range_end(mm, range);
+   __mmu_notifier_invalidate_range_end(mm, range, NULL);
 }
 
 static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_range(mm, start, end);
+   __mmu_notifier_invalidate_range(mm, start, end, NULL);
+}
+
+static inline void mmu_notifier_invalidate_range_start_excluding(struct 
mm_struct *mm,
+   struct mmu_notifier_range 
*range,
+   const struct mmu_notifier 
*exclude)
+{
+   if (mm_has_notifiers(mm))
+   __mmu_notifier_invalidate_range_start(mm, range, exclude);
+}
+
+static inline void mmu_notifier_invalidate_range_end_excluding(struct 
mm_struct *mm,
+   struct 
mmu_notifier_range *range,
+   const struct 
mmu_notifier *exclude)
+{
+   if (mm_has_notifiers(mm))
+   __mmu_notifier_invalidate_range_end(mm, range, exclude);
+}
+
+static inline void mmu_notifier_invalidate_range_excluding(struct mm_struct 
*mm,
+   unsigned long start,
+   unsigned long end,
+   const struct mmu_notifier 
*exclude)
+{
+   if (mm_has_notifiers(mm))
+   __mmu_notifier_invalidate_range(mm, start, end, exclude);
 }
 
 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
@@ -515,6 +544,25 @@ static inline void mmu_notifier_invalidate_range(struct 
mm_struct *mm,
 {
 }
 
+static inline void mmu_notifier_invalidate_range_start_excluding(struct 
mm_struct *mm,
+   struct mmu_notifier_range 
*range,
+   const struct mmu_notifier 
*exclude)
+{
+}
+
+static inline void mmu_notifier_invalidate_range_end_excluding(struct 
mm_struct *mm,
+   struct 
mmu_notifier_range *range,
+   const struct 
mmu_notifier *exclude)
+{
+}
+
+static inline void 

HMM (Heterogeneous Memory Management) v8

2015-05-21 Thread j . glisse

So sorry had to resend because i stupidly forgot to cc mailing list.
Ignore private send done before.


HMM (Heterogeneous Memory Management) is an helper layer for device
that want to mirror a process address space into their own mmu. Main
target is GPU but other hardware, like network device can take also
use HMM.

There is two side to HMM, first one is mirroring of process address
space on behalf of a device. HMM will manage a secondary page table
for the device and keep it synchronize with the CPU page table. HMM
also do DMA mapping on behalf of the device (which would allow new
kind of optimization further down the road (1)).

Second side is allowing to migrate process memory to device memory
where device memory is unmappable by the CPU. Any CPU access will
trigger special fault that will migrate memory back.

From design point of view not much changed since last patchset (2).
Most of the change are in small details of the API expose to device
driver. This version also include device driver change for Mellanox
hardware to use HMM as an alternative to ODP (which provide a subset
of HMM functionality specificaly for RDMA devices). Long term plan
is to have HMM completely replace ODP.



Why doing this ?

Mirroring a process address space is mandatory with OpenCL 2.0 and
with other GPU compute API. OpenCL 2.0 allow different level of
implementation and currently only the lowest 2 are supported on
Linux. To implement the highest level, where CPU and GPU access
can happen concurently and are cache coherent, HMM is needed, or
something providing same functionality, for instance through
platform hardware.

Hardware solution such as PCIE ATS/PASID is limited to mirroring
system memory and does not provide way to migrate memory to device
memory (which offer significantly more bandwidth up to 10 times
faster than regular system memory with discret GPU, also have
lower latency than PCIE transaction).

Current CPU with GPU on same die (AMD or Intel) use the ATS/PASID
and for Intel a special level of cache (backed by a large pool of
fast memory).

For foreseeable futur, discrete GPU will remain releveant as they
can have a large quantity of faster memory than integrated GPU.

Thus we believe HMM will allow to leverage discret GPU memory in
a transparent fashion to the application, with minimum disruption
to the linux kernel mm code. Also HMM can work along hardware
solution such as PCIE ATS/PASID (leaving regular case to ATS/PASID
while HMM handles the migrated memory case).



Design :

The patch 1, 2, 3 and 4 augment the mmu notifier API with new
informations to more efficiently mirror CPU page table updates.

The first side of HMM, process address space mirroring, is
implemented in patch 5 through 12. This use a secondary page
table, in which HMM mirror memory actively use by the device.
HMM does not take a reference on any of the page, it use the
mmu notifier API to track changes to the CPU page table and to
update the mirror page table. All this while providing a simple
API to device driver.

To implement this we use a generic page table and not a radix
tree because we need to store more flags than radix allows and
we need to store dma address (sizeof(dma_addr_t)  sizeof(long)
on some platform). All this is

Patch 14 pass down the lane the new child mm struct of a parent
process being forked. This is necessary to properly handle fork
when parent process have migrated memory (more on that below).

Patch 15 allow to get the current memcg against which anonymous
memory of a process should be accounted. It usefull because in
HMM we do bulk transaction on address space and we wish to avoid
storing a pointer to memcg for each single page. All operation
dealing with memcg happens under the protection of the mmap
semaphore.


Second side of HMM, migration to device memory, is implemented
in patch 16 to 28. This only deal with anonymous memory. A new
special swap type is introduced. Migrated memory will have there
CPU page table entry set to this special swap entry (like the
migration entry but unlike migration this is not a short lived
state).

All the patches are then set of functions that deals with those
special entry in the various code path that might face them.

Memory migration require several steps, first the memory is un-
mapped from CPU and replace with special locked entry, HMM
locked entry is a short lived transitional state, this is to
avoid two threads to fight over migration entry.

Once unmapped HMM can determine what can be migrated or not by
comparing mapcount and page count. If something holds a reference
then the page is not migrated and CPU page table is restored.
Next step is to schedule the copy to device memory and update
the CPU page table to regular HMM entry.

Migration back follow the same pattern, replace with special
lock entry, then copy back, then update CPU page table.


(1) Because HMM keeps a secondary page table which keeps track of
DMA mapping, there is room for new 

[PATCH 07/36] HMM: add per mirror page table v3.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch add the per mirror page table. It also propagate CPU page
table update to this per mirror page table using mmu_notifier callback.
All update are contextualized with an HMM event structure that convey
all information needed by device driver to take proper actions (update
its own mmu to reflect changes and schedule proper flushing).

Core HMM is responsible for updating the per mirror page table once
the device driver is done with its update. Most importantly HMM will
properly propagate HMM page table dirty bit to underlying page.

Changed since v1:
  - Removed unused fence code to defer it to latter patches.

Changed since v2:
  - Use new bit flag helper for mirror page table manipulation.
  - Differentiate fork event with HMM_FORK from other events.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h |  83 
 mm/hmm.c| 221 
 2 files changed, 304 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 175a757..573560b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -46,6 +46,7 @@
 #include linux/mmu_notifier.h
 #include linux/workqueue.h
 #include linux/mman.h
+#include linux/hmm_pt.h
 
 
 struct hmm_device;
@@ -53,6 +54,39 @@ struct hmm_mirror;
 struct hmm;
 
 
+/*
+ * hmm_event - each event is described by a type associated with a struct.
+ */
+enum hmm_etype {
+   HMM_NONE = 0,
+   HMM_FORK,
+   HMM_ISDIRTY,
+   HMM_MIGRATE,
+   HMM_MUNMAP,
+   HMM_DEVICE_RFAULT,
+   HMM_DEVICE_WFAULT,
+   HMM_WRITE_PROTECT,
+};
+
+/* struct hmm_event - memory event information.
+ *
+ * @list: So HMM can keep track of all active events.
+ * @start: First address (inclusive).
+ * @end: Last address (exclusive).
+ * @pte_mask: HMM pte update mask (bit(s) that are still valid).
+ * @etype: Event type (munmap, migrate, truncate, ...).
+ * @backoff: Only meaningful for device page fault.
+ */
+struct hmm_event {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   dma_addr_t  pte_mask;
+   enum hmm_etype  etype;
+   boolbackoff;
+};
+
+
 /* hmm_device - Each device must register one and only one hmm_device.
  *
  * The hmm_device is the link btw HMM and each device driver.
@@ -76,6 +110,53 @@ struct hmm_device_ops {
 * callback against that mirror.
 */
void (*release)(struct hmm_mirror *mirror);
+
+   /* update() - update device mmu following an event.
+*
+* @mirror: The mirror that link process address space with the device.
+* @event: The event that triggered the update.
+* Returns: 0 on success or error code {-EIO, -ENOMEM}.
+*
+* Called to update device page table for a range of address.
+* The event type provide the nature of the update :
+*   - Range is no longer valid (munmap).
+*   - Range protection changes (mprotect, COW, ...).
+*   - Range is unmapped (swap, reclaim, page migration, ...).
+*   - Device page fault.
+*   - ...
+*
+* Thought most device driver only need to use pte_mask as it reflects
+* change that will happen to the HMM page table ie :
+*   new_pte = old_pte  event-pte_mask;
+*
+* Device driver must not update the HMM mirror page table (except the
+* dirty bit see below). Core HMM will update HMM page table after the
+* update is done.
+*
+* Note that device must be cache coherent with system memory (snooping
+* in case of PCIE devices) so there should be no need for device to
+* flush anything.
+*
+* When write protection is turned on device driver must make sure the
+* hardware will no longer be able to write to the page otherwise file
+* system corruption may occur.
+*
+* Device must properly set the dirty bit using hmm_pte_set_bit() on
+* each page entry for memory that was written by the device. If device
+* can not properly account for write access then the dirty bit must be
+* set unconditionaly so that proper write back of file backed page can
+* happen.
+*
+* Device driver must not fail lightly, any failure result in device
+* process being kill.
+*
+* Return 0 on success, error value otherwise :
+* -ENOMEM Not enough memory for performing the operation.
+* -EIOSome input/output error with the device.
+*
+* All other return value trigger warning and 

[PATCH 09/36] HMM: add mm page table iterator helpers.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Because inside the mmu_notifier callback we do not have access to the
vma nor do we know which lock we are holding (the mmap semaphore or
the i_mmap_lock) we can not rely on the regular page table walk (nor
do we want as we have to be carefull to not split huge page).

So this patch introduce an helper to iterate of the cpu page table
content in an efficient way for the situation we are in. Which is we
know that none of the page table entry might vanish from below us
and thus it is safe to walk the page table.

The only added value of the iterator is that it keeps the page table
entry level map accross call which fit well with the HMM mirror page
table update code.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 mm/hmm.c | 95 
 1 file changed, 95 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index e1aa6ca..93d6f5e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -410,6 +410,101 @@ static struct mmu_notifier_ops hmm_notifier_ops = {
 };
 
 
+struct mm_pt_iter {
+   struct mm_struct*mm;
+   pte_t   *ptep;
+   unsigned long   addr;
+};
+
+static void mm_pt_iter_init(struct mm_pt_iter *pt_iter, struct mm_struct *mm)
+{
+   pt_iter-mm = mm;
+   pt_iter-ptep = NULL;
+   pt_iter-addr = -1UL;
+}
+
+static void mm_pt_iter_fini(struct mm_pt_iter *pt_iter)
+{
+   pte_unmap(pt_iter-ptep);
+   pt_iter-ptep = NULL;
+   pt_iter-addr = -1UL;
+   pt_iter-mm = NULL;
+}
+
+static inline bool mm_pt_iter_in_range(struct mm_pt_iter *pt_iter,
+  unsigned long addr)
+{
+   return (addr = pt_iter-addr  addr  (pt_iter-addr + PMD_SIZE));
+}
+
+static struct page *mm_pt_iter_page(struct mm_pt_iter *pt_iter,
+   unsigned long addr)
+{
+   pgd_t *pgdp;
+   pud_t *pudp;
+   pmd_t *pmdp;
+
+again:
+   /*
+* What we are doing here is only valid if we old either the mmap
+* semaphore or the i_mmap_lock of vma-address_space the address
+* belongs to. Sadly because we can not easily get the vma struct
+* we can not sanity test that either of those lock is taken.
+*
+* We have to rely on people using this code knowing what they do.
+*/
+   if (mm_pt_iter_in_range(pt_iter, addr)  likely(pt_iter-ptep)) {
+   pte_t pte = *(pt_iter-ptep + pte_index(addr));
+   unsigned long pfn;
+
+   if (pte_none(pte) || !pte_present(pte))
+   return NULL;
+   if (unlikely(pte_special(pte)))
+   return NULL;
+
+   pfn = pte_pfn(pte);
+   if (is_zero_pfn(pfn))
+   return NULL;
+   return pfn_to_page(pfn);
+   }
+
+   if (pt_iter-ptep) {
+   pte_unmap(pt_iter-ptep);
+   pt_iter-ptep = NULL;
+   pt_iter-addr = -1UL;
+   }
+
+   pgdp = pgd_offset(pt_iter-mm, addr);
+   if (pgd_none_or_clear_bad(pgdp))
+   return NULL;
+   pudp = pud_offset(pgdp, addr);
+   if (pud_none_or_clear_bad(pudp))
+   return NULL;
+   pmdp = pmd_offset(pudp, addr);
+   /*
+* Because we either have the mmap semaphore or the i_mmap_lock we know
+* that pmd can not vanish from under us, thus if pmd exist then it is
+* either a huge page or a valid pmd. It might also be in the splitting
+* transitory state.
+*/
+   if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp)))
+   return NULL;
+   if (pmd_trans_splitting(*pmdp))
+   /*
+* FIXME idealy we would wait but we have no easy mean to get a
+* hold of the vma. So for now busy loop until the splitting is
+* done.
+*/
+   goto again;
+   if (pmd_huge(*pmdp))
+   return pmd_page(*pmdp) + pte_index(addr);
+   /* Regular pmd and it can not morph. */
+   pt_iter-ptep = pte_offset_map(pmdp, addr  PMD_MASK);
+   pt_iter-addr = addr  PMD_MASK;
+   goto again;
+}
+
+
 /* hmm_mirror - per device mirroring functions.
  *
  * Each device that mirror a process has a uniq hmm_mirror struct. A process
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 15/36] memcg: export get_mem_cgroup_from_mm()

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Usefull for HMM when trying to uncharge freshly allocated anonymous
pages after error inside migration memory migration path.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/memcontrol.h | 7 +++
 mm/memcontrol.c| 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c89181..488748e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -93,6 +93,7 @@ bool task_in_mem_cgroup(struct task_struct *task, struct 
mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+extern struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
@@ -275,6 +276,12 @@ static inline struct cgroup_subsys_state
return NULL;
 }
 
+
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+   return NULL;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14c2f20..360d9e0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -966,7 +966,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct 
*p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
 
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
struct mem_cgroup *memcg = NULL;
 
@@ -988,6 +988,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct 
mm_struct *mm)
rcu_read_unlock();
return memcg;
 }
+EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 18/36] HMM: add new HMM page table flag (select flag).

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

When migrating memory the same array for HMM page table entry might be
use with several different devices. Add a new select flag so current
device driver callback can know which entry are selected for the device.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/hmm_pt.h | 6 --
 mm/hmm.c   | 5 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
index 26cfe5e..36f7e00 100644
--- a/include/linux/hmm_pt.h
+++ b/include/linux/hmm_pt.h
@@ -77,8 +77,9 @@ static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
 #define HMM_PTE_VALID_DEV_BIT  0
 #define HMM_PTE_VALID_DMA_BIT  1
 #define HMM_PTE_VALID_PFN_BIT  2
-#define HMM_PTE_WRITE_BIT  3
-#define HMM_PTE_DIRTY_BIT  4
+#define HMM_PTE_SELECT 3
+#define HMM_PTE_WRITE_BIT  4
+#define HMM_PTE_DIRTY_BIT  5
 /*
  * Reserve some bits for device driver private flags. Note that thus can only
  * be manipulated using the hmm_pte_*_bit() sets of helpers.
@@ -170,6 +171,7 @@ static inline bool hmm_pte_test_and_set_bit(dma_addr_t 
*ptep,
 HMM_PTE_BIT_HELPER(valid_dev, HMM_PTE_VALID_DEV_BIT)
 HMM_PTE_BIT_HELPER(valid_dma, HMM_PTE_VALID_DMA_BIT)
 HMM_PTE_BIT_HELPER(valid_pfn, HMM_PTE_VALID_PFN_BIT)
+HMM_PTE_BIT_HELPER(select, HMM_PTE_SELECT)
 HMM_PTE_BIT_HELPER(dirty, HMM_PTE_DIRTY_BIT)
 HMM_PTE_BIT_HELPER(write, HMM_PTE_WRITE_BIT)
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 2143a58..761905a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -757,6 +757,7 @@ static int hmm_mirror_fault_hpmd(struct hmm_mirror *mirror,
hmm_pte[i] = hmm_pte_from_pfn(pfn);
if (pmd_write(*pmdp))
hmm_pte_set_write(hmm_pte[i]);
+   hmm_pte_set_select(hmm_pte[i]);
} while (addr = next, pfn++, i++, addr != hmm_end);
hmm_pt_iter_directory_unlock(iter, mirror-pt);
mirror_fault-addr = addr;
@@ -826,6 +827,7 @@ static int hmm_mirror_fault_pmd(pmd_t *pmdp,
hmm_pte[i] = hmm_pte_from_pfn(pte_pfn(*ptep));
if (pte_write(*ptep))
hmm_pte_set_write(hmm_pte[i]);
+   hmm_pte_set_select(hmm_pte[i]);
} while (addr = next, ptep++, i++, addr != hmm_end);
hmm_pt_iter_directory_unlock(iter, mirror-pt);
pte_unmap(ptep - 1);
@@ -864,7 +866,8 @@ static int hmm_mirror_dma_map(struct hmm_mirror *mirror,
 
 again:
pte = ACCESS_ONCE(hmm_pte[i]);
-   if (!hmm_pte_test_valid_pfn(pte)) {
+   if (!hmm_pte_test_valid_pfn(pte) ||
+   !hmm_pte_test_select(pte)) {
if (!hmm_pte_test_valid_dma(pte)) {
ret = -ENOENT;
break;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 13/36] HMM: DMA map memory on behalf of device driver.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Do the DMA mapping on behalf of the device as HMM is a good place
to perform this common task. Moreover in the future we hope to
add new infrastructure that would make DMA mapping more efficient
(lower overhead per page) by leveraging HMM data structure.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/hmm_pt.h |  11 +++
 mm/hmm.c   | 223 ++---
 2 files changed, 184 insertions(+), 50 deletions(-)

diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
index 330edb2..78a9073 100644
--- a/include/linux/hmm_pt.h
+++ b/include/linux/hmm_pt.h
@@ -176,6 +176,17 @@ static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
return (pfn  PAGE_SHIFT) | (1  HMM_PTE_VALID_PFN_BIT);
 }
 
+static inline dma_addr_t hmm_pte_from_dma_addr(dma_addr_t dma_addr)
+{
+   return (dma_addr  HMM_PTE_DMA_MASK) | (1  HMM_PTE_VALID_DMA_BIT);
+}
+
+static inline dma_addr_t hmm_pte_dma_addr(dma_addr_t pte)
+{
+   /* FIXME Use max dma addr instead of 0 ? */
+   return hmm_pte_test_valid_dma(pte) ? (pte  HMM_PTE_DMA_MASK) : 0;
+}
+
 static inline unsigned long hmm_pte_pfn(dma_addr_t pte)
 {
return hmm_pte_test_valid_pfn(pte) ? pte  PAGE_SHIFT : 0;
diff --git a/mm/hmm.c b/mm/hmm.c
index 21fda9f..1533223 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -41,6 +41,7 @@
 #include linux/mman.h
 #include linux/delay.h
 #include linux/workqueue.h
+#include linux/dma-mapping.h
 
 #include internal.h
 
@@ -574,6 +575,46 @@ static inline int hmm_mirror_update(struct hmm_mirror 
*mirror,
return ret;
 }
 
+static void hmm_mirror_update_pte(struct hmm_mirror *mirror,
+ struct hmm_event *event,
+ struct hmm_pt_iter *iter,
+ struct mm_pt_iter *mm_iter,
+ struct page *page,
+ dma_addr_t *hmm_pte,
+ unsigned long addr)
+{
+   bool dirty = hmm_pte_test_and_clear_dirty(hmm_pte);
+
+   if (hmm_pte_test_valid_pfn(hmm_pte)) {
+   *hmm_pte = event-pte_mask;
+   if (!hmm_pte_test_valid_pfn(hmm_pte))
+   hmm_pt_iter_directory_unref(iter, mirror-pt.llevel);
+   goto out;
+   }
+
+   if (!hmm_pte_test_valid_dma(hmm_pte))
+   return;
+
+   if (!hmm_pte_test_valid_dma(event-pte_mask)) {
+   struct device *dev = mirror-device-dev;
+   dma_addr_t dma_addr;
+
+   dma_addr = hmm_pte_dma_addr(*hmm_pte);
+   dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+   }
+
+   *hmm_pte = event-pte_mask;
+   if (!hmm_pte_test_valid_dma(hmm_pte))
+   hmm_pt_iter_directory_unref(iter, mirror-pt.llevel);
+
+out:
+   if (dirty) {
+   page = page ? : mm_pt_iter_page(mm_iter, addr);
+   if (page)
+   set_page_dirty(page);
+   }
+}
+
 static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
 struct hmm_event *event,
 struct page *page)
@@ -605,19 +646,9 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
do {
next = hmm_pt_level_next(mirror-pt, addr, end,
 mirror-pt.llevel);
-   if (!hmm_pte_test_valid_pfn(hmm_pte))
-   continue;
-   if (hmm_pte_test_and_clear_dirty(hmm_pte) 
-   hmm_pte_test_write(hmm_pte)) {
-   page = page ? : mm_pt_iter_page(mm_iter, addr);
-   if (page)
-   set_page_dirty(page);
-   page = NULL;
-   }
-   *hmm_pte = event-pte_mask;
-   if (hmm_pte_test_valid_pfn(hmm_pte))
-   continue;
-   hmm_pt_iter_directory_unref(iter, mirror-pt.llevel);
+   hmm_mirror_update_pte(mirror, event, iter, mm_iter,
+ page, hmm_pte, addr);
+   page = NULL;
} while (addr = next, hmm_pte++, addr != end);
hmm_pt_iter_directory_unlock(iter, mirror-pt);
}
@@ -697,12 +728,12 @@ static int hmm_mirror_fault_hpmd(struct hmm_mirror 
*mirror,
next = hmm_pt_level_next(mirror-pt, addr, hmm_end,
 mirror-pt.llevel);
 
-   if (!hmm_pte_test_valid_pfn(hmm_pte[i])) {
-   hmm_pte[i] = hmm_pte_from_pfn(pfn);
-   hmm_pt_iter_directory_ref(iter,
- 

[PATCH 14/36] fork: pass the dst vma to copy_page_range() and its sub-functions.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

For HMM we will need to resort to the old way of allocating new page
for anonymous memory when that anonymous memory have been migrated
to device memory.

This does not impact any process that do not use HMM through some
device driver. Only process that migrate anonymous memory to device
memory with HMM will have to copy migrated page on fork.

We do not expect this to be a common or adviced thing to do so we
resort to the simpler solution of allocating new page. If this kind
of usage turns out to be important we will revisit way to achieve
COW even for remote memory.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/mm.h |  5 +++--
 kernel/fork.c  |  2 +-
 mm/memory.c| 33 +
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf642d9..8923532 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1083,8 +1083,9 @@ int walk_page_range(unsigned long addr, unsigned long end,
 int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
-int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-   struct vm_area_struct *vma);
+int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+   struct vm_area_struct *dst_vma,
+   struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/kernel/fork.c b/kernel/fork.c
index 4083be7..0bd5b59 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -492,7 +492,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct 
*oldmm)
rb_parent = tmp-vm_rb;
 
mm-map_count++;
-   retval = copy_page_range(mm, oldmm, mpnt);
+   retval = copy_page_range(mm, oldmm, tmp, mpnt);
 
if (tmp-vm_ops  tmp-vm_ops-open)
tmp-vm_ops-open(tmp);
diff --git a/mm/memory.c b/mm/memory.c
index 5a1131f..6497009 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -885,8 +885,10 @@ out_set_pte:
 }
 
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-  pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-  unsigned long addr, unsigned long end)
+ pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
 {
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
@@ -947,9 +949,12 @@ again:
return 0;
 }
 
-static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
-   pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
-   unsigned long addr, unsigned long end)
+static inline int copy_pmd_range(struct mm_struct *dst_mm,
+struct mm_struct *src_mm,
+pud_t *dst_pud, pud_t *src_pud,
+struct vm_area_struct *dst_vma,
+struct vm_area_struct *vma,
+unsigned long addr, unsigned long end)
 {
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
@@ -974,15 +979,18 @@ static inline int copy_pmd_range(struct mm_struct 
*dst_mm, struct mm_struct *src
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
-   vma, addr, next))
+  dst_vma, vma, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
 }
 
-static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
-   pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
-   unsigned long addr, unsigned long end)
+static inline int copy_pud_range(struct mm_struct *dst_mm,
+struct mm_struct *src_mm,
+pgd_t *dst_pgd, pgd_t *src_pgd,
+struct vm_area_struct *dst_vma,
+struct vm_area_struct *vma,
+unsigned long addr, unsigned long end)
 {
pud_t *src_pud, *dst_pud;
unsigned long next;
@@ -996,14 +1004,15 @@ static inline int copy_pud_range(struct mm_struct 
*dst_mm, struct mm_struct *src
if (pud_none_or_clear_bad(src_pud))
   

[PATCH 11/36] HMM: add discard range helper (to clear and free resources for a range).

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

A common use case is for device driver to stop caring for a range of address
long before said range is munmapped by userspace program. To avoid keeping
track of such range provide an helper function that will free HMM resources
for a range of address.

NOTE THAT DEVICE DRIVER MUST MAKE SURE THE HARDWARE WILL NO LONGER ACCESS THE
RANGE BECAUSE CALLING THIS HELPER !

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/hmm.h |  3 +++
 mm/hmm.c| 24 
 2 files changed, 27 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index fdb1975..ec05df8 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -250,6 +250,9 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
+void hmm_mirror_range_discard(struct hmm_mirror *mirror,
+ unsigned long start,
+ unsigned long end);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 8ec9ffa..4cab3f2 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -916,6 +916,30 @@ out:
 }
 EXPORT_SYMBOL(hmm_mirror_fault);
 
+/* hmm_mirror_range_discard() - discard a range of address.
+ *
+ * @mirror: The mirror struct.
+ * @start: Start address of the range to discard (inclusive).
+ * @end: End address of the range to discard (exclusive).
+ *
+ * Call when device driver want to stop mirroring a range of address and free
+ * any HMM resources associated with that range (including dma mapping if any).
+ *
+ * THIS FUNCTION ASSUME THAT DRIVER ALREADY STOPPED USING THE RANGE OF ADDRESS
+ * AND THUS DO NOT PERFORM ANY SYNCHRONIZATION OR UPDATE WITH THE DRIVER TO
+ * INVALIDATE SAID RANGE.
+ */
+void hmm_mirror_range_discard(struct hmm_mirror *mirror,
+ unsigned long start,
+ unsigned long end)
+{
+   struct hmm_event event;
+
+   hmm_event_init(event, mirror-hmm, start, end, HMM_MUNMAP);
+   hmm_mirror_update_pt(mirror, event, NULL);
+}
+EXPORT_SYMBOL(hmm_mirror_range_discard);
+
 /* hmm_mirror_register() - register mirror against current process for a 
device.
  *
  * @mirror: The mirror struct being registered.
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 19/36] HMM: handle HMM device page table entry on mirror page table fault and update.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

When faulting or updating the device page table properly handle the case of 
device
memory entry.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 mm/hmm.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index 761905a..e4585b7 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -613,6 +613,13 @@ static void hmm_mirror_update_pte(struct hmm_mirror 
*mirror,
goto out;
}
 
+   if (hmm_pte_test_valid_dev(hmm_pte)) {
+   *hmm_pte = event-pte_mask;
+   if (!hmm_pte_test_valid_dev(hmm_pte))
+   hmm_pt_iter_directory_unref(iter, mirror-pt.llevel);
+   return;
+   }
+
if (!hmm_pte_test_valid_dma(hmm_pte))
return;
 
@@ -813,6 +820,13 @@ static int hmm_mirror_fault_pmd(pmd_t *pmdp,
do {
next = hmm_pt_level_next(mirror-pt, addr, hmm_end,
 mirror-pt.llevel);
+
+   if (hmm_pte_test_valid_dev(hmm_pte[i])) {
+   if (write)
+   hmm_pte_set_write(hmm_pte[i]);
+   continue;
+   }
+
if (!pte_present(*ptep) || (write  
!pte_write(*ptep))) {
ret = -ENOENT;
ptep++;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 16/36] HMM: add special swap filetype for memory migrated to HMM device memory.

2015-05-21 Thread j . glisse
From: Jerome Glisse jgli...@redhat.com

When migrating anonymous memory from system memory to device memory
CPU pte are replaced with special HMM swap entry so that page fault,
get user page (gup), fork, ... are properly redirected to HMM helpers.

This patch only add the new swap type entry and hooks HMM helpers
functions inside the page fault and fork code path.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h | 34 ++
 include/linux/swap.h| 12 +++-
 include/linux/swapops.h | 43 ++-
 mm/hmm.c| 21 +
 mm/memory.c | 22 ++
 5 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 186f497..f243eb5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -257,6 +257,40 @@ void hmm_mirror_range_dirty(struct hmm_mirror *mirror,
unsigned long start,
unsigned long end);
 
+int hmm_handle_cpu_fault(struct mm_struct *mm,
+   struct vm_area_struct *vma,
+   pmd_t *pmdp, unsigned long addr,
+   unsigned flags, pte_t orig_pte);
+
+int hmm_mm_fork(struct mm_struct *src_mm,
+   struct mm_struct *dst_mm,
+   struct vm_area_struct *dst_vma,
+   pmd_t *dst_pmd,
+   unsigned long start,
+   unsigned long end);
+
+#else /* CONFIG_HMM */
+
+static inline int hmm_handle_mm_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmdp, unsigned long addr,
+ unsigned flags, pte_t orig_pte)
+{
+   return VM_FAULT_SIGBUS;
+}
+
+static inline int hmm_mm_fork(struct mm_struct *src_mm,
+ struct mm_struct *dst_mm,
+ struct vm_area_struct *dst_vma,
+ pmd_t *dst_pmd,
+ unsigned long start,
+ unsigned long end)
+{
+   BUG();
+   return -ENOMEM;
+}
 
 #endif /* CONFIG_HMM */
+
+
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0428e4c..89b9dda 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -70,8 +70,18 @@ static inline int current_is_kswapd(void)
 #define SWP_HWPOISON_NUM 0
 #endif
 
+/*
+ * HMM (heterogeneous memory management) used when data is in remote memory.
+ */
+#ifdef CONFIG_HMM
+#define SWP_HMM_NUM 1
+#define SWP_HMM(MAX_SWAPFILES + SWP_MIGRATION_NUM + 
SWP_HWPOISON_NUM)
+#else
+#define SWP_HMM_NUM 0
+#endif
+
 #define MAX_SWAPFILES \
-   ((1  MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+   ((1  MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - 
SWP_HMM_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cedf3d3..934359f 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -190,7 +190,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 }
 #endif
 
-#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || 
defined(CONFIG_HMM)
 static inline int non_swap_entry(swp_entry_t entry)
 {
return swp_type(entry) = MAX_SWAPFILES;
@@ -202,4 +202,45 @@ static inline int non_swap_entry(swp_entry_t entry)
 }
 #endif
 
+#ifdef CONFIG_HMM
+static inline swp_entry_t make_hmm_entry(void)
+{
+   /* We do not store anything inside the CPU page table entry (pte). */
+   return swp_entry(SWP_HMM, 0);
+}
+
+static inline swp_entry_t make_hmm_entry_locked(void)
+{
+   /* We do not store anything inside the CPU page table entry (pte). */
+   return swp_entry(SWP_HMM, 1);
+}
+
+static inline swp_entry_t make_hmm_entry_poisonous(void)
+{
+   /* We do not store anything inside the CPU page table entry (pte). */
+   return swp_entry(SWP_HMM, 2);
+}
+
+static inline int is_hmm_entry(swp_entry_t entry)
+{
+   return (swp_type(entry) == SWP_HMM);
+}
+
+static inline int is_hmm_entry_locked(swp_entry_t entry)
+{
+   return (swp_type(entry) == SWP_HMM)  (swp_offset(entry) == 1);
+}
+
+static inline int is_hmm_entry_poisonous(swp_entry_t entry)
+{
+   return (swp_type(entry) == SWP_HMM)  (swp_offset(entry) == 2);
+}
+#else /* CONFIG_HMM */
+static inline int is_hmm_entry(swp_entry_t swp)
+{
+   return 0;
+}
+#endif /* CONFIG_HMM */
+
+
 #endif /* _LINUX_SWAPOPS_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index 1533223..2143a58 

[PATCH 05/36] HMM: introduce heterogeneous memory management v3.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch only introduce core HMM functions for registering a new
mirror and stopping a mirror as well as HMM device registering and
unregistering.

The lifecycle of HMM object is handled differently then the one of
mmu_notifier because unlike mmu_notifier there can be concurrent
call from both mm code to HMM code and/or from device driver code
to HMM code. Moreover lifetime of HMM can be uncorrelated from the
lifetime of the process that is being mirror (GPU might take longer
time to cleanup).

Changed since v1:
  - Updated comment of hmm_device_register().

Changed since v2:
  - Expose struct hmm for easy access to mm struct.
  - Simplify hmm_mirror_register() arguments.
  - Removed the device name.
  - Refcount the mirror struct internaly to HMM allowing to get
rid of the srcu and making the device driver callback error
handling simpler.
  - Safe to call several time hmm_mirror_unregister().
  - Rework the mmu_notifier unregistration and release callback.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
cc: linux-r...@vger.kernel.org
---
 MAINTAINERS  |   7 +
 include/linux/hmm.h  | 164 +
 include/linux/mm.h   |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c|   2 +
 mm/Kconfig   |  15 ++
 mm/Makefile  |   1 +
 mm/hmm.c | 370 +++
 8 files changed, 584 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 78ea7b6..2f2a2be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4730,6 +4730,13 @@ F:   include/uapi/linux/if_hippi.h
 F: net/802/hippi.c
 F: drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse jgli...@redhat.com
+L: linux...@kvack.org
+S: Maintained
+F: mm/hmm.c
+F: include/linux/hmm.h
+
 HOST AP DRIVER
 M: Jouni Malinen j...@w1.fi
 L: hos...@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000..175a757
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/atomic.h
+#include linux/mm_types.h
+#include linux/mmu_notifier.h
+#include linux/workqueue.h
+#include linux/mman.h
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+   /* release() - mirror must stop using the address space.
+*
+* @mirror: The mirror that link process address space with the device.
+*
+* When this is call, device driver must kill all device thread using
+* this mirror. Also, this callback is the last thing call by HMM and
+* HMM will not access the mirror struct after this call (ie no more
+* dereference of it so it is safe for the device driver to free it).
+* It is call either from :
+*   - mm dying (all process 

[PATCH 06/36] HMM: add HMM page table v2.

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Heterogeneous memory management main purpose is to mirror a process address.
To do so it must maintain a secondary page table that is use by the device
driver to program the device or build a device specific page table.

Radix tree can not be use to create this secondary page table because HMM
needs more flags than RADIX_TREE_MAX_TAGS (while this can be increase we
believe HMM will require so much flags that cost will becomes prohibitive
to others users of radix tree).

Moreover radix tree is built around long but for HMM we need to store dma
address and on some platform sizeof(dma_addr_t)  sizeof(long). Thus radix
tree is unsuitable to fulfill HMM requirement hence why we introduce this
code which allows to create page table that can grow and shrink dynamicly.

The design is very clause to CPU page table as it reuse some of the feature
such as spinlock embedded in struct page.

Changed since v1:
  - Use PAGE_SHIFT as shift value to reserve low bit for private device
specific flags. This is to allow device driver to use and some of the
lower bits for their own device specific purpose.
  - Add set of helper for atomically clear, setting and testing bit on
dma_addr_t pointer. Atomicity being usefull only for dirty bit.
  - Differentiate btw DMA mapped entry and non mapped entry (pfn).
  - Split page directory entry and page table entry helpers.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 MAINTAINERS|   2 +
 include/linux/hmm_pt.h | 380 +++
 mm/Makefile|   2 +-
 mm/hmm_pt.c| 425 +
 4 files changed, 808 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm_pt.h
 create mode 100644 mm/hmm_pt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2f2a2be..8cd0aa7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4736,6 +4736,8 @@ L:linux...@kvack.org
 S: Maintained
 F: mm/hmm.c
 F: include/linux/hmm.h
+F: mm/hmm_pt.c
+F: include/linux/hmm_pt.h
 
 HOST AP DRIVER
 M: Jouni Malinen j...@w1.fi
diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
new file mode 100644
index 000..330edb2
--- /dev/null
+++ b/include/linux/hmm_pt.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/*
+ * This provide a set of helpers for HMM page table. See include/linux/hmm.h
+ * for a description of what HMM is.
+ *
+ * HMM page table rely on a locking mecanism similar to CPU page table for page
+ * table update. It use the spinlock embedded inside the struct page to protect
+ * change to page table directory which should minimize lock contention for
+ * concurrent update.
+ *
+ * It does also provide a directory tree protection mechanism. Unlike CPU page
+ * table there is no mmap semaphore to protect directory tree from removal and
+ * this is done intentionaly so that concurrent removal/insertion of directory
+ * inside the tree can happen.
+ *
+ * So anyone walking down the page table must protect directory it traverses so
+ * they are not free by some other thread. This is done by using a reference
+ * counter for each directory. Before traversing a directory a reference is
+ * taken and once traversal is done the reference is drop.
+ *
+ * A directory entry dereference and refcount increment of sub-directory page
+ * must happen in a critical rcu section so that directory page removal can
+ * gracefully wait for all possible other threads that might have dereferenced
+ * the directory.
+ */
+#ifndef _HMM_PT_H
+#define _HMM_PT_H
+
+/*
+ * The HMM page table entry does not reflect any specific hardware. It is just
+ * a common entry format use by HMM internal and expose to HMM user so they can
+ * extract information out of HMM page table.
+ *
+ * Device driver should only rely on the helpers and should not traverse the
+ * page table themself.
+ */
+#define HMM_PT_MAX_LEVEL   6
+
+#define HMM_PDE_VALID_BIT  0
+#define HMM_PDE_VALID  (1  HMM_PDE_VALID_BIT)
+#define HMM_PDE_PFN_MASK   (~((dma_addr_t)((1  PAGE_SHIFT) - 1)))
+
+static inline dma_addr_t hmm_pde_from_pfn(dma_addr_t pfn)

[PATCH 01/36] mmu_notifier: add event information to address invalidation v7

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The event information will be useful for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Changed since v6:
  - try_to_unmap_one() only invalidate when doing migration.
  - Differentiate fork from other case.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |   3 +-
 drivers/infiniband/core/umem_odp.c  |   9 ++-
 drivers/iommu/amd_iommu_v2.c|   3 +-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 135 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/huge_memory.c|  39 ++---
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/madvise.c|   4 +-
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   6 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |   4 +-
 virt/kvm/kvm_main.c |  12 ++-
 20 files changed, 261 insertions(+), 102 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 4039ede..452e9b1 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -132,7 +132,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index eef006c..3a9615b 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -121,7 +121,8 @@ static void radeon_mn_release(struct mmu_notifier *mn,
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
-unsigned long end)
+unsigned long end,
+enum mmu_event event)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 40becdb..6ed69fa 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -165,7 +165,8 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long address)
+unsigned long address,
+enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
 
@@ -192,7 +193,8 @@ static int invalidate_range_start_trampoline(struct ib_umem 
*item, u64 start,
 static void ib_umem_notifier_invalidate_range_start(struct 

[PATCH 02/36] mmu_notifier: keep track of active invalidation ranges v3

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an atomic section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Changed since v2:
  - Add the range to invalid range list before calling -range_start().
  - Del the range from invalid range list after calling -range_end().
  - Remove useless list initialization.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
Reviewed-by: Haggai Eran hagg...@mellanox.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |  9 ++--
 drivers/gpu/drm/radeon/radeon_mn.c  | 14 +++---
 drivers/infiniband/core/umem_odp.c  | 16 +++
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++
 drivers/xen/gntdev.c| 15 ---
 fs/proc/task_mmu.c  | 11 +++--
 include/linux/mmu_notifier.h| 55 ---
 kernel/events/uprobes.c | 13 +++---
 mm/huge_memory.c| 78 ++--
 mm/hugetlb.c| 55 ---
 mm/ksm.c| 28 +---
 mm/madvise.c| 20 -
 mm/memory.c | 72 +-
 mm/migrate.c| 36 +++
 mm/mmu_notifier.c   | 79 -
 mm/mprotect.c   | 18 
 mm/mremap.c | 14 +++---
 virt/kvm/kvm_main.c | 10 ++---
 18 files changed, 302 insertions(+), 256 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 452e9b1..80fe72a 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -131,16 +131,15 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range-start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range-end - 1, start = range-start;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next  end) {
struct drm_i915_gem_object *obj = NULL;
 
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index 3a9615b..24898bf 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -112,34 +112,30 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  *
  * @mn: our notifier
  * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: Address range information.
  *
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-enum mmu_event event)
+const struct mmu_notifier_range 
*range)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, 

[PATCH 17/36] HMM: add new HMM page table flag (valid device memory).

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

For memory migrated to device we need a new type of memory entry.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm_pt.h | 24 +++-
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
index 78a9073..26cfe5e 100644
--- a/include/linux/hmm_pt.h
+++ b/include/linux/hmm_pt.h
@@ -74,10 +74,11 @@ static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
  * In the first case the device driver must ignore any pfn entry as they might
  * show as transient state while HMM is mapping the page.
  */
-#define HMM_PTE_VALID_DMA_BIT  0
-#define HMM_PTE_VALID_PFN_BIT  1
-#define HMM_PTE_WRITE_BIT  2
-#define HMM_PTE_DIRTY_BIT  3
+#define HMM_PTE_VALID_DEV_BIT  0
+#define HMM_PTE_VALID_DMA_BIT  1
+#define HMM_PTE_VALID_PFN_BIT  2
+#define HMM_PTE_WRITE_BIT  3
+#define HMM_PTE_DIRTY_BIT  4
 /*
  * Reserve some bits for device driver private flags. Note that thus can only
  * be manipulated using the hmm_pte_*_bit() sets of helpers.
@@ -85,7 +86,7 @@ static inline unsigned long hmm_pde_pfn(dma_addr_t pde)
  * WARNING ONLY SET/CLEAR THOSE FLAG ON PTE ENTRY THAT HAVE THE VALID BIT SET
  * AS OTHERWISE ANY BIT SET BY THE DRIVER WILL BE OVERWRITTEN BY HMM.
  */
-#define HMM_PTE_HW_SHIFT   4
+#define HMM_PTE_HW_SHIFT   8
 
 #define HMM_PTE_PFN_MASK   (~((dma_addr_t)((1  PAGE_SHIFT) - 1)))
 #define HMM_PTE_DMA_MASK   (~((dma_addr_t)((1  PAGE_SHIFT) - 1)))
@@ -166,6 +167,7 @@ static inline bool hmm_pte_test_and_set_bit(dma_addr_t 
*ptep,
HMM_PTE_TEST_AND_CLEAR_BIT(name, bit)\
HMM_PTE_TEST_AND_SET_BIT(name, bit)
 
+HMM_PTE_BIT_HELPER(valid_dev, HMM_PTE_VALID_DEV_BIT)
 HMM_PTE_BIT_HELPER(valid_dma, HMM_PTE_VALID_DMA_BIT)
 HMM_PTE_BIT_HELPER(valid_pfn, HMM_PTE_VALID_PFN_BIT)
 HMM_PTE_BIT_HELPER(dirty, HMM_PTE_DIRTY_BIT)
@@ -176,11 +178,23 @@ static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
return (pfn  PAGE_SHIFT) | (1  HMM_PTE_VALID_PFN_BIT);
 }
 
+static inline dma_addr_t hmm_pte_from_dev_addr(dma_addr_t dma_addr)
+{
+   return (dma_addr  HMM_PTE_DMA_MASK) | (1  HMM_PTE_VALID_DEV_BIT);
+}
+
 static inline dma_addr_t hmm_pte_from_dma_addr(dma_addr_t dma_addr)
 {
return (dma_addr  HMM_PTE_DMA_MASK) | (1  HMM_PTE_VALID_DMA_BIT);
 }
 
+static inline dma_addr_t hmm_pte_dev_addr(dma_addr_t pte)
+{
+   /* FIXME Use max dma addr instead of 0 ? */
+   return hmm_pte_test_valid_dev(pte) ? (pte  HMM_PTE_DMA_MASK) :
+ (dma_addr_t)-1UL;
+}
+
 static inline dma_addr_t hmm_pte_dma_addr(dma_addr_t pte)
 {
/* FIXME Use max dma addr instead of 0 ? */
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/36] HMM: add dirty range helper (to toggle dirty bit inside mirror page table).

2015-05-21 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Device driver must properly toggle the dirty inside the mirror page table so
dirtyness is properly accounted when core mm code needs to know. Provide a
simple helper to toggle that bit for a range of address.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 include/linux/hmm.h |  3 +++
 mm/hmm.c| 47 +++
 2 files changed, 50 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ec05df8..186f497 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -253,6 +253,9 @@ int hmm_mirror_fault(struct hmm_mirror *mirror, struct 
hmm_event *event);
 void hmm_mirror_range_discard(struct hmm_mirror *mirror,
  unsigned long start,
  unsigned long end);
+void hmm_mirror_range_dirty(struct hmm_mirror *mirror,
+   unsigned long start,
+   unsigned long end);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 4cab3f2..21fda9f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -940,6 +940,53 @@ void hmm_mirror_range_discard(struct hmm_mirror *mirror,
 }
 EXPORT_SYMBOL(hmm_mirror_range_discard);
 
+/* hmm_mirror_range_dirty() - toggle dirty bit for a range of address.
+ *
+ * @mirror: The mirror struct.
+ * @start: Start address of the range to discard (inclusive).
+ * @end: End address of the range to discard (exclusive).
+ *
+ * Call when device driver want to toggle the dirty bit for a range of address.
+ * Usefull when the device driver just want to toggle the bit for whole range
+ * without walking the mirror page table itself.
+ *
+ * Note this function does not directly dirty the page behind an address, but
+ * this will happen once address is invalidated or discard by device driver or
+ * core mm code.
+ */
+void hmm_mirror_range_dirty(struct hmm_mirror *mirror,
+   unsigned long start,
+   unsigned long end)
+{
+   struct hmm_pt_iter iter;
+   unsigned long addr;
+
+   hmm_pt_iter_init(iter);
+   for (addr = start; addr != end;) {
+   unsigned long cend, next;
+   dma_addr_t *hmm_pte;
+
+   hmm_pte = hmm_pt_iter_update(iter, mirror-pt, addr);
+   if (!hmm_pte) {
+   addr = hmm_pt_iter_next(iter, mirror-pt,
+   addr, end);
+   continue;
+   }
+   cend = hmm_pt_level_next(mirror-pt, addr, end,
+mirror-pt.llevel - 1);
+   do {
+   next = hmm_pt_level_next(mirror-pt, addr, cend,
+mirror-pt.llevel);
+   if (!hmm_pte_test_valid_pfn(hmm_pte) ||
+   !hmm_pte_test_write(hmm_pte))
+   continue;
+   hmm_pte_set_dirty(hmm_pte);
+   } while (addr = next, hmm_pte++, addr != cend);
+   }
+   hmm_pt_iter_fini(iter, mirror-pt);
+}
+EXPORT_SYMBOL(hmm_mirror_range_dirty);
+
 /* hmm_mirror_register() - register mirror against current process for a 
device.
  *
  * @mirror: The mirror struct being registered.
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/6] HMM: add HMM page table.

2015-01-05 Thread j . glisse
From: Jérôme Glisse 

Heterogeneous memory management main purpose is to mirror a process address.
To do so it must maintain a secondary page table that is use by the device
driver to program the device or build a device specific page table.

Radix tree can not be use to create this secondary page table because HMM
needs more flags than RADIX_TREE_MAX_TAGS (while this can be increase we
believe HMM will require so much flags that cost will becomes prohibitive
to others users of radix tree).

Moreover radix tree is built around long but for HMM we need to store dma
address and on some platform sizeof(dma_addr_t) > sizeof(long). Thus radix
tree is unsuitable to fulfill HMM requirement hence why we introduce this
code which allows to create page table that can grow and shrink dynamicly.

The design is very clause to CPU page table as it reuse some of the feature
such as spinlock embedded in struct page.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 MAINTAINERS|   2 +
 include/linux/hmm_pt.h | 261 ++
 mm/Makefile|   2 +-
 mm/hmm_pt.c| 425 +
 4 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm_pt.h
 create mode 100644 mm/hmm_pt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3ec87c4..4090e86 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4539,6 +4539,8 @@ L:linux...@kvack.org
 S: Maintained
 F: mm/hmm.c
 F: include/linux/hmm.h
+F: mm/hmm_pt.c
+F: include/linux/hmm_pt.h
 
 HOST AP DRIVER
 M: Jouni Malinen 
diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
new file mode 100644
index 000..88fc519
--- /dev/null
+++ b/include/linux/hmm_pt.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * This provide a set of helpers for HMM page table. See include/linux/hmm.h
+ * for a description of what HMM is.
+ *
+ * HMM page table rely on a locking mecanism similar to CPU page table for page
+ * table update. It use the spinlock embedded inside the struct page to protect
+ * change to page table directory which should minimize lock contention for
+ * concurrent update.
+ *
+ * It does also provide a directory tree protection mechanism. Unlike CPU page
+ * table there is no mmap semaphore to protect directory tree from removal and
+ * this is done intentionaly so that concurrent removal/insertion of directory
+ * inside the tree can happen.
+ *
+ * So anyone walking down the page table must protect directory it traverses so
+ * they are not free by some other thread. This is done by using a reference
+ * counter for each directory. Before traversing a directory a reference is
+ * taken and once traversal is done the reference is drop.
+ *
+ * A directory entry dereference and refcount increment of sub-directory page
+ * must happen in a critical rcu section so that directory page removal can
+ * gracefully wait for all possible other threads that might have dereferenced
+ * the directory.
+ */
+#ifndef _HMM_PT_H
+#define _HMM_PT_H
+
+/*
+ * The HMM page table entry does not reflect any specific hardware. It is just
+ * a common entry format use by HMM internal and expose to HMM user so they can
+ * extract information out of HMM page table.
+ */
+#define HMM_PTE_VALID  (1 << 0)
+#define HMM_PTE_WRITE  (1 << 1)
+#define HMM_PTE_DIRTY  (1 << 2)
+#define HMM_PFN_SHIFT  4
+#define HMM_PFN_MASK   (~((dma_addr_t)((1 << HMM_PFN_SHIFT) - 1)))
+
+static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
+{
+   return (pfn << HMM_PFN_SHIFT) | HMM_PTE_VALID;
+}
+
+static inline unsigned long hmm_pte_pfn(dma_addr_t pte)
+{
+   return pte >> HMM_PFN_SHIFT;
+}
+
+#define HMM_PT_MAX_LEVEL   6
+
+/* struct hmm_pt - HMM page table structure.
+ *
+ * @mask: Array of address mask value of each level.
+ * @directory_mask: Mask for directory index (see below).
+ * @last: Last valid address (inclusive).
+ * @pgd: page global directory (top first level of the directory tree).
+ * @lock: Share lock if spinlock_t does not fit in struct page.
+ * @shift: Array of address shift value of each level.
+ * @llevel: Last level.
+ *
+ * The index into each directory for a given address and level is :
+ *   (address >> 

HMM (Heterogeneous Memory Management) v8

2015-01-05 Thread j . glisse
So a resend with corrections base on Haggai comments. This patchset is just
the ground foundation on to which we want to build our features set. Main
feature being migrating memory to device memory. The very first version of
this patchset already show cased proof of concept of much of the features.

Below is previous patchset cover letter pretty much unchanged as background
and motivation for it did not.


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access trigger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423
  v5 https://lkml.org/lkml/2014/11/3/759
  v6 http://lwn.net/Articles/619737/

Cheers,
Jérôme

To: "Andrew Morton" ,
Cc: ,
Cc: linux-mm ,
Cc: ,
Cc: "Linus Torvalds" ,
Cc: "Mel Gorman" ,
Cc: "H. Peter Anvin" ,
Cc: "Peter Zijlstra" ,
Cc: "Linda Wang" ,
Cc: "Kevin E Martin" ,
Cc: "Jerome Glisse" ,
Cc: "Andrea Arcangeli" ,
Cc: "Johannes Weiner" ,
Cc: "Larry Woodman" ,
Cc: "Rik van Riel" ,
Cc: "Dave Airlie" ,
Cc: "Jeff Law" ,
Cc: "Brendan Conoboy" ,
Cc: "Joe Donohue" ,
Cc: "Duncan Poole" ,
Cc: "Sherry Cheung" ,
Cc: "Subhash Gutti" ,
Cc: "John Hubbard" ,
Cc: "Mark Hairgrove" ,
Cc: "Lucien Dunning" ,
Cc: "Cameron Buschardt" ,
Cc: "Arvind Gopalakrishnan" ,
Cc: "Haggai Eran" ,
Cc: "Or Gerlitz" ,
Cc: "Sagi Grimberg" 
Cc: "Shachar Raindel" ,
Cc: "Liran Liss" ,
Cc: "Roland Dreier" ,
Cc: "Sander, Ben" ,
Cc: 

[PATCH 2/6] mmu_notifier: keep track of active invalidation ranges v3

2015-01-05 Thread j . glisse
From: Jérôme Glisse 

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an "atomic" section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Changed since v2:
  - Add the range to invalid range list before calling ->range_start().
  - Del the range from invalid range list after calling ->range_end().
  - Remove useless list initialization.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
Reviewed-by: Haggai Eran 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |  9 ++--
 drivers/gpu/drm/radeon/radeon_mn.c  | 14 +++---
 drivers/infiniband/core/umem_odp.c  | 16 +++
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++
 drivers/xen/gntdev.c| 15 ---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 55 ---
 kernel/events/uprobes.c | 13 +++---
 mm/huge_memory.c| 78 ++--
 mm/hugetlb.c| 55 ---
 mm/ksm.c| 28 +---
 mm/madvise.c|  8 +++-
 mm/memory.c | 78 ++--
 mm/migrate.c| 36 +++
 mm/mmu_notifier.c   | 79 -
 mm/mprotect.c   | 18 
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++
 virt/kvm/kvm_main.c | 10 ++---
 19 files changed, 309 insertions(+), 259 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..a78eede 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,16 +128,15 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range->start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range->end - 1, start = range->start;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next < end) {
struct drm_i915_gem_object *obj = NULL;
 
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index daf53d3..63e6936 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -100,34 +100,30 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  *
  * @mn: our notifier
  * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: Address range information.
  *
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-enum mmu_event event)
+const struct mmu_notifier_range 
*range)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);

[PATCH 6/6] HMM: add device page fault support.

2015-01-05 Thread j . glisse
From: Jérôme Glisse 

This patch add helper for device page fault. Device page fault helper will
fill the mirror page table using the CPU page table all this synchronized
with any update to CPU page table.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h |   1 +
 mm/hmm.c| 384 
 2 files changed, 385 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index dd34572..72e168b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -259,6 +259,7 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct hmm_device *device);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 719e43c..409750f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -55,6 +55,9 @@ static struct srcu_struct srcu;
  * @lock: Serialize the mirror list modifications.
  * @kref: Reference counter
  * @mmu_notifier: The mmu_notifier of this mm.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -67,6 +70,9 @@ struct hmm {
spinlock_t  lock;
struct kref kref;
struct mmu_notifier mmu_notifier;
+   struct list_headdevice_faults;
+   unsignedndevice_faults;
+   wait_queue_head_t   wait_queue;
 };
 
 static struct mmu_notifier_ops hmm_notifier_ops;
@@ -88,6 +94,11 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
  * help dealing with all this.
  */
 
+static inline bool hmm_event_overlap(struct hmm_event *a, struct hmm_event *b)
+{
+   return !((a->end <= b->start) || (a->start >= b->end));
+}
+
 static inline int hmm_event_init(struct hmm_event *event,
 struct hmm *hmm,
 unsigned long start,
@@ -149,6 +160,9 @@ static int hmm_init(struct hmm *hmm)
hmm->vm_end = TASK_SIZE;
kref_init(>kref);
INIT_HLIST_HEAD(>mirrors);
+   INIT_LIST_HEAD(>device_faults);
+   hmm->ndevice_faults = 0;
+   init_waitqueue_head(>wait_queue);
spin_lock_init(>lock);
 
/* register notifier */
@@ -205,6 +219,60 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+   int ret = 0;
+
+   mmu_notifier_range_wait_valid(hmm->mm, event->start, event->end);
+
+   spin_lock(>lock);
+   if (mmu_notifier_range_is_valid(hmm->mm, event->start, event->end)) {
+   list_add_tail(>list, >device_faults);
+   hmm->ndevice_faults++;
+   event->backoff = false;
+   } else
+   ret = -EAGAIN;
+   spin_unlock(>lock);
+
+   wake_up(>wait_queue);
+
+   return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+   hmm_event_wait(event);
+
+   spin_lock(>lock);
+   list_del_init(>list);
+   hmm->ndevice_faults--;
+   spin_unlock(>lock);
+
+   wake_up(>wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+   struct hmm_event *fevent;
+   unsigned long wait_for = 0;
+
+again:
+   spin_lock(>lock);
+   list_for_each_entry(fevent, >device_faults, list) {
+   if (!hmm_event_overlap(fevent, ievent))
+   continue;
+   fevent->backoff = true;
+   wait_for = hmm->ndevice_faults;
+   }
+   spin_unlock(>lock);
+
+   if (wait_for > 0) {
+   wait_event(hmm->wait_queue, wait_for != hmm->ndevice_faults);
+   wait_for = 0;
+   goto again;
+   }
+}
+
 static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 {
struct hmm_mirror *mirror;
@@ -214,6 +282,8 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
if (hmm->mm->hmm != hmm)
return;
 
+   hmm_wait_device_fault(hmm, event);
+
id = srcu_read_lock();
 
hlist_for_each_entry_rcu(mirror, >mirrors, mlist)
@@ -226,6 +296,35 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
hmm_mirror_update_pt(mirror, event);
 
srcu_read_unlock(, id);
+
+   wake_up(>wait_queue);
+}
+
+static int hmm_mm_fault(struct hmm *hmm,
+   struct hmm_event *event,
+   struct vm_area_struct *vma,
+   unsigned 

[PATCH 5/6] HMM: add per mirror page table.

2015-01-05 Thread j . glisse
From: Jérôme Glisse 

This patch add the per mirror page table. It also propagate CPU page table
update to this per mirror page table using mmu_notifier callback. All update
are contextualized with an HMM event structure that convey all information
needed by device driver to take proper actions (update its own mmu to reflect
changes and schedule proper flushing).

Core HMM is responsible for updating the per mirror page table once the device
driver is done with its update. Most importantly HMM will properly propagate
HMM page table dirty bit to underlying page.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h | 136 +++
 mm/hmm.c| 263 
 2 files changed, 399 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 8eddc15..dd34572 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -46,12 +46,65 @@
 #include 
 #include 
 #include 
+#include 
 
 
 struct hmm_device;
 struct hmm_mirror;
+struct hmm_fence;
 struct hmm;
 
+/* hmm_fence - Device driver fence allowing to batch update and delay wait.
+ *
+ * @mirror: The HMM mirror this fence is associated with.
+ * @list: List of fence.
+ *
+ * Each time HMM callback into a device driver for update the device driver can
+ * return fence which core HMM will wait on. This allow HMM to batch update to
+ * several different device driver and then wait for each of them to complete.
+ *
+ * The hmm_fence structure is intended to be embedded inside a device driver
+ * specific fence structure.
+ */
+struct hmm_fence {
+   struct hmm_mirror   *mirror;
+   struct list_headlist;
+};
+
+
+/*
+ * hmm_event - each event is described by a type associated with a struct.
+ */
+enum hmm_etype {
+   HMM_NONE = 0,
+   HMM_ISDIRTY,
+   HMM_MIGRATE,
+   HMM_MUNMAP,
+   HMM_DEVICE_RFAULT,
+   HMM_DEVICE_WFAULT,
+   HMM_WRITE_PROTECT,
+};
+
+/* struct hmm_event - memory event information.
+ *
+ * @list: So HMM can keep track of all active events.
+ * @start: First address (inclusive).
+ * @end: Last address (exclusive).
+ * @fences: List of device fences associated with this event.
+ * @pte_mask: HMM pte update mask (bit(s) that are still valid).
+ * @etype: Event type (munmap, migrate, truncate, ...).
+ * @backoff: Only meaningful for device page fault.
+ */
+struct hmm_event {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   struct list_headfences;
+   dma_addr_t  pte_mask;
+   enum hmm_etype  etype;
+   boolbackoff;
+};
+
 
 /* hmm_device - Each device must register one and only one hmm_device.
  *
@@ -72,6 +125,87 @@ struct hmm_device_ops {
 * from the mirror page table.
 */
void (*release)(struct hmm_mirror *mirror);
+
+   /* fence_wait() - to wait on device driver fence.
+*
+* @fence: The device driver fence struct.
+* Returns: 0 on success,-EIO on error, -EAGAIN to wait again.
+*
+* Called when hmm want to wait for all operations associated with a
+* fence to complete (including device cache flush if the event mandate
+* it).
+*
+* Device driver must free fence and associated resources if it returns
+* something else thant -EAGAIN. On -EAGAIN the fence must not be free
+* as hmm will call back again.
+*
+* Return error if scheduled operation failed or if need to wait again.
+* -EIO Some input/output error with the device.
+* -EAGAIN The fence not yet signaled, hmm reschedule waiting thread.
+*
+* All other return value trigger warning and are transformed to -EIO.
+*/
+   int (*fence_wait)(struct hmm_fence *fence);
+
+   /* fence_ref() - take a reference fence structure.
+*
+* @fence: Fence structure hmm is referencing.
+*/
+   void (*fence_ref)(struct hmm_fence *fence);
+
+   /* fence_unref() - drop a reference fence structure.
+*
+* @fence: Fence structure hmm is dereferencing.
+*/
+   void (*fence_unref)(struct hmm_fence *fence);
+
+   /* update() - update device mmu following an event.
+*
+* @mirror: The mirror that link process address space with the device.
+* @event: The event that triggered the update.
+* Returns: Valid fence ptr or NULL on success otherwise ERR_PTR.
+*
+* Called to update device page table for a range of address.
+* The event type provide the nature of the update :
+*   - Range is no longer valid (munmap).
+*   - Range protection changes (mprotect, COW, ...).
+*   - Range is unmapped (swap, 

[PATCH 3/6] HMM: introduce heterogeneous memory management v2.

2015-01-05 Thread j . glisse
From: Jérôme Glisse 

This patch only introduce core HMM functions for registering a new mirror and
stopping a mirror as well as registering and unregistering a device.

The lifecycle of HMM object is handled differently then one of mmu_notifier
because unlike mmu_notifier there can be concurrent call from both mm code to
HMM code and/or from device driver code to HMM code. Moreover lifetime of HMM
can be uncorrelated from the lifetime of the process that is being mirror.

Changed since v1:
  - Updated comment of hmm_device_register().

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 MAINTAINERS  |   7 +
 include/linux/hmm.h  | 129 
 include/linux/mm.h   |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c|   2 +
 mm/Kconfig   |  15 ++
 mm/Makefile  |   1 +
 mm/hmm.c | 373 +++
 8 files changed, 552 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c03bc6c..3ec87c4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4533,6 +4533,13 @@ F:   include/uapi/linux/if_hippi.h
 F: net/802/hippi.c
 F: drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse 
+L: linux...@kvack.org
+S: Maintained
+F: mm/hmm.c
+F: include/linux/hmm.h
+
 HOST AP DRIVER
 M: Jouni Malinen 
 L: hos...@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000..8eddc15
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+   /* release() - mirror must stop using the address space.
+*
+* @mirror: The mirror that link process address space with the device.
+*
+* This callback is call either on mm destruction or as result to a
+* call to hmm_mirror_release(). Device driver have to stop all hw
+* thread and all usage of the address space, it has to dirty all pages
+* that have been dirty by the device. But it must not clear any entry
+* from the mirror page table.
+*/
+   void (*release)(struct hmm_mirror *mirror);
+};
+
+/* struct hmm_device - per device HMM structure
+ *
+ * @name: Device name (uniquely identify the device on the system).
+ * @ops: The hmm operations callback.
+ * @mirrors: List of all active mirrors for the device.
+ * @mutex: Mutex protecting mirrors list.
+ *
+ * Each device that want to mirror an address space must register one of this
+ * struct (only once per linux device).
+ */
+struct hmm_device {
+   const char  *name;
+   const struct hmm_device_ops *ops;
+   struct list_headmirrors;
+   struct mutexmutex;
+};
+
+int hmm_device_register(struct hmm_device *device);
+int hmm_device_unregister(struct hmm_device *device);
+
+
+/* hmm_mirror - device 

[PATCH 1/6] mmu_notifier: add event information to address invalidation v6

2015-01-05 Thread j . glisse
From: Jérôme Glisse 

The event information will be useful for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |   3 +-
 drivers/infiniband/core/umem_odp.c  |   9 ++-
 drivers/iommu/amd_iommu_v2.c|   3 +-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/madvise.c|   4 +-
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   6 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 21 files changed, 274 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index a69bd44..daf53d3 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -109,7 +109,8 @@ static void radeon_mn_release(struct mmu_notifier *mn,
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
-unsigned long end)
+unsigned long end,
+enum mmu_event event)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 6095872..bc36e8c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -165,7 +165,8 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long address)
+unsigned long address,
+enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
 
@@ -192,7 +193,8 @@ static int invalidate_range_start_trampoline(struct ib_umem 
*item, u64 start,
 static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
   

[PATCH 4/6] HMM: add HMM page table.

2015-01-05 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Heterogeneous memory management main purpose is to mirror a process address.
To do so it must maintain a secondary page table that is use by the device
driver to program the device or build a device specific page table.

Radix tree can not be use to create this secondary page table because HMM
needs more flags than RADIX_TREE_MAX_TAGS (while this can be increase we
believe HMM will require so much flags that cost will becomes prohibitive
to others users of radix tree).

Moreover radix tree is built around long but for HMM we need to store dma
address and on some platform sizeof(dma_addr_t)  sizeof(long). Thus radix
tree is unsuitable to fulfill HMM requirement hence why we introduce this
code which allows to create page table that can grow and shrink dynamicly.

The design is very clause to CPU page table as it reuse some of the feature
such as spinlock embedded in struct page.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 MAINTAINERS|   2 +
 include/linux/hmm_pt.h | 261 ++
 mm/Makefile|   2 +-
 mm/hmm_pt.c| 425 +
 4 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm_pt.h
 create mode 100644 mm/hmm_pt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 3ec87c4..4090e86 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4539,6 +4539,8 @@ L:linux...@kvack.org
 S: Maintained
 F: mm/hmm.c
 F: include/linux/hmm.h
+F: mm/hmm_pt.c
+F: include/linux/hmm_pt.h
 
 HOST AP DRIVER
 M: Jouni Malinen j...@w1.fi
diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
new file mode 100644
index 000..88fc519
--- /dev/null
+++ b/include/linux/hmm_pt.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/*
+ * This provide a set of helpers for HMM page table. See include/linux/hmm.h
+ * for a description of what HMM is.
+ *
+ * HMM page table rely on a locking mecanism similar to CPU page table for page
+ * table update. It use the spinlock embedded inside the struct page to protect
+ * change to page table directory which should minimize lock contention for
+ * concurrent update.
+ *
+ * It does also provide a directory tree protection mechanism. Unlike CPU page
+ * table there is no mmap semaphore to protect directory tree from removal and
+ * this is done intentionaly so that concurrent removal/insertion of directory
+ * inside the tree can happen.
+ *
+ * So anyone walking down the page table must protect directory it traverses so
+ * they are not free by some other thread. This is done by using a reference
+ * counter for each directory. Before traversing a directory a reference is
+ * taken and once traversal is done the reference is drop.
+ *
+ * A directory entry dereference and refcount increment of sub-directory page
+ * must happen in a critical rcu section so that directory page removal can
+ * gracefully wait for all possible other threads that might have dereferenced
+ * the directory.
+ */
+#ifndef _HMM_PT_H
+#define _HMM_PT_H
+
+/*
+ * The HMM page table entry does not reflect any specific hardware. It is just
+ * a common entry format use by HMM internal and expose to HMM user so they can
+ * extract information out of HMM page table.
+ */
+#define HMM_PTE_VALID  (1  0)
+#define HMM_PTE_WRITE  (1  1)
+#define HMM_PTE_DIRTY  (1  2)
+#define HMM_PFN_SHIFT  4
+#define HMM_PFN_MASK   (~((dma_addr_t)((1  HMM_PFN_SHIFT) - 1)))
+
+static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
+{
+   return (pfn  HMM_PFN_SHIFT) | HMM_PTE_VALID;
+}
+
+static inline unsigned long hmm_pte_pfn(dma_addr_t pte)
+{
+   return pte  HMM_PFN_SHIFT;
+}
+
+#define HMM_PT_MAX_LEVEL   6
+
+/* struct hmm_pt - HMM page table structure.
+ *
+ * @mask: Array of address mask value of each level.
+ * @directory_mask: Mask for directory index (see below).
+ * @last: Last valid address (inclusive).
+ * @pgd: page global directory (top first level of the directory tree).
+ * @lock: Share lock if spinlock_t does not fit in struct page.
+ * @shift: Array of address shift 

[PATCH 6/6] HMM: add device page fault support.

2015-01-05 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch add helper for device page fault. Device page fault helper will
fill the mirror page table using the CPU page table all this synchronized
with any update to CPU page table.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h |   1 +
 mm/hmm.c| 384 
 2 files changed, 385 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index dd34572..72e168b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -259,6 +259,7 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct hmm_device *device);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 719e43c..409750f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -55,6 +55,9 @@ static struct srcu_struct srcu;
  * @lock: Serialize the mirror list modifications.
  * @kref: Reference counter
  * @mmu_notifier: The mmu_notifier of this mm.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -67,6 +70,9 @@ struct hmm {
spinlock_t  lock;
struct kref kref;
struct mmu_notifier mmu_notifier;
+   struct list_headdevice_faults;
+   unsignedndevice_faults;
+   wait_queue_head_t   wait_queue;
 };
 
 static struct mmu_notifier_ops hmm_notifier_ops;
@@ -88,6 +94,11 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
  * help dealing with all this.
  */
 
+static inline bool hmm_event_overlap(struct hmm_event *a, struct hmm_event *b)
+{
+   return !((a-end = b-start) || (a-start = b-end));
+}
+
 static inline int hmm_event_init(struct hmm_event *event,
 struct hmm *hmm,
 unsigned long start,
@@ -149,6 +160,9 @@ static int hmm_init(struct hmm *hmm)
hmm-vm_end = TASK_SIZE;
kref_init(hmm-kref);
INIT_HLIST_HEAD(hmm-mirrors);
+   INIT_LIST_HEAD(hmm-device_faults);
+   hmm-ndevice_faults = 0;
+   init_waitqueue_head(hmm-wait_queue);
spin_lock_init(hmm-lock);
 
/* register notifier */
@@ -205,6 +219,60 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+   int ret = 0;
+
+   mmu_notifier_range_wait_valid(hmm-mm, event-start, event-end);
+
+   spin_lock(hmm-lock);
+   if (mmu_notifier_range_is_valid(hmm-mm, event-start, event-end)) {
+   list_add_tail(event-list, hmm-device_faults);
+   hmm-ndevice_faults++;
+   event-backoff = false;
+   } else
+   ret = -EAGAIN;
+   spin_unlock(hmm-lock);
+
+   wake_up(hmm-wait_queue);
+
+   return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+   hmm_event_wait(event);
+
+   spin_lock(hmm-lock);
+   list_del_init(event-list);
+   hmm-ndevice_faults--;
+   spin_unlock(hmm-lock);
+
+   wake_up(hmm-wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+   struct hmm_event *fevent;
+   unsigned long wait_for = 0;
+
+again:
+   spin_lock(hmm-lock);
+   list_for_each_entry(fevent, hmm-device_faults, list) {
+   if (!hmm_event_overlap(fevent, ievent))
+   continue;
+   fevent-backoff = true;
+   wait_for = hmm-ndevice_faults;
+   }
+   spin_unlock(hmm-lock);
+
+   if (wait_for  0) {
+   wait_event(hmm-wait_queue, wait_for != hmm-ndevice_faults);
+   wait_for = 0;
+   goto again;
+   }
+}
+
 static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 {
struct hmm_mirror *mirror;
@@ -214,6 +282,8 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
if (hmm-mm-hmm != hmm)
return;
 
+   hmm_wait_device_fault(hmm, event);
+
id = srcu_read_lock(srcu);
 
hlist_for_each_entry_rcu(mirror, hmm-mirrors, mlist)
@@ -226,6 +296,35 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
hmm_mirror_update_pt(mirror, event);
 
srcu_read_unlock(srcu, id);
+
+   wake_up(hmm-wait_queue);
+}
+

[PATCH 5/6] HMM: add per mirror page table.

2015-01-05 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch add the per mirror page table. It also propagate CPU page table
update to this per mirror page table using mmu_notifier callback. All update
are contextualized with an HMM event structure that convey all information
needed by device driver to take proper actions (update its own mmu to reflect
changes and schedule proper flushing).

Core HMM is responsible for updating the per mirror page table once the device
driver is done with its update. Most importantly HMM will properly propagate
HMM page table dirty bit to underlying page.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h | 136 +++
 mm/hmm.c| 263 
 2 files changed, 399 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 8eddc15..dd34572 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -46,12 +46,65 @@
 #include linux/mmu_notifier.h
 #include linux/workqueue.h
 #include linux/mman.h
+#include linux/hmm_pt.h
 
 
 struct hmm_device;
 struct hmm_mirror;
+struct hmm_fence;
 struct hmm;
 
+/* hmm_fence - Device driver fence allowing to batch update and delay wait.
+ *
+ * @mirror: The HMM mirror this fence is associated with.
+ * @list: List of fence.
+ *
+ * Each time HMM callback into a device driver for update the device driver can
+ * return fence which core HMM will wait on. This allow HMM to batch update to
+ * several different device driver and then wait for each of them to complete.
+ *
+ * The hmm_fence structure is intended to be embedded inside a device driver
+ * specific fence structure.
+ */
+struct hmm_fence {
+   struct hmm_mirror   *mirror;
+   struct list_headlist;
+};
+
+
+/*
+ * hmm_event - each event is described by a type associated with a struct.
+ */
+enum hmm_etype {
+   HMM_NONE = 0,
+   HMM_ISDIRTY,
+   HMM_MIGRATE,
+   HMM_MUNMAP,
+   HMM_DEVICE_RFAULT,
+   HMM_DEVICE_WFAULT,
+   HMM_WRITE_PROTECT,
+};
+
+/* struct hmm_event - memory event information.
+ *
+ * @list: So HMM can keep track of all active events.
+ * @start: First address (inclusive).
+ * @end: Last address (exclusive).
+ * @fences: List of device fences associated with this event.
+ * @pte_mask: HMM pte update mask (bit(s) that are still valid).
+ * @etype: Event type (munmap, migrate, truncate, ...).
+ * @backoff: Only meaningful for device page fault.
+ */
+struct hmm_event {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   struct list_headfences;
+   dma_addr_t  pte_mask;
+   enum hmm_etype  etype;
+   boolbackoff;
+};
+
 
 /* hmm_device - Each device must register one and only one hmm_device.
  *
@@ -72,6 +125,87 @@ struct hmm_device_ops {
 * from the mirror page table.
 */
void (*release)(struct hmm_mirror *mirror);
+
+   /* fence_wait() - to wait on device driver fence.
+*
+* @fence: The device driver fence struct.
+* Returns: 0 on success,-EIO on error, -EAGAIN to wait again.
+*
+* Called when hmm want to wait for all operations associated with a
+* fence to complete (including device cache flush if the event mandate
+* it).
+*
+* Device driver must free fence and associated resources if it returns
+* something else thant -EAGAIN. On -EAGAIN the fence must not be free
+* as hmm will call back again.
+*
+* Return error if scheduled operation failed or if need to wait again.
+* -EIO Some input/output error with the device.
+* -EAGAIN The fence not yet signaled, hmm reschedule waiting thread.
+*
+* All other return value trigger warning and are transformed to -EIO.
+*/
+   int (*fence_wait)(struct hmm_fence *fence);
+
+   /* fence_ref() - take a reference fence structure.
+*
+* @fence: Fence structure hmm is referencing.
+*/
+   void (*fence_ref)(struct hmm_fence *fence);
+
+   /* fence_unref() - drop a reference fence structure.
+*
+* @fence: Fence structure hmm is dereferencing.
+*/
+   void (*fence_unref)(struct hmm_fence *fence);
+
+   /* update() - update device mmu following an event.
+*
+* @mirror: The mirror that link process address space with the device.
+* @event: The event that triggered the update.
+* Returns: Valid fence ptr or NULL on success otherwise ERR_PTR.
+*
+* Called to update device page table for a range of address.
+* The event 

[PATCH 3/6] HMM: introduce heterogeneous memory management v2.

2015-01-05 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch only introduce core HMM functions for registering a new mirror and
stopping a mirror as well as registering and unregistering a device.

The lifecycle of HMM object is handled differently then one of mmu_notifier
because unlike mmu_notifier there can be concurrent call from both mm code to
HMM code and/or from device driver code to HMM code. Moreover lifetime of HMM
can be uncorrelated from the lifetime of the process that is being mirror.

Changed since v1:
  - Updated comment of hmm_device_register().

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 MAINTAINERS  |   7 +
 include/linux/hmm.h  | 129 
 include/linux/mm.h   |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c|   2 +
 mm/Kconfig   |  15 ++
 mm/Makefile  |   1 +
 mm/hmm.c | 373 +++
 8 files changed, 552 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c03bc6c..3ec87c4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4533,6 +4533,13 @@ F:   include/uapi/linux/if_hippi.h
 F: net/802/hippi.c
 F: drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse jgli...@redhat.com
+L: linux...@kvack.org
+S: Maintained
+F: mm/hmm.c
+F: include/linux/hmm.h
+
 HOST AP DRIVER
 M: Jouni Malinen j...@w1.fi
 L: hos...@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000..8eddc15
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/atomic.h
+#include linux/mm_types.h
+#include linux/mmu_notifier.h
+#include linux/workqueue.h
+#include linux/mman.h
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+   /* release() - mirror must stop using the address space.
+*
+* @mirror: The mirror that link process address space with the device.
+*
+* This callback is call either on mm destruction or as result to a
+* call to hmm_mirror_release(). Device driver have to stop all hw
+* thread and all usage of the address space, it has to dirty all pages
+* that have been dirty by the device. But it must not clear any entry
+* from the mirror page table.
+*/
+   void (*release)(struct hmm_mirror *mirror);
+};
+
+/* struct hmm_device - per device HMM structure
+ *
+ * @name: Device name (uniquely identify the device on the system).
+ * @ops: The hmm operations callback.
+ * @mirrors: List of all active mirrors for the device.
+ * @mutex: Mutex protecting mirrors list.
+ *
+ * Each device that want to mirror an address space must register one of this
+ * struct (only once per linux device).
+ */
+struct hmm_device {
+   const char  *name;
+   

[PATCH 2/6] mmu_notifier: keep track of active invalidation ranges v3

2015-01-05 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an atomic section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Changed since v2:
  - Add the range to invalid range list before calling -range_start().
  - Del the range from invalid range list after calling -range_end().
  - Remove useless list initialization.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
Reviewed-by: Haggai Eran hagg...@mellanox.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |  9 ++--
 drivers/gpu/drm/radeon/radeon_mn.c  | 14 +++---
 drivers/infiniband/core/umem_odp.c  | 16 +++
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++
 drivers/xen/gntdev.c| 15 ---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 55 ---
 kernel/events/uprobes.c | 13 +++---
 mm/huge_memory.c| 78 ++--
 mm/hugetlb.c| 55 ---
 mm/ksm.c| 28 +---
 mm/madvise.c|  8 +++-
 mm/memory.c | 78 ++--
 mm/migrate.c| 36 +++
 mm/mmu_notifier.c   | 79 -
 mm/mprotect.c   | 18 
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++
 virt/kvm/kvm_main.c | 10 ++---
 19 files changed, 309 insertions(+), 259 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..a78eede 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,16 +128,15 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range-start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range-end - 1, start = range-start;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next  end) {
struct drm_i915_gem_object *obj = NULL;
 
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index daf53d3..63e6936 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -100,34 +100,30 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  *
  * @mn: our notifier
  * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: Address range information.
  *
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-enum mmu_event event)
+const struct mmu_notifier_range 
*range)
 {
struct 

HMM (Heterogeneous Memory Management) v8

2015-01-05 Thread j . glisse
So a resend with corrections base on Haggai comments. This patchset is just
the ground foundation on to which we want to build our features set. Main
feature being migrating memory to device memory. The very first version of
this patchset already show cased proof of concept of much of the features.

Below is previous patchset cover letter pretty much unchanged as background
and motivation for it did not.


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access trigger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423
  v5 https://lkml.org/lkml/2014/11/3/759
  v6 http://lwn.net/Articles/619737/

Cheers,
Jérôme

To: Andrew Morton a...@linux-foundation.org,
Cc: linux-kernel@vger.kernel.org,
Cc: linux-mm linux...@kvack.org,
Cc: linux-fsde...@vger.kernel.org,
Cc: Linus Torvalds torva...@linux-foundation.org,
Cc: Mel Gorman mgor...@suse.de,
Cc: H. Peter Anvin h...@zytor.com,
Cc: Peter Zijlstra pet...@infradead.org,
Cc: Linda Wang lw...@redhat.com,
Cc: Kevin E Martin k...@redhat.com,
Cc: Jerome Glisse jgli...@redhat.com,
Cc: Andrea Arcangeli aarca...@redhat.com,
Cc: Johannes Weiner jwei...@redhat.com,
Cc: Larry Woodman lwood...@redhat.com,
Cc: Rik van Riel r...@redhat.com,
Cc: Dave Airlie airl...@redhat.com,
Cc: Jeff Law l...@redhat.com,
Cc: Brendan Conoboy b...@redhat.com,
Cc: Joe Donohue jdono...@redhat.com,
Cc: 

[PATCH 1/6] mmu_notifier: add event information to address invalidation v6

2015-01-05 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The event information will be useful for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |   3 +-
 drivers/infiniband/core/umem_odp.c  |   9 ++-
 drivers/iommu/amd_iommu_v2.c|   3 +-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/madvise.c|   4 +-
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   6 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 21 files changed, 274 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index a69bd44..daf53d3 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -109,7 +109,8 @@ static void radeon_mn_release(struct mmu_notifier *mn,
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
-unsigned long end)
+unsigned long end,
+enum mmu_event event)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 6095872..bc36e8c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -165,7 +165,8 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long address)
+unsigned long address,
+enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
 
@@ -192,7 +193,8 @@ static int invalidate_range_start_trampoline(struct ib_umem 
*item, u64 start,
 static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
  

[PATCH 2/7] mmu_notifier: keep track of active invalidation ranges v2

2014-12-22 Thread j . glisse
From: Jérôme Glisse 

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an "atomic" section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |  9 ++--
 drivers/gpu/drm/radeon/radeon_mn.c  | 14 ++
 drivers/infiniband/core/umem_odp.c  | 16 +++---
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++---
 drivers/xen/gntdev.c| 15 +++---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 60 +--
 kernel/events/uprobes.c | 13 +++--
 mm/huge_memory.c| 78 +
 mm/hugetlb.c| 55 +++--
 mm/ksm.c| 28 +--
 mm/madvise.c|  8 ++-
 mm/memory.c | 78 -
 mm/migrate.c| 36 +++---
 mm/mmu_notifier.c   | 87 -
 mm/mprotect.c   | 18 ---
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++---
 virt/kvm/kvm_main.c | 10 ++--
 19 files changed, 322 insertions(+), 259 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..a78eede 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,16 +128,15 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range->start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range->end - 1, start = range->start;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next < end) {
struct drm_i915_gem_object *obj = NULL;
 
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index daf53d3..63e6936 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -100,34 +100,30 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  *
  * @mn: our notifier
  * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: Address range information.
  *
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-enum mmu_event event)
+const struct mmu_notifier_range 
*range)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
-
/* notification is exclusive, but interval is inclusive */
-   end -= 1;
+   unsigned long end = range->end - 1;
 
mutex_lock(>lock);
 
-   it = interval_tree_iter_first(>objects, start, 

[PATCH 5/7] HMM: add per mirror page table.

2014-12-22 Thread j . glisse
From: Jérôme Glisse 

This patch add the per mirror page table. It also propagate CPU page table
update to this per mirror page table using mmu_notifier callback. All update
are contextualized with an HMM event structure that convey all information
needed by device driver to take proper actions (update its own mmu to reflect
changes and schedule proper flushing).

Core HMM is responsible for updating the per mirror page table once the device
driver is done with its update. Most importantly HMM will properly propagate
HMM page table dirty bit to underlying page.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h | 136 +++
 mm/hmm.c| 263 
 2 files changed, 399 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 8eddc15..dd34572 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -46,12 +46,65 @@
 #include 
 #include 
 #include 
+#include 
 
 
 struct hmm_device;
 struct hmm_mirror;
+struct hmm_fence;
 struct hmm;
 
+/* hmm_fence - Device driver fence allowing to batch update and delay wait.
+ *
+ * @mirror: The HMM mirror this fence is associated with.
+ * @list: List of fence.
+ *
+ * Each time HMM callback into a device driver for update the device driver can
+ * return fence which core HMM will wait on. This allow HMM to batch update to
+ * several different device driver and then wait for each of them to complete.
+ *
+ * The hmm_fence structure is intended to be embedded inside a device driver
+ * specific fence structure.
+ */
+struct hmm_fence {
+   struct hmm_mirror   *mirror;
+   struct list_headlist;
+};
+
+
+/*
+ * hmm_event - each event is described by a type associated with a struct.
+ */
+enum hmm_etype {
+   HMM_NONE = 0,
+   HMM_ISDIRTY,
+   HMM_MIGRATE,
+   HMM_MUNMAP,
+   HMM_DEVICE_RFAULT,
+   HMM_DEVICE_WFAULT,
+   HMM_WRITE_PROTECT,
+};
+
+/* struct hmm_event - memory event information.
+ *
+ * @list: So HMM can keep track of all active events.
+ * @start: First address (inclusive).
+ * @end: Last address (exclusive).
+ * @fences: List of device fences associated with this event.
+ * @pte_mask: HMM pte update mask (bit(s) that are still valid).
+ * @etype: Event type (munmap, migrate, truncate, ...).
+ * @backoff: Only meaningful for device page fault.
+ */
+struct hmm_event {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   struct list_headfences;
+   dma_addr_t  pte_mask;
+   enum hmm_etype  etype;
+   boolbackoff;
+};
+
 
 /* hmm_device - Each device must register one and only one hmm_device.
  *
@@ -72,6 +125,87 @@ struct hmm_device_ops {
 * from the mirror page table.
 */
void (*release)(struct hmm_mirror *mirror);
+
+   /* fence_wait() - to wait on device driver fence.
+*
+* @fence: The device driver fence struct.
+* Returns: 0 on success,-EIO on error, -EAGAIN to wait again.
+*
+* Called when hmm want to wait for all operations associated with a
+* fence to complete (including device cache flush if the event mandate
+* it).
+*
+* Device driver must free fence and associated resources if it returns
+* something else thant -EAGAIN. On -EAGAIN the fence must not be free
+* as hmm will call back again.
+*
+* Return error if scheduled operation failed or if need to wait again.
+* -EIO Some input/output error with the device.
+* -EAGAIN The fence not yet signaled, hmm reschedule waiting thread.
+*
+* All other return value trigger warning and are transformed to -EIO.
+*/
+   int (*fence_wait)(struct hmm_fence *fence);
+
+   /* fence_ref() - take a reference fence structure.
+*
+* @fence: Fence structure hmm is referencing.
+*/
+   void (*fence_ref)(struct hmm_fence *fence);
+
+   /* fence_unref() - drop a reference fence structure.
+*
+* @fence: Fence structure hmm is dereferencing.
+*/
+   void (*fence_unref)(struct hmm_fence *fence);
+
+   /* update() - update device mmu following an event.
+*
+* @mirror: The mirror that link process address space with the device.
+* @event: The event that triggered the update.
+* Returns: Valid fence ptr or NULL on success otherwise ERR_PTR.
+*
+* Called to update device page table for a range of address.
+* The event type provide the nature of the update :
+*   - Range is no longer valid (munmap).
+*   - Range protection changes (mprotect, COW, ...).
+*   - Range is unmapped (swap, 

[PATCH 4/7] HMM: add HMM page table.

2014-12-22 Thread j . glisse
From: Jérôme Glisse 

Heterogeneous memory management main purpose is to mirror a process address.
To do so it must maintain a secondary page table that is use by the device
driver to program the device or build a device specific page table.

Radix tree can not be use to create this secondary page table because HMM
needs more flags than RADIX_TREE_MAX_TAGS (while this can be increase we
believe HMM will require so much flags that cost will becomes prohibitive
to others users of radix tree).

Moreover radix tree is built around long but for HMM we need to store dma
address and on some platform sizeof(dma_addr_t) > sizeof(long). Thus radix
tree is unsuitable to fulfill HMM requirement hence why we introduce this
code which allows to create page table that can grow and shrink dynamicly.

The design is very clause to CPU page table as it reuse some of the feature
such as spinlock embedded in struct page.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 MAINTAINERS|   2 +
 include/linux/hmm_pt.h | 261 ++
 mm/Makefile|   2 +-
 mm/hmm_pt.c| 425 +
 4 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm_pt.h
 create mode 100644 mm/hmm_pt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c5bb62e..02f9f29 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4539,6 +4539,8 @@ L:linux...@kvack.org
 S: Maintained
 F: mm/hmm.c
 F: include/linux/hmm.h
+F: mm/hmm_pt.c
+F: include/linux/hmm_pt.h
 
 HOST AP DRIVER
 M: Jouni Malinen 
diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
new file mode 100644
index 000..88fc519
--- /dev/null
+++ b/include/linux/hmm_pt.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * This provide a set of helpers for HMM page table. See include/linux/hmm.h
+ * for a description of what HMM is.
+ *
+ * HMM page table rely on a locking mecanism similar to CPU page table for page
+ * table update. It use the spinlock embedded inside the struct page to protect
+ * change to page table directory which should minimize lock contention for
+ * concurrent update.
+ *
+ * It does also provide a directory tree protection mechanism. Unlike CPU page
+ * table there is no mmap semaphore to protect directory tree from removal and
+ * this is done intentionaly so that concurrent removal/insertion of directory
+ * inside the tree can happen.
+ *
+ * So anyone walking down the page table must protect directory it traverses so
+ * they are not free by some other thread. This is done by using a reference
+ * counter for each directory. Before traversing a directory a reference is
+ * taken and once traversal is done the reference is drop.
+ *
+ * A directory entry dereference and refcount increment of sub-directory page
+ * must happen in a critical rcu section so that directory page removal can
+ * gracefully wait for all possible other threads that might have dereferenced
+ * the directory.
+ */
+#ifndef _HMM_PT_H
+#define _HMM_PT_H
+
+/*
+ * The HMM page table entry does not reflect any specific hardware. It is just
+ * a common entry format use by HMM internal and expose to HMM user so they can
+ * extract information out of HMM page table.
+ */
+#define HMM_PTE_VALID  (1 << 0)
+#define HMM_PTE_WRITE  (1 << 1)
+#define HMM_PTE_DIRTY  (1 << 2)
+#define HMM_PFN_SHIFT  4
+#define HMM_PFN_MASK   (~((dma_addr_t)((1 << HMM_PFN_SHIFT) - 1)))
+
+static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
+{
+   return (pfn << HMM_PFN_SHIFT) | HMM_PTE_VALID;
+}
+
+static inline unsigned long hmm_pte_pfn(dma_addr_t pte)
+{
+   return pte >> HMM_PFN_SHIFT;
+}
+
+#define HMM_PT_MAX_LEVEL   6
+
+/* struct hmm_pt - HMM page table structure.
+ *
+ * @mask: Array of address mask value of each level.
+ * @directory_mask: Mask for directory index (see below).
+ * @last: Last valid address (inclusive).
+ * @pgd: page global directory (top first level of the directory tree).
+ * @lock: Share lock if spinlock_t does not fit in struct page.
+ * @shift: Array of address shift value of each level.
+ * @llevel: Last level.
+ *
+ * The index into each directory for a given address and level is :
+ *   (address >> 

[PATCH 6/7] HMM: add device page fault support.

2014-12-22 Thread j . glisse
From: Jérôme Glisse 

This patch add helper for device page fault. Device page fault helper will
fill the mirror page table using the CPU page table all this synchronized
with any update to CPU page table.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 include/linux/hmm.h |   1 +
 mm/hmm.c| 384 
 2 files changed, 385 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index dd34572..72e168b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -259,6 +259,7 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct hmm_device *device);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 90ebe75..5fb7e19 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -55,6 +55,9 @@ static struct srcu_struct srcu;
  * @lock: Serialize the mirror list modifications.
  * @kref: Reference counter
  * @mmu_notifier: The mmu_notifier of this mm.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -67,6 +70,9 @@ struct hmm {
spinlock_t  lock;
struct kref kref;
struct mmu_notifier mmu_notifier;
+   struct list_headdevice_faults;
+   unsignedndevice_faults;
+   wait_queue_head_t   wait_queue;
 };
 
 static struct mmu_notifier_ops hmm_notifier_ops;
@@ -88,6 +94,11 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
  * help dealing with all this.
  */
 
+static inline bool hmm_event_overlap(struct hmm_event *a, struct hmm_event *b)
+{
+   return !((a->end <= b->start) || (a->start >= b->end));
+}
+
 static inline int hmm_event_init(struct hmm_event *event,
 struct hmm *hmm,
 unsigned long start,
@@ -149,6 +160,9 @@ static int hmm_init(struct hmm *hmm)
hmm->vm_end = TASK_SIZE;
kref_init(>kref);
INIT_HLIST_HEAD(>mirrors);
+   INIT_LIST_HEAD(>device_faults);
+   hmm->ndevice_faults = 0;
+   init_waitqueue_head(>wait_queue);
spin_lock_init(>lock);
 
/* register notifier */
@@ -205,6 +219,60 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+   int ret = 0;
+
+   mmu_notifier_range_wait_valid(hmm->mm, event->start, event->end);
+
+   spin_lock(>lock);
+   if (mmu_notifier_range_is_valid(hmm->mm, event->start, event->end)) {
+   list_add_tail(>list, >device_faults);
+   hmm->ndevice_faults++;
+   event->backoff = false;
+   } else
+   ret = -EAGAIN;
+   spin_unlock(>lock);
+
+   wake_up(>wait_queue);
+
+   return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+   hmm_event_wait(event);
+
+   spin_lock(>lock);
+   list_del_init(>list);
+   hmm->ndevice_faults--;
+   spin_unlock(>lock);
+
+   wake_up(>wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+   struct hmm_event *fevent;
+   unsigned long wait_for = 0;
+
+again:
+   spin_lock(>lock);
+   list_for_each_entry(fevent, >device_faults, list) {
+   if (!hmm_event_overlap(fevent, ievent))
+   continue;
+   fevent->backoff = true;
+   wait_for = hmm->ndevice_faults;
+   }
+   spin_unlock(>lock);
+
+   if (wait_for > 0) {
+   wait_event(hmm->wait_queue, wait_for != hmm->ndevice_faults);
+   wait_for = 0;
+   goto again;
+   }
+}
+
 static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 {
struct hmm_mirror *mirror;
@@ -214,6 +282,8 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
if (hmm->mm->hmm != hmm)
return;
 
+   hmm_wait_device_fault(hmm, event);
+
id = srcu_read_lock();
 
hlist_for_each_entry_rcu(mirror, >mirrors, mlist)
@@ -226,6 +296,35 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
hmm_mirror_update_pt(mirror, event);
 
srcu_read_unlock(, id);
+
+   wake_up(>wait_queue);
+}
+
+static int hmm_mm_fault(struct hmm *hmm,
+   struct hmm_event *event,
+   struct vm_area_struct *vma,
+   unsigned 

[PATCH 3/7] HMM: introduce heterogeneous memory management.

2014-12-22 Thread j . glisse
From: Jérôme Glisse 

This patch only introduce core HMM functions for registering a new mirror and
stopping a mirror as well as registering and unregistering a device.

The lifecycle of HMM object is handled differently then one of mmu_notifier
because unlike mmu_notifier there can be concurrent call from both mm code to
HMM code and/or from device driver code to HMM code. Moreover lifetime of HMM
can be uncorrelated from the lifetime of the process that is being mirror.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Mark Hairgrove 
Signed-off-by: John Hubbard 
Signed-off-by: Jatin Kumar 
---
 MAINTAINERS  |   7 +
 include/linux/hmm.h  | 129 
 include/linux/mm.h   |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c|   2 +
 mm/Kconfig   |  15 ++
 mm/Makefile  |   1 +
 mm/hmm.c | 374 +++
 8 files changed, 553 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a442d32..c5bb62e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4533,6 +4533,13 @@ F:   include/uapi/linux/if_hippi.h
 F: net/802/hippi.c
 F: drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse 
+L: linux...@kvack.org
+S: Maintained
+F: mm/hmm.c
+F: include/linux/hmm.h
+
 HOST AP DRIVER
 M: Jouni Malinen 
 L: hos...@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000..8eddc15
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+   /* release() - mirror must stop using the address space.
+*
+* @mirror: The mirror that link process address space with the device.
+*
+* This callback is call either on mm destruction or as result to a
+* call to hmm_mirror_release(). Device driver have to stop all hw
+* thread and all usage of the address space, it has to dirty all pages
+* that have been dirty by the device. But it must not clear any entry
+* from the mirror page table.
+*/
+   void (*release)(struct hmm_mirror *mirror);
+};
+
+/* struct hmm_device - per device HMM structure
+ *
+ * @name: Device name (uniquely identify the device on the system).
+ * @ops: The hmm operations callback.
+ * @mirrors: List of all active mirrors for the device.
+ * @mutex: Mutex protecting mirrors list.
+ *
+ * Each device that want to mirror an address space must register one of this
+ * struct (only once per linux device).
+ */
+struct hmm_device {
+   const char  *name;
+   const struct hmm_device_ops *ops;
+   struct list_headmirrors;
+   struct mutexmutex;
+};
+
+int hmm_device_register(struct hmm_device *device);
+int hmm_device_unregister(struct hmm_device *device);
+
+
+/* hmm_mirror - device specific mirroring functions.
+ *
+ * Each device that mirror a process 

[PATCH 1/7] mmu_notifier: add event information to address invalidation v6

2014-12-22 Thread j . glisse
From: Jérôme Glisse 

The event information will be useful for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |   3 +-
 drivers/infiniband/core/umem_odp.c  |   9 ++-
 drivers/iommu/amd_iommu_v2.c|   3 +-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/madvise.c|   4 +-
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   6 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 21 files changed, 274 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index a69bd44..daf53d3 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -109,7 +109,8 @@ static void radeon_mn_release(struct mmu_notifier *mn,
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
-unsigned long end)
+unsigned long end,
+enum mmu_event event)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 6095872..bc36e8c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -165,7 +165,8 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long address)
+unsigned long address,
+enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
 
@@ -192,7 +193,8 @@ static int invalidate_range_start_trampoline(struct ib_umem 
*item, u64 start,
 static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
   

HMM (Heterogeneous Memory Management) v7

2014-12-22 Thread j . glisse
So after PTO and before end of year frenzy here is an updated HMM patchset.
While not reusing Linus page table design, i use something that is, in my
view at least, close to it. Also i avoid pretending that this will be useful
to other and move it to hmm specific code. There is a longer justification
on why implementing a new page table code instead of using radix or other
existing kernel structure as part of commit message.

Everything else is pretty much the same, ie this patchset is just the ground
foundation on to which we want to build our features set. Main feature being
migrating memory to device memory. The very first version of this patchset
already show cased proof of concept of much of the features.

Below is previous patchset cover letter pretty much unchanged as background
and motivation for it did not.


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access trigger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423
  v5 https://lkml.org/lkml/2014/11/3/759
  v6 http://lwn.net/Articles/619737/

Cheers,
Jérôme

To: "Andrew Morton" ,
Cc: ,
Cc: linux-mm ,
Cc: ,
Cc: "Linus Torvalds" ,
Cc: "Mel Gorman" ,
Cc: "H. Peter Anvin" ,
Cc: "Peter Zijlstra" ,
Cc: "Linda Wang" ,
Cc: "Kevin E Martin" ,
Cc: "Jerome Glisse" ,
Cc: "Andrea Arcangeli" ,
Cc: "Johannes Weiner" ,
Cc: "Larry Woodman" ,
Cc: "Rik van Riel" ,

HMM (Heterogeneous Memory Management) v7

2014-12-22 Thread j . glisse
So after PTO and before end of year frenzy here is an updated HMM patchset.
While not reusing Linus page table design, i use something that is, in my
view at least, close to it. Also i avoid pretending that this will be useful
to other and move it to hmm specific code. There is a longer justification
on why implementing a new page table code instead of using radix or other
existing kernel structure as part of commit message.

Everything else is pretty much the same, ie this patchset is just the ground
foundation on to which we want to build our features set. Main feature being
migrating memory to device memory. The very first version of this patchset
already show cased proof of concept of much of the features.

Below is previous patchset cover letter pretty much unchanged as background
and motivation for it did not.


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access trigger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423
  v5 https://lkml.org/lkml/2014/11/3/759
  v6 http://lwn.net/Articles/619737/

Cheers,
Jérôme

To: Andrew Morton a...@linux-foundation.org,
Cc: linux-kernel@vger.kernel.org,
Cc: linux-mm linux...@kvack.org,
Cc: linux-fsde...@vger.kernel.org,
Cc: Linus Torvalds torva...@linux-foundation.org,
Cc: Mel Gorman mgor...@suse.de,
Cc: H. Peter Anvin h...@zytor.com,
Cc: Peter Zijlstra 

[PATCH 3/7] HMM: introduce heterogeneous memory management.

2014-12-22 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch only introduce core HMM functions for registering a new mirror and
stopping a mirror as well as registering and unregistering a device.

The lifecycle of HMM object is handled differently then one of mmu_notifier
because unlike mmu_notifier there can be concurrent call from both mm code to
HMM code and/or from device driver code to HMM code. Moreover lifetime of HMM
can be uncorrelated from the lifetime of the process that is being mirror.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 MAINTAINERS  |   7 +
 include/linux/hmm.h  | 129 
 include/linux/mm.h   |  11 ++
 include/linux/mm_types.h |  14 ++
 kernel/fork.c|   2 +
 mm/Kconfig   |  15 ++
 mm/Makefile  |   1 +
 mm/hmm.c | 374 +++
 8 files changed, 553 insertions(+)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a442d32..c5bb62e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4533,6 +4533,13 @@ F:   include/uapi/linux/if_hippi.h
 F: net/802/hippi.c
 F: drivers/net/hippi/
 
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse jgli...@redhat.com
+L: linux...@kvack.org
+S: Maintained
+F: mm/hmm.c
+F: include/linux/hmm.h
+
 HOST AP DRIVER
 M: Jouni Malinen j...@w1.fi
 L: hos...@shmoo.com (subscribers-only)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000..8eddc15
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/* This is a heterogeneous memory management (hmm). In a nutshell this provide
+ * an API to mirror a process address on a device which has its own mmu using
+ * its own page table for the process. It supports everything except special
+ * vma.
+ *
+ * Mandatory hardware features :
+ *   - An mmu with pagetable.
+ *   - Read only flag per cpu page.
+ *   - Page fault ie hardware must stop and wait for kernel to service fault.
+ *
+ * Optional hardware features :
+ *   - Dirty bit per cpu page.
+ *   - Access bit per cpu page.
+ *
+ * The hmm code handle all the interfacing with the core kernel mm code and
+ * provide a simple API. It does support migrating system memory to device
+ * memory and handle migration back to system memory on cpu page fault.
+ *
+ * Migrated memory is considered as swaped from cpu and core mm code point of
+ * view.
+ */
+#ifndef _HMM_H
+#define _HMM_H
+
+#ifdef CONFIG_HMM
+
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/atomic.h
+#include linux/mm_types.h
+#include linux/mmu_notifier.h
+#include linux/workqueue.h
+#include linux/mman.h
+
+
+struct hmm_device;
+struct hmm_mirror;
+struct hmm;
+
+
+/* hmm_device - Each device must register one and only one hmm_device.
+ *
+ * The hmm_device is the link btw HMM and each device driver.
+ */
+
+/* struct hmm_device_operations - HMM device operation callback
+ */
+struct hmm_device_ops {
+   /* release() - mirror must stop using the address space.
+*
+* @mirror: The mirror that link process address space with the device.
+*
+* This callback is call either on mm destruction or as result to a
+* call to hmm_mirror_release(). Device driver have to stop all hw
+* thread and all usage of the address space, it has to dirty all pages
+* that have been dirty by the device. But it must not clear any entry
+* from the mirror page table.
+*/
+   void (*release)(struct hmm_mirror *mirror);
+};
+
+/* struct hmm_device - per device HMM structure
+ *
+ * @name: Device name (uniquely identify the device on the system).
+ * @ops: The hmm operations callback.
+ * @mirrors: List of all active mirrors for the device.
+ * @mutex: Mutex protecting mirrors list.
+ *
+ * Each device that want to mirror an address space must register one of this
+ * struct (only once per linux device).
+ */
+struct hmm_device {
+   const char  *name;
+   const struct hmm_device_ops *ops;
+   struct list_head

[PATCH 1/7] mmu_notifier: add event information to address invalidation v6

2014-12-22 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The event information will be useful for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/gpu/drm/radeon/radeon_mn.c  |   3 +-
 drivers/infiniband/core/umem_odp.c  |   9 ++-
 drivers/iommu/amd_iommu_v2.c|   3 +-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/madvise.c|   4 +-
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   6 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 21 files changed, 274 insertions(+), 107 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index a69bd44..daf53d3 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -109,7 +109,8 @@ static void radeon_mn_release(struct mmu_notifier *mn,
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
-unsigned long end)
+unsigned long end,
+enum mmu_event event)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 6095872..bc36e8c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -165,7 +165,8 @@ static int invalidate_page_trampoline(struct ib_umem *item, 
u64 start,
 
 static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long address)
+unsigned long address,
+enum mmu_event event)
 {
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
 
@@ -192,7 +193,8 @@ static int invalidate_range_start_trampoline(struct ib_umem 
*item, u64 start,
 static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
  

[PATCH 6/7] HMM: add device page fault support.

2014-12-22 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch add helper for device page fault. Device page fault helper will
fill the mirror page table using the CPU page table all this synchronized
with any update to CPU page table.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h |   1 +
 mm/hmm.c| 384 
 2 files changed, 385 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index dd34572..72e168b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -259,6 +259,7 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct hmm_device *device);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+int hmm_mirror_fault(struct hmm_mirror *mirror, struct hmm_event *event);
 
 
 #endif /* CONFIG_HMM */
diff --git a/mm/hmm.c b/mm/hmm.c
index 90ebe75..5fb7e19 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -55,6 +55,9 @@ static struct srcu_struct srcu;
  * @lock: Serialize the mirror list modifications.
  * @kref: Reference counter
  * @mmu_notifier: The mmu_notifier of this mm.
+ * @device_faults: List of all active device page faults.
+ * @ndevice_faults: Number of active device page faults.
+ * @wait_queue: Wait queue for event synchronization.
  *
  * For each process address space (mm_struct) there is one and only one hmm
  * struct. hmm functions will redispatch to each devices the change made to
@@ -67,6 +70,9 @@ struct hmm {
spinlock_t  lock;
struct kref kref;
struct mmu_notifier mmu_notifier;
+   struct list_headdevice_faults;
+   unsignedndevice_faults;
+   wait_queue_head_t   wait_queue;
 };
 
 static struct mmu_notifier_ops hmm_notifier_ops;
@@ -88,6 +94,11 @@ static void hmm_mirror_update_pt(struct hmm_mirror *mirror,
  * help dealing with all this.
  */
 
+static inline bool hmm_event_overlap(struct hmm_event *a, struct hmm_event *b)
+{
+   return !((a-end = b-start) || (a-start = b-end));
+}
+
 static inline int hmm_event_init(struct hmm_event *event,
 struct hmm *hmm,
 unsigned long start,
@@ -149,6 +160,9 @@ static int hmm_init(struct hmm *hmm)
hmm-vm_end = TASK_SIZE;
kref_init(hmm-kref);
INIT_HLIST_HEAD(hmm-mirrors);
+   INIT_LIST_HEAD(hmm-device_faults);
+   hmm-ndevice_faults = 0;
+   init_waitqueue_head(hmm-wait_queue);
spin_lock_init(hmm-lock);
 
/* register notifier */
@@ -205,6 +219,60 @@ static inline struct hmm *hmm_unref(struct hmm *hmm)
return NULL;
 }
 
+static int hmm_device_fault_start(struct hmm *hmm, struct hmm_event *event)
+{
+   int ret = 0;
+
+   mmu_notifier_range_wait_valid(hmm-mm, event-start, event-end);
+
+   spin_lock(hmm-lock);
+   if (mmu_notifier_range_is_valid(hmm-mm, event-start, event-end)) {
+   list_add_tail(event-list, hmm-device_faults);
+   hmm-ndevice_faults++;
+   event-backoff = false;
+   } else
+   ret = -EAGAIN;
+   spin_unlock(hmm-lock);
+
+   wake_up(hmm-wait_queue);
+
+   return ret;
+}
+
+static void hmm_device_fault_end(struct hmm *hmm, struct hmm_event *event)
+{
+   hmm_event_wait(event);
+
+   spin_lock(hmm-lock);
+   list_del_init(event-list);
+   hmm-ndevice_faults--;
+   spin_unlock(hmm-lock);
+
+   wake_up(hmm-wait_queue);
+}
+
+static void hmm_wait_device_fault(struct hmm *hmm, struct hmm_event *ievent)
+{
+   struct hmm_event *fevent;
+   unsigned long wait_for = 0;
+
+again:
+   spin_lock(hmm-lock);
+   list_for_each_entry(fevent, hmm-device_faults, list) {
+   if (!hmm_event_overlap(fevent, ievent))
+   continue;
+   fevent-backoff = true;
+   wait_for = hmm-ndevice_faults;
+   }
+   spin_unlock(hmm-lock);
+
+   if (wait_for  0) {
+   wait_event(hmm-wait_queue, wait_for != hmm-ndevice_faults);
+   wait_for = 0;
+   goto again;
+   }
+}
+
 static void hmm_update(struct hmm *hmm, struct hmm_event *event)
 {
struct hmm_mirror *mirror;
@@ -214,6 +282,8 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
if (hmm-mm-hmm != hmm)
return;
 
+   hmm_wait_device_fault(hmm, event);
+
id = srcu_read_lock(srcu);
 
hlist_for_each_entry_rcu(mirror, hmm-mirrors, mlist)
@@ -226,6 +296,35 @@ static void hmm_update(struct hmm *hmm, struct hmm_event 
*event)
hmm_mirror_update_pt(mirror, event);
 
srcu_read_unlock(srcu, id);
+
+   wake_up(hmm-wait_queue);
+}
+

[PATCH 4/7] HMM: add HMM page table.

2014-12-22 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Heterogeneous memory management main purpose is to mirror a process address.
To do so it must maintain a secondary page table that is use by the device
driver to program the device or build a device specific page table.

Radix tree can not be use to create this secondary page table because HMM
needs more flags than RADIX_TREE_MAX_TAGS (while this can be increase we
believe HMM will require so much flags that cost will becomes prohibitive
to others users of radix tree).

Moreover radix tree is built around long but for HMM we need to store dma
address and on some platform sizeof(dma_addr_t)  sizeof(long). Thus radix
tree is unsuitable to fulfill HMM requirement hence why we introduce this
code which allows to create page table that can grow and shrink dynamicly.

The design is very clause to CPU page table as it reuse some of the feature
such as spinlock embedded in struct page.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 MAINTAINERS|   2 +
 include/linux/hmm_pt.h | 261 ++
 mm/Makefile|   2 +-
 mm/hmm_pt.c| 425 +
 4 files changed, 689 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm_pt.h
 create mode 100644 mm/hmm_pt.c

diff --git a/MAINTAINERS b/MAINTAINERS
index c5bb62e..02f9f29 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4539,6 +4539,8 @@ L:linux...@kvack.org
 S: Maintained
 F: mm/hmm.c
 F: include/linux/hmm.h
+F: mm/hmm_pt.c
+F: include/linux/hmm_pt.h
 
 HOST AP DRIVER
 M: Jouni Malinen j...@w1.fi
diff --git a/include/linux/hmm_pt.h b/include/linux/hmm_pt.h
new file mode 100644
index 000..88fc519
--- /dev/null
+++ b/include/linux/hmm_pt.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/*
+ * This provide a set of helpers for HMM page table. See include/linux/hmm.h
+ * for a description of what HMM is.
+ *
+ * HMM page table rely on a locking mecanism similar to CPU page table for page
+ * table update. It use the spinlock embedded inside the struct page to protect
+ * change to page table directory which should minimize lock contention for
+ * concurrent update.
+ *
+ * It does also provide a directory tree protection mechanism. Unlike CPU page
+ * table there is no mmap semaphore to protect directory tree from removal and
+ * this is done intentionaly so that concurrent removal/insertion of directory
+ * inside the tree can happen.
+ *
+ * So anyone walking down the page table must protect directory it traverses so
+ * they are not free by some other thread. This is done by using a reference
+ * counter for each directory. Before traversing a directory a reference is
+ * taken and once traversal is done the reference is drop.
+ *
+ * A directory entry dereference and refcount increment of sub-directory page
+ * must happen in a critical rcu section so that directory page removal can
+ * gracefully wait for all possible other threads that might have dereferenced
+ * the directory.
+ */
+#ifndef _HMM_PT_H
+#define _HMM_PT_H
+
+/*
+ * The HMM page table entry does not reflect any specific hardware. It is just
+ * a common entry format use by HMM internal and expose to HMM user so they can
+ * extract information out of HMM page table.
+ */
+#define HMM_PTE_VALID  (1  0)
+#define HMM_PTE_WRITE  (1  1)
+#define HMM_PTE_DIRTY  (1  2)
+#define HMM_PFN_SHIFT  4
+#define HMM_PFN_MASK   (~((dma_addr_t)((1  HMM_PFN_SHIFT) - 1)))
+
+static inline dma_addr_t hmm_pte_from_pfn(dma_addr_t pfn)
+{
+   return (pfn  HMM_PFN_SHIFT) | HMM_PTE_VALID;
+}
+
+static inline unsigned long hmm_pte_pfn(dma_addr_t pte)
+{
+   return pte  HMM_PFN_SHIFT;
+}
+
+#define HMM_PT_MAX_LEVEL   6
+
+/* struct hmm_pt - HMM page table structure.
+ *
+ * @mask: Array of address mask value of each level.
+ * @directory_mask: Mask for directory index (see below).
+ * @last: Last valid address (inclusive).
+ * @pgd: page global directory (top first level of the directory tree).
+ * @lock: Share lock if spinlock_t does not fit in struct page.
+ * @shift: Array of address shift 

[PATCH 5/7] HMM: add per mirror page table.

2014-12-22 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This patch add the per mirror page table. It also propagate CPU page table
update to this per mirror page table using mmu_notifier callback. All update
are contextualized with an HMM event structure that convey all information
needed by device driver to take proper actions (update its own mmu to reflect
changes and schedule proper flushing).

Core HMM is responsible for updating the per mirror page table once the device
driver is done with its update. Most importantly HMM will properly propagate
HMM page table dirty bit to underlying page.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Signed-off-by: Sherry Cheung sche...@nvidia.com
Signed-off-by: Subhash Gutti sgu...@nvidia.com
Signed-off-by: Mark Hairgrove mhairgr...@nvidia.com
Signed-off-by: John Hubbard jhubb...@nvidia.com
Signed-off-by: Jatin Kumar jaku...@nvidia.com
---
 include/linux/hmm.h | 136 +++
 mm/hmm.c| 263 
 2 files changed, 399 insertions(+)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 8eddc15..dd34572 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -46,12 +46,65 @@
 #include linux/mmu_notifier.h
 #include linux/workqueue.h
 #include linux/mman.h
+#include linux/hmm_pt.h
 
 
 struct hmm_device;
 struct hmm_mirror;
+struct hmm_fence;
 struct hmm;
 
+/* hmm_fence - Device driver fence allowing to batch update and delay wait.
+ *
+ * @mirror: The HMM mirror this fence is associated with.
+ * @list: List of fence.
+ *
+ * Each time HMM callback into a device driver for update the device driver can
+ * return fence which core HMM will wait on. This allow HMM to batch update to
+ * several different device driver and then wait for each of them to complete.
+ *
+ * The hmm_fence structure is intended to be embedded inside a device driver
+ * specific fence structure.
+ */
+struct hmm_fence {
+   struct hmm_mirror   *mirror;
+   struct list_headlist;
+};
+
+
+/*
+ * hmm_event - each event is described by a type associated with a struct.
+ */
+enum hmm_etype {
+   HMM_NONE = 0,
+   HMM_ISDIRTY,
+   HMM_MIGRATE,
+   HMM_MUNMAP,
+   HMM_DEVICE_RFAULT,
+   HMM_DEVICE_WFAULT,
+   HMM_WRITE_PROTECT,
+};
+
+/* struct hmm_event - memory event information.
+ *
+ * @list: So HMM can keep track of all active events.
+ * @start: First address (inclusive).
+ * @end: Last address (exclusive).
+ * @fences: List of device fences associated with this event.
+ * @pte_mask: HMM pte update mask (bit(s) that are still valid).
+ * @etype: Event type (munmap, migrate, truncate, ...).
+ * @backoff: Only meaningful for device page fault.
+ */
+struct hmm_event {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   struct list_headfences;
+   dma_addr_t  pte_mask;
+   enum hmm_etype  etype;
+   boolbackoff;
+};
+
 
 /* hmm_device - Each device must register one and only one hmm_device.
  *
@@ -72,6 +125,87 @@ struct hmm_device_ops {
 * from the mirror page table.
 */
void (*release)(struct hmm_mirror *mirror);
+
+   /* fence_wait() - to wait on device driver fence.
+*
+* @fence: The device driver fence struct.
+* Returns: 0 on success,-EIO on error, -EAGAIN to wait again.
+*
+* Called when hmm want to wait for all operations associated with a
+* fence to complete (including device cache flush if the event mandate
+* it).
+*
+* Device driver must free fence and associated resources if it returns
+* something else thant -EAGAIN. On -EAGAIN the fence must not be free
+* as hmm will call back again.
+*
+* Return error if scheduled operation failed or if need to wait again.
+* -EIO Some input/output error with the device.
+* -EAGAIN The fence not yet signaled, hmm reschedule waiting thread.
+*
+* All other return value trigger warning and are transformed to -EIO.
+*/
+   int (*fence_wait)(struct hmm_fence *fence);
+
+   /* fence_ref() - take a reference fence structure.
+*
+* @fence: Fence structure hmm is referencing.
+*/
+   void (*fence_ref)(struct hmm_fence *fence);
+
+   /* fence_unref() - drop a reference fence structure.
+*
+* @fence: Fence structure hmm is dereferencing.
+*/
+   void (*fence_unref)(struct hmm_fence *fence);
+
+   /* update() - update device mmu following an event.
+*
+* @mirror: The mirror that link process address space with the device.
+* @event: The event that triggered the update.
+* Returns: Valid fence ptr or NULL on success otherwise ERR_PTR.
+*
+* Called to update device page table for a range of address.
+* The event 

[PATCH 2/7] mmu_notifier: keep track of active invalidation ranges v2

2014-12-22 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an atomic section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |  9 ++--
 drivers/gpu/drm/radeon/radeon_mn.c  | 14 ++
 drivers/infiniband/core/umem_odp.c  | 16 +++---
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++---
 drivers/xen/gntdev.c| 15 +++---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 60 +--
 kernel/events/uprobes.c | 13 +++--
 mm/huge_memory.c| 78 +
 mm/hugetlb.c| 55 +++--
 mm/ksm.c| 28 +--
 mm/madvise.c|  8 ++-
 mm/memory.c | 78 -
 mm/migrate.c| 36 +++---
 mm/mmu_notifier.c   | 87 -
 mm/mprotect.c   | 18 ---
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++---
 virt/kvm/kvm_main.c | 10 ++--
 19 files changed, 322 insertions(+), 259 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..a78eede 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,16 +128,15 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range-start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range-end - 1, start = range-start;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next  end) {
struct drm_i915_gem_object *obj = NULL;
 
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index daf53d3..63e6936 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -100,34 +100,30 @@ static void radeon_mn_release(struct mmu_notifier *mn,
  *
  * @mn: our notifier
  * @mn: the mm this callback is about
- * @start: start of updated range
- * @end: end of updated range
+ * @range: Address range information.
  *
  * We block for all BOs between start and end to be idle and
  * unmap them by move them into system domain again.
  */
 static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
-unsigned long start,
-unsigned long end,
-enum mmu_event event)
+const struct mmu_notifier_range 
*range)
 {
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct interval_tree_node *it;
-
/* notification is exclusive, but interval is inclusive */
-   end -= 1;
+   unsigned long end = range-end - 1;
 
mutex_lock(rmn-lock);
 
-   

[PATCH] scripts/package: binrpm-pkg do not build source tar file v4

2014-12-04 Thread j . glisse
From: Jérôme Glisse 

When doing make binrpm-pkg we only want to build the binary and header
package as the documentation of binrpm-pkg target claims. Hence this
patch avoid building the source and devel package. This makes binrpm-pkg
target lot faster and way more usefull.

Changed since v3 :
  - Avoid creating /usr/src/kernels directory.
  - Avoid useless pre clean step of source and devel package.

Changed since v2 :
  - Avoid symlinks in /usr/src/kernel.
  - Update subject line and changelog.

Changed since v1 :
  - Avoid building the devel package too.

Signed-off-by: Jérôme Glisse 
Cc: Michal Marek 
---
 scripts/package/mkspec | 4 
 1 file changed, 4 insertions(+)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index 1395760..d9ab94b 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -117,6 +117,7 @@ echo 'mv vmlinux.bz2 
$RPM_BUILD_ROOT'"/boot/vmlinux-$KERNELRELEASE.bz2"
 echo 'mv vmlinux.orig vmlinux'
 echo "%endif"
 
+if ! $PREBUILT; then
 echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/{build,source}"
 echo "mkdir -p "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE"
 echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* 
--exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation 
--exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\""
@@ -124,6 +125,7 @@ echo "tar "'$EXCLUDES'" -cf- . | (cd 
"'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNEL
 echo 'cd $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE"
 echo "ln -sf /usr/src/kernels/$KERNELRELEASE build"
 echo "ln -sf /usr/src/kernels/$KERNELRELEASE source"
+fi
 
 echo ""
 echo "%clean"
@@ -151,9 +153,11 @@ echo "%files headers"
 echo '%defattr (-, root, root)'
 echo "/usr/include"
 echo ""
+if ! $PREBUILT; then
 echo "%files devel"
 echo '%defattr (-, root, root)'
 echo "/usr/src/kernels/$KERNELRELEASE"
 echo "/lib/modules/$KERNELRELEASE/build"
 echo "/lib/modules/$KERNELRELEASE/source"
 echo ""
+fi
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] scripts/package: binrpm-pkg do not build source tar file v4

2014-12-04 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

When doing make binrpm-pkg we only want to build the binary and header
package as the documentation of binrpm-pkg target claims. Hence this
patch avoid building the source and devel package. This makes binrpm-pkg
target lot faster and way more usefull.

Changed since v3 :
  - Avoid creating /usr/src/kernels directory.
  - Avoid useless pre clean step of source and devel package.

Changed since v2 :
  - Avoid symlinks in /usr/src/kernel.
  - Update subject line and changelog.

Changed since v1 :
  - Avoid building the devel package too.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Cc: Michal Marek mma...@suse.cz
---
 scripts/package/mkspec | 4 
 1 file changed, 4 insertions(+)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index 1395760..d9ab94b 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -117,6 +117,7 @@ echo 'mv vmlinux.bz2 
$RPM_BUILD_ROOT'/boot/vmlinux-$KERNELRELEASE.bz2
 echo 'mv vmlinux.orig vmlinux'
 echo %endif
 
+if ! $PREBUILT; then
 echo 'rm -f $RPM_BUILD_ROOT'/lib/modules/$KERNELRELEASE/{build,source}
 echo mkdir -p '$RPM_BUILD_ROOT'/usr/src/kernels/$KERNELRELEASE
 echo EXCLUDES=\$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* 
--exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation 
--exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\
@@ -124,6 +125,7 @@ echo tar '$EXCLUDES' -cf- . | (cd 
'$RPM_BUILD_ROOT'/usr/src/kernels/$KERNEL
 echo 'cd $RPM_BUILD_ROOT'/lib/modules/$KERNELRELEASE
 echo ln -sf /usr/src/kernels/$KERNELRELEASE build
 echo ln -sf /usr/src/kernels/$KERNELRELEASE source
+fi
 
 echo 
 echo %clean
@@ -151,9 +153,11 @@ echo %files headers
 echo '%defattr (-, root, root)'
 echo /usr/include
 echo 
+if ! $PREBUILT; then
 echo %files devel
 echo '%defattr (-, root, root)'
 echo /usr/src/kernels/$KERNELRELEASE
 echo /lib/modules/$KERNELRELEASE/build
 echo /lib/modules/$KERNELRELEASE/source
 echo 
+fi
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] scripts/package: binrpm-pkg do not create source and devel package v3

2014-12-02 Thread j . glisse
From: Jérôme Glisse 

When doing make binrpm-pkg we only want to build the binary and header
package as the documentation of binrpm-pkg target claims. Hence this
patch avoid building the source and devel package. This makes binrpm-pkg
target lot faster and way more usefull.

Changed since v2 :
  - Avoid symlinks in /usr/src/kernel.
  - Update subject line and changelog.

Changed since v1 :
  - Avoid building the devel package too.

Signed-off-by: Jérôme Glisse 
---
 scripts/package/mkspec | 4 
 1 file changed, 4 insertions(+)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index 1395760..010653c 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -119,11 +119,13 @@ echo "%endif"
 
 echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/{build,source}"
 echo "mkdir -p "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE"
+if ! $PREBUILT; then
 echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* 
--exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation 
--exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\""
 echo "tar "'$EXCLUDES'" -cf- . | (cd 
"'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE;tar xvf -)"
 echo 'cd $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE"
 echo "ln -sf /usr/src/kernels/$KERNELRELEASE build"
 echo "ln -sf /usr/src/kernels/$KERNELRELEASE source"
+fi
 
 echo ""
 echo "%clean"
@@ -151,9 +153,11 @@ echo "%files headers"
 echo '%defattr (-, root, root)'
 echo "/usr/include"
 echo ""
+if ! $PREBUILT; then
 echo "%files devel"
 echo '%defattr (-, root, root)'
 echo "/usr/src/kernels/$KERNELRELEASE"
 echo "/lib/modules/$KERNELRELEASE/build"
 echo "/lib/modules/$KERNELRELEASE/source"
 echo ""
+fi
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] scripts/package: binrpm-pkg do not create source and devel package v3

2014-12-02 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

When doing make binrpm-pkg we only want to build the binary and header
package as the documentation of binrpm-pkg target claims. Hence this
patch avoid building the source and devel package. This makes binrpm-pkg
target lot faster and way more usefull.

Changed since v2 :
  - Avoid symlinks in /usr/src/kernel.
  - Update subject line and changelog.

Changed since v1 :
  - Avoid building the devel package too.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 scripts/package/mkspec | 4 
 1 file changed, 4 insertions(+)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index 1395760..010653c 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -119,11 +119,13 @@ echo %endif
 
 echo 'rm -f $RPM_BUILD_ROOT'/lib/modules/$KERNELRELEASE/{build,source}
 echo mkdir -p '$RPM_BUILD_ROOT'/usr/src/kernels/$KERNELRELEASE
+if ! $PREBUILT; then
 echo EXCLUDES=\$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* 
--exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation 
--exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\
 echo tar '$EXCLUDES' -cf- . | (cd 
'$RPM_BUILD_ROOT'/usr/src/kernels/$KERNELRELEASE;tar xvf -)
 echo 'cd $RPM_BUILD_ROOT'/lib/modules/$KERNELRELEASE
 echo ln -sf /usr/src/kernels/$KERNELRELEASE build
 echo ln -sf /usr/src/kernels/$KERNELRELEASE source
+fi
 
 echo 
 echo %clean
@@ -151,9 +153,11 @@ echo %files headers
 echo '%defattr (-, root, root)'
 echo /usr/include
 echo 
+if ! $PREBUILT; then
 echo %files devel
 echo '%defattr (-, root, root)'
 echo /usr/src/kernels/$KERNELRELEASE
 echo /lib/modules/$KERNELRELEASE/build
 echo /lib/modules/$KERNELRELEASE/source
 echo 
+fi
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] scripts/package: binrpm-pkg do not build source tar file v2

2014-12-01 Thread j . glisse
From: Jérôme Glisse 

When doing make binrpm-pkg we do not want to build the source tar
file. This patch avoid doing the gigantic tar file.

Changed since v1
  - Avoid building the devel package too.

Signed-off-by: Jérôme Glisse 
---
 scripts/package/mkspec | 4 
 1 file changed, 4 insertions(+)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index 1395760..31b7b7f 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -119,8 +119,10 @@ echo "%endif"
 
 echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/{build,source}"
 echo "mkdir -p "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE"
+if ! $PREBUILT; then
 echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* 
--exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation 
--exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\""
 echo "tar "'$EXCLUDES'" -cf- . | (cd 
"'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE;tar xvf -)"
+fi
 echo 'cd $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE"
 echo "ln -sf /usr/src/kernels/$KERNELRELEASE build"
 echo "ln -sf /usr/src/kernels/$KERNELRELEASE source"
@@ -151,9 +153,11 @@ echo "%files headers"
 echo '%defattr (-, root, root)'
 echo "/usr/include"
 echo ""
+if ! $PREBUILT; then
 echo "%files devel"
 echo '%defattr (-, root, root)'
 echo "/usr/src/kernels/$KERNELRELEASE"
 echo "/lib/modules/$KERNELRELEASE/build"
 echo "/lib/modules/$KERNELRELEASE/source"
 echo ""
+fi
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] scripts/package: binrpm-pkg do not build source tar file v2

2014-12-01 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

When doing make binrpm-pkg we do not want to build the source tar
file. This patch avoid doing the gigantic tar file.

Changed since v1
  - Avoid building the devel package too.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 scripts/package/mkspec | 4 
 1 file changed, 4 insertions(+)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index 1395760..31b7b7f 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -119,8 +119,10 @@ echo %endif
 
 echo 'rm -f $RPM_BUILD_ROOT'/lib/modules/$KERNELRELEASE/{build,source}
 echo mkdir -p '$RPM_BUILD_ROOT'/usr/src/kernels/$KERNELRELEASE
+if ! $PREBUILT; then
 echo EXCLUDES=\$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* 
--exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation 
--exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\
 echo tar '$EXCLUDES' -cf- . | (cd 
'$RPM_BUILD_ROOT'/usr/src/kernels/$KERNELRELEASE;tar xvf -)
+fi
 echo 'cd $RPM_BUILD_ROOT'/lib/modules/$KERNELRELEASE
 echo ln -sf /usr/src/kernels/$KERNELRELEASE build
 echo ln -sf /usr/src/kernels/$KERNELRELEASE source
@@ -151,9 +153,11 @@ echo %files headers
 echo '%defattr (-, root, root)'
 echo /usr/include
 echo 
+if ! $PREBUILT; then
 echo %files devel
 echo '%defattr (-, root, root)'
 echo /usr/src/kernels/$KERNELRELEASE
 echo /lib/modules/$KERNELRELEASE/build
 echo /lib/modules/$KERNELRELEASE/source
 echo 
+fi
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] lib: lockless generic and arch independent page table (gpt) v2.

2014-11-10 Thread j . glisse
From: Jérôme Glisse 

Page table is a common structure format most notably use by cpu mmu. The
arch depend page table code has strong tie to the architecture which makes
it unsuitable to be use by other non arch specific code.

This patch implement a generic and arch independent page table. It is generic
in the sense that entry size can be u64 or unsigned long (or u32 too on 32bits
arch).

It is lockless in the sense that at any point in time you can have concurrent
thread updating the page table (removing or changing entry) and faulting in
the page table (adding new entry). This is achieve by enforcing each updater
and each faulter to take a range lock. There is no exclusion on range lock,
ie several thread can fault or update the same range concurrently and it is
the responsability of the user to synchronize update to the page table entry
(pte), update to the page table directory (pdp) is under gpt responsability.

API usage pattern is :
  gpt_init()

  gpt_lock_update(lock_range)
  // User can update pte for instance by using atomic bit operation
  // allowing complete lockless update.
  gpt_unlock_update(lock_range)

  gpt_lock_fault(lock_range)
  // User can fault in pte but he is responsible for avoiding thread
  // to concurrently fault the same pte and for properly accounting
  // the number of pte faulted in the pdp structure.
  gpt_unlock_fault(lock_range)
  // The new faulted pte will only be visible to others updaters only
  // once all concurrent faulter on the address unlock.

Details on how the lockless concurrent updater and faulter works is provided
in the header file.

Changed since v1:
  - Switch to macro implementation instead of using arithmetic to accomodate
  the various size for table entry (uint64_t, unsigned long, ...).
  This is somewhat less flexbile but right now there is no use for the extra
  flexibility v1 was offering.

Signed-off-by: Jérôme Glisse 
Acked-by: Rik van Riel 
---
 include/linux/gpt.h | 340 +++
 lib/Kconfig |   3 +
 lib/Makefile|   2 +
 lib/gpt.c   | 202 
 lib/gpt_generic.h   | 663 
 5 files changed, 1210 insertions(+)
 create mode 100644 include/linux/gpt.h
 create mode 100644 lib/gpt.c
 create mode 100644 lib/gpt_generic.h

diff --git a/include/linux/gpt.h b/include/linux/gpt.h
new file mode 100644
index 000..3c28634
--- /dev/null
+++ b/include/linux/gpt.h
@@ -0,0 +1,340 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * High level overview
+ * ---
+ *
+ * This is a generic arch independant page table implementation with lockless
+ * (allmost lockless) access. The content of the page table ie the page table
+ * entry, are not protected by the gpt helper, it is up to the code using gpt
+ * to protect the page table entry from concurrent update with no restriction
+ * on the mechanism (can be atomic or can sleep).
+ *
+ * The gpt code only deals with protecting the page directory tree structure.
+ * Which is done in a lockless way. Concurrent threads can read and or write
+ * overlapping range of the gpt. There can also be concurrent insertion and
+ * removal of page directory (insertion or removal of page table level).
+ *
+ * While removal of page directory is completely lockless, insertion of new
+ * page directory still require a lock (to avoid double insertion). If the
+ * architecture have a spinlock in its page struct then several threads can
+ * concurrently insert new directory (level) as long as they are inserting into
+ * different page directory. Otherwise insertion will serialize using a common
+ * spinlock. Note that insertion in this context only refer to inserting page
+ * directory, it does not deal about page table entry insertion and again this
+ * is the responsability of gpt user to properly synchronize those.
+ *
+ *
+ * Each gpt access must be done under gpt lock protection by calling gpt_lock()
+ * with a lock structure. Once a range is "locked" with gpt_lock() all access
+ * can be done in lockless fashion, using either gpt_walk or gpt_iter helpers.
+ * Note however that only directory that are considered as established will be
+ * considered ie if a thread is concurently inserting a new directory in the
+ * locked range then this directory will be ignore by gpt_walk or gpt_iter.
+ *
+ * This restriction comes from the lockless design. Some thread can hold a gpt
+ * 

[PATCH 1/5] mmu_notifier: add event information to address invalidation v6

2014-11-10 Thread j . glisse
From: Jérôme Glisse 

The event information will be usefull for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their intented usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/iommu/amd_iommu_v2.c|  11 ++-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   5 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 18 files changed, 269 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 90d734b..57d2acf 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -413,14 +413,17 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
 
 static void mn_invalidate_page(struct mmu_notifier *mn,
   struct mm_struct *mm,
-  unsigned long address)
+  unsigned long address,
+  enum mmu_event event)
 {
__mn_flush_page(mn, address);
 }
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start,
+ unsigned long end,
+ enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -441,7 +444,9 @@ static void mn_invalidate_range_start(struct mmu_notifier 
*mn,
 
 static void mn_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
-   unsigned long start, unsigned long end)
+   unsigned long start,
+   unsigned long end,
+   enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 2129274..e67fed1 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -221,7 +221,8 @@ void gru_flush_all_tlb(struct gru_state *gru)
  */
 static void gru_invalidate_range_start(struct mmu_notifier *mn,
   

[PATCH 4/5] hmm: heterogeneous memory management v6

2014-11-10 Thread j . glisse
From: Jérôme Glisse 

Motivation:

Heterogeneous memory management is intended to allow a device to transparently
access a process address space without having to lock pages of the process or
take references on them. In other word mirroring a process address space while
allowing the regular memory management event such as page reclamation or page
migration, to happen seamlessly.

Recent years have seen a surge into the number of specialized devices that are
part of a computer platform (from desktop to phone). So far each of those
devices have operated on there own private address space that is not link or
expose to the process address space that is using them. This separation often
leads to multiple memory copy happening between the device owned memory and the
process memory. This of course is both a waste of cpu cycle and memory.

Over the last few years most of those devices have gained a full mmu allowing
them to support multiple page table, page fault and other features that are
found inside cpu mmu. There is now a strong incentive to start leveraging
capabilities of such devices and to start sharing process address to avoid
any unnecessary memory copy as well as simplifying the programming model of
those devices by sharing an unique and common address space with the process
that use them.

The aim of the heterogeneous memory management is to provide a common API that
can be use by any such devices in order to mirror process address. The hmm code
provide an unique entry point and interface itself with the core mm code of the
linux kernel avoiding duplicate implementation and shielding device driver code
from core mm code.

Moreover, hmm also intend to provide support for migrating memory to device
private memory, allowing device to work on its own fast local memory. The hmm
code would be responsible to intercept cpu page fault on migrated range and
to migrate it back to system memory allowing cpu to resume its access to the
memory.

Another feature hmm intend to provide is support for atomic operation for the
device even if the bus linking the device and the cpu do not have any such
capabilities. On such hardware atomic operation require the page to only be
mapped on the device or on the cpu but not both at the same time.

We expect that graphic processing unit and network interface to be among the
first users of such api.

Hardware requirement:

Because hmm is intended to be use by device driver there are minimum features
requirement for the hardware mmu :
  - hardware have its own page table per process (can be share btw != devices)
  - hardware mmu support page fault and suspend execution until the page fault
is serviced by hmm code. The page fault must also trigger some form of
interrupt so that hmm code can be call by the device driver.
  - hardware must support at least read only mapping (otherwise it can not
access read only range of the process address space).
  - hardware access to system memory must be cache coherent with the cpu.

For better memory management it is highly recommanded that the device also
support the following features :
  - hardware mmu set access bit in its page table on memory access (like cpu).
  - hardware page table can be updated from cpu or through a fast path.
  - hardware provide advanced statistic over which range of memory it access
the most.
  - hardware differentiate atomic memory access from regular access allowing
to support atomic operation even on platform that do not have atomic
support on the bus linking the device with the cpu.

Implementation:

The hmm layer provide a simple API to the device driver. Each device driver
have to register and hmm device that holds pointer to all the callback the hmm
code will make to synchronize the device page table with the cpu page table of
a given process.

For each process it wants to mirror the device driver must register a mirror
hmm structure that holds all the informations specific to the process being
mirrored. Each hmm mirror uniquely link an hmm device with a process address
space (the mm struct).

This design allow several different device driver to mirror concurrently the
same process. The hmm layer will dispatch approprietly to each device driver
modification that are happening to the process address space.

The hmm layer rely on the mmu notifier api to monitor change to the process
address space. Because update to device page table can have unbound completion
time, the hmm layer need the capability to sleep during mmu notifier callback.

This patch only implement the core of the hmm layer and do not support feature
such as migration to device memory.

Changed since v1:
  - convert fence to refcounted object
  - change the api to provide pte value directly avoiding useless temporary
special hmm pfn value
  - cleanups & fixes ...

Changed since v2:
  - fixed checkpatch.pl warnings & errors
  - converted to a staging feature

Changed since v3:
  - Use mmput notifier chain instead of 

[PATCH 2/5] mmu_notifier: keep track of active invalidation ranges v2

2014-11-10 Thread j . glisse
From: Jérôme Glisse 

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an "atomic" section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Signed-off-by: Jérôme Glisse 
Reviewed-by: Rik van Riel 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++--
 drivers/iommu/amd_iommu_v2.c|  8 +--
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++---
 drivers/xen/gntdev.c| 15 +++---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 60 --
 kernel/events/uprobes.c | 13 +++--
 mm/huge_memory.c| 78 +
 mm/hugetlb.c| 55 +++--
 mm/ksm.c| 28 +--
 mm/memory.c | 78 -
 mm/migrate.c| 36 +++---
 mm/mmu_notifier.c   | 88 -
 mm/mprotect.c   | 17 ---
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++---
 virt/kvm/kvm_main.c | 10 ++--
 17 files changed, 310 insertions(+), 245 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..10b0044 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,26 +128,25 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range->start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range->end - 1;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next < end) {
struct drm_i915_gem_object *obj = NULL;
 
spin_lock(>lock);
if (mn->has_linear)
-   it = invalidate_range__linear(mn, mm, start, end);
+   it = invalidate_range__linear(mn, mm, range->start, 
end);
else if (serial == mn->serial)
it = interval_tree_iter_next(it, next, end);
else
-   it = interval_tree_iter_first(>objects, start, end);
+   it = interval_tree_iter_first(>objects, 
range->start, end);
if (it != NULL) {
obj = container_of(it, struct i915_mmu_object, it)->obj;
drm_gem_object_reference(>base);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 57d2acf..9b7f32d 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -421,9 +421,7 @@ static void mn_invalidate_page(struct mmu_notifier *mn,
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- enum mmu_event event)
+ const struct mmu_notifier_range *range)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -444,9 +442,7 @@ static void 

HMM (heterogeneous memory management) v6

2014-11-10 Thread j . glisse
Andrew so resending with review and ack from Riek and couple minor fixes
along the way. Is there anything blocking this from getting in next kernel ?
Again hardware is coming and there is still a long list of features waiting
on this core set of patches getting in. I reinclude part of my previous
email below.


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access triger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423
  v5 https://lkml.org/lkml/2014/11/3/759

Cheers,
Jérôme

To: "Andrew Morton" ,
Cc: ,
Cc: linux-mm ,
Cc: ,
Cc: "Linus Torvalds" ,
Cc: "Mel Gorman" ,
Cc: "H. Peter Anvin" ,
Cc: "Peter Zijlstra" ,
Cc: "Linda Wang" ,
Cc: "Kevin E Martin" ,
Cc: "Jerome Glisse" ,
Cc: "Andrea Arcangeli" ,
Cc: "Johannes Weiner" ,
Cc: "Larry Woodman" ,
Cc: "Rik van Riel" ,
Cc: "Dave Airlie" ,
Cc: "Jeff Law" ,
Cc: "Brendan Conoboy" ,
Cc: "Joe Donohue" ,
Cc: "Duncan Poole" ,
Cc: "Sherry Cheung" ,
Cc: "Subhash Gutti" ,
Cc: "John Hubbard" ,
Cc: "Mark Hairgrove" ,
Cc: "Lucien Dunning" ,
Cc: "Cameron Buschardt" ,
Cc: "Arvind Gopalakrishnan" ,
Cc: "Haggai Eran" ,
Cc: "Or Gerlitz" ,
Cc: "Sagi Grimberg" 
Cc: "Shachar Raindel" ,
Cc: "Liran Liss" ,
Cc: "Roland Dreier" ,
Cc: "Sander, Ben" ,
Cc: "Stoner, Greg" ,
Cc: "Bridgman, John" ,
Cc: "Mantor, Michael" ,
Cc: "Blinzer, Paul" ,
Cc: "Morichetti, Laurent" ,
Cc: "Deucher, Alexander" 

HMM (heterogeneous memory management) v6

2014-11-10 Thread j . glisse
Andrew so resending with review and ack from Riek and couple minor fixes
along the way. Is there anything blocking this from getting in next kernel ?
Again hardware is coming and there is still a long list of features waiting
on this core set of patches getting in. I reinclude part of my previous
email below.


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access triger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423
  v5 https://lkml.org/lkml/2014/11/3/759

Cheers,
Jérôme

To: Andrew Morton a...@linux-foundation.org,
Cc: linux-kernel@vger.kernel.org,
Cc: linux-mm linux...@kvack.org,
Cc: linux-fsde...@vger.kernel.org,
Cc: Linus Torvalds torva...@linux-foundation.org,
Cc: Mel Gorman mgor...@suse.de,
Cc: H. Peter Anvin h...@zytor.com,
Cc: Peter Zijlstra pet...@infradead.org,
Cc: Linda Wang lw...@redhat.com,
Cc: Kevin E Martin k...@redhat.com,
Cc: Jerome Glisse jgli...@redhat.com,
Cc: Andrea Arcangeli aarca...@redhat.com,
Cc: Johannes Weiner jwei...@redhat.com,
Cc: Larry Woodman lwood...@redhat.com,
Cc: Rik van Riel r...@redhat.com,
Cc: Dave Airlie airl...@redhat.com,
Cc: Jeff Law l...@redhat.com,
Cc: Brendan Conoboy b...@redhat.com,
Cc: Joe Donohue jdono...@redhat.com,
Cc: Duncan Poole dpo...@nvidia.com,
Cc: Sherry Cheung sche...@nvidia.com,
Cc: Subhash Gutti sgu...@nvidia.com,
Cc: John Hubbard 

[PATCH 2/5] mmu_notifier: keep track of active invalidation ranges v2

2014-11-10 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an atomic section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Changed since v1:
  - Fix a possible deadlock in mmu_notifier_range_wait_valid()

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++--
 drivers/iommu/amd_iommu_v2.c|  8 +--
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++---
 drivers/xen/gntdev.c| 15 +++---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 60 --
 kernel/events/uprobes.c | 13 +++--
 mm/huge_memory.c| 78 +
 mm/hugetlb.c| 55 +++--
 mm/ksm.c| 28 +--
 mm/memory.c | 78 -
 mm/migrate.c| 36 +++---
 mm/mmu_notifier.c   | 88 -
 mm/mprotect.c   | 17 ---
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++---
 virt/kvm/kvm_main.c | 10 ++--
 17 files changed, 310 insertions(+), 245 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..10b0044 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,26 +128,25 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range-start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range-end - 1;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next  end) {
struct drm_i915_gem_object *obj = NULL;
 
spin_lock(mn-lock);
if (mn-has_linear)
-   it = invalidate_range__linear(mn, mm, start, end);
+   it = invalidate_range__linear(mn, mm, range-start, 
end);
else if (serial == mn-serial)
it = interval_tree_iter_next(it, next, end);
else
-   it = interval_tree_iter_first(mn-objects, start, end);
+   it = interval_tree_iter_first(mn-objects, 
range-start, end);
if (it != NULL) {
obj = container_of(it, struct i915_mmu_object, it)-obj;
drm_gem_object_reference(obj-base);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 57d2acf..9b7f32d 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -421,9 +421,7 @@ static void mn_invalidate_page(struct mmu_notifier *mn,
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- enum mmu_event event)
+ const struct mmu_notifier_range *range)
 {
struct pasid_state *pasid_state;
struct device_state 

[PATCH 4/5] hmm: heterogeneous memory management v6

2014-11-10 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Motivation:

Heterogeneous memory management is intended to allow a device to transparently
access a process address space without having to lock pages of the process or
take references on them. In other word mirroring a process address space while
allowing the regular memory management event such as page reclamation or page
migration, to happen seamlessly.

Recent years have seen a surge into the number of specialized devices that are
part of a computer platform (from desktop to phone). So far each of those
devices have operated on there own private address space that is not link or
expose to the process address space that is using them. This separation often
leads to multiple memory copy happening between the device owned memory and the
process memory. This of course is both a waste of cpu cycle and memory.

Over the last few years most of those devices have gained a full mmu allowing
them to support multiple page table, page fault and other features that are
found inside cpu mmu. There is now a strong incentive to start leveraging
capabilities of such devices and to start sharing process address to avoid
any unnecessary memory copy as well as simplifying the programming model of
those devices by sharing an unique and common address space with the process
that use them.

The aim of the heterogeneous memory management is to provide a common API that
can be use by any such devices in order to mirror process address. The hmm code
provide an unique entry point and interface itself with the core mm code of the
linux kernel avoiding duplicate implementation and shielding device driver code
from core mm code.

Moreover, hmm also intend to provide support for migrating memory to device
private memory, allowing device to work on its own fast local memory. The hmm
code would be responsible to intercept cpu page fault on migrated range and
to migrate it back to system memory allowing cpu to resume its access to the
memory.

Another feature hmm intend to provide is support for atomic operation for the
device even if the bus linking the device and the cpu do not have any such
capabilities. On such hardware atomic operation require the page to only be
mapped on the device or on the cpu but not both at the same time.

We expect that graphic processing unit and network interface to be among the
first users of such api.

Hardware requirement:

Because hmm is intended to be use by device driver there are minimum features
requirement for the hardware mmu :
  - hardware have its own page table per process (can be share btw != devices)
  - hardware mmu support page fault and suspend execution until the page fault
is serviced by hmm code. The page fault must also trigger some form of
interrupt so that hmm code can be call by the device driver.
  - hardware must support at least read only mapping (otherwise it can not
access read only range of the process address space).
  - hardware access to system memory must be cache coherent with the cpu.

For better memory management it is highly recommanded that the device also
support the following features :
  - hardware mmu set access bit in its page table on memory access (like cpu).
  - hardware page table can be updated from cpu or through a fast path.
  - hardware provide advanced statistic over which range of memory it access
the most.
  - hardware differentiate atomic memory access from regular access allowing
to support atomic operation even on platform that do not have atomic
support on the bus linking the device with the cpu.

Implementation:

The hmm layer provide a simple API to the device driver. Each device driver
have to register and hmm device that holds pointer to all the callback the hmm
code will make to synchronize the device page table with the cpu page table of
a given process.

For each process it wants to mirror the device driver must register a mirror
hmm structure that holds all the informations specific to the process being
mirrored. Each hmm mirror uniquely link an hmm device with a process address
space (the mm struct).

This design allow several different device driver to mirror concurrently the
same process. The hmm layer will dispatch approprietly to each device driver
modification that are happening to the process address space.

The hmm layer rely on the mmu notifier api to monitor change to the process
address space. Because update to device page table can have unbound completion
time, the hmm layer need the capability to sleep during mmu notifier callback.

This patch only implement the core of the hmm layer and do not support feature
such as migration to device memory.

Changed since v1:
  - convert fence to refcounted object
  - change the api to provide pte value directly avoiding useless temporary
special hmm pfn value
  - cleanups  fixes ...

Changed since v2:
  - fixed checkpatch.pl warnings  errors
  - converted to a staging feature

Changed since v3:
  - Use mmput notifier 

[PATCH 1/5] mmu_notifier: add event information to address invalidation v6

2014-11-10 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The event information will be usefull for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their intented usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Changed since v5:
  - Typo fix.
  - Changed zap_page_range from MMU_MUNMAP to MMU_MIGRATE to reflect the
fact that the address range is still valid just the page backing it
are no longer.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Reviewed-by: Rik van Riel r...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/iommu/amd_iommu_v2.c|  11 ++-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   5 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 18 files changed, 269 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 90d734b..57d2acf 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -413,14 +413,17 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
 
 static void mn_invalidate_page(struct mmu_notifier *mn,
   struct mm_struct *mm,
-  unsigned long address)
+  unsigned long address,
+  enum mmu_event event)
 {
__mn_flush_page(mn, address);
 }
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start,
+ unsigned long end,
+ enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -441,7 +444,9 @@ static void mn_invalidate_range_start(struct mmu_notifier 
*mn,
 
 static void mn_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
-   unsigned long start, unsigned long end)
+   unsigned long start,
+   unsigned long end,
+   enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 2129274..e67fed1 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -221,7 +221,8 @@ void gru_flush_all_tlb(struct gru_state *gru)
  */
 static void 

[PATCH 3/5] lib: lockless generic and arch independent page table (gpt) v2.

2014-11-10 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

Page table is a common structure format most notably use by cpu mmu. The
arch depend page table code has strong tie to the architecture which makes
it unsuitable to be use by other non arch specific code.

This patch implement a generic and arch independent page table. It is generic
in the sense that entry size can be u64 or unsigned long (or u32 too on 32bits
arch).

It is lockless in the sense that at any point in time you can have concurrent
thread updating the page table (removing or changing entry) and faulting in
the page table (adding new entry). This is achieve by enforcing each updater
and each faulter to take a range lock. There is no exclusion on range lock,
ie several thread can fault or update the same range concurrently and it is
the responsability of the user to synchronize update to the page table entry
(pte), update to the page table directory (pdp) is under gpt responsability.

API usage pattern is :
  gpt_init()

  gpt_lock_update(lock_range)
  // User can update pte for instance by using atomic bit operation
  // allowing complete lockless update.
  gpt_unlock_update(lock_range)

  gpt_lock_fault(lock_range)
  // User can fault in pte but he is responsible for avoiding thread
  // to concurrently fault the same pte and for properly accounting
  // the number of pte faulted in the pdp structure.
  gpt_unlock_fault(lock_range)
  // The new faulted pte will only be visible to others updaters only
  // once all concurrent faulter on the address unlock.

Details on how the lockless concurrent updater and faulter works is provided
in the header file.

Changed since v1:
  - Switch to macro implementation instead of using arithmetic to accomodate
  the various size for table entry (uint64_t, unsigned long, ...).
  This is somewhat less flexbile but right now there is no use for the extra
  flexibility v1 was offering.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
---
 include/linux/gpt.h | 340 +++
 lib/Kconfig |   3 +
 lib/Makefile|   2 +
 lib/gpt.c   | 202 
 lib/gpt_generic.h   | 663 
 5 files changed, 1210 insertions(+)
 create mode 100644 include/linux/gpt.h
 create mode 100644 lib/gpt.c
 create mode 100644 lib/gpt_generic.h

diff --git a/include/linux/gpt.h b/include/linux/gpt.h
new file mode 100644
index 000..3c28634
--- /dev/null
+++ b/include/linux/gpt.h
@@ -0,0 +1,340 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/*
+ * High level overview
+ * ---
+ *
+ * This is a generic arch independant page table implementation with lockless
+ * (allmost lockless) access. The content of the page table ie the page table
+ * entry, are not protected by the gpt helper, it is up to the code using gpt
+ * to protect the page table entry from concurrent update with no restriction
+ * on the mechanism (can be atomic or can sleep).
+ *
+ * The gpt code only deals with protecting the page directory tree structure.
+ * Which is done in a lockless way. Concurrent threads can read and or write
+ * overlapping range of the gpt. There can also be concurrent insertion and
+ * removal of page directory (insertion or removal of page table level).
+ *
+ * While removal of page directory is completely lockless, insertion of new
+ * page directory still require a lock (to avoid double insertion). If the
+ * architecture have a spinlock in its page struct then several threads can
+ * concurrently insert new directory (level) as long as they are inserting into
+ * different page directory. Otherwise insertion will serialize using a common
+ * spinlock. Note that insertion in this context only refer to inserting page
+ * directory, it does not deal about page table entry insertion and again this
+ * is the responsability of gpt user to properly synchronize those.
+ *
+ *
+ * Each gpt access must be done under gpt lock protection by calling gpt_lock()
+ * with a lock structure. Once a range is locked with gpt_lock() all access
+ * can be done in lockless fashion, using either gpt_walk or gpt_iter helpers.
+ * Note however that only directory that are considered as established will be
+ * considered ie if a thread is concurently inserting a new directory in the
+ * locked range then this directory will be ignore by gpt_walk or gpt_iter.
+ *
+ * This 

[PATCH 4/5] hmm: heterogeneous memory management v6

2014-11-03 Thread j . glisse
From: Jérôme Glisse 

Motivation:

Heterogeneous memory management is intended to allow a device to transparently
access a process address space without having to lock pages of the process or
take references on them. In other word mirroring a process address space while
allowing the regular memory management event such as page reclamation or page
migration, to happen seamlessly.

Recent years have seen a surge into the number of specialized devices that are
part of a computer platform (from desktop to phone). So far each of those
devices have operated on there own private address space that is not link or
expose to the process address space that is using them. This separation often
leads to multiple memory copy happening between the device owned memory and the
process memory. This of course is both a waste of cpu cycle and memory.

Over the last few years most of those devices have gained a full mmu allowing
them to support multiple page table, page fault and other features that are
found inside cpu mmu. There is now a strong incentive to start leveraging
capabilities of such devices and to start sharing process address to avoid
any unnecessary memory copy as well as simplifying the programming model of
those devices by sharing an unique and common address space with the process
that use them.

The aim of the heterogeneous memory management is to provide a common API that
can be use by any such devices in order to mirror process address. The hmm code
provide an unique entry point and interface itself with the core mm code of the
linux kernel avoiding duplicate implementation and shielding device driver code
from core mm code.

Moreover, hmm also intend to provide support for migrating memory to device
private memory, allowing device to work on its own fast local memory. The hmm
code would be responsible to intercept cpu page fault on migrated range and
to migrate it back to system memory allowing cpu to resume its access to the
memory.

Another feature hmm intend to provide is support for atomic operation for the
device even if the bus linking the device and the cpu do not have any such
capabilities. On such hardware atomic operation require the page to only be
mapped on the device or on the cpu but not both at the same time.

We expect that graphic processing unit and network interface to be among the
first users of such api.

Hardware requirement:

Because hmm is intended to be use by device driver there are minimum features
requirement for the hardware mmu :
  - hardware have its own page table per process (can be share btw != devices)
  - hardware mmu support page fault and suspend execution until the page fault
is serviced by hmm code. The page fault must also trigger some form of
interrupt so that hmm code can be call by the device driver.
  - hardware must support at least read only mapping (otherwise it can not
access read only range of the process address space).
  - hardware access to system memory must be cache coherent with the cpu.

For better memory management it is highly recommanded that the device also
support the following features :
  - hardware mmu set access bit in its page table on memory access (like cpu).
  - hardware page table can be updated from cpu or through a fast path.
  - hardware provide advanced statistic over which range of memory it access
the most.
  - hardware differentiate atomic memory access from regular access allowing
to support atomic operation even on platform that do not have atomic
support on the bus linking the device with the cpu.

Implementation:

The hmm layer provide a simple API to the device driver. Each device driver
have to register and hmm device that holds pointer to all the callback the hmm
code will make to synchronize the device page table with the cpu page table of
a given process.

For each process it wants to mirror the device driver must register a mirror
hmm structure that holds all the informations specific to the process being
mirrored. Each hmm mirror uniquely link an hmm device with a process address
space (the mm struct).

This design allow several different device driver to mirror concurrently the
same process. The hmm layer will dispatch approprietly to each device driver
modification that are happening to the process address space.

The hmm layer rely on the mmu notifier api to monitor change to the process
address space. Because update to device page table can have unbound completion
time, the hmm layer need the capability to sleep during mmu notifier callback.

This patch only implement the core of the hmm layer and do not support feature
such as migration to device memory.

Changed since v1:
  - convert fence to refcounted object
  - change the api to provide pte value directly avoiding useless temporary
special hmm pfn value
  - cleanups & fixes ...

Changed since v2:
  - fixed checkpatch.pl warnings & errors
  - converted to a staging feature

Changed since v3:
  - Use mmput notifier chain instead of 

[PATCH 3/5] lib: lockless generic and arch independent page table (gpt) v2.

2014-11-03 Thread j . glisse
From: Jérôme Glisse 

Page table is a common structure format most notably use by cpu mmu. The
arch depend page table code has strong tie to the architecture which makes
it unsuitable to be use by other non arch specific code.

This patch implement a generic and arch independent page table. It is generic
in the sense that entry size can be u64 or unsigned long (or u32 too on 32bits
arch).

It is lockless in the sense that at any point in time you can have concurrent
thread updating the page table (removing or changing entry) and faulting in
the page table (adding new entry). This is achieve by enforcing each updater
and each faulter to take a range lock. There is no exclusion on range lock,
ie several thread can fault or update the same range concurrently and it is
the responsability of the user to synchronize update to the page table entry
(pte), update to the page table directory (pdp) is under gpt responsability.

API usage pattern is :
  gpt_init()

  gpt_lock_update(lock_range)
  // User can update pte for instance by using atomic bit operation
  // allowing complete lockless update.
  gpt_unlock_update(lock_range)

  gpt_lock_fault(lock_range)
  // User can fault in pte but he is responsible for avoiding thread
  // to concurrently fault the same pte and for properly accounting
  // the number of pte faulted in the pdp structure.
  gpt_unlock_fault(lock_range)
  // The new faulted pte will only be visible to others updaters only
  // once all concurrent faulter on the address unlock.

Details on how the lockless concurrent updater and faulter works is provided
in the header file.

Changed since v1:
  - Switch to macro implementation instead of using arithmetic to accomodate
  the various size for table entry (uint64_t, unsigned long, ...).
  This is somewhat less flexbile but right now there is no use for the extra
  flexibility v1 was offering.

Signed-off-by: Jérôme Glisse 
---
 include/linux/gpt.h | 340 +++
 lib/Kconfig |   3 +
 lib/Makefile|   2 +
 lib/gpt.c   | 202 
 lib/gpt_generic.h   | 663 
 5 files changed, 1210 insertions(+)
 create mode 100644 include/linux/gpt.h
 create mode 100644 lib/gpt.c
 create mode 100644 lib/gpt_generic.h

diff --git a/include/linux/gpt.h b/include/linux/gpt.h
new file mode 100644
index 000..3c28634
--- /dev/null
+++ b/include/linux/gpt.h
@@ -0,0 +1,340 @@
+/*
+ * Copyright 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * High level overview
+ * ---
+ *
+ * This is a generic arch independant page table implementation with lockless
+ * (allmost lockless) access. The content of the page table ie the page table
+ * entry, are not protected by the gpt helper, it is up to the code using gpt
+ * to protect the page table entry from concurrent update with no restriction
+ * on the mechanism (can be atomic or can sleep).
+ *
+ * The gpt code only deals with protecting the page directory tree structure.
+ * Which is done in a lockless way. Concurrent threads can read and or write
+ * overlapping range of the gpt. There can also be concurrent insertion and
+ * removal of page directory (insertion or removal of page table level).
+ *
+ * While removal of page directory is completely lockless, insertion of new
+ * page directory still require a lock (to avoid double insertion). If the
+ * architecture have a spinlock in its page struct then several threads can
+ * concurrently insert new directory (level) as long as they are inserting into
+ * different page directory. Otherwise insertion will serialize using a common
+ * spinlock. Note that insertion in this context only refer to inserting page
+ * directory, it does not deal about page table entry insertion and again this
+ * is the responsability of gpt user to properly synchronize those.
+ *
+ *
+ * Each gpt access must be done under gpt lock protection by calling gpt_lock()
+ * with a lock structure. Once a range is "locked" with gpt_lock() all access
+ * can be done in lockless fashion, using either gpt_walk or gpt_iter helpers.
+ * Note however that only directory that are considered as established will be
+ * considered ie if a thread is concurently inserting a new directory in the
+ * locked range then this directory will be ignore by gpt_walk or gpt_iter.
+ *
+ * This restriction comes from the lockless design. Some thread can hold a gpt
+ * lock for long time but 

[PATCH 1/5] mmu_notifier: add event information to address invalidation v5

2014-11-03 Thread j . glisse
From: Jérôme Glisse 

The event information will be usefull for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their intented usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Signed-off-by: Jérôme Glisse 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/iommu/amd_iommu_v2.c|  11 ++-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   5 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 18 files changed, 269 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 90d734b..57d2acf 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -413,14 +413,17 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
 
 static void mn_invalidate_page(struct mmu_notifier *mn,
   struct mm_struct *mm,
-  unsigned long address)
+  unsigned long address,
+  enum mmu_event event)
 {
__mn_flush_page(mn, address);
 }
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start,
+ unsigned long end,
+ enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -441,7 +444,9 @@ static void mn_invalidate_range_start(struct mmu_notifier 
*mn,
 
 static void mn_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
-   unsigned long start, unsigned long end)
+   unsigned long start,
+   unsigned long end,
+   enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 2129274..e67fed1 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -221,7 +221,8 @@ void gru_flush_all_tlb(struct gru_state *gru)
  */
 static void gru_invalidate_range_start(struct mmu_notifier *mn,
   struct mm_struct *mm,
-  unsigned long start, unsigned long end)
+  unsigned long start, unsigned long end,
+   

[PATCH 2/5] mmu_notifier: keep track of active invalidation ranges

2014-11-03 Thread j . glisse
From: Jérôme Glisse 

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an "atomic" section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Signed-off-by: Jérôme Glisse 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++---
 drivers/iommu/amd_iommu_v2.c|  8 +---
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++
 drivers/xen/gntdev.c| 15 ---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 60 ++---
 kernel/events/uprobes.c | 13 +++---
 mm/huge_memory.c| 78 ++---
 mm/hugetlb.c| 55 +++
 mm/ksm.c| 28 +---
 mm/memory.c | 78 +++--
 mm/migrate.c| 36 +++
 mm/mmu_notifier.c   | 76 +++-
 mm/mprotect.c   | 17 ---
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++
 virt/kvm/kvm_main.c | 10 ++---
 17 files changed, 298 insertions(+), 245 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..10b0044 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,26 +128,25 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range->start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range->end - 1;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next < end) {
struct drm_i915_gem_object *obj = NULL;
 
spin_lock(>lock);
if (mn->has_linear)
-   it = invalidate_range__linear(mn, mm, start, end);
+   it = invalidate_range__linear(mn, mm, range->start, 
end);
else if (serial == mn->serial)
it = interval_tree_iter_next(it, next, end);
else
-   it = interval_tree_iter_first(>objects, start, end);
+   it = interval_tree_iter_first(>objects, 
range->start, end);
if (it != NULL) {
obj = container_of(it, struct i915_mmu_object, it)->obj;
drm_gem_object_reference(>base);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 57d2acf..9b7f32d 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -421,9 +421,7 @@ static void mn_invalidate_page(struct mmu_notifier *mn,
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- enum mmu_event event)
+ const struct mmu_notifier_range *range)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -444,9 +442,7 @@ static void mn_invalidate_range_start(struct mmu_notifier 
*mn,
 
 static void 

[PATCH 5/5] hmm/dummy: dummy driver to showcase the hmm api v3

2014-11-03 Thread j . glisse
From: Jérôme Glisse 

This is a dummy driver which full fill two purposes :
  - showcase the hmm api and gives references on how to use it.
  - provide an extensive user space api to stress test hmm.

This is a particularly dangerous module as it allow to access a
mirror of a process address space through its device file. Hence
it should not be enabled by default and only people actively
developing for hmm should use it.

Changed since v1:
  - Fixed all checkpatch.pl issue (ignoreing some over 80 characters).

Changed since v2:
  - Rebase and adapted to lastest change.

Signed-off-by: Jérôme Glisse 
---
 drivers/char/Kconfig   |9 +
 drivers/char/Makefile  |1 +
 drivers/char/hmm_dummy.c   | 1151 
 include/uapi/linux/hmm_dummy.h |   30 ++
 4 files changed, 1191 insertions(+)
 create mode 100644 drivers/char/hmm_dummy.c
 create mode 100644 include/uapi/linux/hmm_dummy.h

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index efefd12..7574e92 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -600,6 +600,15 @@ config TILE_SROM
  device appear much like a simple EEPROM, and knows
  how to partition a single ROM for multiple purposes.
 
+config HMM_DUMMY
+   tristate "hmm dummy driver to test hmm."
+   depends on HMM
+   default n
+   help
+ Say Y here if you want to build the hmm dummy driver that allow you
+ to test the hmm infrastructure by mapping a process address space
+ in hmm dummy driver device file. When in doubt, say "N".
+
 source "drivers/char/xillybus/Kconfig"
 
 endmenu
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index d06cde26..eff0543 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -62,3 +62,4 @@ js-rtc-y = rtc.o
 
 obj-$(CONFIG_TILE_SROM)+= tile-srom.o
 obj-$(CONFIG_XILLYBUS) += xillybus/
+obj-$(CONFIG_HMM_DUMMY)+= hmm_dummy.o
diff --git a/drivers/char/hmm_dummy.c b/drivers/char/hmm_dummy.c
new file mode 100644
index 000..89a9112
--- /dev/null
+++ b/drivers/char/hmm_dummy.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/* This is a dummy driver made to exercice the HMM (hardware memory management)
+ * API of the kernel. It allow an userspace program to map its whole address
+ * space through the hmm dummy driver file.
+ *
+ * In here mirror address are address in the process address space that is
+ * being mirrored. While virtual address are the address in the current
+ * process that has the hmm dummy dev file mapped (address of the file
+ * mapping).
+ *
+ * You must be carefull to not mix one and another.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define HMM_DUMMY_DEVICE_NAME  "hmm_dummy_device"
+#define HMM_DUMMY_MAX_DEVICES  4
+
+struct hmm_dummy_device;
+
+struct hmm_dummy_mirror {
+   struct kref kref;
+   struct file *filp;
+   struct hmm_dummy_device *ddevice;
+   struct hmm_mirror   mirror;
+   unsignedminor;
+   pid_t   pid;
+   struct mm_struct*mm;
+   unsigned long   *pgdp;
+   struct mutexmutex;
+   boolstop;
+};
+
+struct hmm_dummy_device {
+   struct cdev cdev;
+   struct hmm_device   device;
+   dev_t   dev;
+   int major;
+   struct mutexmutex;
+   charname[32];
+   /* device file mapping tracking (keep track of all vma) */
+   struct hmm_dummy_mirror *dmirrors[HMM_DUMMY_MAX_DEVICES];
+   struct address_space*fmapping[HMM_DUMMY_MAX_DEVICES];
+};
+
+/* We only create 2 device to show the inter device rmem sharing/migration
+ * capabilities.
+ */
+static struct hmm_dummy_device ddevices[2];
+
+
+/* hmm_dummy_pt - dummy page table, the dummy device fake its own page table.
+ *
+ * Helper function to manage the dummy device page table.
+ */
+#define HMM_DUMMY_PTE_VALID(1UL << 0UL)
+#define HMM_DUMMY_PTE_READ (1UL << 1UL)
+#define HMM_DUMMY_PTE_WRITE(1UL << 2UL)
+#define HMM_DUMMY_PTE_DIRTY(1UL << 3UL)
+#define HMM_DUMMY_PFN_SHIFT(PAGE_SHIFT)
+

HMM (heterogeneous memory management) v5

2014-11-03 Thread j . glisse
Andrew i received no feedback since last time i sent this patchset, so i
would really like to have it merge for next kernel. While right now there
is no kernel driver that leverage this code, the hardware is coming and we
still have a long way to go before we have all the features needed. Right
now i am blocking any further work on the merge of this core code.

(Note that patch 5 the dummy driver is included as reference and should not
be merge unless you want me to grow it into some testing infrastructure. I
only include it here so people can have a look on how HMM is suppose to be
use).


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access triger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423

Cheers,
Jérôme

To: "Andrew Morton" ,
Cc: ,
Cc: linux-mm ,
Cc: ,
Cc: "Linus Torvalds" ,
Cc: "Mel Gorman" ,
Cc: "H. Peter Anvin" ,
Cc: "Peter Zijlstra" ,
Cc: "Linda Wang" ,
Cc: "Kevin E Martin" ,
Cc: "Jerome Glisse" ,
Cc: "Andrea Arcangeli" ,
Cc: "Johannes Weiner" ,
Cc: "Larry Woodman" ,
Cc: "Rik van Riel" ,
Cc: "Dave Airlie" ,
Cc: "Jeff Law" ,
Cc: "Brendan Conoboy" ,
Cc: "Joe Donohue" ,
Cc: "Duncan Poole" ,
Cc: "Sherry Cheung" ,
Cc: "Subhash Gutti" ,
Cc: "John Hubbard" ,
Cc: "Mark Hairgrove" ,
Cc: "Lucien Dunning" ,
Cc: "Cameron Buschardt" ,
Cc: "Arvind Gopalakrishnan" ,
Cc: "Haggai Eran" ,
Cc: "Or Gerlitz" ,
Cc: 

HMM (heterogeneous memory management) v5

2014-11-03 Thread j . glisse
Andrew i received no feedback since last time i sent this patchset, so i
would really like to have it merge for next kernel. While right now there
is no kernel driver that leverage this code, the hardware is coming and we
still have a long way to go before we have all the features needed. Right
now i am blocking any further work on the merge of this core code.

(Note that patch 5 the dummy driver is included as reference and should not
be merge unless you want me to grow it into some testing infrastructure. I
only include it here so people can have a look on how HMM is suppose to be
use).


What it is ?

In a nutshell HMM is a subsystem that provide an easy to use api to mirror a
process address on a device with minimal hardware requirement (mainly device
page fault and read only page mapping). This does not rely on ATS and PASID
PCIE extensions. It intends to supersede those extensions by allowing to move
system memory to device memory in a transparent fashion for core kernel mm
code (ie cpu page fault on page residing in device memory will trigger
migration back to system memory).


Why doing this ?

We want to be able to mirror a process address space so that compute api such
as OpenCL or other similar api can start using the exact same address space on
the GPU as on the CPU. This will greatly simplify usages of those api. Moreover
we believe that we will see more and more specialize unit functions that will
want to mirror process address using their own mmu.

The migration side is simply because GPU memory bandwidth is far beyond than
system memory bandwith and there is no sign that this gap is closing (quite the
opposite).


Current status and future features :

None of this core code change in any major way core kernel mm code. This
is simple ground work with no impact on existing code path. Features that
will be implemented on top of this are :
  1 - Tansparently handle page mapping on behalf of device driver (DMA).
  2 - Improve DMA api to better match new usage pattern of HMM.
  3 - Migration of anonymous memory to device memory.
  4 - Locking memory to remote memory (CPU access triger SIGBUS).
  5 - Access exclusion btw CPU and device for atomic operations.
  6 - Migration of file backed memory to device memory.


How future features will be implemented :
1 - Simply use existing DMA api to map page on behalf of a device.
2 - Introduce new DMA api to match new semantic of HMM. It is no longer page
we map but address range and managing which page is effectively backing
an address should be easy to update. I gave a presentation about that
during this LPC.
3 - Requires change to cpu page fault code path to handle migration back to
system memory on cpu access. An implementation of this was already sent
as part of v1. This will be low impact and only add a new special swap
type handling to existing fault code.
4 - Require a new syscall as i can not see which current syscall would be
appropriate for this. My first feeling was to use mbind as it has the
right semantic (binding a range of address to a device) but mbind is
too numa centric.

Second one was madvise, but semantic does not match, madvise does allow
kernel to ignore them while we do want to block cpu access for as long
as the range is bind to a device.

So i do not think any of existing syscall can be extended with new flags
but maybe i am wrong.
5 - Allowing to map a page as read only on the CPU while a device perform
some atomic operation on it (this is mainly to work around system bus
that do not support atomic memory access and sadly there is a large
base of hardware without that feature).

Easiest implementation would be using some page flags but there is none
left. So it must be some flags in vma to know if there is a need to query
HMM for write protection.

6 - This is the trickiest one to implement and while i showed a proof of
concept with v1, i am still have a lot of conflictual feeling about how
to achieve this.


As usual comments are more then welcome. Thanks in advance to anyone that
take a look at this code.

Previous patchset posting :
  v1 http://lwn.net/Articles/597289/
  v2 https://lkml.org/lkml/2014/6/12/559 (cover letter did not make it to ml)
  v3 https://lkml.org/lkml/2014/6/13/633
  v4 https://lkml.org/lkml/2014/8/29/423

Cheers,
Jérôme

To: Andrew Morton a...@linux-foundation.org,
Cc: linux-kernel@vger.kernel.org,
Cc: linux-mm linux...@kvack.org,
Cc: linux-fsde...@vger.kernel.org,
Cc: Linus Torvalds torva...@linux-foundation.org,
Cc: Mel Gorman mgor...@suse.de,
Cc: H. Peter Anvin h...@zytor.com,
Cc: Peter Zijlstra pet...@infradead.org,
Cc: Linda Wang lw...@redhat.com,
Cc: Kevin E Martin k...@redhat.com,
Cc: Jerome Glisse jgli...@redhat.com,
Cc: Andrea Arcangeli aarca...@redhat.com,
Cc: Johannes Weiner jwei...@redhat.com,
Cc: Larry Woodman lwood...@redhat.com,
Cc: Rik van Riel r...@redhat.com,
Cc: Dave Airlie airl...@redhat.com,
Cc: 

[PATCH 5/5] hmm/dummy: dummy driver to showcase the hmm api v3

2014-11-03 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

This is a dummy driver which full fill two purposes :
  - showcase the hmm api and gives references on how to use it.
  - provide an extensive user space api to stress test hmm.

This is a particularly dangerous module as it allow to access a
mirror of a process address space through its device file. Hence
it should not be enabled by default and only people actively
developing for hmm should use it.

Changed since v1:
  - Fixed all checkpatch.pl issue (ignoreing some over 80 characters).

Changed since v2:
  - Rebase and adapted to lastest change.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 drivers/char/Kconfig   |9 +
 drivers/char/Makefile  |1 +
 drivers/char/hmm_dummy.c   | 1151 
 include/uapi/linux/hmm_dummy.h |   30 ++
 4 files changed, 1191 insertions(+)
 create mode 100644 drivers/char/hmm_dummy.c
 create mode 100644 include/uapi/linux/hmm_dummy.h

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index efefd12..7574e92 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -600,6 +600,15 @@ config TILE_SROM
  device appear much like a simple EEPROM, and knows
  how to partition a single ROM for multiple purposes.
 
+config HMM_DUMMY
+   tristate hmm dummy driver to test hmm.
+   depends on HMM
+   default n
+   help
+ Say Y here if you want to build the hmm dummy driver that allow you
+ to test the hmm infrastructure by mapping a process address space
+ in hmm dummy driver device file. When in doubt, say N.
+
 source drivers/char/xillybus/Kconfig
 
 endmenu
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index d06cde26..eff0543 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -62,3 +62,4 @@ js-rtc-y = rtc.o
 
 obj-$(CONFIG_TILE_SROM)+= tile-srom.o
 obj-$(CONFIG_XILLYBUS) += xillybus/
+obj-$(CONFIG_HMM_DUMMY)+= hmm_dummy.o
diff --git a/drivers/char/hmm_dummy.c b/drivers/char/hmm_dummy.c
new file mode 100644
index 000..89a9112
--- /dev/null
+++ b/drivers/char/hmm_dummy.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse jgli...@redhat.com
+ */
+/* This is a dummy driver made to exercice the HMM (hardware memory management)
+ * API of the kernel. It allow an userspace program to map its whole address
+ * space through the hmm dummy driver file.
+ *
+ * In here mirror address are address in the process address space that is
+ * being mirrored. While virtual address are the address in the current
+ * process that has the hmm dummy dev file mapped (address of the file
+ * mapping).
+ *
+ * You must be carefull to not mix one and another.
+ */
+#include linux/init.h
+#include linux/fs.h
+#include linux/mm.h
+#include linux/module.h
+#include linux/kernel.h
+#include linux/major.h
+#include linux/cdev.h
+#include linux/device.h
+#include linux/mutex.h
+#include linux/rwsem.h
+#include linux/sched.h
+#include linux/slab.h
+#include linux/highmem.h
+#include linux/delay.h
+#include linux/hmm.h
+
+#include uapi/linux/hmm_dummy.h
+
+#define HMM_DUMMY_DEVICE_NAME  hmm_dummy_device
+#define HMM_DUMMY_MAX_DEVICES  4
+
+struct hmm_dummy_device;
+
+struct hmm_dummy_mirror {
+   struct kref kref;
+   struct file *filp;
+   struct hmm_dummy_device *ddevice;
+   struct hmm_mirror   mirror;
+   unsignedminor;
+   pid_t   pid;
+   struct mm_struct*mm;
+   unsigned long   *pgdp;
+   struct mutexmutex;
+   boolstop;
+};
+
+struct hmm_dummy_device {
+   struct cdev cdev;
+   struct hmm_device   device;
+   dev_t   dev;
+   int major;
+   struct mutexmutex;
+   charname[32];
+   /* device file mapping tracking (keep track of all vma) */
+   struct hmm_dummy_mirror *dmirrors[HMM_DUMMY_MAX_DEVICES];
+   struct address_space*fmapping[HMM_DUMMY_MAX_DEVICES];
+};
+
+/* We only create 2 device to show the inter device rmem sharing/migration
+ * capabilities.
+ */
+static struct hmm_dummy_device ddevices[2];
+
+
+/* hmm_dummy_pt - dummy page table, the dummy device fake its own page table.
+ *
+ * Helper function to manage the dummy device page table.
+ */
+#define 

[PATCH 1/5] mmu_notifier: add event information to address invalidation v5

2014-11-03 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The event information will be usefull for new user of mmu_notifier API.
The event argument differentiate between a vma disappearing, a page
being write protected or simply a page being unmaped. This allow new
user to take different path for different event for instance on unmap
the resource used to track a vma are still valid and should stay around.
While if the event is saying that a vma is being destroy it means that any
resources used to track this vma can be free.

Changed since v1:
  - renamed action into event (updated commit message too).
  - simplified the event names and clarified their intented usage
also documenting what exceptation the listener can have in
respect to each event.

Changed since v2:
  - Avoid crazy name.
  - Do not move code that do not need to move.

Changed since v3:
  - Separate hugue page split from mlock/munlock and softdirty.

Changed since v4:
  - Rebase (no other changes).

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c |   3 +-
 drivers/iommu/amd_iommu_v2.c|  11 ++-
 drivers/misc/sgi-gru/grutlbpurge.c  |   9 ++-
 drivers/xen/gntdev.c|   9 ++-
 fs/proc/task_mmu.c  |   6 +-
 include/linux/mmu_notifier.h| 131 ++--
 kernel/events/uprobes.c |  10 ++-
 mm/filemap_xip.c|   2 +-
 mm/huge_memory.c|  39 ++
 mm/hugetlb.c|  23 +++---
 mm/ksm.c|  18 +++--
 mm/memory.c |  27 ---
 mm/migrate.c|   9 ++-
 mm/mmu_notifier.c   |  28 ---
 mm/mprotect.c   |   5 +-
 mm/mremap.c |   6 +-
 mm/rmap.c   |  24 --
 virt/kvm/kvm_main.c |  12 ++-
 18 files changed, 269 insertions(+), 103 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index d182058..20dbd26 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -129,7 +129,8 @@ restart:
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
   unsigned long start,
-  unsigned long end)
+  unsigned long end,
+  enum mmu_event event)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 90d734b..57d2acf 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -413,14 +413,17 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
 
 static void mn_invalidate_page(struct mmu_notifier *mn,
   struct mm_struct *mm,
-  unsigned long address)
+  unsigned long address,
+  enum mmu_event event)
 {
__mn_flush_page(mn, address);
 }
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start,
+ unsigned long end,
+ enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -441,7 +444,9 @@ static void mn_invalidate_range_start(struct mmu_notifier 
*mn,
 
 static void mn_invalidate_range_end(struct mmu_notifier *mn,
struct mm_struct *mm,
-   unsigned long start, unsigned long end)
+   unsigned long start,
+   unsigned long end,
+   enum mmu_event event)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 2129274..e67fed1 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -221,7 +221,8 @@ void gru_flush_all_tlb(struct gru_state *gru)
  */
 static void gru_invalidate_range_start(struct mmu_notifier *mn,
   struct mm_struct *mm,
-  unsigned long start, unsigned long end)
+  unsigned long start, 

[PATCH 2/5] mmu_notifier: keep track of active invalidation ranges

2014-11-03 Thread j . glisse
From: Jérôme Glisse jgli...@redhat.com

The mmu_notifier_invalidate_range_start() and 
mmu_notifier_invalidate_range_end()
can be considered as forming an atomic section for the cpu page table update
point of view. Between this two function the cpu page table content is 
unreliable
for the address range being invalidated.

Current user such as kvm need to know when they can trust the content of the cpu
page table. This becomes even more important to new users of the mmu_notifier
api (such as HMM or ODP).

This patch use a structure define at all call site to invalidate_range_start()
that is added to a list for the duration of the invalidation. It adds two new
helpers to allow querying if a range is being invalidated or to wait for a range
to become valid.

For proper synchronization, user must block new range invalidation from inside
there invalidate_range_start() callback, before calling the helper functions.
Otherwise there is no garanty that a new range invalidation will not be added
after the call to the helper function to query for existing range.

Signed-off-by: Jérôme Glisse jgli...@redhat.com
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++---
 drivers/iommu/amd_iommu_v2.c|  8 +---
 drivers/misc/sgi-gru/grutlbpurge.c  | 15 +++
 drivers/xen/gntdev.c| 15 ---
 fs/proc/task_mmu.c  | 12 +++--
 include/linux/mmu_notifier.h| 60 ++---
 kernel/events/uprobes.c | 13 +++---
 mm/huge_memory.c| 78 ++---
 mm/hugetlb.c| 55 +++
 mm/ksm.c| 28 +---
 mm/memory.c | 78 +++--
 mm/migrate.c| 36 +++
 mm/mmu_notifier.c   | 76 +++-
 mm/mprotect.c   | 17 ---
 mm/mremap.c | 14 +++---
 mm/rmap.c   | 15 +++
 virt/kvm/kvm_main.c | 10 ++---
 17 files changed, 298 insertions(+), 245 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 20dbd26..10b0044 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -128,26 +128,25 @@ restart:
 
 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier 
*_mn,
   struct mm_struct *mm,
-  unsigned long start,
-  unsigned long end,
-  enum mmu_event event)
+  const struct 
mmu_notifier_range *range)
 {
struct i915_mmu_notifier *mn = container_of(_mn, struct 
i915_mmu_notifier, mn);
struct interval_tree_node *it = NULL;
-   unsigned long next = start;
+   unsigned long next = range-start;
unsigned long serial = 0;
+   /* interval ranges are inclusive, but invalidate range is exclusive */
+   unsigned long end = range-end - 1;
 
-   end--; /* interval ranges are inclusive, but invalidate range is 
exclusive */
while (next  end) {
struct drm_i915_gem_object *obj = NULL;
 
spin_lock(mn-lock);
if (mn-has_linear)
-   it = invalidate_range__linear(mn, mm, start, end);
+   it = invalidate_range__linear(mn, mm, range-start, 
end);
else if (serial == mn-serial)
it = interval_tree_iter_next(it, next, end);
else
-   it = interval_tree_iter_first(mn-objects, start, end);
+   it = interval_tree_iter_first(mn-objects, 
range-start, end);
if (it != NULL) {
obj = container_of(it, struct i915_mmu_object, it)-obj;
drm_gem_object_reference(obj-base);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 57d2acf..9b7f32d 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -421,9 +421,7 @@ static void mn_invalidate_page(struct mmu_notifier *mn,
 
 static void mn_invalidate_range_start(struct mmu_notifier *mn,
  struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- enum mmu_event event)
+ const struct mmu_notifier_range *range)
 {
struct pasid_state *pasid_state;
struct device_state *dev_state;
@@ -444,9 +442,7 @@ static void mn_invalidate_range_start(struct mmu_notifier 
*mn,
 
 

  1   2   >