From: Philip Yang <philip.y...@amd.com> With xnack on, add validate timestamp in order to handle GPU vm fault from multiple GPUs.
If GPU retry fault need migrate the range to the best restore location, use range validate timestamp to record system timestamp after range is restored to update GPU page table. Because multiple pages of same range have multiple retry fault, define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING to the long time period that pending retry fault may still comes after page table update, to skip duplicate retry fault of same range. If difference between system timestamp and range last validate timestamp is bigger than AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING, that means the retry fault is from another GPU, then continue to handle retry fault recover. Signed-off-by: Philip Yang <philip.y...@amd.com> Signed-off-by: Felix Kuehling <felix.kuehl...@amd.com> --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 27 +++++++++++++++++++++++---- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 ++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 8b57f5a471bd..65f20a72ddcb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -34,6 +34,11 @@ #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 +/* Long enough to ensure no retry fault comes after svm range is restored and + * page table is updated. + */ +#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 + static void svm_range_evict_svm_bo_worker(struct work_struct *work); /** * svm_range_unlink - unlink svm_range from lists and interval tree @@ -122,6 +127,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(&prange->remove_list); INIT_LIST_HEAD(&prange->svm_bo_list); atomic_set(&prange->invalid, 0); + prange->validate_timestamp = ktime_to_us(ktime_get()); mutex_init(&prange->mutex); spin_lock_init(&prange->svm_bo_lock); svm_range_set_default_attributes(&prange->preferred_loc, @@ -482,20 +488,28 @@ static int svm_range_validate_vram(struct svm_range *prange) static int svm_range_validate(struct mm_struct *mm, struct svm_range *prange) { + struct kfd_process *p; int r; pr_debug("svms 0x%p [0x%lx 0x%lx] actual loc 0x%x\n", prange->svms, prange->it_node.start, prange->it_node.last, prange->actual_loc); + p = container_of(prange->svms, struct kfd_process, svms); + if (!prange->actual_loc) r = svm_range_validate_ram(mm, prange); else r = svm_range_validate_vram(prange); - pr_debug("svms 0x%p [0x%lx 0x%lx] ret %d invalid %d\n", prange->svms, - prange->it_node.start, prange->it_node.last, - r, atomic_read(&prange->invalid)); + if (!r) { + if (p->xnack_enabled) + atomic_set(&prange->invalid, 0); + prange->validate_timestamp = ktime_to_us(ktime_get()); + } + + pr_debug("svms 0x%p [0x%lx 0x%lx] ret %d\n", prange->svms, + prange->it_node.start, prange->it_node.last, r); return r; } @@ -1766,6 +1780,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct svm_range_list *svms; struct svm_range *prange; struct kfd_process *p; + uint64_t timestamp; int32_t best_loc; int srcu_idx; int r = 0; @@ -1790,7 +1805,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, struct amdgpu_vm *vm, goto out_srcu_unlock; } - if (!atomic_read(&prange->invalid)) { + mutex_lock(&prange->mutex); + timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp; + mutex_unlock(&prange->mutex); + /* skip duplicate vm fault on different pages of same range */ + if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { pr_debug("svms 0x%p [0x%lx %lx] already restored\n", svms, prange->it_node.start, prange->it_node.last); goto out_srcu_unlock; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 0685eb04b87c..466ec5537bbb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -66,6 +66,7 @@ struct svm_range_bo { * @actual_loc: the actual location, 0 for CPU, or GPU id * @granularity:migration granularity, log2 num pages * @invalid: not 0 means cpu page table is invalidated + * @validate_timestamp: system timestamp when range is validated * @bitmap_access: index bitmap of GPUs which can access the range * @bitmap_aip: index bitmap of GPUs which can access the range in place * @@ -95,6 +96,7 @@ struct svm_range { uint32_t actual_loc; uint8_t granularity; atomic_t invalid; + uint64_t validate_timestamp; DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); }; -- 2.29.2 _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx