[PATCH 19/27] drm/amdkfd: Fix a circular lock dependency

2019-04-28 Thread Kuehling, Felix
Fix a circular lock dependency exposed under userptr memory pressure.
The DQM lock is the only one taken inside the MMU notifier. We need
to make sure that no reclaim is done under this lock, and that
no other locks are taken under which reclaim is possible.

Signed-off-by: Felix Kuehling 
Reviewed-by: Philip Yang 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 33 ---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1562590d837e..0bfdb141b6e7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -794,10 +794,14 @@ static int register_process(struct device_queue_manager 
*dqm,
retval = dqm->asic_ops.update_qpd(dqm, qpd);
 
dqm->processes_count++;
-   kfd_inc_compute_active(dqm->dev);
 
dqm_unlock(dqm);
 
+   /* Outside the DQM lock because under the DQM lock we can't do
+* reclaim or take other locks that others hold while reclaiming.
+*/
+   kfd_inc_compute_active(dqm->dev);
+
return retval;
 }
 
@@ -818,7 +822,6 @@ static int unregister_process(struct device_queue_manager 
*dqm,
list_del(>list);
kfree(cur);
dqm->processes_count--;
-   kfd_dec_compute_active(dqm->dev);
goto out;
}
}
@@ -826,6 +829,13 @@ static int unregister_process(struct device_queue_manager 
*dqm,
retval = 1;
 out:
dqm_unlock(dqm);
+
+   /* Outside the DQM lock because under the DQM lock we can't do
+* reclaim or take other locks that others hold while reclaiming.
+*/
+   if (!retval)
+   kfd_dec_compute_active(dqm->dev);
+
return retval;
 }
 
@@ -1519,6 +1529,7 @@ static int process_termination_nocpsch(struct 
device_queue_manager *dqm,
struct queue *q, *next;
struct device_process_node *cur, *next_dpn;
int retval = 0;
+   bool found = false;
 
dqm_lock(dqm);
 
@@ -1537,12 +1548,19 @@ static int process_termination_nocpsch(struct 
device_queue_manager *dqm,
list_del(>list);
kfree(cur);
dqm->processes_count--;
-   kfd_dec_compute_active(dqm->dev);
+   found = true;
break;
}
}
 
dqm_unlock(dqm);
+
+   /* Outside the DQM lock because under the DQM lock we can't do
+* reclaim or take other locks that others hold while reclaiming.
+*/
+   if (found)
+   kfd_dec_compute_active(dqm->dev);
+
return retval;
 }
 
@@ -1588,6 +1606,7 @@ static int process_termination_cpsch(struct 
device_queue_manager *dqm,
struct device_process_node *cur, *next_dpn;
enum kfd_unmap_queues_filter filter =
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+   bool found = false;
 
retval = 0;
 
@@ -1624,7 +1643,7 @@ static int process_termination_cpsch(struct 
device_queue_manager *dqm,
list_del(>list);
kfree(cur);
dqm->processes_count--;
-   kfd_dec_compute_active(dqm->dev);
+   found = true;
break;
}
}
@@ -1638,6 +1657,12 @@ static int process_termination_cpsch(struct 
device_queue_manager *dqm,
 
dqm_unlock(dqm);
 
+   /* Outside the DQM lock because under the DQM lock we can't do
+* reclaim or take other locks that others hold while reclaiming.
+*/
+   if (found)
+   kfd_dec_compute_active(dqm->dev);
+
/* Lastly, free mqd resources.
 * Do uninit_mqd() after dqm_unlock to avoid circular locking.
 */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 13/27] drm/amdkfd: Move sdma_queue_id calculation into allocate_sdma_queue()

2019-04-28 Thread Kuehling, Felix
From: Yong Zhao 

This avoids duplicated code.

Signed-off-by: Yong Zhao 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 29 +++
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e2de246d681b..38c66b8ffd31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -883,7 +883,7 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 }
 
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
-   unsigned int *sdma_id)
+   struct queue *q)
 {
int bit;
 
@@ -892,7 +892,14 @@ static int allocate_sdma_queue(struct device_queue_manager 
*dqm,
 
bit = __ffs64(dqm->sdma_bitmap);
dqm->sdma_bitmap &= ~(1ULL << bit);
-   *sdma_id = bit;
+   q->sdma_id = bit;
+
+   q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
+   q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
+
+   pr_debug("SDMA id is:%d\n", q->sdma_id);
+   pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
+   pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
 
return 0;
 }
@@ -914,21 +921,14 @@ static int create_sdma_queue_nocpsch(struct 
device_queue_manager *dqm,
 
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
 
-   retval = allocate_sdma_queue(dqm, >sdma_id);
+   retval = allocate_sdma_queue(dqm, q);
if (retval)
return retval;
 
-   q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
-   q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
-
retval = allocate_doorbell(qpd, q);
if (retval)
goto out_deallocate_sdma_queue;
 
-   pr_debug("SDMA id is:%d\n", q->sdma_id);
-   pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
-   pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
-
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
retval = mqd_mgr->init_mqd(mqd_mgr, >mqd, >mqd_mem_obj,
>gart_mqd_addr, >properties);
@@ -1129,16 +1129,9 @@ static int create_queue_cpsch(struct 
device_queue_manager *dqm, struct queue *q,
}
 
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
-   retval = allocate_sdma_queue(dqm, >sdma_id);
+   retval = allocate_sdma_queue(dqm, q);
if (retval)
goto out;
-   q->properties.sdma_queue_id =
-   q->sdma_id / get_num_sdma_engines(dqm);
-   q->properties.sdma_engine_id =
-   q->sdma_id % get_num_sdma_engines(dqm);
-   pr_debug("SDMA id is:%d\n", q->sdma_id);
-   pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
-   pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
}
 
retval = allocate_doorbell(qpd, q);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 23/27] drm/amdkfd: Preserve ttmp[4:5] instead of ttmp[14:15]

2019-04-28 Thread Kuehling, Felix
From: Jay Cornwall 

ttmp[4:5] is initialized by the SPI with SPI_GDBG_TRAP_DATA* values.
These values are more useful to the debugger than ttmp[14:15], which
carries dispatch_scratch_base*. There are too few registers to
preserve both.

Signed-off-by: Jay Cornwall 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 466 +-
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |  45 +-
 2 files changed, 253 insertions(+), 258 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index eed845b4e9a7..e413d4a71fa3 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -274,12 +274,12 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
-   0xbf820001, 0xbf820161,
+   0xbf820001, 0xbf82015e,
0xb8f8f802, 0x89788678,
-   0xb8f1f803, 0x866eff71,
+   0xb8fbf803, 0x866eff7b,
0x0400, 0xbf85003b,
-   0x866eff71, 0x0800,
-   0xbf850003, 0x866eff71,
+   0x866eff7b, 0x0800,
+   0xbf850003, 0x866eff7b,
0x0100, 0xbf84000c,
0x866eff78, 0x2000,
0xbf840005, 0xbf8e0010,
@@ -292,13 +292,13 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0x8977ff77, 0xfc00,
0x87776f77, 0x896eff6e,
0x001f8000, 0xb96ef807,
-   0xb8f0f812, 0xb8f1f813,
-   0x8ef08870, 0xc0071bb8,
+   0xb8faf812, 0xb8fbf813,
+   0x8efa887a, 0xc0071bbd,
0x, 0xbf8cc07f,
-   0xc0071c38, 0x0008,
+   0xc0071ebd, 0x0008,
0xbf8cc07f, 0x86ee6e6e,
0xbf840001, 0xbe801d6e,
-   0xb8f1f803, 0x8671ff71,
+   0xb8fbf803, 0x867bff7b,
0x01ff, 0xbf850002,
0x806c846c, 0x826d806d,
0x866dff6d, 0x,
@@ -308,258 +308,256 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0x8f6e8378, 0xb96ee0c2,
0xbf82, 0xb9780002,
0xbe801f6c, 0x866dff6d,
-   0x, 0xbef00080,
-   0xb9700283, 0xb8f02407,
-   0x8e709b70, 0x876d706d,
-   0xb8f003c7, 0x8e709a70,
-   0x876d706d, 0xb8f0f807,
-   0x8670ff70, 0x7fff,
-   0xb970f807, 0xbeee007e,
+   0x, 0xbefa0080,
+   0xb97a0283, 0xb8fa2407,
+   0x8e7a9b7a, 0x876d7a6d,
+   0xb8fa03c7, 0x8e7a9a7a,
+   0x876d7a6d, 0xb8faf807,
+   0x867aff7a, 0x7fff,
+   0xb97af807, 0xbeee007e,
0xbeef007f, 0xbefe0180,
-   0xbf94, 0x87708478,
-   0xb970f802, 0xbf8e0002,
-   0xbf88fffe, 0xb8f02a05,
+   0xbf94, 0x877a8478,
+   0xb97af802, 0xbf8e0002,
+   0xbf88fffe, 0xb8fa2a05,
+   0x807a817a, 0x8e7a8a7a,
+   0xb8fb1605, 0x807b817b,
+   0x8e7b867b, 0x807a7b7a,
+   0x807a7e7a, 0x827b807f,
+   0x867bff7b, 0x,
+   0xc04b1c3d, 0x0050,
+   0xbf8cc07f, 0xc04b1d3d,
+   0x0060, 0xbf8cc07f,
+   0xc0431e7d, 0x0074,
+   0xbf8cc07f, 0xbef4007e,
+   0x8675ff7f, 0x,
+   0x8775ff75, 0x0004,
+   0xbef60080, 0xbef700ff,
+   0x00807fac, 0x867aff7f,
+   0x0800, 0x8f7a837a,
+   0x8a77, 0x867aff7f,
+   0x7000, 0x8f7a817a,
+   0x8a77, 0xbef1007c,
+   0xbef00080, 0xb8f02a05,
0x80708170, 0x8e708a70,
-   0xb8f11605, 0x80718171,
-   0x8e718671, 0x80707170,
-   0x80707e70, 0x8271807f,
-   0x8671ff71, 0x,
-   0xc0471cb8, 0x0040,
-   0xbf8cc07f, 0xc04b1d38,
-   0x0048, 0xbf8cc07f,
-   0xc0431e78, 0x0058,
-   0xbf8cc07f, 0xc0471eb8,
-   0x005c, 0xbf8cc07f,
-   0xbef4007e, 0x8675ff7f,
-   0x, 0x8775ff75,
-   0x0004, 0xbef60080,
-   0xbef700ff, 0x00807fac,
-   0x8670ff7f, 0x0800,
-   0x8f708370, 0x8077,
-   0x8670ff7f, 0x7000,
-   0x8f708170, 0x8077,
-   0xbefb007c, 0xbefa0080,
-   0xb8fa2a05, 0x807a817a,
-   0x8e7a8a7a, 0xb8f01605,
-   0x80708170, 0x8e708670,
-   0x807a707a, 0xbef60084,
-   0xbef600ff, 0x0100,
-   0xbefe007c, 0xbefc007a,
-   0xc0611efa, 0x007c,
-   0xbf8cc07f, 0x807a847a,
-   0xbefc007e, 0xbefe007c,
-   0xbefc007a, 0xc0611b3a,
+   0xb8fa1605, 0x807a817a,
+   0x8e7a867a, 0x80707a70,
+   0xbef60084, 0xbef600ff,
+   0x0100, 0xbefe007c,
+   0xbefc0070, 0xc0611c7a,
0x007c, 0xbf8cc07f,
-   0x807a847a, 0xbefc007e,
-   0xbefe007c, 0xbefc007a,
-   0xc0611b7a, 0x007c,
-   0xbf8cc07f, 0x807a847a,
+   0x80708470, 0xbefc007e,
+   0xbefe007c, 0xbefc0070,
+   0xc0611b3a, 0x007c,
+   0xbf8cc07f, 0x80708470,
0xbefc007e, 0xbefe007c,
-   0xbefc007a, 0xc0611bba,
+   0xbefc0070, 0xc0611b7a,
0x007c, 0xbf8cc07f,
-   0x807a847a, 0xbefc007e,
-   0xbefe007c, 0xbefc007a,
-   0xc0611bfa, 0x007c,

[PATCH 26/27] drm/amdgpu: Use heavy weight for tlb invalidation on xgmi configuration

2019-04-28 Thread Kuehling, Felix
From: shaoyunl 

There is a bug found in vml2 xgmi logic:
mtype is always sent as NC on the VMC to TC interface for a page walk,
regardless of whether the request is being sent to local or remote GPU.
NC means non-coherent and will cause the VMC return data to be cached
in the TCC (versus UC – uncached will not cache the data). Since the
page table updates are being done by SDMA/HDP, then TCC will never be
updated and the GC VML2 will continue to hit on the TCC and never get
the updated page tables and result in a fault.
Heave weigh tlb invalidation does a WB/INVAL of the L1/L2 GL data
caches so TCC will not be hit on next request

Signed-off-by: shaoyunl 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 53 +--
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index ef3d93b995b2..7ec97e903a1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -726,29 +726,8 @@ static uint16_t get_atc_vmid_pasid_mapping_pasid(struct 
kgd_dev *kgd,
return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
 }
 
-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
-{
-   struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
-
-   /* Use legacy mode tlb invalidation.
-*
-* Currently on Raven the code below is broken for anything but
-* legacy mode due to a MMHUB power gating problem. A workaround
-* is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ
-* == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack
-* bit.
-*
-* TODO 1: agree on the right set of invalidation registers for
-* KFD use. Use the last one for now. Invalidate both GC and
-* MMHUB.
-*
-* TODO 2: support range-based invalidation, requires kfg2kgd
-* interface change
-*/
-   amdgpu_gmc_flush_gpu_tlb(adev, vmid, 0);
-}
-
-static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
+static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid,
+   uint32_t flush_type)
 {
signed long r;
uint32_t seq;
@@ -761,7 +740,7 @@ static int invalidate_tlbs_with_kiq(struct amdgpu_device 
*adev, uint16_t pasid)
PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
PACKET3_INVALIDATE_TLBS_ALL_HUB(1) |
PACKET3_INVALIDATE_TLBS_PASID(pasid) |
-   PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */
+   PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(flush_type));
amdgpu_fence_emit_polling(ring, );
amdgpu_ring_commit(ring);
spin_unlock(>gfx.kiq.ring_lock);
@@ -780,12 +759,16 @@ static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t 
pasid)
struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
int vmid;
struct amdgpu_ring *ring = >gfx.kiq.ring;
+   uint32_t flush_type = 0;
 
if (adev->in_gpu_reset)
return -EIO;
+   if (adev->gmc.xgmi.num_physical_nodes &&
+   adev->asic_type == CHIP_VEGA20)
+   flush_type = 2;
 
if (ring->sched.ready)
-   return invalidate_tlbs_with_kiq(adev, pasid);
+   return invalidate_tlbs_with_kiq(adev, pasid, flush_type);
 
for (vmid = 0; vmid < 16; vmid++) {
if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
@@ -793,7 +776,8 @@ static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t 
pasid)
if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) {
if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid)
== pasid) {
-   write_vmid_invalidate_request(kgd, vmid);
+   amdgpu_gmc_flush_gpu_tlb(adev, vmid,
+flush_type);
break;
}
}
@@ -811,7 +795,22 @@ static int invalidate_tlbs_vmid(struct kgd_dev *kgd, 
uint16_t vmid)
return 0;
}
 
-   write_vmid_invalidate_request(kgd, vmid);
+   /* Use legacy mode tlb invalidation.
+*
+* Currently on Raven the code below is broken for anything but
+* legacy mode due to a MMHUB power gating problem. A workaround
+* is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ
+* == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack
+* bit.
+*
+* TODO 1: agree on the right set of invalidation registers for
+* KFD use. Use the last one for now. Invalidate both GC and
+* MMHUB.
+*
+* TODO 2: support 

[PATCH 16/27] drm/amdkfd: Introduce XGMI SDMA queue type

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Existing QUEUE_TYPE_SDMA means PCIe optimized SDMA queues.
Introduce a new QUEUE_TYPE_SDMA_XGMI, which is optimized
for non-PCIe transfer such as XGMI.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  15 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 123 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |   3 +
 .../gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c  |   2 +
 .../gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c  |   2 +
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   4 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  10 +-
 include/uapi/linux/kfd_ioctl.h|   7 +-
 10 files changed, 132 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d795e5018270..c731126ada22 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
q_properties->type = KFD_QUEUE_TYPE_SDMA;
+   else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
+   q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
else
return -ENOTSUPP;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8202a5db3a35..1368b41cb92b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -54,6 +54,7 @@ static const struct kfd_device_info kaveri_device_info = {
.needs_iommu_device = true,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -71,6 +72,7 @@ static const struct kfd_device_info carrizo_device_info = {
.needs_iommu_device = true,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -87,6 +89,7 @@ static const struct kfd_device_info raven_device_info = {
.needs_iommu_device = true,
.needs_pci_atomics = true,
.num_sdma_engines = 1,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 #endif
@@ -105,6 +108,7 @@ static const struct kfd_device_info hawaii_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -121,6 +125,7 @@ static const struct kfd_device_info tonga_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -137,6 +142,7 @@ static const struct kfd_device_info fiji_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -153,6 +159,7 @@ static const struct kfd_device_info fiji_vf_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -170,6 +177,7 @@ static const struct kfd_device_info polaris10_device_info = 
{
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -186,6 +194,7 @@ static const struct kfd_device_info 
polaris10_vf_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -202,6 +211,7 @@ static const struct kfd_device_info polaris11_device_info = 
{
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -218,6 +228,7 @@ static const struct kfd_device_info polaris12_device_info = 
{
.needs_iommu_device = false,
.needs_pci_atomics = true,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
 };
 
@@ -234,6 +245,7 @@ static const struct kfd_device_info vega10_device_info = {
.needs_iommu_device = false,
.needs_pci_atomics = false,
.num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine 

[PATCH 10/27] drm/amdkfd: Allocate MQD trunk for HIQ and SDMA

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

MEC FW for some new asic requires all SDMA MQDs to be in a continuous
trunk of memory right after HIQ MQD. Add a field in device queue manager
to hold the HIQ/SDMA MQD memory object and allocate MQD trunk on device
queue manager initialization.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 32 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 2 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 063625c3646b..e2de246d681b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1620,6 +1620,25 @@ static int init_mqd_managers(struct device_queue_manager 
*dqm)
 
return -ENOMEM;
 }
+
+/* Allocate one hiq mqd (HWS) and all SDMA mqd in a continuous trunk*/
+static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm)
+{
+   int retval;
+   struct kfd_dev *dev = dqm->dev;
+   struct kfd_mem_obj *mem_obj = >hiq_sdma_mqd;
+   uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
+   dev->device_info->num_sdma_engines *
+   dev->device_info->num_sdma_queues_per_engine +
+   dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+
+   retval = amdgpu_amdkfd_alloc_gtt_mem(dev->kgd, size,
+   &(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
+   (void *)&(mem_obj->cpu_ptr), true);
+
+   return retval;
+}
+
 struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 {
struct device_queue_manager *dqm;
@@ -1729,6 +1748,11 @@ struct device_queue_manager 
*device_queue_manager_init(struct kfd_dev *dev)
if (init_mqd_managers(dqm))
goto out_free;
 
+   if (allocate_hiq_sdma_mqd(dqm)) {
+   pr_err("Failed to allocate hiq sdma mqd trunk buffer\n");
+   goto out_free;
+   }
+
if (!dqm->ops.initialize(dqm))
return dqm;
 
@@ -1737,9 +1761,17 @@ struct device_queue_manager 
*device_queue_manager_init(struct kfd_dev *dev)
return NULL;
 }
 
+void deallocate_hiq_sdma_mqd(struct kfd_dev *dev, struct kfd_mem_obj *mqd)
+{
+   WARN(!mqd, "No hiq sdma mqd trunk to free");
+
+   amdgpu_amdkfd_free_gtt_mem(dev->kgd, mqd->gtt_mem);
+}
+
 void device_queue_manager_uninit(struct device_queue_manager *dqm)
 {
dqm->ops.uninitialize(dqm);
+   deallocate_hiq_sdma_mqd(dqm->dev, >hiq_sdma_mqd);
kfree(dqm);
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a5ef7a6650a5..3742fd340ec3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -197,6 +197,7 @@ struct device_queue_manager {
/* hw exception  */
boolis_hws_hang;
struct work_struct  hw_exception_work;
+   struct kfd_mem_obj  hiq_sdma_mqd;
 };
 
 void device_queue_manager_init_cik(
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 21/27] drm/amdkfd: Preserve wave state after instruction fetch MEM_VIOL

2019-04-28 Thread Kuehling, Felix
From: Jay Cornwall 

If instruction fetch fails the wave cannot be halted and returned to
the shader without raising MEM_VIOL again. Currently the wave is
terminated if this occurs, but this loses information about the cause
of the fault. The debugger would prefer the faulting wave state to be
context-saved.

Poll inside the trap handler until TRAPSTS.SAVECTX indicates context
save is ready. Exit the poll loop and complete the remainder of the
exception handler, then return to the shader. The next instruction
fetch will be from the trap handler and not the faulting PC. Context
save will then deschedule the wave and save its state.

Signed-off-by: Jay Cornwall 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 10 ++
 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 10 --
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index ec9a9a99f808..097da0dd3b04 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -274,15 +274,17 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
-   0xbf820001, 0xbf82015d,
+   0xbf820001, 0xbf820161,
0xb8f8f802, 0x89788678,
0xb8f1f803, 0x866eff71,
-   0x0400, 0xbf850037,
+   0x0400, 0xbf85003b,
0x866eff71, 0x0800,
0xbf850003, 0x866eff71,
-   0x0100, 0xbf840008,
+   0x0100, 0xbf84000c,
0x866eff78, 0x2000,
-   0xbf840001, 0xbf81,
+   0xbf840005, 0xbf8e0010,
+   0xb8eef803, 0x866eff6e,
+   0x0400, 0xbf84fffb,
0x8778ff78, 0x2000,
0x80ec886c, 0x82ed806d,
0xb8eef807, 0x866fff6e,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 0bb9c577b3a2..6a010c9e55de 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -266,10 +266,16 @@ if (!EMU_RUN_HACK)
 
 L_HALT_WAVE:
 // If STATUS.HALT is set then this fault must come from SQC instruction 
fetch.
-// We cannot prevent further faults so just terminate the wavefront.
+// We cannot prevent further faults. Spin wait until context saved.
 s_and_b32   ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
 s_cbranch_scc0  L_NOT_ALREADY_HALTED
-s_endpgm
+
+L_WAIT_CTX_SAVE:
+s_sleep 0x10
+s_getreg_b32ttmp2, hwreg(HW_REG_TRAPSTS)
+s_and_b32   ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK
+s_cbranch_scc0  L_WAIT_CTX_SAVE
+
 L_NOT_ALREADY_HALTED:
 s_or_b32s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 12/27] drm/amdkfd: Allocate hiq and sdma mqd from mqd trunk

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Instead of allocat hiq and sdma mqd from sub-allocator, allocate
them from a mqd trunk pool. This is done for all asics

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 49 +++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |  7 +++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  | 20 +++-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   | 22 +++--
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   | 22 +++--
 5 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index eeb2b60a36b5..9307811bc427 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -23,6 +23,55 @@
 
 #include "kfd_mqd_manager.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_device_queue_manager.h"
+
+struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev)
+{
+   struct kfd_mem_obj *mqd_mem_obj = NULL;
+
+   mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+   if (!mqd_mem_obj)
+   return NULL;
+
+   mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem;
+   mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr;
+   mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr;
+
+   return mqd_mem_obj;
+}
+
+struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
+   struct queue_properties *q)
+{
+   struct kfd_mem_obj *mqd_mem_obj = NULL;
+   uint64_t offset;
+
+   mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+   if (!mqd_mem_obj)
+   return NULL;
+
+   offset = (q->sdma_engine_id *
+   dev->device_info->num_sdma_queues_per_engine +
+   q->sdma_queue_id) *
+   dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
+
+   offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+
+   mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem
+   + offset);
+   mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset;
+   mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t)
+   dev->dqm->hiq_sdma_mqd.cpu_ptr + offset);
+
+   return mqd_mem_obj;
+}
+
+void uninit_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
+   struct kfd_mem_obj *mqd_mem_obj)
+{
+   WARN_ON(!mqd_mem_obj->gtt_mem);
+   kfree(mqd_mem_obj);
+}
 
 void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
const uint32_t *cu_mask, uint32_t cu_mask_count,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index 009d232fb60b..56af256a191b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -102,6 +102,13 @@ struct mqd_manager {
uint32_t mqd_size;
 };
 
+struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev);
+
+struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
+   struct queue_properties *q);
+void uninit_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
+   struct kfd_mem_obj *mqd_mem_obj);
+
 void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
const uint32_t *cu_mask, uint32_t cu_mask_count,
uint32_t *se_mask);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index a00402077e34..6e8509ec29d9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -71,6 +71,9 @@ static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
 {
struct kfd_mem_obj *mqd_mem_obj;
 
+   if (q->type == KFD_QUEUE_TYPE_HIQ)
+   return allocate_hiq_mqd(kfd);
+
if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd),
_mem_obj))
return NULL;
@@ -148,12 +151,10 @@ static int init_mqd_sdma(struct mqd_manager *mm, void 
**mqd,
 {
int retval;
struct cik_sdma_rlc_registers *m;
+   struct kfd_dev *dev = mm->dev;
 
-   retval = kfd_gtt_sa_allocate(mm->dev,
-   sizeof(struct cik_sdma_rlc_registers),
-   mqd_mem_obj);
-
-   if (retval != 0)
+   *mqd_mem_obj = allocate_sdma_mqd(dev, q);
+   if (!*mqd_mem_obj)
return -ENOMEM;
 
m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr;
@@ -175,11 +176,6 @@ static void uninit_mqd(struct mqd_manager *mm, void *mqd,
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
 }
 
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
-   struct kfd_mem_obj *mqd_mem_obj)
-{
-   kfd_gtt_sa_free(mm->dev, 

[PATCH 03/27] drm/amdkfd: Differentiate b/t sdma_id and sdma_queue_id

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

sdma_queue_id is sdma queue index inside one sdma engine.
sdma_id is sdma queue index among all sdma engines. Use
those two names properly.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 937ed1a7050d..7e5ead042dc0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -922,7 +922,7 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 }
 
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
-   unsigned int *sdma_queue_id)
+   unsigned int *sdma_id)
 {
int bit;
 
@@ -931,17 +931,17 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
 
bit = __ffs64(dqm->sdma_bitmap);
dqm->sdma_bitmap &= ~(1ULL << bit);
-   *sdma_queue_id = bit;
+   *sdma_id = bit;
 
return 0;
 }
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
-   unsigned int sdma_queue_id)
+   unsigned int sdma_id)
 {
-   if (sdma_queue_id >= get_num_sdma_queues(dqm))
+   if (sdma_id >= get_num_sdma_queues(dqm))
return;
-   dqm->sdma_bitmap |= (1ULL << sdma_queue_id);
+   dqm->sdma_bitmap |= (1ULL << sdma_id);
 }
 
 static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 01/27] drm/amdkfd: Use 64 bit sdma_bitmap

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Maximumly support 64 sdma queues

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1d6b15788ebf..0b1044dea765 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -891,7 +891,7 @@ static int initialize_nocpsch(struct device_queue_manager 
*dqm)
}
 
dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
-   dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+   dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1;
 
return 0;
 }
@@ -929,8 +929,8 @@ static int allocate_sdma_queue(struct device_queue_manager 
*dqm,
if (dqm->sdma_bitmap == 0)
return -ENOMEM;
 
-   bit = ffs(dqm->sdma_bitmap) - 1;
-   dqm->sdma_bitmap &= ~(1 << bit);
+   bit = __ffs64(dqm->sdma_bitmap);
+   dqm->sdma_bitmap &= ~(1ULL << bit);
*sdma_queue_id = bit;
 
return 0;
@@ -941,7 +941,7 @@ static void deallocate_sdma_queue(struct 
device_queue_manager *dqm,
 {
if (sdma_queue_id >= get_num_sdma_queues(dqm))
return;
-   dqm->sdma_bitmap |= (1 << sdma_queue_id);
+   dqm->sdma_bitmap |= (1ULL << sdma_queue_id);
 }
 
 static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
@@ -1047,7 +1047,7 @@ static int initialize_cpsch(struct device_queue_manager 
*dqm)
dqm->queue_count = dqm->processes_count = 0;
dqm->sdma_queue_count = 0;
dqm->active_runlist = false;
-   dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+   dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1;
 
INIT_WORK(>hw_exception_work, kfd_process_hw_exception);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 70e38a2e23b9..2770f3ece89f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -188,7 +188,7 @@ struct device_queue_manager {
unsigned inttotal_queue_count;
unsigned intnext_pipe_to_allocate;
unsigned int*allocated_queues;
-   unsigned intsdma_bitmap;
+   uint64_tsdma_bitmap;
unsigned intvmid_bitmap;
uint64_tpipelines_addr;
struct kfd_mem_obj  *pipeline_mem;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 07/27] drm/amdkfd: Introduce DIQ type mqd manager

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

With introduction of new mqd allocation scheme for HIQ,
DIQ and HIQ use different mqd allocation scheme, DIQ
can't reuse HIQ mqd manager

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c|  1 -
 6 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index f1596881f20a..58bb3ad233a1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -58,6 +58,9 @@ static bool initialize(struct kernel_queue *kq, struct 
kfd_dev *dev,
kq->nop_packet = nop.u32all;
switch (type) {
case KFD_QUEUE_TYPE_DIQ:
+   kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm,
+   KFD_MQD_TYPE_DIQ);
+   break;
case KFD_QUEUE_TYPE_HIQ:
kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm,
KFD_MQD_TYPE_HIQ);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index ae90a99909ef..e69bb4d3c3a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -413,6 +413,17 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE 
type,
mqd->is_occupied = is_occupied;
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+   break;
+   case KFD_MQD_TYPE_DIQ:
+   mqd->init_mqd = init_mqd_hiq;
+   mqd->uninit_mqd = uninit_mqd;
+   mqd->load_mqd = load_mqd;
+   mqd->update_mqd = update_mqd_hiq;
+   mqd->destroy_mqd = destroy_mqd;
+   mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+   mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
break;
case KFD_MQD_TYPE_SDMA:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 8fe74b821b32..273aad4f59c8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -475,6 +475,17 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE 
type,
mqd->is_occupied = is_occupied;
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+   break;
+   case KFD_MQD_TYPE_DIQ:
+   mqd->init_mqd = init_mqd_hiq;
+   mqd->uninit_mqd = uninit_mqd;
+   mqd->load_mqd = load_mqd;
+   mqd->update_mqd = update_mqd_hiq;
+   mqd->destroy_mqd = destroy_mqd;
+   mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+   mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
break;
case KFD_MQD_TYPE_SDMA:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 6469b3456f00..67bd590a82fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -472,6 +472,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE 
type,
mqd->is_occupied = is_occupied;
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+   break;
+   case KFD_MQD_TYPE_DIQ:
+   mqd->init_mqd = init_mqd_hiq;
+   mqd->uninit_mqd = uninit_mqd;
+   mqd->load_mqd = load_mqd;
+   mqd->update_mqd = update_mqd_hiq;
+   mqd->destroy_mqd = destroy_mqd;
+   mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+   mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
break;
case KFD_MQD_TYPE_SDMA:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 3dd48da0e2d6..d1d60336172a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -472,6 +472,7 @@ enum KFD_MQD_TYPE {
KFD_MQD_TYPE_HIQ,   /* for hiq */
KFD_MQD_TYPE_CP,/* for cp queues and diq */
KFD_MQD_TYPE_SDMA,  /* for sdma queues */
+   KFD_MQD_TYPE_DIQ,   /* for diq */
KFD_MQD_TYPE_MAX
 };
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 

[PATCH 05/27] drm/amdkfd: Fix a potential memory leak

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Free mqd_mem_obj it GTT buffer allocation for MQD+control stack fails.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 9dbba609450e..8fe74b821b32 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -76,6 +76,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
struct v9_mqd *m;
struct kfd_dev *kfd = mm->dev;
 
+   *mqd_mem_obj = NULL;
/* From V9,  for CWSR, the control stack is located on the next page
 * boundary after the mqd, we will use the gtt allocation function
 * instead of sub-allocation function.
@@ -93,8 +94,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
} else
retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
mqd_mem_obj);
-   if (retval != 0)
+   if (retval) {
+   kfree(*mqd_mem_obj);
return -ENOMEM;
+   }
 
m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
addr = (*mqd_mem_obj)->gpu_addr;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 02/27] drm/amdkfd: Add sdma allocation debug message

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Add debug messages during SDMA queue allocation.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0b1044dea765..937ed1a7050d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1177,6 +1177,9 @@ static int create_queue_cpsch(struct device_queue_manager 
*dqm, struct queue *q,
q->sdma_id / get_num_sdma_engines(dqm);
q->properties.sdma_engine_id =
q->sdma_id % get_num_sdma_engines(dqm);
+   pr_debug("SDMA id is:%d\n", q->sdma_id);
+   pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
+   pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
}
 
retval = allocate_doorbell(qpd, q);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 27/27] drm/amdgpu: Fix GTT size calculation

2019-04-28 Thread Kuehling, Felix
From: Kent Russell 

GTT size is currently limited to the minimum of VRAM size or 3/4 of
system memory. This severely limits the quanitity of system memory
that can be used by ROCm application.

Increase GTT size to the maximum of VRAM size or system memory size.

Signed-off-by: Kent Russell 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c14198737dcd..e9ecc3953673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1740,11 +1740,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
struct sysinfo si;
 
si_meminfo();
-   gtt_size = min(max((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
-  adev->gmc.mc_vram_size),
-  ((uint64_t)si.totalram * si.mem_unit * 3/4));
-   }
-   else
+   gtt_size = max3((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
+   adev->gmc.mc_vram_size,
+   ((uint64_t)si.totalram * si.mem_unit));
+   } else
gtt_size = (uint64_t)amdgpu_gtt_size << 20;
 
/* Initialize GTT memory pool */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 25/27] drm/amdkfd: Add domain number into gpu_id

2019-04-28 Thread Kuehling, Felix
From: Amber Lin 

A multi-socket server can have multiple PCIe segments so BFD is not enough
to distingush each GPU. Also add domain number into account when generating
gpu_id.

Signed-off-by: Amber Lin 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 64099a8494e1..2c06d6c16eab 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1082,8 +1082,9 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
local_mem_info.local_mem_size_public;
 
buf[0] = gpu->pdev->devfn;
-   buf[1] = gpu->pdev->subsystem_vendor;
-   buf[2] = gpu->pdev->subsystem_device;
+   buf[1] = gpu->pdev->subsystem_vendor |
+   (gpu->pdev->subsystem_device << 16);
+   buf[2] = pci_domain_nr(gpu->pdev->bus);
buf[3] = gpu->pdev->device;
buf[4] = gpu->pdev->bus->number;
buf[5] = lower_32_bits(local_mem_size);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 18/27] drm/amdkfd: Delete alloc_format field from map_queue struct

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Alloc format was never really supported by MEC FW. FW always
does one per pipe allocation.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 2 --
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 2 --
 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h  | 7 +--
 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h  | 7 +--
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index 604570bea6bd..3dd731c69b5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -153,8 +153,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, 
uint32_t *buffer,
 
packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
sizeof(struct pm4_mes_map_queues));
-   packet->bitfields2.alloc_format =
-   alloc_format__mes_map_queues__one_per_pipe_vi;
packet->bitfields2.num_queues = 1;
packet->bitfields2.queue_sel =
queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index 3cdb19826927..2adaf40027eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -190,8 +190,6 @@ static int pm_map_queues_vi(struct packet_manager *pm, 
uint32_t *buffer,
 
packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
sizeof(struct pm4_mes_map_queues));
-   packet->bitfields2.alloc_format =
-   alloc_format__mes_map_queues__one_per_pipe_vi;
packet->bitfields2.num_queues = 1;
packet->bitfields2.queue_sel =
queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index f2bcf5c092ea..0661339071f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -255,11 +255,6 @@ enum mes_map_queues_queue_type_enum {
 queue_type__mes_map_queues__low_latency_static_queue_vi = 3
 };
 
-enum mes_map_queues_alloc_format_enum {
-   alloc_format__mes_map_queues__one_per_pipe_vi = 0,
-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
-};
-
 enum mes_map_queues_engine_sel_enum {
engine_sel__mes_map_queues__compute_vi = 0,
engine_sel__mes_map_queues__sdma0_vi = 2,
@@ -279,7 +274,7 @@ struct pm4_mes_map_queues {
enum mes_map_queues_queue_sel_enum queue_sel:2;
uint32_t reserved2:15;
enum mes_map_queues_queue_type_enum queue_type:3;
-   enum mes_map_queues_alloc_format_enum alloc_format:2;
+   uint32_t reserved3:2;
enum mes_map_queues_engine_sel_enum engine_sel:3;
uint32_t num_queues:3;
} bitfields2;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
index 7c8d9b357749..5466cfe1c3cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
@@ -216,11 +216,6 @@ enum mes_map_queues_queue_type_vi_enum {
 queue_type__mes_map_queues__low_latency_static_queue_vi = 3
 };
 
-enum mes_map_queues_alloc_format_vi_enum {
-   alloc_format__mes_map_queues__one_per_pipe_vi = 0,
-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
-};
-
 enum mes_map_queues_engine_sel_vi_enum {
engine_sel__mes_map_queues__compute_vi = 0,
engine_sel__mes_map_queues__sdma0_vi = 2,
@@ -240,7 +235,7 @@ struct pm4_mes_map_queues {
enum mes_map_queues_queue_sel_vi_enum queue_sel:2;
uint32_t reserved2:15;
enum mes_map_queues_queue_type_vi_enum queue_type:3;
-   enum mes_map_queues_alloc_format_vi_enum alloc_format:2;
+   uint32_t reserved3:2;
enum mes_map_queues_engine_sel_vi_enum engine_sel:3;
uint32_t num_queues:3;
} bitfields2;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 11/27] drm/amdkfd: Move non-sdma mqd allocation out of init_mqd

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

This is preparation work to introduce more mqd allocation
scheme

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  | 20 ++--
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   | 51 ---
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   | 18 +--
 3 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index eec131b801b0..a00402077e34 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -66,6 +66,19 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
m->compute_static_thread_mgmt_se3);
 }
 
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+   struct queue_properties *q)
+{
+   struct kfd_mem_obj *mqd_mem_obj;
+
+   if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd),
+   _mem_obj))
+   return NULL;
+
+   return mqd_mem_obj;
+}
+
+
 static int init_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
@@ -73,11 +86,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
uint64_t addr;
struct cik_mqd *m;
int retval;
+   struct kfd_dev *kfd = mm->dev;
 
-   retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
-   mqd_mem_obj);
-
-   if (retval != 0)
+   *mqd_mem_obj = allocate_mqd(kfd, q);
+   if (!*mqd_mem_obj)
return -ENOMEM;
 
m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 15274a880ea2..8f8166189fd5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -67,38 +67,53 @@ static void update_cu_mask(struct mqd_manager *mm, void 
*mqd,
m->compute_static_thread_mgmt_se3);
 }
 
-static int init_mqd(struct mqd_manager *mm, void **mqd,
-   struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
-   struct queue_properties *q)
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+   struct queue_properties *q)
 {
int retval;
-   uint64_t addr;
-   struct v9_mqd *m;
-   struct kfd_dev *kfd = mm->dev;
+   struct kfd_mem_obj *mqd_mem_obj = NULL;
 
-   *mqd_mem_obj = NULL;
/* From V9,  for CWSR, the control stack is located on the next page
 * boundary after the mqd, we will use the gtt allocation function
 * instead of sub-allocation function.
 */
if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
-   *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
-   if (!*mqd_mem_obj)
-   return -ENOMEM;
+   mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+   if (!mqd_mem_obj)
+   return NULL;
retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd,
ALIGN(q->ctl_stack_size, PAGE_SIZE) +
ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
-   &((*mqd_mem_obj)->gtt_mem),
-   &((*mqd_mem_obj)->gpu_addr),
-   (void *)&((*mqd_mem_obj)->cpu_ptr), true);
-   } else
-   retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
-   mqd_mem_obj);
+   &(mqd_mem_obj->gtt_mem),
+   &(mqd_mem_obj->gpu_addr),
+   (void *)&(mqd_mem_obj->cpu_ptr), true);
+   } else {
+   retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v9_mqd),
+   _mem_obj);
+   }
+
if (retval) {
-   kfree(*mqd_mem_obj);
-   return -ENOMEM;
+   kfree(mqd_mem_obj);
+   return NULL;
}
 
+   return mqd_mem_obj;
+
+}
+
+static int init_mqd(struct mqd_manager *mm, void **mqd,
+   struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+   struct queue_properties *q)
+{
+   int retval;
+   uint64_t addr;
+   struct v9_mqd *m;
+   struct kfd_dev *kfd = mm->dev;
+
+   *mqd_mem_obj = allocate_mqd(kfd, q);
+   if (!*mqd_mem_obj)
+   return -ENOMEM;
+
m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
addr = (*mqd_mem_obj)->gpu_addr;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index ad9dc9a678f2..3296ffbde6ac 100644
--- 

[PATCH 00/27] KFD upstreaming

2019-04-28 Thread Kuehling, Felix
Assorted KFD changes that have been accumulating on amd-kfd-staging. New
features and fixes included:
* Support for VegaM
* Support for systems with multiple PCI domains
* New SDMA queue type that's optimized for XGMI links
* SDMA MQD allocation changes to support future ASICs with more SDMA queues
* Fix for compute profile switching at process termination
* Fix for a circular lock dependency in MMU notifiers
* Fix for TLB flushing bug with XGMI enabled
* Fix for artificial GTT system memory limitation
* Trap handler updates

Amber Lin (1):
  drm/amdkfd: Add domain number into gpu_id

Felix Kuehling (1):
  drm/amdkfd: Fix a circular lock dependency

Harish Kasiviswanathan (1):
  drm/amdkfd: Fix compute profile switching

Jay Cornwall (4):
  drm/amdkfd: Fix gfx8 MEM_VIOL exception handler
  drm/amdkfd: Preserve wave state after instruction fetch MEM_VIOL
  drm/amdkfd: Fix gfx9 XNACK state save/restore
  drm/amdkfd: Preserve ttmp[4:5] instead of ttmp[14:15]

Kent Russell (2):
  drm/amdkfd: Add VegaM support
  drm/amdgpu: Fix GTT size calculation

Oak Zeng (16):
  drm/amdkfd: Use 64 bit sdma_bitmap
  drm/amdkfd: Add sdma allocation debug message
  drm/amdkfd: Differentiate b/t sdma_id and sdma_queue_id
  drm/amdkfd: Shift sdma_engine_id and sdma_queue_id in mqd
  drm/amdkfd: Fix a potential memory leak
  drm/amdkfd: Introduce asic-specific mqd_manager_init function
  drm/amdkfd: Introduce DIQ type mqd manager
  drm/amdkfd: Init mqd managers in device queue manager init
  drm/amdkfd: Add mqd size in mqd manager struct
  drm/amdkfd: Allocate MQD trunk for HIQ and SDMA
  drm/amdkfd: Move non-sdma mqd allocation out of init_mqd
  drm/amdkfd: Allocate hiq and sdma mqd from mqd trunk
  drm/amdkfd: Fix sdma queue map issue
  drm/amdkfd: Introduce XGMI SDMA queue type
  drm/amdkfd: Expose sdma engine numbers to topology
  drm/amdkfd: Delete alloc_format field from map_queue struct

Yong Zhao (1):
  drm/amdkfd: Move sdma_queue_id calculation into allocate_sdma_queue()

shaoyunl (1):
  drm/amdgpu: Use heavy weight for tlb invalidation on xgmi
configuration

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  53 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |   9 +-
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 483 +-
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm |  13 -
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |  63 +--
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  51 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 354 -
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  14 +-
 .../amd/amdkfd/kfd_device_queue_manager_cik.c |   2 +
 .../amd/amdkfd/kfd_device_queue_manager_v9.c  |   1 +
 .../amd/amdkfd/kfd_device_queue_manager_vi.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |   6 +-
 .../gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c  |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c  |   4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  |  70 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |   8 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  53 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  85 +--
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  53 +-
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |   7 +-
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h   |   7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  14 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  14 +-
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  13 +-
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   2 +
 drivers/gpu/drm/amd/include/cik_structs.h |   3 +-
 drivers/gpu/drm/amd/include/v9_structs.h  |   3 +-
 drivers/gpu/drm/amd/include/vi_structs.h  |   3 +-
 include/uapi/linux/kfd_ioctl.h|   7 +-
 33 files changed, 826 insertions(+), 587 deletions(-)

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 04/27] drm/amdkfd: Shift sdma_engine_id and sdma_queue_id in mqd

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

FW of some new ASICs requires sdma mqd size to be not more than
128 dwords. Repurpose the last 2 reserved fields of sdma mqd for
driver internal use, so the total mqd size is no bigger than 128
dwords

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/include/cik_structs.h | 3 +--
 drivers/gpu/drm/amd/include/v9_structs.h  | 3 +--
 drivers/gpu/drm/amd/include/vi_structs.h  | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/cik_structs.h 
b/drivers/gpu/drm/amd/include/cik_structs.h
index 749eab94e335..699e658c3cec 100644
--- a/drivers/gpu/drm/amd/include/cik_structs.h
+++ b/drivers/gpu/drm/amd/include/cik_structs.h
@@ -282,8 +282,7 @@ struct cik_sdma_rlc_registers {
uint32_t reserved_123;
uint32_t reserved_124;
uint32_t reserved_125;
-   uint32_t reserved_126;
-   uint32_t reserved_127;
+   /* reserved_126,127: repurposed for driver-internal use */
uint32_t sdma_engine_id;
uint32_t sdma_queue_id;
 };
diff --git a/drivers/gpu/drm/amd/include/v9_structs.h 
b/drivers/gpu/drm/amd/include/v9_structs.h
index ceaf4932258d..8b383dbe1cda 100644
--- a/drivers/gpu/drm/amd/include/v9_structs.h
+++ b/drivers/gpu/drm/amd/include/v9_structs.h
@@ -151,8 +151,7 @@ struct v9_sdma_mqd {
uint32_t reserved_123;
uint32_t reserved_124;
uint32_t reserved_125;
-   uint32_t reserved_126;
-   uint32_t reserved_127;
+   /* reserved_126,127: repurposed for driver-internal use */
uint32_t sdma_engine_id;
uint32_t sdma_queue_id;
 };
diff --git a/drivers/gpu/drm/amd/include/vi_structs.h 
b/drivers/gpu/drm/amd/include/vi_structs.h
index 717fbae1d362..c17613287cd0 100644
--- a/drivers/gpu/drm/amd/include/vi_structs.h
+++ b/drivers/gpu/drm/amd/include/vi_structs.h
@@ -151,8 +151,7 @@ struct vi_sdma_mqd {
uint32_t reserved_123;
uint32_t reserved_124;
uint32_t reserved_125;
-   uint32_t reserved_126;
-   uint32_t reserved_127;
+   /* reserved_126,127: repurposed for driver-internal use */
uint32_t sdma_engine_id;
uint32_t sdma_queue_id;
 };
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 09/27] drm/amdkfd: Add mqd size in mqd manager struct

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Also initialize mqd size on mqd manager initialization

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 1 +
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c  | 4 
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c  | 4 
 4 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index f8261313ae7b..009d232fb60b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -99,6 +99,7 @@ struct mqd_manager {
 
struct mutexmqd_mutex;
struct kfd_dev  *dev;
+   uint32_t mqd_size;
 };
 
 void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index e69bb4d3c3a9..eec131b801b0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -400,6 +400,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+   mqd->mqd_size = sizeof(struct cik_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
@@ -411,6 +412,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+   mqd->mqd_size = sizeof(struct cik_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
@@ -422,6 +424,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+   mqd->mqd_size = sizeof(struct cik_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
@@ -433,6 +436,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
+   mqd->mqd_size = sizeof(struct cik_sdma_rlc_registers);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 273aad4f59c8..15274a880ea2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -462,6 +462,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE 
type,
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->get_wave_state = get_wave_state;
+   mqd->mqd_size = sizeof(struct v9_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
@@ -473,6 +474,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+   mqd->mqd_size = sizeof(struct v9_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
@@ -484,6 +486,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd_hiq;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
+   mqd->mqd_size = sizeof(struct v9_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
@@ -495,6 +498,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE 
type,
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
+   mqd->mqd_size = sizeof(struct v9_sdma_mqd);
 #if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 67bd590a82fc..ad9dc9a678f2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -459,6 +459,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE 
type,
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->get_wave_state = 

[PATCH 24/27] drm/amdkfd: Add VegaM support

2019-04-28 Thread Kuehling, Felix
From: Kent Russell 

Add the VegaM information to KFD

Signed-off-by: Kent Russell 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  5 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 20 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  1 +
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  1 +
 7 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1714900035d7..59f8ca4297db 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -134,6 +134,7 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
 #define polaris10_cache_info carrizo_cache_info
 #define polaris11_cache_info carrizo_cache_info
 #define polaris12_cache_info carrizo_cache_info
+#define vegam_cache_info carrizo_cache_info
 /* TODO - check & update Vega10 cache details */
 #define vega10_cache_info carrizo_cache_info
 #define raven_cache_info carrizo_cache_info
@@ -652,6 +653,10 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
pcache_info = polaris12_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
break;
+   case CHIP_VEGAM:
+   pcache_info = vegam_cache_info;
+   num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
+   break;
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_VEGA20:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 1368b41cb92b..a53dda9071b1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -232,6 +232,23 @@ static const struct kfd_device_info polaris12_device_info 
= {
.num_sdma_queues_per_engine = 2,
 };
 
+static const struct kfd_device_info vegam_device_info = {
+   .asic_family = CHIP_VEGAM,
+   .max_pasid_bits = 16,
+   .max_no_of_hqd  = 24,
+   .doorbell_size  = 4,
+   .ih_ring_entry_size = 4 * sizeof(uint32_t),
+   .event_interrupt_class = _interrupt_class_cik,
+   .num_of_watch_points = 4,
+   .mqd_size_aligned = MQD_SIZE_ALIGNED,
+   .supports_cwsr = true,
+   .needs_iommu_device = false,
+   .needs_pci_atomics = true,
+   .num_sdma_engines = 2,
+   .num_xgmi_sdma_engines = 0,
+   .num_sdma_queues_per_engine = 2,
+};
+
 static const struct kfd_device_info vega10_device_info = {
.asic_family = CHIP_VEGA10,
.max_pasid_bits = 16,
@@ -387,6 +404,9 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x6995, _device_info }, /* Polaris12 */
{ 0x6997, _device_info }, /* Polaris12 */
{ 0x699F, _device_info }, /* Polaris12 */
+   { 0x694C, _device_info }, /* VegaM */
+   { 0x694E, _device_info }, /* VegaM */
+   { 0x694F, _device_info }, /* VegaM */
{ 0x6860, _device_info },/* Vega10 */
{ 0x6861, _device_info },/* Vega10 */
{ 0x6862, _device_info },/* Vega10 */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0bfdb141b6e7..ece35c7a77b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1811,6 +1811,7 @@ struct device_queue_manager 
*device_queue_manager_init(struct kfd_dev *dev)
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+   case CHIP_VEGAM:
device_queue_manager_init_vi_tonga(>asic_ops);
break;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 213ea5454d11..dc7339825b5c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -398,6 +398,7 @@ int kfd_init_apertures(struct kfd_process *process)
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+   case CHIP_VEGAM:
kfd_init_apertures_vi(pdd, id);
break;
case CHIP_VEGA10:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 7a737b50bed4..1cc03b3ddbb9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -315,6 +315,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
case CHIP_POLARIS10:
case CHIP_POLARIS11:
case CHIP_POLARIS12:
+   case CHIP_VEGAM:

[PATCH 06/27] drm/amdkfd: Introduce asic-specific mqd_manager_init function

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Global function mqd_manager_init just calls asic-specific functions and it
is not necessary. Delete it and introduce a mqd_manager_init interface in
dqm for asic-specific mqd manager init. Call mqd_manager_init interface
directly to initialize mqd manager

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  2 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 ++
 .../amd/amdkfd/kfd_device_queue_manager_cik.c |  2 ++
 .../amd/amdkfd/kfd_device_queue_manager_v9.c  |  1 +
 .../amd/amdkfd/kfd_device_queue_manager_vi.c  |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 29 ---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 --
 7 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 7e5ead042dc0..a5a8643c04fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -583,7 +583,7 @@ static struct mqd_manager *get_mqd_manager(
 
mqd_mgr = dqm->mqd_mgrs[type];
if (!mqd_mgr) {
-   mqd_mgr = mqd_manager_init(type, dqm->dev);
+   mqd_mgr = dqm->asic_ops.mqd_manager_init(type, dqm->dev);
if (!mqd_mgr)
pr_err("mqd manager is NULL");
dqm->mqd_mgrs[type] = mqd_mgr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 2770f3ece89f..a5d83ec1c6a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -158,6 +158,8 @@ struct device_queue_manager_asic_ops {
void(*init_sdma_vm)(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd);
+   struct mqd_manager *(*mqd_manager_init)(enum KFD_MQD_TYPE type,
+struct kfd_dev *dev);
 };
 
 /**
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
index aed4c21417bf..0d26506798cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -48,6 +48,7 @@ void device_queue_manager_init_cik(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
asic_ops->update_qpd = update_qpd_cik;
asic_ops->init_sdma_vm = init_sdma_vm;
+   asic_ops->mqd_manager_init = mqd_manager_init_cik;
 }
 
 void device_queue_manager_init_cik_hawaii(
@@ -56,6 +57,7 @@ void device_queue_manager_init_cik_hawaii(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
asic_ops->update_qpd = update_qpd_cik_hawaii;
asic_ops->init_sdma_vm = init_sdma_vm_hawaii;
+   asic_ops->mqd_manager_init = mqd_manager_init_cik_hawaii;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
index 417515332c35..e9fe39382371 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -37,6 +37,7 @@ void device_queue_manager_init_v9(
 {
asic_ops->update_qpd = update_qpd_v9;
asic_ops->init_sdma_vm = init_sdma_vm_v9;
+   asic_ops->mqd_manager_init = mqd_manager_init_v9;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
index c3a5dcfe877a..3a7cb2f88366 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -54,6 +54,7 @@ void device_queue_manager_init_vi(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
asic_ops->update_qpd = update_qpd_vi;
asic_ops->init_sdma_vm = init_sdma_vm;
+   asic_ops->mqd_manager_init = mqd_manager_init_vi;
 }
 
 void device_queue_manager_init_vi_tonga(
@@ -62,6 +63,7 @@ void device_queue_manager_init_vi_tonga(
asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
asic_ops->update_qpd = update_qpd_vi_tonga;
asic_ops->init_sdma_vm = init_sdma_vm_tonga;
+   asic_ops->mqd_manager_init = mqd_manager_init_vi_tonga;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index aed9b9b82213..eeb2b60a36b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ 

[PATCH 17/27] drm/amdkfd: Expose sdma engine numbers to topology

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Expose available numbers of both SDMA queue types in the topology.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 7 +++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 2cb09e088dce..e536f4b6698f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -476,6 +476,10 @@ static ssize_t node_show(struct kobject *kobj, struct 
attribute *attr,
dev->node_props.drm_render_minor);
sysfs_show_64bit_prop(buffer, "hive_id",
dev->node_props.hive_id);
+   sysfs_show_32bit_prop(buffer, "num_sdma_engines",
+   dev->node_props.num_sdma_engines);
+   sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines",
+   dev->node_props.num_sdma_xgmi_engines);
 
if (dev->gpu) {
log_max_watch_addr =
@@ -1282,6 +1286,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
gpu->shared_resources.drm_render_minor;
 
dev->node_props.hive_id = gpu->hive_id;
+   dev->node_props.num_sdma_engines = gpu->device_info->num_sdma_engines;
+   dev->node_props.num_sdma_xgmi_engines =
+   gpu->device_info->num_xgmi_sdma_engines;
 
kfd_fill_mem_clk_max_info(dev);
kfd_fill_iolink_non_crat_info(dev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 84710cfd23c2..949e885dfb53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -78,6 +78,8 @@ struct kfd_node_properties {
uint32_t max_engine_clk_fcompute;
uint32_t max_engine_clk_ccompute;
int32_t  drm_render_minor;
+   uint32_t num_sdma_engines;
+   uint32_t num_sdma_xgmi_engines;
uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
 };
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 08/27] drm/amdkfd: Init mqd managers in device queue manager init

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Previously mqd managers was initialized on demand. As there
are only a few type of mqd managers, the on demand initialization
doesn't save too much memory. Initialize them on device
queue initialization instead and delete the get_mqd_manager
interface. This makes codes more organized for future changes.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 127 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |   6 -
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |   6 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|   3 +-
 4 files changed, 47 insertions(+), 95 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a5a8643c04fc..063625c3646b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -368,9 +368,7 @@ static int create_compute_queue_nocpsch(struct 
device_queue_manager *dqm,
struct mqd_manager *mqd_mgr;
int retval;
 
-   mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
-   if (!mqd_mgr)
-   return -ENOMEM;
+   mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
 
retval = allocate_hqd(dqm, q);
if (retval)
@@ -425,10 +423,8 @@ static int destroy_queue_nocpsch_locked(struct 
device_queue_manager *dqm,
int retval;
struct mqd_manager *mqd_mgr;
 
-   mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-   get_mqd_type_from_queue_type(q->properties.type));
-   if (!mqd_mgr)
-   return -ENOMEM;
+   mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+   q->properties.type)];
 
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
deallocate_hqd(dqm, q);
@@ -501,12 +497,8 @@ static int update_queue(struct device_queue_manager *dqm, 
struct queue *q)
retval = -ENODEV;
goto out_unlock;
}
-   mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-   get_mqd_type_from_queue_type(q->properties.type));
-   if (!mqd_mgr) {
-   retval = -ENOMEM;
-   goto out_unlock;
-   }
+   mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+   q->properties.type)];
/*
 * Eviction state logic: we only mark active queues as evicted
 * to avoid the overhead of restoring inactive queues later
@@ -571,27 +563,6 @@ static int update_queue(struct device_queue_manager *dqm, 
struct queue *q)
return retval;
 }
 
-static struct mqd_manager *get_mqd_manager(
-   struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
-{
-   struct mqd_manager *mqd_mgr;
-
-   if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
-   return NULL;
-
-   pr_debug("mqd type %d\n", type);
-
-   mqd_mgr = dqm->mqd_mgrs[type];
-   if (!mqd_mgr) {
-   mqd_mgr = dqm->asic_ops.mqd_manager_init(type, dqm->dev);
-   if (!mqd_mgr)
-   pr_err("mqd manager is NULL");
-   dqm->mqd_mgrs[type] = mqd_mgr;
-   }
-
-   return mqd_mgr;
-}
-
 static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
 {
@@ -612,13 +583,8 @@ static int evict_process_queues_nocpsch(struct 
device_queue_manager *dqm,
list_for_each_entry(q, >queues_list, list) {
if (!q->properties.is_active)
continue;
-   mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-   get_mqd_type_from_queue_type(q->properties.type));
-   if (!mqd_mgr) { /* should not be here */
-   pr_err("Cannot evict queue, mqd mgr is NULL\n");
-   retval = -ENOMEM;
-   goto out;
-   }
+   mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+   q->properties.type)];
q->properties.is_evicted = true;
q->properties.is_active = false;
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
@@ -717,13 +683,8 @@ static int restore_process_queues_nocpsch(struct 
device_queue_manager *dqm,
list_for_each_entry(q, >queues_list, list) {
if (!q->properties.is_evicted)
continue;
-   mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-   get_mqd_type_from_queue_type(q->properties.type));
-   if (!mqd_mgr) { /* should not be here */
-   pr_err("Cannot restore queue, mqd mgr is NULL\n");
-   retval = -ENOMEM;
-   goto out;
-   }
+   mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+  

[PATCH 22/27] drm/amdkfd: Fix gfx9 XNACK state save/restore

2019-04-28 Thread Kuehling, Felix
From: Jay Cornwall 

SQ_WAVE_IB_STS.RCNT grew from 4 bits to 5 in gfx9. Do not truncate
when saving in the high bits of TTMP1.

Signed-off-by: Jay Cornwall 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h   | 12 ++--
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm|  8 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 097da0dd3b04..eed845b4e9a7 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -310,8 +310,8 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0xbe801f6c, 0x866dff6d,
0x, 0xbef00080,
0xb9700283, 0xb8f02407,
-   0x8e709c70, 0x876d706d,
-   0xb8f003c7, 0x8e709b70,
+   0x8e709b70, 0x876d706d,
+   0xb8f003c7, 0x8e709a70,
0x876d706d, 0xb8f0f807,
0x8670ff70, 0x7fff,
0xb970f807, 0xbeee007e,
@@ -549,11 +549,11 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
0x0048, 0xc0031e77,
0x0058, 0xc0071eb7,
0x005c, 0xbf8cc07f,
-   0x866fff6d, 0xf000,
-   0x8f6f9c6f, 0x8e6f906f,
+   0x866fff6d, 0xf800,
+   0x8f6f9b6f, 0x8e6f906f,
0xbeee0080, 0x876e6f6e,
-   0x866fff6d, 0x0800,
-   0x8f6f9b6f, 0x8e6f8f6f,
+   0x866fff6d, 0x0400,
+   0x8f6f9a6f, 0x8e6f8f6f,
0x876e6f6e, 0x866fff70,
0x0080, 0x8f6f976f,
0xb96ef807, 0x866dff6d,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 6a010c9e55de..e1ac34517642 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -150,10 +150,10 @@ var S_SAVE_SPI_INIT_MTYPE_SHIFT   =   28
 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK=   0x0400  //bit[26]: 
FirstWaveInTG
 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT   =   26
 
-var S_SAVE_PC_HI_RCNT_SHIFT=   28  //FIXME  check 
with Brian to ensure all fields other than PC[47:0] can be used
-var S_SAVE_PC_HI_RCNT_MASK =   0xF000  //FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT=   27  //FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_MASK =   0x0800  //FIXME
+var S_SAVE_PC_HI_RCNT_SHIFT=   27  //FIXME  check 
with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK =   0xF800  //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT=   26  //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK =   0x0400  //FIXME
 
 var s_save_spi_init_lo =   exec_lo
 var s_save_spi_init_hi =   exec_hi
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 15/27] drm/amdkfd: Fix sdma queue map issue

2019-04-28 Thread Kuehling, Felix
From: Oak Zeng 

Previous codes assumes there are two sdma engines.
This is not true e.g., Raven only has 1 SDMA engine.
Fix the issue by using sdma engine number info in
device_info.

Signed-off-by: Oak Zeng 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 21 +++
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index bac1f36d38a2..d41045d3fc3a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1212,12 +1212,17 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
return 0;
 }
 
-static int unmap_sdma_queues(struct device_queue_manager *dqm,
-   unsigned int sdma_engine)
+static int unmap_sdma_queues(struct device_queue_manager *dqm)
 {
-   return pm_send_unmap_queue(>packets, KFD_QUEUE_TYPE_SDMA,
-   KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
-   sdma_engine);
+   int i, retval = 0;
+
+   for (i = 0; i < dqm->dev->device_info->num_sdma_engines; i++) {
+   retval = pm_send_unmap_queue(>packets, KFD_QUEUE_TYPE_SDMA,
+   KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i);
+   if (retval)
+   return retval;
+   }
+   return retval;
 }
 
 /* dqm->lock mutex has to be locked before calling this function */
@@ -1256,10 +1261,8 @@ static int unmap_queues_cpsch(struct 
device_queue_manager *dqm,
pr_debug("Before destroying queues, sdma queue count is : %u\n",
dqm->sdma_queue_count);
 
-   if (dqm->sdma_queue_count > 0) {
-   unmap_sdma_queues(dqm, 0);
-   unmap_sdma_queues(dqm, 1);
-   }
+   if (dqm->sdma_queue_count > 0)
+   unmap_sdma_queues(dqm);
 
retval = pm_send_unmap_queue(>packets, KFD_QUEUE_TYPE_COMPUTE,
filter, filter_param, false, 0);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 14/27] drm/amdkfd: Fix compute profile switching

2019-04-28 Thread Kuehling, Felix
From: Harish Kasiviswanathan 

Fix compute profile switching on process termination.

Add a dedicated reference counter to keep track of entry/exit to/from
compute profile. This enables switching compute profiles for other
reasons than process creation or termination.

Signed-off-by: Harish Kasiviswanathan 
Signed-off-by: Eric Huang 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  | 16 
 .../drm/amd/amdkfd/kfd_device_queue_manager.c| 11 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  7 +++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c1e4d44d6137..8202a5db3a35 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -462,6 +462,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
kfd->pdev = pdev;
kfd->init_complete = false;
kfd->kfd2kgd = f2g;
+   atomic_set(>compute_profile, 0);
 
mutex_init(>doorbell_mutex);
memset(>doorbell_available_index, 0,
@@ -1036,6 +1037,21 @@ void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
atomic_inc(>sram_ecc_flag);
 }
 
+void kfd_inc_compute_active(struct kfd_dev *kfd)
+{
+   if (atomic_inc_return(>compute_profile) == 1)
+   amdgpu_amdkfd_set_compute_idle(kfd->kgd, false);
+}
+
+void kfd_dec_compute_active(struct kfd_dev *kfd)
+{
+   int count = atomic_dec_return(>compute_profile);
+
+   if (count == 0)
+   amdgpu_amdkfd_set_compute_idle(kfd->kgd, true);
+   WARN_ONCE(count < 0, "Compute profile ref. count error");
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 38c66b8ffd31..bac1f36d38a2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -772,8 +772,8 @@ static int register_process(struct device_queue_manager 
*dqm,
 
retval = dqm->asic_ops.update_qpd(dqm, qpd);
 
-   if (dqm->processes_count++ == 0)
-   amdgpu_amdkfd_set_compute_idle(dqm->dev->kgd, false);
+   dqm->processes_count++;
+   kfd_inc_compute_active(dqm->dev);
 
dqm_unlock(dqm);
 
@@ -796,9 +796,8 @@ static int unregister_process(struct device_queue_manager 
*dqm,
if (qpd == cur->qpd) {
list_del(>list);
kfree(cur);
-   if (--dqm->processes_count == 0)
-   amdgpu_amdkfd_set_compute_idle(
-   dqm->dev->kgd, true);
+   dqm->processes_count--;
+   kfd_dec_compute_active(dqm->dev);
goto out;
}
}
@@ -1479,6 +1478,7 @@ static int process_termination_nocpsch(struct 
device_queue_manager *dqm,
list_del(>list);
kfree(cur);
dqm->processes_count--;
+   kfd_dec_compute_active(dqm->dev);
break;
}
}
@@ -1562,6 +1562,7 @@ static int process_termination_cpsch(struct 
device_queue_manager *dqm,
list_del(>list);
kfree(cur);
dqm->processes_count--;
+   kfd_dec_compute_active(dqm->dev);
break;
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d1d60336172a..87328c96b0f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -279,6 +279,9 @@ struct kfd_dev {
 
/* SRAM ECC flag */
atomic_t sram_ecc_flag;
+
+   /* Compute Profile ref. count */
+   atomic_t compute_profile;
 };
 
 enum kfd_mempool {
@@ -977,6 +980,10 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, 
struct kfd_process *p);
 
 bool kfd_is_locked(void);
 
+/* Compute profile */
+void kfd_inc_compute_active(struct kfd_dev *dev);
+void kfd_dec_compute_active(struct kfd_dev *dev);
+
 /* Debugfs */
 #if defined(CONFIG_DEBUG_FS)
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu: Remap hdp coherency registers

2019-04-23 Thread Kuehling, Felix
One more nit-pick inline.

On 2019-04-23 4:59 p.m., Zeng, Oak wrote:
> Remap HDP_MEM_COHERENCY_FLUSH_CNTL and HDP_REG_COHERENCY_FLUSH_CNTL
> to an empty page in mmio space. We will later map this page to process
> space so application can flush hdp. This can't be done properly at
> those registers' original location because it will expose more than
> desired registers to process space.
>
> v2: Use explicit register hole location
> v3: Moved remapped hdp registers into adev struct
> v4: Use more generic name for remapped page
>  Expose register offset in kfd_ioctl.h
> v5: Move hdp register remap function to nbio ip function
> v6: Fixed operator precedence issue and other bugs
>
> Change-Id: Ia8d27c0c9a082711d16bbf55602bf5712a47b6d6
> Signed-off-by: Oak Zeng 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +++
>   drivers/gpu/drm/amd/amdgpu/cik.c   |  1 +
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 15 ---
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 15 ---
>   drivers/gpu/drm/amd/amdgpu/si.c|  1 +
>   drivers/gpu/drm/amd/amdgpu/soc15.c | 11 +++
>   drivers/gpu/drm/amd/amdgpu/vi.c|  1 +
>   include/uapi/linux/kfd_ioctl.h |  7 +++
>   8 files changed, 52 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc96ec4..e16dcee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -642,6 +642,11 @@ struct nbio_hdp_flush_reg {
>   u32 ref_and_mask_sdma1;
>   };
>   
> +struct amdgpu_mmio_remap {
> + u32 reg_offset;
> + resource_size_t bus_addr;
> +};
> +
>   struct amdgpu_nbio_funcs {
>   const struct nbio_hdp_flush_reg *hdp_flush_reg;
>   u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev);
> @@ -669,6 +674,7 @@ struct amdgpu_nbio_funcs {
>   void (*ih_control)(struct amdgpu_device *adev);
>   void (*init_registers)(struct amdgpu_device *adev);
>   void (*detect_hw_virt)(struct amdgpu_device *adev);
> + void (*remap_hdp_registers)(struct amdgpu_device *adev);
>   };
>   
>   struct amdgpu_df_funcs {
> @@ -767,6 +773,7 @@ struct amdgpu_device {
>   void __iomem*rmmio;
>   /* protects concurrent MM_INDEX/DATA based register access */
>   spinlock_t mmio_idx_lock;
> + struct amdgpu_mmio_remaprmmio_remap;
>   /* protects concurrent SMC based register access */
>   spinlock_t smc_idx_lock;
>   amdgpu_rreg_t   smc_rreg;
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c 
> b/drivers/gpu/drm/amd/amdgpu/cik.c
> index 07c1f23..3f7ec6a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> @@ -1827,6 +1827,7 @@ static int cik_common_early_init(void *handle)
>   {
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> + adev->rmmio_remap.bus_addr = ULLONG_MAX;

It would be easier to just not do this and define 0 as "no MMIO 
remapping". That way you don't have to change cik.c, si.c and vi.c and 
only need to worry about chips that actually do support it.

Then the condition in patch 2 would need to change as well:

> + offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
> + if (!offset)
> + return -ENOMEM;

With that fixed, the series is Reviewed-by: Felix Kuehling 



>   adev->smc_rreg = _smc_rreg;
>   adev->smc_wreg = _smc_wreg;
>   adev->pcie_rreg = _pcie_rreg;
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> index 1cdb98a..73419fa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> @@ -29,9 +29,18 @@
>   #include "nbio/nbio_7_0_sh_mask.h"
>   #include "nbio/nbio_7_0_smn.h"
>   #include "vega10_enum.h"
> +#include 
>   
>   #define smnNBIF_MGCG_CTRL_LCLK  0x1013a05c
>   
> +static void nbio_v7_0_remap_hdp_registers(struct amdgpu_device *adev)
> +{
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL);
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL);
> +}
> +
>   static u32 nbio_v7_0_get_rev_id(struct amdgpu_device *adev)
>   {
>   u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0);
> @@ -55,10 +64,9 @@ static void nbio_v7_0_hdp_flush(struct amdgpu_device *adev,
>   struct amdgpu_ring *ring)
>   {
>   if (!ring || !ring->funcs->emit_wreg)
> - WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0);
> + WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
>   else
> - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
> - NBIO, 0, 

Re: [PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device

2019-04-23 Thread Kuehling, Felix
It seems to me that amdgpu_hive_info is a driver-internal structure, but 
the psp_xpmi_topology structures are an interface with the PSP that may 
change in future ASIC generations. So on second thought, adding the 
psp_xgmi_topology structures to the psp_xgmi_context (or 
amdgpu_hive_info) like that is probably a bad idea. The structures 
should probably be defined only in psp_v11_0.c and opaque for the rest 
of the driver.

Anyway, this is getting into a bigger cleanup that is not directly 
related to this change. We'll probably have to deal with this sooner or 
later, when a new PSP version changes the XGMI interfaces.

Either way, the series is Reviewed-by: Felix Kuehling 


On 2019-04-23 4:21 p.m., Liu, Shaoyun wrote:
> KFD need to provide the info for upper level to determine the data path
>
> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h| 26 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 23 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  3 ++-
>   5 files changed, 50 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index acf8ae0..8f8523a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include 
>   #include 
> +#include "amdgpu_xgmi.h"
>   
>   static const unsigned int compute_vmid_bitmap = 0xFF00;
>   
> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>   
>   return adev->gmc.xgmi.hive_id;
>   }
> +uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct 
> kgd_dev *src)
> +{
> + struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
> + struct amdgpu_device *adev = (struct amdgpu_device *)dst;
> + int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
> +
> + if (ret < 0) {
> + DRM_ERROR("amdgpu: failed to get  xgmi hops count between node 
> %d and %d. ret = %d\n",
> + adev->gmc.xgmi.physical_node_id,
> + peer_adev->gmc.xgmi.physical_node_id, ret);
> + ret = 0;
> + }
> + return  (uint8_t)ret;
> +}
>   
>   int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type 
> engine,
>   uint32_t vmid, uint64_t gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index e6a5037..b0cb94d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, 
> int dma_buf_fd,
> uint32_t *flags);
>   uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
>   uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
> +uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct 
> kgd_dev *src);
>   
>   #define read_user_wptr(mmptr, wptr, dst)\
>   ({  \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index cde113f..acbc18b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -95,12 +95,26 @@ struct psp_funcs
>   int (*ras_cure_posion)(struct psp_context *psp, uint64_t *mode_ptr);
>   };
>   
> +#define AMDGPU_XGMI_MAX_CONNECTED_NODES  64
> +struct psp_xgmi_node_info {
> + uint64_tnode_id;
> + uint8_t num_hops;
> + uint8_t is_sharing_enabled;
> + enum ta_xgmi_assigned_sdma_engine   sdma_engine;
> +};
> +
> +struct psp_xgmi_topology_info {
> + uint32_tnum_nodes;
> + struct psp_xgmi_node_info   nodes[AMDGPU_XGMI_MAX_CONNECTED_NODES];
> +};
> +
>   struct psp_xgmi_context {
>   uint8_t initialized;
>   uint32_tsession_id;
>   struct amdgpu_bo*xgmi_shared_bo;
>   uint64_txgmi_shared_mc_addr;
>   void*xgmi_shared_buf;
> + struct psp_xgmi_topology_info   top_info;
>   };
>   
>   struct psp_ras_context {
> @@ -181,18 +195,6 @@ struct amdgpu_psp_funcs {
>   enum AMDGPU_UCODE_ID);
>   };
>   
> -#define AMDGPU_XGMI_MAX_CONNECTED_NODES  64
> -struct psp_xgmi_node_info {
> - uint64_tnode_id;
> - uint8_t num_hops;
> - uint8_t  

Re: [PATCH 1/2] drm/amdgpu: Remap hdp coherency registers

2019-04-23 Thread Kuehling, Felix
See inline.

On 2019-04-23 3:23 p.m., Zeng, Oak wrote:
> Remap HDP_MEM_COHERENCY_FLUSH_CNTL and HDP_REG_COHERENCY_FLUSH_CNTL
> to an empty page in mmio space. We will later map this page to process
> space so application can flush hdp. This can't be done properly at
> those registers' original location because it will expose more than
> desired registers to process space.
>
> v2: Use explicit register hole location
> v3: Moved remapped hdp registers into adev struct
> v4: Use more generic name for remapped page
>  Expose register offset in kfd_ioctl.h
> v5: Move hdp register remap function to nbio ip function
>
> Change-Id: Ia8d27c0c9a082711d16bbf55602bf5712a47b6d6
> Signed-off-by: Oak Zeng 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +++
>   drivers/gpu/drm/amd/amdgpu/cik.c   |  1 +
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 15 ---
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 15 ---
>   drivers/gpu/drm/amd/amdgpu/si.c|  1 +
>   drivers/gpu/drm/amd/amdgpu/soc15.c | 11 +++
>   drivers/gpu/drm/amd/amdgpu/vi.c|  1 +
>   include/uapi/linux/kfd_ioctl.h |  7 +++
>   8 files changed, 52 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc96ec4..e16dcee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -642,6 +642,11 @@ struct nbio_hdp_flush_reg {
>   u32 ref_and_mask_sdma1;
>   };
>   
> +struct amdgpu_mmio_remap {
> + u32 reg_offset;
> + resource_size_t bus_addr;
> +};
> +
>   struct amdgpu_nbio_funcs {
>   const struct nbio_hdp_flush_reg *hdp_flush_reg;
>   u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev);
> @@ -669,6 +674,7 @@ struct amdgpu_nbio_funcs {
>   void (*ih_control)(struct amdgpu_device *adev);
>   void (*init_registers)(struct amdgpu_device *adev);
>   void (*detect_hw_virt)(struct amdgpu_device *adev);
> + void (*remap_hdp_registers)(struct amdgpu_device *adev);
>   };
>   
>   struct amdgpu_df_funcs {
> @@ -767,6 +773,7 @@ struct amdgpu_device {
>   void __iomem*rmmio;
>   /* protects concurrent MM_INDEX/DATA based register access */
>   spinlock_t mmio_idx_lock;
> + struct amdgpu_mmio_remaprmmio_remap;
>   /* protects concurrent SMC based register access */
>   spinlock_t smc_idx_lock;
>   amdgpu_rreg_t   smc_rreg;
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c 
> b/drivers/gpu/drm/amd/amdgpu/cik.c
> index 07c1f23..3f7ec6a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> @@ -1827,6 +1827,7 @@ static int cik_common_early_init(void *handle)
>   {
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> + adev->rmmio_remap.bus_addr = ULLONG_MAX;
>   adev->smc_rreg = _smc_rreg;
>   adev->smc_wreg = _smc_wreg;
>   adev->pcie_rreg = _pcie_rreg;
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> index 1cdb98a..83f1f75 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> @@ -29,9 +29,18 @@
>   #include "nbio/nbio_7_0_sh_mask.h"
>   #include "nbio/nbio_7_0_smn.h"
>   #include "vega10_enum.h"
> +#include 
>   
>   #define smnNBIF_MGCG_CTRL_LCLK  0x1013a05c
>   
> +static void nbio_v7_0_remap_hdp_registers(struct amdgpu_device *adev)
> +{
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset << 2 + HDP_MEM_FLUSH_CNTL);

I don't think this does what you intend. I think + binds stronger than 
<<, so you should write this as

     (adev->rmmio_remap.reg_offset << 2) + HDP_MEM_FLUSH_CNTL


> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset << 2 + HDP_REG_FLUSH_CNTL);

Same as above.


> +}
> +
>   static u32 nbio_v7_0_get_rev_id(struct amdgpu_device *adev)
>   {
>   u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0);
> @@ -55,10 +64,9 @@ static void nbio_v7_0_hdp_flush(struct amdgpu_device *adev,
>   struct amdgpu_ring *ring)
>   {
>   if (!ring || !ring->funcs->emit_wreg)
> - WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0);
> + WREG32_NO_KIQ(adev->rmmio_remap.reg_offset + 
> HDP_MEM_FLUSH_CNTL, 0);

Are you sure this is correct? As I understand it from the above, 
adev->rmmio_remap.reg_offset is in dwords, HDP_MEM_FLUSH_CNTL is in 
bytes. Something will need to be shifted.


>   else
> - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
> - NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0);
> + amdgpu_ring_emit_wreg(ring, adev->rmmio_remap.reg_offset + 
> HDP_MEM_FLUSH_CNTL, 0);

Same as above.


>   }
>   
>   static u32 nbio_v7_0_get_memsize(struct amdgpu_device *adev)
> @@ 

Re: [PATCH 2/2] drm/amdkfd: Adjust weight to represent num_hops info when report xgmi iolink

2019-04-23 Thread Kuehling, Felix
On 2019-04-17 2:59 p.m., Liu, Shaoyun wrote:
> Upper level runtime need the xgmi hops info to determine the data path
>
> Change-Id: I969b419eab125157e223e9b03980ca229c1e6af4
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 8 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 3 ++-
>   2 files changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 2e7c449..d48c6c5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -341,6 +341,7 @@ static int kfd_parse_subtype_cache(struct 
> crat_subtype_cache *cache,
>   return 0;
>   }
>   
> +
>   /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
>* topology device present in the device_list
>*/
> @@ -372,7 +373,7 @@ static int kfd_parse_subtype_iolink(struct 
> crat_subtype_iolink *iolink,
>   if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
>   props->weight = 20;
>   else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
> - props->weight = 15;
> + props->weight = 15 * iolink->num_hops_xgmi;
>   else
>   props->weight = node_distance(id_from, id_to);
>   
> @@ -1092,6 +1093,7 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int 
> *avail_size,
>   
>   static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
>   struct kfd_dev *kdev,
> + struct kfd_dev *peer_kdev,
>   struct crat_subtype_iolink *sub_type_hdr,
>   uint32_t proximity_domain_from,
>   uint32_t proximity_domain_to)
> @@ -1110,6 +1112,8 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int 
> *avail_size,
>   sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
>   sub_type_hdr->proximity_domain_from = proximity_domain_from;
>   sub_type_hdr->proximity_domain_to = proximity_domain_to;
> + sub_type_hdr->num_hops_xgmi =
> + amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd);
>   return 0;
>   }
>   
> @@ -1287,7 +1291,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>   (char *)sub_type_hdr +
>   sizeof(struct crat_subtype_iolink));
>   ret = kfd_fill_gpu_xgmi_link_to_gpu(
> - _size, kdev,
> + _size, kdev, peer_dev->gpu,
>   (struct crat_subtype_iolink *)sub_type_hdr,
>   proximity_domain, nid);
>   if (ret < 0)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> index 7c3f192..579835c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> @@ -257,7 +257,7 @@ struct crat_subtype_ccompute {
>   #define CRAT_IOLINK_TYPE_OTHER  16
>   #define CRAT_IOLINK_TYPE_MAX255
>   
> -#define CRAT_IOLINK_RESERVED_LENGTH  24
> +#define CRAT_IOLINK_RESERVED_LENGTH  20
>   
>   struct crat_subtype_iolink {
>   uint8_t type;
> @@ -274,6 +274,7 @@ struct crat_subtype_iolink {
>   uint32_tminimum_bandwidth_mbs;
>   uint32_tmaximum_bandwidth_mbs;
>   uint32_trecommended_transfer_size;
> + uint32_tnum_hops_xgmi;
>   uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH];

It would be safer to add num_hops_xgmi at the end of the reserved space. 
uint8_t is probably enough for the hop count. I'd also not change the 
CRAT_IOLINK_RESERVED_LENGTH. Instead this would work and make it clearer 
that we're appropriating some reserved space:

     ...
     uint8_t    reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1];
     uint8_t    num_hops_xgmi;
};


>   };
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu: Implement get num of hops between two xgmi device

2019-04-23 Thread Kuehling, Felix
See inline.

On 2019-04-17 2:58 p.m., Liu, Shaoyun wrote:
> KFD need to provide the info for upper level to determine the data path
>
> Change-Id: Idc809e8f3381b9222dd7be96539522d440f3ee7d
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 21 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
>   4 files changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index acf8ae0..3fe9a38 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -27,6 +27,7 @@
>   #include "amdgpu_gfx.h"
>   #include 
>   #include 
> +#include "amdgpu_xgmi.h"
>   
>   static const unsigned int compute_vmid_bitmap = 0xFF00;
>   
> @@ -481,6 +482,20 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
>   
>   return adev->gmc.xgmi.hive_id;
>   }
> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct 
> kgd_dev *src)
> +{
> + struct amdgpu_device *peer_adev = (struct amdgpu_device *)src;
> + struct amdgpu_device *adev = (struct amdgpu_device *)dst;
> + int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
> +
> + if (ret < 0) {
> + DRM_ERROR("amdgpu: failed to get  xgmi hops count between node 
> %d and %d. ret = %d\n",
> + adev->gmc.xgmi.physical_node_id,
> + peer_adev->gmc.xgmi.physical_node_id, ret);
> + ret = 0;
> + }
> + return  (uint32_t)ret;
> +}
>   
>   int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type 
> engine,
>   uint32_t vmid, uint64_t gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index e6a5037..8cc8a5a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, 
> int dma_buf_fd,
> uint32_t *flags);
>   uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
>   uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
> +uint32_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct 
> kgd_dev *src);
>   
>   #define read_user_wptr(mmptr, wptr, dst)\
>   ({  \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index a48c84c..eaebd47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -248,6 +248,27 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info 
> *hive, struct amdgpu_dev
>   return ret;
>   }
>   
> +
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> + struct amdgpu_device *peer_adev)
> +{
> + struct psp_xgmi_topology_info *top;
> + int ret = 0;
> +
> + top = kzalloc(sizeof(struct psp_xgmi_topology_info), GFP_KERNEL);

Where does this get freed? Looks like you have a memory leak.

Also, instead of allocating a new copy and querying the info from the 
PSP many times, could the psp_xgmi_topology_info structure be persistent 
somewhere in adev. I think after the driver initialization it won't 
change any more.

Regards,
   Felix


> + if (!top)
> + return -ENOMEM;
> + top->num_nodes = 1;
> + top->nodes[0].node_id = peer_adev->gmc.xgmi.node_id;
> + ret = psp_xgmi_get_topology_info(>psp, 1, top);
> + if (ret) {
> + dev_err(adev->dev,
> + "XGMI: Failed to get topology info\n");
> + return ret;
> + }
> + return top->nodes[0].num_hops;
> +}
> +
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   {
>   struct psp_xgmi_topology_info *hive_topology;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 3e9c91e..8a945bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -41,6 +41,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info 
> *hive, struct amdgpu_dev
>   int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>   void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>   int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> + struct amdgpu_device *peer_adev);
>   
>   static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>   struct amdgpu_device *bo_adev)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm: increase drm mmap_range size to 1TB

2019-04-17 Thread Kuehling, Felix
Adding dri-devel

On 2019-04-17 6:15 p.m., Yang, Philip wrote:
> After patch "drm: Use the same mmap-range offset and size for GEM and
> TTM", application failed to create bo of system memory because drm
> mmap_range size decrease to 64GB from original 1TB. This is not big
> enough for applications. Increase the drm mmap_range size to 1TB.
>
> Change-Id: Id482af261f56f3289e8873f5985078da599a0998
> Signed-off-by: Philip Yang 
> ---
>   include/drm/drm_vma_manager.h | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/drm/drm_vma_manager.h b/include/drm/drm_vma_manager.h
> index f4f8ff1cdeec..76ac5e97a559 100644
> --- a/include/drm/drm_vma_manager.h
> +++ b/include/drm/drm_vma_manager.h
> @@ -35,7 +35,7 @@
>*/
>   #if BITS_PER_LONG == 64
>   #define DRM_FILE_PAGE_OFFSET_START ((0xUL >> PAGE_SHIFT) + 1)
> -#define DRM_FILE_PAGE_OFFSET_SIZE ((0xUL >> PAGE_SHIFT) * 16)
> +#define DRM_FILE_PAGE_OFFSET_SIZE ((0xUL >> PAGE_SHIFT) * 256)
>   #else
>   #define DRM_FILE_PAGE_OFFSET_START ((0xFFFUL >> PAGE_SHIFT) + 1)
>   #define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFUL >> PAGE_SHIFT) * 16)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdkfd: Disable Packet Manager in non HWS mode except Hawaii

2019-04-17 Thread Kuehling, Felix
If you want to optimize driver init time, you should check what actually 
takes the most time. Randomly micro-optimizing things that may not even 
matter only increases complexity for no benefit.

Typically the things that make initialization slow are points where we 
synchronize with the GPU or random sleeps or delays to allow the 
hardware to complete something. A few lines of code that only run on the 
CPU will not make an appreciable difference.

Regards,
   Felix

On 2019-04-17 5:17 p.m., Zhao, Yong wrote:
> I can fix that cosmetic line. I did not measure the difference, but
> driver initialization usually takes a longer time, that's why I am
> trying to reduce it. Also, it means one less thing to worry about during
> non HWS mode bringup, because we don't need to deal with HIQ any more.
> With that, what do you think now?
>
> Regards,
>
> Yong
>
> On 2019-04-17 5:06 p.m., Kuehling, Felix wrote:
>> On 2019-04-17 4:54 p.m., Zhao, Yong wrote:
>>> The packet manager is only needed for HWS mode, as well as Hawaii in non
>>> HWS mode. So only initialize it under those scenarios. This is useful
>>> especially for emulation environment when things are slow.
>> I never thought of packet manager initialization as something expensive.
>> Why does this matter? In emulation, the GPU is slow, but the CPU should
>> be OK. Packet manager initialization doesn't do any waiting for the GPU,
>> so I don't see how this would have any measurable impact.
>>
>> Anyway, see one cosmetic comment inline.
>>
>>
>>> Change-Id: Iedfa07c94241e3252463e1e5ea537543c2ccef03
>>> Signed-off-by: Yong Zhao 
>>> ---
>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++--
>>> 1 file changed, 11 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index 1d6b15788ebf..ec83914d9867 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -911,13 +911,22 @@ static void uninitialize(struct device_queue_manager 
>>> *dqm)
>>> 
>>> static int start_nocpsch(struct device_queue_manager *dqm)
>>> {
>>> +   int ret = 0;
>> checkpatch.pl would complain that there should be an empty line after
>> variable declarations.
>>
>>
>>> init_interrupts(dqm);
>>> -   return pm_init(>packets, dqm);
>>> +   /* Cache flushing on Hawaii in non HWS mode is done through packet
>>> +* manager (PM), so we need to initialize PM for Hawaii.
>>> +*/
>>> +   if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>> +   ret = pm_init(>packets, dqm);
>>> +
>>> +   return ret;
>>> }
>>> 
>>> static int stop_nocpsch(struct device_queue_manager *dqm)
>>> {
>>> -   pm_uninit(>packets);
>>> +   if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>> +   pm_uninit(>packets);
>>> +
>>> return 0;
>>> }
>>> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu: Remap hdp coherency registers

2019-04-17 Thread Kuehling, Felix
On 2019-04-17 10:20 a.m., Zeng, Oak wrote:
> Remap HDP_MEM_COHERENCY_FLUSH_CNTL and HDP_REG_COHERENCY_FLUSH_CNTL
> to an empty page in mmio space. We will later map this page to process
> space so application can flush hdp. This can't be done properly at
> those registers' original location because it will expose more than
> desired registers to process space.
>
> v2: Use explicit register hole location
> v3: Moved remapped hdp registers into adev struct
>
> Change-Id: Ia8d27c0c9a082711d16bbf55602bf5712a47b6d6
> Signed-off-by: Oak Zeng 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h|  8 
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c |  5 ++---
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 ++---
>   drivers/gpu/drm/amd/amdgpu/soc15.c | 23 +++
>   4 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc96ec4..40c3ba6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -642,6 +642,13 @@ struct nbio_hdp_flush_reg {
>   u32 ref_and_mask_sdma1;
>   };
>   
> +struct remapped_hdp_reg {
> + u32 remapped_hdp_mem_flush_cntl_reg_offset;
> + u32 remapped_hdp_reg_flush_cntl_reg_offset;
I think the offsets or indexes of the remapped register inside the 
remapped MMIO page should not be variable. We need an agreed convention 
between user mode and kernel mode, which register is mapped where. This 
will be part of the driver ABI that must be maintained for backwards 
compatibility. This should probably be defined in 
include/uapi/linux/kfd_ioctl.h as an enum or #define.


> + resource_size_t remapped_hdp_mem_flush_cntl_physical_addr;
> + resource_size_t remapped_hdp_reg_flush_cntl_physical_addr;
> +};

The variable names are a bit verbose.

Alex suggested in patch 2 to use a more generic name for the buffer 
type. Maybe a more generic name makes sense here too for any future mmio 
remappings. Maybe struct amdgpu_mmio_remap.

I don't think we need both the offset and the physical address. I don't 
see that we need the physical address of each remapped register. Your 
patch 2 only needs the physical address of the first remapped register, 
for the start address of the remapped MMIO page. So instead of adding 
the physical address of each register just add one member phys_addr (or 
bus_addr) that is the bus address of the remapped MMIO page.


> +
>   struct amdgpu_nbio_funcs {
>   const struct nbio_hdp_flush_reg *hdp_flush_reg;
>   u32 (*get_hdp_flush_req_offset)(struct amdgpu_device *adev);
> @@ -939,6 +946,7 @@ struct amdgpu_device {
>   struct work_struct  xgmi_reset_work;
>   
>   boolin_baco_reset;
> + struct remapped_hdp_reg remapped_hdp_reg;

Move this to just after the other mmio members in struct amdgpu_device 
and give it a more generic name like adev->rmmio_remap.


>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
> *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> index 1cdb98a..d41e333 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
> @@ -55,10 +55,9 @@ static void nbio_v7_0_hdp_flush(struct amdgpu_device *adev,
>   struct amdgpu_ring *ring)
>   {
>   if (!ring || !ring->funcs->emit_wreg)
> - WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0);
> + 
> WREG32_NO_KIQ(adev->remapped_hdp_reg.remapped_hdp_mem_flush_cntl_reg_offset, 
> 0);
>   else
> - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
> - NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0);
> + amdgpu_ring_emit_wreg(ring, 
> adev->remapped_hdp_reg.remapped_hdp_mem_flush_cntl_reg_offset, 0);
>   }
>   
>   static u32 nbio_v7_0_get_memsize(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c 
> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> index c69d515..8f0a30e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
> @@ -53,10 +53,9 @@ static void nbio_v7_4_hdp_flush(struct amdgpu_device *adev,
>   struct amdgpu_ring *ring)
>   {
>   if (!ring || !ring->funcs->emit_wreg)
> - WREG32_SOC15_NO_KIQ(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL, 0);
> + 
> WREG32_NO_KIQ(adev->remapped_hdp_reg.remapped_hdp_mem_flush_cntl_reg_offset, 
> 0);
>   else
> - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
> - NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL), 0);
> + amdgpu_ring_emit_wreg(ring, 
> adev->remapped_hdp_reg.remapped_hdp_mem_flush_cntl_reg_offset, 0);
>   }
>   
>   static u32 nbio_v7_4_get_memsize(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
> 

Re: [PATCH 2/2] drm/amdkfd: Expose HDP registers to user space

2019-04-17 Thread Kuehling, Felix
On 2019-04-17 12:20 p.m., Deucher, Alexander wrote:
>> -Original Message-
>> From: amd-gfx  On Behalf Of
>> Zeng, Oak
>> Sent: Wednesday, April 17, 2019 10:21 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander ; Kuehling, Felix
>> ; Zeng, Oak ; Keely, Sean
>> ; Koenig, Christian 
>> Subject: [PATCH 2/2] drm/amdkfd: Expose HDP registers to user space
>>
>> Introduce a new memory type (KFD_IOC_ALLOC_MEM_FLAGS_HDP_FLUSH)
> I'd suggest calling this something like ALLOC_MEM_FLAGS_MMIO_REMAP or 
> something like that in case we end up needing to remap other registers in the 
> future.

So that's assuming other registers would get remapped into the same 
page. Makes sense.

Regards,
   Felix


>
> Alex
>
>> and expose mmio page of HDP registers to user space through this new
>> memory type.
>>
>> v2: moved remapped hdp regs to adev struct
>>
>> Change-Id: If5ac13c46ea7fbd2194ddc8b2ece26ef4f76c330
>> Signed-off-by: Oak Zeng 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   | 7 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   | 1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 
>>   drivers/gpu/drm/amd/include/kgd_kfd_interface.h  | 1 +
>>   include/uapi/linux/kfd_ioctl.h   | 1 +
>>   6 files changed, 18 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index acf8ae0..d953338b8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -482,6 +482,13 @@ uint64_t amdgpu_amdkfd_get_hive_id(struct
>> kgd_dev *kgd)
>>  return adev->gmc.xgmi.hive_id;
>>   }
>>
>> +uint64_t amdgpu_amdkfd_get_hdp_register_physical_addr(struct kgd_dev
>> +*kgd) {
>> +struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>> +
>> +return
>> +adev-
>>> remapped_hdp_reg.remapped_hdp_mem_flush_cntl_physical_addr;
>> +}
>> +
>>   int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum
>> kgd_engine_type engine,
>>  uint32_t vmid, uint64_t gpu_addr,
>>  uint32_t *ib_cmd, uint32_t ib_len)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index e6a5037..e778679 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -154,6 +154,7 @@ int amdgpu_amdkfd_get_dmabuf_info(struct
>> kgd_dev *kgd, int dma_buf_fd,
>>uint32_t *flags);
>>   uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);  uint64_t
>> amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd);
>> +uint64_t amdgpu_amdkfd_get_hdp_register_physical_addr(struct kgd_dev
>> +*kgd);
>>
>>   #define read_user_wptr(mmptr, wptr, dst)   \
>>  ({  \
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 047bba8..4394f61 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1087,7 +1087,8 @@ int
>> amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>>  if (!offset || !*offset)
>>  return -EINVAL;
>>  user_addr = *offset;
>> -} else if (flags & ALLOC_MEM_FLAGS_DOORBELL) {
>> +} else if (flags & (ALLOC_MEM_FLAGS_DOORBELL |
>> +ALLOC_MEM_FLAGS_HDP_FLUSH)) {
>>  domain = AMDGPU_GEM_DOMAIN_GTT;
>>  alloc_domain = AMDGPU_GEM_DOMAIN_CPU;
>>  bo_type = ttm_bo_type_sg;
>> @@ -1263,8 +1264,8 @@ int
>> amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>>  /* Free the sync object */
>>  amdgpu_sync_free(>sync);
>>
>> -/* If the SG is not NULL, it's one we created for a doorbell
>> - * BO. We need to free it.
>> +/* If the SG is not NULL, it's one we created for a doorbell or hdp
>> + * flush BO. We need to free it.
>>   */
>>  if (mem->bo->tbo.sg) {
>>  sg_free_table(mem->bo->tbo.sg);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 083bd81..973d2fe 100644
>> --- a/

Re: [PATCH] drm/amdkfd: Disable Packet Manager in non HWS mode except Hawaii

2019-04-17 Thread Kuehling, Felix
On 2019-04-17 4:54 p.m., Zhao, Yong wrote:
> The packet manager is only needed for HWS mode, as well as Hawaii in non
> HWS mode. So only initialize it under those scenarios. This is useful
> especially for emulation environment when things are slow.

I never thought of packet manager initialization as something expensive. 
Why does this matter? In emulation, the GPU is slow, but the CPU should 
be OK. Packet manager initialization doesn't do any waiting for the GPU, 
so I don't see how this would have any measurable impact.

Anyway, see one cosmetic comment inline.


>
> Change-Id: Iedfa07c94241e3252463e1e5ea537543c2ccef03
> Signed-off-by: Yong Zhao 
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++--
>   1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 1d6b15788ebf..ec83914d9867 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -911,13 +911,22 @@ static void uninitialize(struct device_queue_manager 
> *dqm)
>   
>   static int start_nocpsch(struct device_queue_manager *dqm)
>   {
> + int ret = 0;

checkpatch.pl would complain that there should be an empty line after 
variable declarations.


>   init_interrupts(dqm);
> - return pm_init(>packets, dqm);
> + /* Cache flushing on Hawaii in non HWS mode is done through packet
> +  * manager (PM), so we need to initialize PM for Hawaii.
> +  */
> + if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> + ret = pm_init(>packets, dqm);
> +
> + return ret;
>   }
>   
>   static int stop_nocpsch(struct device_queue_manager *dqm)
>   {
> - pm_uninit(>packets);
> + if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> + pm_uninit(>packets);
> +
>   return 0;
>   }
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [RFC] drm/amdkfd: Use logical cpu id for building vcrat

2019-04-16 Thread Kuehling, Felix
On 2019-04-16 2:44 a.m., Christian König wrote:
>>
>> It's not a high priority as I'm not aware of any applications that 
>> actually make use of the cache information.
>>
> Which raises the question why we have done this in the first place? 
> When nobody is using it could we just remove the interface?

The interface for cache information in the topology has existed since 
KFD was first introduced for APUs. On APUs the information is contained 
in the CRAT table. I see no reason not to report available information 
to user mode. If you wanted to remove it, you'd need to prove that 
nobody is using it. My statement was much weaker than that.

Furthermore, the cache information on CPUs is currently generated in 
user mode anyway. Changing the way we count CPU cores would break that 
existing user mode code, so we can't do that. That said, there is no 
kernel code to remove here. All I was saying was, that it's not a high 
priority to add the kernel code to populate CPU cache information in 
kernel mode.

Regards,
   Felix


>
> Regards,
> Christian.
>
> Am 16.04.19 um 05:24 schrieb Kuehling, Felix:
>>
>> On x86 we use the apicid to associate caches with CPU cores. See the 
>> Thunk code in libhsakmt/src/topology.c (static void 
>> find_cpu_cache_siblings()). If we used a different way to identify 
>> CPU cores, I think that would break. This code in the Thunk is 
>> x86-specific as it uses the CPUID instruction. We don't have 
>> equivalent code for ARM. So for ARM it doesn't really matter much, 
>> how you count your CPU cores in the CRAT table.
>>
>> I think eventually we want to get rid of that fragile CPUID code in 
>> the Thunk and get the cache information in kernel mode and report it 
>> to user mode through the KFD topology sysfs filesystem. Then we could 
>> also move away from using apicids as CPU IDs on x86.
>>
>> It's not a high priority as I'm not aware of any applications that 
>> actually make use of the cache information.
>>
>> Regards,
>>   Felix
>>
>> On 2019-04-15 22:39, Hillf Danton wrote:
>>>
>>> Hi folks
>>>
>>> In commit d1c234e2cd, arm64 is granted to build kfd. Currently, it 
>>> is physical
>>>
>>> cpu id that is used for building the x86_64 vcrat, but logical cpu 
>>> id is used
>>>
>>> instead for arm64, though the function name requires apicid. Can we 
>>> use the
>>>
>>> physical id for both arches if it really has an up-hand over the 
>>> logical one,
>>>
>>> as the following tiny diff represents?
>>>
>>> --- linux-5.1-rc4/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
>>> 2019-04-16 07:55:56.611685400 +0800
>>>
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 2019-04-16 
>>> 09:16:50.506126600 +0800
>>>
>>> @@ -1405,11 +1405,7 @@ static int kfd_cpumask_to_apic_id(const
>>>
>>> first_cpu_of_numa_node = cpumask_first(cpumask);
>>>
>>>    if (first_cpu_of_numa_node >= nr_cpu_ids)
>>>
>>>  return -1;
>>>
>>> -#ifdef CONFIG_X86_64
>>>
>>> - return cpu_data(first_cpu_of_numa_node).apicid;
>>>
>>> -#else
>>>
>>> - return first_cpu_of_numa_node;
>>>
>>> -#endif
>>>
>>> +    return cpu_physical_id(first_cpu_of_numa_node);
>>>
>>> }
>>>
>>> /* kfd_numa_node_to_apic_id - Returns the APIC ID of the first 
>>> logical processor
>>>
>>> --
>>>
>>> Or is logical cpu id enough to do the work, with some cosmetic 
>>> applied to the
>>>
>>> function names(not included in the following simple diff yet)?
>>>
>>> thanks
>>>
>>> Hillf
>>>
>>> --- linux-5.1-rc4/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
>>> 2019-04-16 07:55:56.611685400 +0800
>>>
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 2019-04-16 
>>> 09:18:24.546578400 +0800
>>>
>>> @@ -1405,11 +1405,7 @@ static int kfd_cpumask_to_apic_id(const
>>>
>>> first_cpu_of_numa_node = cpumask_first(cpumask);
>>>
>>>    if (first_cpu_of_numa_node >= nr_cpu_ids)
>>>
>>>  return -1;
>>>
>>> -#ifdef CONFIG_X86_64
>>>
>>> - return cpu_data(first_cpu_of_numa_node).apicid;
>>>
>>> -#else
>>>
>>>    return first_cpu_of_numa_node;
>>>
>>> -#endif
>>>
>>> }
>>>
>>> /* kfd_numa_node_to_apic_id - Returns the APIC ID of the first 
>>> logical processor
>>>
>>> --
>>>
>>
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: get_fw_version isn't ASIC specific

2019-04-16 Thread Kuehling, Felix
This is a nice cleanup.

With this change, kfd2kgd_calls.get_fw_version is no longer used. You 
should remove it from kgd_kfd_interface.h. Also move the enum 
kgd_engine_type to amdgpu_amdkfd.h at the same time.

With that fixed, this patch is Reviewed-by: Felix Kuehling 


On 2019-04-12 4:10 p.m., Lin, Amber wrote:
> Method of getting firmware version is the same across ASICs, so remove
> them from ASIC-specific files and create one in amdgpu_amdkfd.c. This new
> created get_fw_version simply reads fw_version from adev->gfx than parsing
> the ucode header.
>
> Signed-off-by: Amber Lin 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 37 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 61 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 61 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 54 
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  4 +-
>   6 files changed, 41 insertions(+), 178 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index acf8ae0..aeead07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -335,6 +335,43 @@ void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, 
> void *mem_obj)
>   amdgpu_bo_unref(&(bo));
>   }
>   
> +uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
> +   enum kgd_engine_type type)
> +{
> + struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +
> + switch (type) {
> + case KGD_ENGINE_PFP:
> + return adev->gfx.pfp_fw_version;
> +
> + case KGD_ENGINE_ME:
> + return adev->gfx.me_fw_version;
> +
> + case KGD_ENGINE_CE:
> + return adev->gfx.ce_fw_version;
> +
> + case KGD_ENGINE_MEC1:
> + return adev->gfx.mec_fw_version;
> +
> + case KGD_ENGINE_MEC2:
> + return adev->gfx.mec2_fw_version;
> +
> + case KGD_ENGINE_RLC:
> + return adev->gfx.rlc_fw_version;
> +
> + case KGD_ENGINE_SDMA1:
> + return adev->sdma.instance[0].fw_version;
> +
> + case KGD_ENGINE_SDMA2:
> + return adev->sdma.instance[1].fw_version;
> +
> + default:
> + return 0;
> + }
> +
> + return 0;
> +}
> +
>   void amdgpu_amdkfd_get_local_mem_info(struct kgd_dev *kgd,
> struct kfd_local_mem_info *mem_info)
>   {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index e6a5037..5c8397f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -141,6 +141,8 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, 
> size_t size,
>   void **mem_obj, uint64_t *gpu_addr,
>   void **cpu_ptr, bool mqd_gfx9);
>   void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
> +uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd,
> +   enum kgd_engine_type type);
>   void amdgpu_amdkfd_get_local_mem_info(struct kgd_dev *kgd,
> struct kfd_local_mem_info *mem_info);
>   uint64_t amdgpu_amdkfd_get_gpu_clock_counter(struct kgd_dev *kgd);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index ff7fac7..fa09e11 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -22,14 +22,12 @@
>   
>   #include 
>   #include 
> -#include 
>   #include 
>   #include 
>   #include "amdgpu.h"
>   #include "amdgpu_amdkfd.h"
>   #include "cikd.h"
>   #include "cik_sdma.h"
> -#include "amdgpu_ucode.h"
>   #include "gfx_v7_0.h"
>   #include "gca/gfx_7_2_d.h"
>   #include "gca/gfx_7_2_enum.h"
> @@ -139,7 +137,6 @@ static bool get_atc_vmid_pasid_mapping_valid(struct 
> kgd_dev *kgd, uint8_t vmid);
>   static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
>   uint8_t vmid);
>   
> -static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type 
> type);
>   static void set_scratch_backing_va(struct kgd_dev *kgd,
>   uint64_t va, uint32_t vmid);
>   static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t 
> vmid,
> @@ -191,7 +188,6 @@ static const struct kfd2kgd_calls kfd2kgd = {
>   .address_watch_get_offset = kgd_address_watch_get_offset,
>   .get_atc_vmid_pasid_mapping_pasid = get_atc_vmid_pasid_mapping_pasid,
>   .get_atc_vmid_pasid_mapping_valid = get_atc_vmid_pasid_mapping_valid,
> - .get_fw_version = get_fw_version,
>   .set_scratch_backing_va = 

Re: [RFC] drm/amdkfd: Use logical cpu id for building vcrat

2019-04-15 Thread Kuehling, Felix
On x86 we use the apicid to associate caches with CPU cores. See the Thunk code 
in libhsakmt/src/topology.c (static void find_cpu_cache_siblings()). If we used 
a different way to identify CPU cores, I think that would break. This code in 
the Thunk is x86-specific as it uses the CPUID instruction. We don't have 
equivalent code for ARM. So for ARM it doesn't really matter much, how you 
count your CPU cores in the CRAT table.

I think eventually we want to get rid of that fragile CPUID code in the Thunk 
and get the cache information in kernel mode and report it to user mode through 
the KFD topology sysfs filesystem. Then we could also move away from using 
apicids as CPU IDs on x86.

It's not a high priority as I'm not aware of any applications that actually 
make use of the cache information.

Regards,
  Felix

On 2019-04-15 22:39, Hillf Danton wrote:
Hi folks

In commit d1c234e2cd, arm64 is granted to build kfd. Currently, it is physical
cpu id that is used for building the x86_64 vcrat, but logical cpu id is used
instead for arm64, though the function name requires apicid. Can we use the
physical id for both arches if it really has an up-hand over the logical one,
as the following tiny diff represents?

--- linux-5.1-rc4/drivers/gpu/drm/amd/amdkfd/kfd_topology.c2019-04-16 
07:55:56.611685400 +0800
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 2019-04-16 09:16:50.506126600 
+0800
@@ -1405,11 +1405,7 @@ static int kfd_cpumask_to_apic_id(const
   first_cpu_of_numa_node = cpumask_first(cpumask);
   if (first_cpu_of_numa_node >= nr_cpu_ids)
 return -1;
-#ifdef CONFIG_X86_64
- return cpu_data(first_cpu_of_numa_node).apicid;
-#else
- return first_cpu_of_numa_node;
-#endif
+return cpu_physical_id(first_cpu_of_numa_node);
}
/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
--


Or is logical cpu id enough to do the work, with some cosmetic applied to the
function names(not included in the following simple diff yet)?

thanks
Hillf


--- linux-5.1-rc4/drivers/gpu/drm/amd/amdkfd/kfd_topology.c2019-04-16 
07:55:56.611685400 +0800
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 2019-04-16 09:18:24.546578400 
+0800
@@ -1405,11 +1405,7 @@ static int kfd_cpumask_to_apic_id(const
   first_cpu_of_numa_node = cpumask_first(cpumask);
   if (first_cpu_of_numa_node >= nr_cpu_ids)
 return -1;
-#ifdef CONFIG_X86_64
- return cpu_data(first_cpu_of_numa_node).apicid;
-#else
   return first_cpu_of_numa_node;
-#endif
}
/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
--

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: support dpm level modification under virtualization

2019-04-10 Thread Kuehling, Felix
How does forcing DPM levels work in SRIOV? Can clocks switch fast enough to 
allow different VFs have different clocks? If not, can one VF override the 
clocks used by another VF? In that case, wouldn't that violate the isolation 
between VFs?

Regards,
  Felix

-Original Message-
From: amd-gfx  On Behalf Of Yintian Tao
Sent: Tuesday, April 9, 2019 11:18
To: amd-gfx@lists.freedesktop.org
Cc: Tao, Yintian 
Subject: [PATCH] drm/amdgpu: support dpm level modification under virtualization

Under vega10 virtualuzation, smu ip block will not be added.
Therefore, we need add pp clk query and force dpm level function at 
amdgpu_virt_ops to support the feature.

Change-Id: I713419c57b854082f6f739f1d32a055c7115e620
Signed-off-by: Yintian Tao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 15 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 33 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   | 11 +
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  | 78 ++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h  |  6 +++
 7 files changed, 147 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3ff8899..bb0fd5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2486,6 +2486,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>virt.vf_errors.lock);
hash_init(adev->mn_hash);
mutex_init(>lock_reset);
+   mutex_init(>virt.dpm_mutex);
 
amdgpu_device_check_arguments(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 6190495..1353955 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -727,6 +727,9 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file
if (adev->pm.dpm_enabled) {
dev_info.max_engine_clock = amdgpu_dpm_get_sclk(adev, 
false) * 10;
dev_info.max_memory_clock = amdgpu_dpm_get_mclk(adev, 
false) * 10;
+   } else if (amdgpu_sriov_vf(adev)) {
+   dev_info.max_engine_clock = amdgpu_virt_get_sclk(adev, 
false) * 10;
+   dev_info.max_memory_clock = amdgpu_virt_get_mclk(adev, 
false) * 10;
} else {
dev_info.max_engine_clock = adev->clock.default_sclk * 
10;
dev_info.max_memory_clock = adev->clock.default_mclk * 
10; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 5540259..0162d1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -380,6 +380,17 @@ static ssize_t 
amdgpu_set_dpm_forced_performance_level(struct device *dev,
goto fail;
}
 
+if (amdgpu_sriov_vf(adev)) {
+if (amdgim_is_hwperf(adev) &&
+adev->virt.ops->force_dpm_level) {
+mutex_lock(>pm.mutex);
+adev->virt.ops->force_dpm_level(adev, level);
+mutex_unlock(>pm.mutex);
+return count;
+} else
+return -EINVAL;
+}
+
if (current_level == level)
return count;
 
@@ -843,6 +854,10 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = ddev->dev_private;
 
+   if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev) &&
+   adev->virt.ops->get_pp_clk)
+   return adev->virt.ops->get_pp_clk(adev, PP_SCLK, buf);
+
if (is_support_sw_smu(adev))
return smu_print_clk_levels(>smu, PP_SCLK, buf);
else if (adev->powerplay.pp_funcs->print_clock_levels)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 462a04e..ae4b2a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -375,4 +375,37 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device 
*adev)
}
 }
 
+static uint32_t parse_clk(char *buf, bool min) {
+char *ptr = buf;
+uint32_t clk = 0;
+
+do {
+ptr = strchr(ptr, ':');
+if (!ptr)
+break;
+ptr+=2;
+clk = simple_strtoul(ptr, NULL, 10);
+} while (!min);
+
+return clk * 100;
+}
+
+uint32_t amdgpu_virt_get_sclk(struct amdgpu_device *adev, bool lowest) 
+{
+char buf[512] = {0};
+
+adev->virt.ops->get_pp_clk(adev, PP_SCLK, buf);
+
+return parse_clk(buf, lowest);
+}
+
+uint32_t amdgpu_virt_get_mclk(struct 

Re: [PATCH 1/8] drm/amdgpu: fix ATC handling for Ryzen

2019-04-03 Thread Kuehling, Felix
On 2019-04-03 1:24 p.m., Koenig, Christian wrote:
> Am 01.04.19 um 20:58 schrieb Kuehling, Felix:
>> On 2019-04-01 2:03 p.m., Christian König wrote:
>>> Am 01.04.19 um 19:59 schrieb Kuehling, Felix:
>>>> On 2019-04-01 7:23 a.m., Christian König wrote:
>>>>> Am 30.03.19 um 01:41 schrieb Kuehling, Felix:
>>>>>> Patches 1-3 are Reviewed-by: Felix Kuehling 
>>>>> Thanks.
>>>>>
>>>>>> About the direct mode, that removes a bunch of synchronization, so it
>>>>>> must make some assumptions about the state of the page tables. What
>>>>>> makes that safe?
>>>>> Direct mode is only supposed to be used during page fault handling.
>>>>>
>>>>> E.g. we know that the page tables are in the correct place in this
>>>>> situation because the hardware is hammering on a PTE and waiting for
>>>>> it to become valid.
>>>> A fence could also indicate a concurrent modification of the page table.
>>>> For example a PTB may be allocated and initialized concurrently, not in
>>>> direct mode. Would direct mode need to wait for a fence that indicates
>>>> completion of the PTB initialization? Or do we have some way to ensure
>>>> such concurrent allocation and initialization of a PTB cannot happen?
>>> Yeah, that is a very good question I haven't solved yet either.
>>>
>>> My currently best idea is to separate the address space, e.g. use the
>>> lower address space for on demand paging and the higher with classic
>>> pre-filled page tables for the MM and display engines.
>> That may work for graphics, but doesn't work for KFD. I need the ability
>> to mix pre-filled page tables with HMM in the same SVM address space.
> Even after thinking for multiple days about it I can't of hand find a
> way to make this work.
>
>> That's why I was thinking that all page table updates for a given VM
>> would need to use the same method.
> Well what exactly do you mean with that? Essentially there are two methods:
>
> 1. Pre-fill the page tables before accessing them with the hardware.
>
> 2. Fill on demand with page faults.
>
> I don't think we can mix those two methods together in the same address
> range.

That's what I was hoping to do. For example an application could use 
"old" BO-based memory management APIs that pre-fill page tables with 
"new" HMM-based memory management APIs that rely on page faults. Those 
may be different libraries written in different languages running in the 
same application. E.g. a GPU BLAS implementation that's optimized and 
uses old-style memory allocations linked to an OpenMP application that 
relies on HMM.

If that's not possible, I'd need to emulate all the old memory APIs on 
top of HMM. I was hoping to avoid that.

Even when page faults are enabled, we want to be able to pre-fault stuff 
to avoid the performance it on the first access. Are you saying that 
won't be possible?

Regards,
   Felix


>
> E.g. we can say to use pre-fill for MM engines in the upper range and on
> demand filling in the lower range, but we can't mix them.
>
> Regards,
> Christian.
>
>> Regards,
>>  Felix
>>
>>> Christian.
>>>
>>>> Regards,
>>>>       Felix
>>>>
>>>>
>>>>> Christian.
>>>>>
>>>>>>      Is it safe to use direct-mode on a
>>>>>> per-page-table-update basis? Or do all page table updates have to go
>>>>>> through direct mode to avoid hazards? If yes, then maybe this
>>>>>> should be
>>>>>> a property of the VM rather than a parameter that gets passed to a
>>>>>> bunch
>>>>>> of function calls.
>>>>>>
>>>>>> Regards,
>>>>>>        Felix
>>>>>>
>>>>>> On 2019-03-29 6:45 a.m., Christian König wrote:
>>>>>>> Otherwise we don't correctly use translate further.
>>>>>>>
>>>>>>> Signed-off-by: Christian König 
>>>>>>> ---
>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 13 -
>>>>>>>       1 file changed, 8 insertions(+), 5 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>>>> index 3d221f044183..059d9802e713 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amd

Re: [PATCH v13 14/20] drm/amdgpu, arm64: untag user pointers in amdgpu_ttm_tt_get_user_pages

2019-04-02 Thread Kuehling, Felix
On 2019-04-02 10:37 a.m., Andrey Konovalov wrote:
> On Mon, Mar 25, 2019 at 11:21 PM Kuehling, Felix  
> wrote:
>> On 2019-03-20 10:51 a.m., Andrey Konovalov wrote:
>>> This patch is a part of a series that extends arm64 kernel ABI to allow to
>>> pass tagged user pointers (with the top byte set to something else other
>>> than 0x00) as syscall arguments.
>>>
>>> amdgpu_ttm_tt_get_user_pages() uses provided user pointers for vma
>>> lookups, which can only by done with untagged pointers.
>>>
>>> Untag user pointers in this function.
>>>
>>> Signed-off-by: Andrey Konovalov 
>>> ---
>>>drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 5 +++--
>>>1 file changed, 3 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 73e71e61dc99..891b027fa33b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -751,10 +751,11 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, 
>>> struct page **pages)
>>> * check that we only use anonymous memory to prevent 
>>> problems
>>> * with writeback
>>> */
>>> - unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
>>> + unsigned long userptr = untagged_addr(gtt->userptr);
>>> + unsigned long end = userptr + ttm->num_pages * PAGE_SIZE;
>>>struct vm_area_struct *vma;
>>>
>>> - vma = find_vma(mm, gtt->userptr);
>>> + vma = find_vma(mm, userptr);
>>>if (!vma || vma->vm_file || vma->vm_end < end) {
>>>up_read(>mmap_sem);
>>>return -EPERM;
>> We'll need to be careful that we don't break your change when the
>> following commit gets applied through drm-next for Linux 5.2:
>>
>> https://cgit.freedesktop.org/~agd5f/linux/commit/?h=drm-next-5.2-wip=915d3eecfa23693bac9e54cdacf84fb4efdcc5c4
>>
>> Would it make sense to apply the untagging in amdgpu_ttm_tt_set_userptr
>> instead? That would avoid this conflict and I think it would clearly put
>> the untagging into the user mode code path where the tagged pointer
>> originates.
>>
>> In amdgpu_gem_userptr_ioctl and amdgpu_amdkfd_gpuvm.c (init_user_pages)
>> we also set up an MMU notifier with the (tagged) pointer from user mode.
>> That should probably also use the untagged address so that MMU notifiers
>> for the untagged address get correctly matched up with the right BO. I'd
>> move the untagging further up the call stack to cover that. For the GEM
>> case I think amdgpu_gem_userptr_ioctl would be the right place. For the
>> KFD case, I'd do this in amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu.
> Will do in v14, thanks a lot for looking at this!
>
> Is this applicable to the radeon driver (drivers/gpu/drm/radeon) as
> well? It seems to be using very similar structure.

I think so. Radeon doesn't have the KFD bits any more. But the GEM 
interface and MMU notifier are very similar.

Regards,
   Felix


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH RFC tip/core/rcu 3/4] drivers/gpu/drm/amd: Dynamically allocate kfd_processes_srcu

2019-04-02 Thread Kuehling, Felix
On 2019-04-02 10:29 a.m., Paul E. McKenney wrote:
> Having DEFINE_SRCU() or DEFINE_STATIC_SRCU() in a loadable module
> requires that the size of the reserved region be increased, which is
> not something we really want to be doing.  This commit therefore removes
> the DEFINE_STATIC_SRCU() from drivers/gpu/drm/amd/amdkfd/kfd_process.c in
> favor of defining kfd_processes_srcu as a simple srcu_struct, initializing
> it in amdgpu_amdkfd_init(), and cleaning it up in amdgpu_amdkfd_fini().
>
> Reported-by: kbuild test robot 
> Signed-off-by: Paul E. McKenney 
> Tested-by: kbuild test robot 
> Cc: Oded Gabbay 
> Cc: Alex Deucher 
> Cc: "Christian König"  Cc: "David (ChunMing) Zhou" 
> Cc: David Airlie 
> Cc: Daniel Vetter 
> Cc: Tejun Heo 
> Cc: 
> Cc: 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 2 +-
>   2 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index fe1d7368c1e6..eadb20dee867 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -28,6 +28,8 @@
>   #include 
>   #include 
>   
> +extern struct srcu_struct kfd_processes_srcu;
> +
>   static const unsigned int compute_vmid_bitmap = 0xFF00;
>   
>   /* Total memory size in system memory and all GPU VRAM. Used to
> @@ -40,6 +42,8 @@ int amdgpu_amdkfd_init(void)
>   struct sysinfo si;
>   int ret;
>   
> + ret = init_srcu_struct(_processes_srcu);
> + WARN_ON(ret);

kfd_processes_srcu only exists if kfd_process.c is compiled in. That 
depends on CONFIG_HSA_AMD. So this should at least move into #ifdef a 
few lines below.

However, it would be cleaner to move this initialization into kfd_init 
in kfd_module.c, or better yet, into kfd_process_create_wq in 
kfd_process.c. Then kfd_process_create_wq should be renamed to something 
more generic, such as kfd_process_init.


>   si_meminfo();
>   amdgpu_amdkfd_total_mem_size = si.totalram - si.totalhigh;
>   amdgpu_amdkfd_total_mem_size *= si.mem_unit;
> @@ -57,6 +61,7 @@ int amdgpu_amdkfd_init(void)
>   void amdgpu_amdkfd_fini(void)
>   {
>   kgd2kfd_exit();
> + cleanup_srcu_struct(_processes_srcu);

Similarly, this would be cleaner in kfd_exit in kfd_module.c or 
kfd_process_destroy_wq in kfd_process.c, with that function similarly 
renamed to kfd_process_fini.

I'm attaching a revised patch. It's only compile tested.

Regards,
   Felix


>   }
>   
>   void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 4bdae78bab8e..98b694068b8a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -47,7 +47,7 @@ struct mm_struct;
>   DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE);
>   static DEFINE_MUTEX(kfd_processes_mutex);
>   
> -DEFINE_SRCU(kfd_processes_srcu);
> +struct srcu_struct kfd_processes_srcu;
>   
>   /* For process termination handling */
>   static struct workqueue_struct *kfd_process_wq;
From 5857a9aa63957a5755ff81ae5c46533bca408c12 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" 
Date: Tue, 2 Apr 2019 07:29:32 -0700
Subject: [PATCH 1/1] drivers/gpu/drm/amd: Dynamically allocate
 kfd_processes_srcu v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Having DEFINE_SRCU() or DEFINE_STATIC_SRCU() in a loadable module
requires that the size of the reserved region be increased, which is
not something we really want to be doing.  This commit therefore removes
the DEFINE_STATIC_SRCU() from drivers/gpu/drm/amd/amdkfd/kfd_process.c in
favor of defining kfd_processes_srcu as a simple srcu_struct, initializing
it in kfd_process_init(), and cleaning it up in kfd_process_fini().

v2 (Felix Kuehling): Move srcu init and cleanup into kfd_process.c

Reported-by: kbuild test robot 
Signed-off-by: Paul E. McKenney 
Signed-off-by: Felix Kuehling 
Tested-by (v1): kbuild test robot 
Cc: Oded Gabbay 
Cc: Alex Deucher 
Cc: "Christian König" 
Cc: "David (ChunMing) Zhou" 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Tejun Heo 
Cc: 
Cc: 
---
 drivers/gpu/drm/amd/amdkfd/kfd_module.c  |  8 
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 19 ---
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 932007e..e8e2c15 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -52,15 +52,15 @@ static int kfd_init(void)
 	if (err < 0)
 		goto err_topology;
 
-	err = kfd_process_create_wq();
+	err = kfd_process_init();
 	if (err < 0)
-		goto err_create_wq;
+		goto err_process;
 
 	kfd_debugfs_init();
 
 	return 0;
 
-err_create_wq:

Re: [PATCH 1/8] drm/amdgpu: fix ATC handling for Ryzen

2019-04-01 Thread Kuehling, Felix
On 2019-04-01 2:03 p.m., Christian König wrote:
> Am 01.04.19 um 19:59 schrieb Kuehling, Felix:
>> On 2019-04-01 7:23 a.m., Christian König wrote:
>>> Am 30.03.19 um 01:41 schrieb Kuehling, Felix:
>>>> Patches 1-3 are Reviewed-by: Felix Kuehling 
>>> Thanks.
>>>
>>>> About the direct mode, that removes a bunch of synchronization, so it
>>>> must make some assumptions about the state of the page tables. What
>>>> makes that safe?
>>> Direct mode is only supposed to be used during page fault handling.
>>>
>>> E.g. we know that the page tables are in the correct place in this
>>> situation because the hardware is hammering on a PTE and waiting for
>>> it to become valid.
>> A fence could also indicate a concurrent modification of the page table.
>> For example a PTB may be allocated and initialized concurrently, not in
>> direct mode. Would direct mode need to wait for a fence that indicates
>> completion of the PTB initialization? Or do we have some way to ensure
>> such concurrent allocation and initialization of a PTB cannot happen?
>
> Yeah, that is a very good question I haven't solved yet either.
>
> My currently best idea is to separate the address space, e.g. use the 
> lower address space for on demand paging and the higher with classic 
> pre-filled page tables for the MM and display engines.

That may work for graphics, but doesn't work for KFD. I need the ability 
to mix pre-filled page tables with HMM in the same SVM address space. 
That's why I was thinking that all page table updates for a given VM 
would need to use the same method.

Regards,
   Felix

>
> Christian.
>
>>
>> Regards,
>>     Felix
>>
>>
>>> Christian.
>>>
>>>>    Is it safe to use direct-mode on a
>>>> per-page-table-update basis? Or do all page table updates have to go
>>>> through direct mode to avoid hazards? If yes, then maybe this 
>>>> should be
>>>> a property of the VM rather than a parameter that gets passed to a 
>>>> bunch
>>>> of function calls.
>>>>
>>>> Regards,
>>>>      Felix
>>>>
>>>> On 2019-03-29 6:45 a.m., Christian König wrote:
>>>>> Otherwise we don't correctly use translate further.
>>>>>
>>>>> Signed-off-by: Christian König 
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 13 -
>>>>>     1 file changed, 8 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> index 3d221f044183..059d9802e713 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>>> @@ -767,14 +767,17 @@ static int amdgpu_vm_clear_bo(struct
>>>>> amdgpu_device *adev,
>>>>>        addr = 0;
>>>>>     if (ats_entries) {
>>>>> -    uint64_t ats_value;
>>>>> +    uint64_t value = 0, flags;
>>>>>     -    ats_value = AMDGPU_PTE_DEFAULT_ATC;
>>>>> -    if (level != AMDGPU_VM_PTB)
>>>>> -    ats_value |= AMDGPU_PDE_PTE;
>>>>> +    flags = AMDGPU_PTE_DEFAULT_ATC;
>>>>> +    if (level != AMDGPU_VM_PTB) {
>>>>> +    /* Handle leaf PDEs as PTEs */
>>>>> +    flags |= AMDGPU_PDE_PTE;
>>>>> +    amdgpu_gmc_get_vm_pde(adev, level, , );
>>>>> +    }
>>>>>        r = vm->update_funcs->update(, bo, addr, 0,
>>>>> ats_entries,
>>>>> - 0, ats_value);
>>>>> + value, flags);
>>>>>     if (r)
>>>>>     return r;
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/8] drm/amdgpu: fix ATC handling for Ryzen

2019-04-01 Thread Kuehling, Felix
On 2019-04-01 7:23 a.m., Christian König wrote:
> Am 30.03.19 um 01:41 schrieb Kuehling, Felix:
>> Patches 1-3 are Reviewed-by: Felix Kuehling 
>
> Thanks.
>
>>
>> About the direct mode, that removes a bunch of synchronization, so it
>> must make some assumptions about the state of the page tables. What
>> makes that safe?
>
> Direct mode is only supposed to be used during page fault handling.
>
> E.g. we know that the page tables are in the correct place in this 
> situation because the hardware is hammering on a PTE and waiting for 
> it to become valid.

A fence could also indicate a concurrent modification of the page table. 
For example a PTB may be allocated and initialized concurrently, not in 
direct mode. Would direct mode need to wait for a fence that indicates 
completion of the PTB initialization? Or do we have some way to ensure 
such concurrent allocation and initialization of a PTB cannot happen?

Regards,
   Felix


>
> Christian.
>
>>   Is it safe to use direct-mode on a
>> per-page-table-update basis? Or do all page table updates have to go
>> through direct mode to avoid hazards? If yes, then maybe this should be
>> a property of the VM rather than a parameter that gets passed to a bunch
>> of function calls.
>>
>> Regards,
>>     Felix
>>
>> On 2019-03-29 6:45 a.m., Christian König wrote:
>>> Otherwise we don't correctly use translate further.
>>>
>>> Signed-off-by: Christian König 
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 13 -
>>>    1 file changed, 8 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 3d221f044183..059d9802e713 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -767,14 +767,17 @@ static int amdgpu_vm_clear_bo(struct 
>>> amdgpu_device *adev,
>>>       addr = 0;
>>>    if (ats_entries) {
>>> -    uint64_t ats_value;
>>> +    uint64_t value = 0, flags;
>>>    -    ats_value = AMDGPU_PTE_DEFAULT_ATC;
>>> -    if (level != AMDGPU_VM_PTB)
>>> -    ats_value |= AMDGPU_PDE_PTE;
>>> +    flags = AMDGPU_PTE_DEFAULT_ATC;
>>> +    if (level != AMDGPU_VM_PTB) {
>>> +    /* Handle leaf PDEs as PTEs */
>>> +    flags |= AMDGPU_PDE_PTE;
>>> +    amdgpu_gmc_get_vm_pde(adev, level, , );
>>> +    }
>>>       r = vm->update_funcs->update(, bo, addr, 0, 
>>> ats_entries,
>>> - 0, ats_value);
>>> + value, flags);
>>>    if (r)
>>>    return r;
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/8] drm/amdgpu: fix ATC handling for Ryzen

2019-03-29 Thread Kuehling, Felix
Patches 1-3 are Reviewed-by: Felix Kuehling 

About the direct mode, that removes a bunch of synchronization, so it 
must make some assumptions about the state of the page tables. What 
makes that safe? Is it safe to use direct-mode on a 
per-page-table-update basis? Or do all page table updates have to go 
through direct mode to avoid hazards? If yes, then maybe this should be 
a property of the VM rather than a parameter that gets passed to a bunch 
of function calls.

Regards,
   Felix

On 2019-03-29 6:45 a.m., Christian König wrote:
> Otherwise we don't correctly use translate further.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 13 -
>   1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3d221f044183..059d9802e713 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -767,14 +767,17 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   
>   addr = 0;
>   if (ats_entries) {
> - uint64_t ats_value;
> + uint64_t value = 0, flags;
>   
> - ats_value = AMDGPU_PTE_DEFAULT_ATC;
> - if (level != AMDGPU_VM_PTB)
> - ats_value |= AMDGPU_PDE_PTE;
> + flags = AMDGPU_PTE_DEFAULT_ATC;
> + if (level != AMDGPU_VM_PTB) {
> + /* Handle leaf PDEs as PTEs */
> + flags |= AMDGPU_PDE_PTE;
> + amdgpu_gmc_get_vm_pde(adev, level, , );
> + }
>   
>   r = vm->update_funcs->update(, bo, addr, 0, ats_entries,
> -  0, ats_value);
> +  value, flags);
>   if (r)
>   return r;
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Add preferred_domain check when determine XGMI state

2019-03-28 Thread Kuehling, Felix
On 2019-03-28 4:38 p.m., Liu, Shaoyun wrote:
> Avoid unnecessary XGMI hight pstate trigger when mapping none-vram memory for 
> peer device
>
> Change-Id: I1881deff3da19f1f4b58d5765db03a590092a5b2
> Signed-off-by: shaoyunl 

This patch is Reviewed-by: Felix Kuehling 

Please also give Christian a chance to review this one before you submit.

Thanks,
   Felix


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 11 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  3 ++-
>   2 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 9ee8d7a..7b84036 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -31,6 +31,7 @@
>   #include 
>   #include "amdgpu.h"
>   #include "amdgpu_display.h"
> +#include "amdgpu_xgmi.h"
>   
>   void amdgpu_gem_object_free(struct drm_gem_object *gobj)
>   {
> @@ -666,6 +667,7 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   struct amdgpu_device *adev = dev->dev_private;
>   struct drm_amdgpu_gem_op *args = data;
>   struct drm_gem_object *gobj;
> + struct amdgpu_vm_bo_base *base;
>   struct amdgpu_bo *robj;
>   int r;
>   
> @@ -704,6 +706,15 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   amdgpu_bo_unreserve(robj);
>   break;
>   }
> + for (base = robj->vm_bo; base; base = base->next)
> + if 
> (amdgpu_xgmi_same_hive(amdgpu_ttm_adev(robj->tbo.bdev),
> + 
> amdgpu_ttm_adev(base->vm->root.base.bo->tbo.bdev))) {
> + r = -EINVAL;
> + amdgpu_bo_unreserve(robj);
> + goto out;
> + }
> +
> +
>   robj->preferred_domains = args->value & (AMDGPU_GEM_DOMAIN_VRAM 
> |
>   AMDGPU_GEM_DOMAIN_GTT |
>   AMDGPU_GEM_DOMAIN_CPU);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3d221f0..eb242a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2025,7 +2025,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
>   INIT_LIST_HEAD(_va->valids);
>   INIT_LIST_HEAD(_va->invalids);
>   
> - if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
> + if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
> + (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) {
>   bo_va->is_xgmi = true;
>   mutex_lock(>vm_manager.lock_pstate);
>   /* Power up XGMI if it can be potentially used */
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Add preferred_domain check when determine XGMI state

2019-03-28 Thread Kuehling, Felix
On 2019-03-28 3:47 p.m., Liu, Shaoyun wrote:
> I think we only care about the context device (adev) and the real device
> this bo been allocated(robj->tbo.bdev).  The bo_va or base  don't have
> the device pointer directly, it have  a pointer to bo which should be
> the  same  as robj here.  We  can move the same_hive  check out of the
> loop .

That doesn't make sense. The "same hive" check only makes sense if one 
of the devices is the one where the memory is physically located 
(robj->tbo.bdev), and the other one is where the memory is accessed 
from. That only makes sense inside the loop. The amdgpu_vm_bo_base 
should tell you the device that's mapping and potentially accessing the 
memory over XGMI. You could get it like this:

     mapping_adev = base->vm->root.base.bo->tbo.bdev;

Regards,
   Felix


>
> Regards
>
> shaoyun.liu
>
>
> On 2019-03-28 3:18 p.m., Kuehling, Felix wrote:
>> On 2019-03-28 1:55 p.m., Liu, Shaoyun wrote:
>>> Avoid unnecessary XGMI hight pstate trigger when mapping none-vram memory 
>>> for peer device
>>>
>>> Change-Id: I1881deff3da19f1f4b58d5765db03a590092a5b2
>>> Signed-off-by: shaoyunl 
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 13 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  3 ++-
>>> 2 files changed, 15 insertions(+), 1 deletion(-)
>>> I think we only care about the context device (adev)  and  the real device 
>>> this bo been allocated ,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index 9ee8d7a..82dc2b6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -31,6 +31,7 @@
>>> #include 
>>> #include "amdgpu.h"
>>> #include "amdgpu_display.h"
>>> +#include "amdgpu_xgmi.h"
>>> 
>>> void amdgpu_gem_object_free(struct drm_gem_object *gobj)
>>> {
>>> @@ -666,6 +667,8 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
>>> *data,
>>> struct amdgpu_device *adev = dev->dev_private;
>>> struct drm_amdgpu_gem_op *args = data;
>>> struct drm_gem_object *gobj;
>>> +   struct amdgpu_vm_bo_base *base;
>>> +   struct amdgpu_bo_va *bo_va;
>>> struct amdgpu_bo *robj;
>>> int r;
>>> 
>>> @@ -704,6 +707,16 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
>>> *data,
>>> amdgpu_bo_unreserve(robj);
>>> break;
>>> }
>>> +   for (base = robj->vm_bo; base; base = base->next) {
>>> +   bo_va = container_of(base, struct amdgpu_bo_va, base);
>>> +   if (bo_va &&
>>> +   
>>> amdgpu_xgmi_same_hive(adev,amdgpu_ttm_adev(robj->tbo.bdev))) {
>> adev and robj->tbo.bdev are the same in each loop iteration. Shouldn't
>> we get one of the devices from the bo_va or amdgpu_vm_bo_base?
>>
>> Regards,
>>  Felix
>>
>>
>>> +   r = -EINVAL;
>>> +   amdgpu_bo_unreserve(robj);
>>> +   goto out;
>>> +   }
>>> +   }
>>> +
>>> robj->preferred_domains = args->value & 
>>> (AMDGPU_GEM_DOMAIN_VRAM |
>>> 
>>> AMDGPU_GEM_DOMAIN_GTT |
>>> 
>>> AMDGPU_GEM_DOMAIN_CPU);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index 3d221f0..eb242a1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -2025,7 +2025,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
>>> amdgpu_device *adev,
>>> INIT_LIST_HEAD(_va->valids);
>>> INIT_LIST_HEAD(_va->invalids);
>>> 
>>> -   if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
>>> +   if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
>>> +   (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) {
>>> bo_va->is_xgmi = true;
>>> mutex_lock(>vm_manager.lock_pstate);
>>> /* Power up XGMI if it can be potentially used */
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: don't put the root PD into the relocated list

2019-03-28 Thread Kuehling, Felix
The change looks reasonable to me. Acked-by: Felix Kuehling 


I just don't understand why the root PD is special and handled 
differently from other PDs and PTs.

Regards,
   Felix

On 2019-03-27 6:39 a.m., Christian König wrote:
> Instead of skipping the root PD while processing the relocated list just never
> put it on the list in the first place.
>
> This avoids walking the list all together when the root PD is the only entry
> and so also avoids trying to submit a zero sized IB to the SDMA.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 18 --
>   1 file changed, 8 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index af1a7020c3ab..5f615d63e2e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -305,7 +305,7 @@ static void amdgpu_vm_bo_base_init(struct 
> amdgpu_vm_bo_base *base,
>   return;
>   
>   vm->bulk_moveable = false;
> - if (bo->tbo.type == ttm_bo_type_kernel)
> + if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
>   amdgpu_vm_bo_relocated(base);
>   else
>   amdgpu_vm_bo_idle(base);
> @@ -671,7 +671,10 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device 
> *adev, struct amdgpu_vm *vm,
>   if (r)
>   break;
>   }
> - amdgpu_vm_bo_relocated(bo_base);
> + if (bo->parent)
> + amdgpu_vm_bo_relocated(bo_base);
> + else
> + amdgpu_vm_bo_idle(bo_base);
>   }
>   }
>   
> @@ -1184,16 +1187,15 @@ uint64_t amdgpu_vm_map_gart(const dma_addr_t 
> *pages_addr, uint64_t addr)
>*
>* @param: parameters for the update
>* @vm: requested vm
> - * @parent: parent directory
>* @entry: entry to update
>*
>* Makes sure the requested entry in parent is up to date.
>*/
>   static int amdgpu_vm_update_pde(struct amdgpu_vm_update_params *params,
>   struct amdgpu_vm *vm,
> - struct amdgpu_vm_pt *parent,
>   struct amdgpu_vm_pt *entry)
>   {
> + struct amdgpu_vm_pt *parent = amdgpu_vm_pt_parent(entry);
>   struct amdgpu_bo *bo = parent->base.bo, *pbo;
>   uint64_t pde, pt, flags;
>   unsigned level;
> @@ -1255,17 +1257,13 @@ int amdgpu_vm_update_directories(struct amdgpu_device 
> *adev,
>   return r;
>   
>   while (!list_empty(>relocated)) {
> - struct amdgpu_vm_pt *pt, *entry;
> + struct amdgpu_vm_pt *entry;
>   
>   entry = list_first_entry(>relocated, struct amdgpu_vm_pt,
>base.vm_status);
>   amdgpu_vm_bo_idle(>base);
>   
> - pt = amdgpu_vm_pt_parent(entry);
> - if (!pt)
> - continue;
> -
> - r = amdgpu_vm_update_pde(, vm, pt, entry);
> + r = amdgpu_vm_update_pde(, vm, entry);
>   if (r)
>   goto error;
>   }
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Add preferred_domain check when determine XGMI state

2019-03-28 Thread Kuehling, Felix
On 2019-03-28 1:55 p.m., Liu, Shaoyun wrote:
> Avoid unnecessary XGMI hight pstate trigger when mapping none-vram memory for 
> peer device
>
> Change-Id: I1881deff3da19f1f4b58d5765db03a590092a5b2
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 13 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  3 ++-
>   2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 9ee8d7a..82dc2b6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -31,6 +31,7 @@
>   #include 
>   #include "amdgpu.h"
>   #include "amdgpu_display.h"
> +#include "amdgpu_xgmi.h"
>   
>   void amdgpu_gem_object_free(struct drm_gem_object *gobj)
>   {
> @@ -666,6 +667,8 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   struct amdgpu_device *adev = dev->dev_private;
>   struct drm_amdgpu_gem_op *args = data;
>   struct drm_gem_object *gobj;
> + struct amdgpu_vm_bo_base *base;
> + struct amdgpu_bo_va *bo_va;
>   struct amdgpu_bo *robj;
>   int r;
>   
> @@ -704,6 +707,16 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   amdgpu_bo_unreserve(robj);
>   break;
>   }
> + for (base = robj->vm_bo; base; base = base->next) {
> + bo_va = container_of(base, struct amdgpu_bo_va, base);
> + if (bo_va &&
> + 
> amdgpu_xgmi_same_hive(adev,amdgpu_ttm_adev(robj->tbo.bdev))) {

adev and robj->tbo.bdev are the same in each loop iteration. Shouldn't 
we get one of the devices from the bo_va or amdgpu_vm_bo_base?

Regards,
   Felix


> + r = -EINVAL;
> + amdgpu_bo_unreserve(robj);
> + goto out;
> + }
> + }
> +
>   robj->preferred_domains = args->value & (AMDGPU_GEM_DOMAIN_VRAM 
> |
>   AMDGPU_GEM_DOMAIN_GTT |
>   AMDGPU_GEM_DOMAIN_CPU);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3d221f0..eb242a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2025,7 +2025,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
>   INIT_LIST_HEAD(_va->valids);
>   INIT_LIST_HEAD(_va->invalids);
>   
> - if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
> + if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
> + (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) {
>   bo_va->is_xgmi = true;
>   mutex_lock(>vm_manager.lock_pstate);
>   /* Power up XGMI if it can be potentially used */
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Add preferred_domain check when determine XGMI state

2019-03-27 Thread Kuehling, Felix
On 2019-03-26 4:35 p.m., Liu, Shaoyun wrote:
> Avoid unnecessary XGMI hight pstate trigger when mapping none-vram memory for 
> peer device
>
> Change-Id: I1881deff3da19f1f4b58d5765db03a590092a5b2
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 11 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  3 ++-
>   2 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index a82c3b1..a0f56e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -666,6 +666,8 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   struct amdgpu_device *adev = dev->dev_private;
>   struct drm_amdgpu_gem_op *args = data;
>   struct drm_gem_object *gobj;
> + struct amdgpu_vm_bo_base *base;
> + struct amdgpu_bo_va *bo_va;
>   struct amdgpu_bo *robj;
>   int r;
>   
> @@ -704,6 +706,15 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   amdgpu_bo_unreserve(robj);
>   break;
>   }
> + for (base = robj->vm_bo; base; base = base->next) {
> + bo_va = container_of(base, struct amdgpu_bo_va, base);
> + if (bo_va && bo_va->is_xgmi) {

The point here is to prevent transitions where a mapping moves from 
is_xgmi => !is_xgmi or from !is_xgmi => is_xgmi, because that would 
potentially require a XGMI pstate change.

This check catches the case of is_xgmi => !is_xgmi, but not the other 
way around. I think you should drop the is_xgmi condition here. Any BO 
that's shared between multiple GPUs should return -EINVAL here because 
moving it could result in an XGMI pstate change. It doesn't matter 
whether it's currently using XGMI or not.

Regards,
   Felix

> + r = -EINVAL;
> + amdgpu_bo_unreserve(robj);
> + goto out;
> + }
> + }
> +
>   robj->preferred_domains = args->value & (AMDGPU_GEM_DOMAIN_VRAM 
> |
>   AMDGPU_GEM_DOMAIN_GTT |
>   AMDGPU_GEM_DOMAIN_CPU);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 76eee7e..8ed23d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2048,7 +2048,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
>   INIT_LIST_HEAD(_va->valids);
>   INIT_LIST_HEAD(_va->invalids);
>   
> - if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
> + if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
> + (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) {
>   bo_va->is_xgmi = true;
>   mutex_lock(>vm_manager.lock_pstate);
>   /* Power up XGMI if it can be potentially used */
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Add preferred_domain check when determine XGMI state

2019-03-26 Thread Kuehling, Felix
On 2019-03-26 2:54 p.m., Liu, Shaoyun wrote:
> Avoid unnecessary XGMI hight pstate trigger when mapping none-vram memory for 
> peer device
>
> Change-Id: I1881deff3da19f1f4b58d5765db03a590092a5b2
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 9 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 3 ++-
>   2 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index a82c3b1..3c7ee71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -664,8 +664,10 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   struct drm_file *filp)
>   {
>   struct amdgpu_device *adev = dev->dev_private;
> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   struct drm_amdgpu_gem_op *args = data;
>   struct drm_gem_object *gobj;
> + struct amdgpu_bo_va *bo_va;
>   struct amdgpu_bo *robj;
>   int r;
>   
> @@ -704,6 +706,13 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void 
> *data,
>   amdgpu_bo_unreserve(robj);
>   break;
>   }
> + bo_va = amdgpu_vm_bo_find(>vm, robj);
> + if (bo_va && bo_va->is_xgmi) {
> + r = -EINVAL;
> + amdgpu_bo_unreserve(robj);
> + break;
> + }
> +

Hmm, from the other discussion, GEM doesn't really support P2P of VRAM 
BOs between GPUs right now. The only way this function can affect a BO 
that's P2P shared is, if the BO is allocated with GEM and then imported 
into KFD. In that case you'll need to take into account mappings of the 
imported BO in all the KFD VMs, not the VMs in the fpriv->vm.

In other words, you need to find all bo_vas of the BO in all VMs and for 
each one check, whether it has is_xgmi set.

Regards,
   Felix


>   robj->preferred_domains = args->value & (AMDGPU_GEM_DOMAIN_VRAM 
> |
>   AMDGPU_GEM_DOMAIN_GTT |
>   AMDGPU_GEM_DOMAIN_CPU);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 76eee7e..f08dda2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2048,7 +2048,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
>   INIT_LIST_HEAD(_va->valids);
>   INIT_LIST_HEAD(_va->invalids);
>   
> - if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
> + if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
> + (bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM)) {
>   bo_va->is_xgmi = true;
>   mutex_lock(>vm_manager.lock_pstate);
>   /* Power up XGMI if it can be potentially used */
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: XGMI pstate switch initial support

2019-03-26 Thread Kuehling, Felix
On 2019-03-26 9:15 a.m., Liu, Shaoyun wrote:
> I think in a real usage  ( tensorflow ex),  it's rarely only the 
> system memory (no vram) will be mapped for peer access.

With that argument you could simplify your change and just power up XGMI 
as soon as a KFD process starts. Because that's effectively what happens 
with your change as it is now, if you don't check the memory type. Every 
KFD process maps the signal page (in system memory) to all GPUs. So you 
will always increment the xgmi_map_count even before the first VRAM BO 
is allocated, let alone mapped to multiple GPUs.


> Anyway, how about add preferred_domain check for xgmi ? I think even 
> user use ioctl to change the preferred_domain,  bo_add should still be 
> called before the real mapping.

amdgpu_gem_op_ioctl with AMDGPU_GEM_OP_SET_PLACEMENT doesn't call 
bo_add. You'd have to add something in amdgpu_gem_op_ioctl to 
re-evaluate all bo_vas of the BO when its placement changes, update the 
bo_va->is_xgmi flag, and if necessary the xgmi_map_counter.

Regards,
   Felix


>
> Regards
> Shaoyun.liu
> 
> *From:* amd-gfx  on behalf of 
> Kuehling, Felix 
> *Sent:* March 25, 2019 6:28:32 PM
> *To:* Liu, Shaoyun; amd-gfx@lists.freedesktop.org
> *Subject:* Re: [PATCH] drm/amdgpu: XGMI pstate switch initial support
> I don't see any check for the memory type. As far as I can tell you'll
> power up XGMI even for system memory mappings. See inline.
>
> On 2019-03-22 3:28 p.m., Liu, Shaoyun wrote:
> > Driver vote low to high pstate switch whenever there is an outstanding
> > XGMI mapping request. Driver vote high to low pstate when all the
> > outstanding XGMI mapping is terminated.
> >
> > Change-Id: I197501f853c47f844055c0e28c0ac00a1ff06607
> > Signed-off-by: shaoyunl 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 21 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  4 
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 16 +++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   | 10 ++
> >   6 files changed, 56 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index ec9562d..c4c61e9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2018,6 +2018,10 @@ static void 
> amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
> >    r = amdgpu_device_enable_mgpu_fan_boost();
> >    if (r)
> >    DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> > +
> > + /*set to low pstate by default */
> > + amdgpu_xgmi_set_pstate(adev, 0);
> > +
> >   }
> >
> >   static void amdgpu_device_delay_enable_gfx_off(struct work_struct 
> *work)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > index 220a6a7..c430e82 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > @@ -72,6 +72,8 @@ struct amdgpu_bo_va {
> >
> >    /* If the mappings are cleared or filled */
> >    bool    cleared;
> > +
> > + bool    is_xgmi;
> >   };
> >
> >   struct amdgpu_bo {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 729da1c..a7247d5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -34,6 +34,7 @@
> >   #include "amdgpu_trace.h"
> >   #include "amdgpu_amdkfd.h"
> >   #include "amdgpu_gmc.h"
> > +#include "amdgpu_xgmi.h"
> >
> >   /**
> >    * DOC: GPUVM
> > @@ -2072,6 +2073,15 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
> >    INIT_LIST_HEAD(_va->valids);
> >    INIT_LIST_HEAD(_va->invalids);
> >
> > + if (bo && amdgpu_xgmi_same_hive(adev, 
> amdgpu_ttm_adev(bo->tbo.bdev))) {
> > + bo_va->is_xgmi = true;
>
> You're setting this to true even for system memory BOs that don't
> involve XGMI mappings. That means you'll power up XGMI unnecessarily in
> many cases because KFD processes always have system memory mappings that
> are mapped to all GPUs (e.g. the signal page)

Re: [PATCH 1/2] drm/amdgpu: move VM table mapping into the backend as well

2019-03-25 Thread Kuehling, Felix
The series is Reviewed-by: Felix Kuehling 

On 2019-03-25 8:22 a.m., Christian König wrote:
> Clean that up further and also fix another case where the BO
> wasn't kmapped for CPU based updates.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 31 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  | 11 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 20 +
>   4 files changed, 37 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index af1a7020c3ab..c9c8309a4d3f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -660,17 +660,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device 
> *adev, struct amdgpu_vm *vm,
>   if (bo->tbo.type != ttm_bo_type_kernel) {
>   amdgpu_vm_bo_moved(bo_base);
>   } else {
> - if (vm->use_cpu_for_update)
> - r = amdgpu_bo_kmap(bo, NULL);
> - else
> - r = amdgpu_ttm_alloc_gart(>tbo);
> - if (r)
> - break;
> - if (bo->shadow) {
> - r = amdgpu_ttm_alloc_gart(>shadow->tbo);
> - if (r)
> - break;
> - }
> + vm->update_funcs->map_table(bo);
>   amdgpu_vm_bo_relocated(bo_base);
>   }
>   }
> @@ -752,22 +742,17 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   if (r)
>   return r;
>   
> - r = amdgpu_ttm_alloc_gart(>tbo);
> - if (r)
> - return r;
> -
>   if (bo->shadow) {
>   r = ttm_bo_validate(>shadow->tbo, >shadow->placement,
>   );
>   if (r)
>   return r;
> -
> - r = amdgpu_ttm_alloc_gart(>shadow->tbo);
> - if (r)
> - return r;
> -
>   }
>   
> + r = vm->update_funcs->map_table(bo);
> + if (r)
> + return r;
> +
>   memset(, 0, sizeof(params));
>   params.adev = adev;
>   params.vm = vm;
> @@ -878,12 +863,6 @@ static int amdgpu_vm_alloc_pts(struct amdgpu_device 
> *adev,
>   if (r)
>   return r;
>   
> - if (vm->use_cpu_for_update) {
> - r = amdgpu_bo_kmap(pt, NULL);
> - if (r)
> - goto error_free_pt;
> - }
> -
>   /* Keep a reference to the root directory to avoid
>* freeing them up in the wrong order.
>*/
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 520122be798b..3ec875c0cc76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -215,7 +215,7 @@ struct amdgpu_vm_update_params {
>   };
>   
>   struct amdgpu_vm_update_funcs {
> -
> + int (*map_table)(struct amdgpu_bo *bo);
>   int (*prepare)(struct amdgpu_vm_update_params *p, void * owner,
>  struct dma_fence *exclusive);
>   int (*update)(struct amdgpu_vm_update_params *p,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> index 9d53982021de..5222d165abfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> @@ -24,6 +24,16 @@
>   #include "amdgpu_object.h"
>   #include "amdgpu_trace.h"
>   
> +/**
> + * amdgpu_vm_cpu_map_table - make sure new PDs/PTs are kmapped
> + *
> + * @table: newly allocated or validated PD/PT
> + */
> +static int amdgpu_vm_cpu_map_table(struct amdgpu_bo *table)
> +{
> + return amdgpu_bo_kmap(table, NULL);
> +}
> +
>   /**
>* amdgpu_vm_cpu_prepare - prepare page table update with the CPU
>*
> @@ -110,6 +120,7 @@ static int amdgpu_vm_cpu_commit(struct 
> amdgpu_vm_update_params *p,
>   }
>   
>   const struct amdgpu_vm_update_funcs amdgpu_vm_cpu_funcs = {
> + .map_table = amdgpu_vm_cpu_map_table,
>   .prepare = amdgpu_vm_cpu_prepare,
>   .update = amdgpu_vm_cpu_update,
>   .commit = amdgpu_vm_cpu_commit
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index e4bacdb44c68..4bccd69fe30d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -28,6 +28,25 @@
>   #define AMDGPU_VM_SDMA_MIN_NUM_DW   256u
>   #define AMDGPU_VM_SDMA_MAX_NUM_DW   (16u * 1024u)
>   
> +/**
> + * amdgpu_vm_sdma_map_table - make sure new PDs/PTs are GTT mapped
> + *
> + * @table: newly allocated or validated PD/PT
> + */
> +static int amdgpu_vm_sdma_map_table(struct amdgpu_bo *table)
> +{
> + int 

Re: Kernel panic while “ modprobe amdkfd ; modprobe -r amdkfd ; 4.14.35 kernel

2019-03-25 Thread Kuehling, Felix
On 2019-03-22 12:58 p.m., John Donnelly wrote:
> Hello ,
>
> I am investigating a issue reported by a test group concerning this driver.  
> Their test loads and unloads every kernel module included in the 4.14.35 
> kernel release . You don’t even need a AMD platform . It occurs on any Intel, 
>  or a  KVM VM instance too.
>
> Kernel panic while “  modprobe amdkfd ;  modprobe -r amdkfd  “
>
> [  329.425334]  ? __slab_free+0x9b/0x2ba
> [  329.427836]  ? process_slab+0x3c1/0x45c
> [  329.430336]  dev_printk_emit+0x4e/0x65
> [  329.432829]  __dev_printk+0x46/0x8b
> [  329.435183]  _dev_info+0x6c/0x85
> [  329.437435]  ? kfree+0x141/0x182
> [  329.439646]  kfd_module_exit+0x37/0x39 [amdkfd]
> [  329.442258]  SyS_delete_module+0x1c3/0x26f
> [  329.444722]  ? entry_SYSCALL_64_after_hwframe+0xaa/0x0
> [  329.447479]  ? entry_SYSCALL_64_after_hwframe+0xa3/0x0
> [  329.450206]  ? entry_SYSCALL_64_after_hwframe+0x9c/0x0
> [  329.452912]  ? entry_SYSCALL_64_after_hwframe+0x95/0x0
> [  329.455586]  do_syscall_64+0x79/0x1ae
> [  329.457766]  entry_SYSCALL_64_after_hwframe+0x151/0x0
> [  329.460369] RIP: 0033:0x7f1757a1b457
> [  329.462502] RSP: 002b:7ffd62ce1f48 EFLAGS: 0206 ORIG_RAX:
>
>
>
> Sometimes  the unload works but the message logged is garbage:
>
> [root@jpd-vmbase02 ~]# modprobe -r amdkfd
> [  144.449981]  hn??蟟??xn??ן??kfd: Removed module

I think this was caused by using dev_info with a kfd_device that didn't 
exist any more. It was fixed by this commit:

commit c393e9b2d51540b74e18e555df14706098dbf2cc
Author: Randy Dunlap 
Date:   Mon Nov 13 18:08:48 2017 +0200

     drm/amdkfd: fix amdkfd use-after-free GP fault

     Fix GP fault caused by dev_info() reference to a struct device*
     after the device has been freed (use after free).
     kfd_chardev_exit() frees the device so 'kfd_device' should not
     be used after calling kfd_chardev_exit().

     Signed-off-by: Randy Dunlap 
     Signed-off-by: Oded Gabbay 


>
>
> Is  this something one of team members could have possibly corrected in an 
> upstream version ?

In current kernels, amdkfd is no longer a separate KO. It's part of 
amdgpu now. Also see above. This bug is probably not reproducible any more.

Regards,
   Felix


>
> #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs"
> #define KFD_DRIVER_DATE "20150421"
> #define KFD_DRIVER_MAJOR0
> #define KFD_DRIVER_MINOR7
> #define KFD_DRIVER_PATCHLEVEL   2
>
>
> Any advise welcome.
>
>
> Thank you,
>
> John
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: XGMI pstate switch initial support

2019-03-25 Thread Kuehling, Felix
I don't see any check for the memory type. As far as I can tell you'll 
power up XGMI even for system memory mappings. See inline.

On 2019-03-22 3:28 p.m., Liu, Shaoyun wrote:
> Driver vote low to high pstate switch whenever there is an outstanding
> XGMI mapping request. Driver vote high to low pstate when all the
> outstanding XGMI mapping is terminated.
>
> Change-Id: I197501f853c47f844055c0e28c0ac00a1ff06607
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 21 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  4 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 16 +++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   | 10 ++
>   6 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ec9562d..c4c61e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2018,6 +2018,10 @@ static void 
> amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
>   r = amdgpu_device_enable_mgpu_fan_boost();
>   if (r)
>   DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> +
> + /*set to low pstate by default */
> + amdgpu_xgmi_set_pstate(adev, 0);
> +
>   }
>   
>   static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 220a6a7..c430e82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -72,6 +72,8 @@ struct amdgpu_bo_va {
>   
>   /* If the mappings are cleared or filled */
>   boolcleared;
> +
> + boolis_xgmi;
>   };
>   
>   struct amdgpu_bo {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 729da1c..a7247d5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -34,6 +34,7 @@
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_gmc.h"
> +#include "amdgpu_xgmi.h"
>   
>   /**
>* DOC: GPUVM
> @@ -2072,6 +2073,15 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct 
> amdgpu_device *adev,
>   INIT_LIST_HEAD(_va->valids);
>   INIT_LIST_HEAD(_va->invalids);
>   
> + if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
> + bo_va->is_xgmi = true;

You're setting this to true even for system memory BOs that don't 
involve XGMI mappings. That means you'll power up XGMI unnecessarily in 
many cases because KFD processes always have system memory mappings that 
are mapped to all GPUs (e.g. the signal page).

Regards,
   Felix


> + mutex_lock(>vm_manager.lock_pstate);
> + /* Power up XGMI if it can be potentially used */
> + if (++adev->vm_manager.xgmi_map_counter == 1)
> + amdgpu_xgmi_set_pstate(adev, 1);
> + mutex_unlock(>vm_manager.lock_pstate);
> + }
> +
>   return bo_va;
>   }
>   
> @@ -2490,6 +2500,14 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
>   }
>   
>   dma_fence_put(bo_va->last_pt_update);
> +
> + if (bo && bo_va->is_xgmi) {
> + mutex_lock(>vm_manager.lock_pstate);
> + if (--adev->vm_manager.xgmi_map_counter == 0)
> + amdgpu_xgmi_set_pstate(adev, 0);
> + mutex_unlock(>vm_manager.lock_pstate);
> + }
> +
>   kfree(bo_va);
>   }
>   
> @@ -2997,6 +3015,9 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
>   
>   idr_init(>vm_manager.pasid_idr);
>   spin_lock_init(>vm_manager.pasid_lock);
> +
> + adev->vm_manager.xgmi_map_counter = 0;
> + mutex_init(>vm_manager.lock_pstate);
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 520122b..f586b38 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -324,6 +324,10 @@ struct amdgpu_vm_manager {
>*/
>   struct idr  pasid_idr;
>   spinlock_t  pasid_lock;
> +
> + /* counter of mapped memory through xgmi */
> + uint32_txgmi_map_counter;
> + struct mutexlock_pstate;
>   };
>   
>   #define amdgpu_vm_copy_pte(adev, ib, pe, src, count) 
> ((adev)->vm_manager.vm_pte_funcs->copy_pte((ib), (pe), (src), (count)))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index fcc4b05..3368347 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ 

Re: [PATCH v13 14/20] drm/amdgpu, arm64: untag user pointers in amdgpu_ttm_tt_get_user_pages

2019-03-25 Thread Kuehling, Felix
On 2019-03-20 10:51 a.m., Andrey Konovalov wrote:
> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
>
> amdgpu_ttm_tt_get_user_pages() uses provided user pointers for vma
> lookups, which can only by done with untagged pointers.
>
> Untag user pointers in this function.
>
> Signed-off-by: Andrey Konovalov 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 73e71e61dc99..891b027fa33b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -751,10 +751,11 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, 
> struct page **pages)
>* check that we only use anonymous memory to prevent problems
>* with writeback
>*/
> - unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
> + unsigned long userptr = untagged_addr(gtt->userptr);
> + unsigned long end = userptr + ttm->num_pages * PAGE_SIZE;
>   struct vm_area_struct *vma;
>   
> - vma = find_vma(mm, gtt->userptr);
> + vma = find_vma(mm, userptr);
>   if (!vma || vma->vm_file || vma->vm_end < end) {
>   up_read(>mmap_sem);
>   return -EPERM;

We'll need to be careful that we don't break your change when the 
following commit gets applied through drm-next for Linux 5.2:

https://cgit.freedesktop.org/~agd5f/linux/commit/?h=drm-next-5.2-wip=915d3eecfa23693bac9e54cdacf84fb4efdcc5c4

Would it make sense to apply the untagging in amdgpu_ttm_tt_set_userptr 
instead? That would avoid this conflict and I think it would clearly put 
the untagging into the user mode code path where the tagged pointer 
originates.

In amdgpu_gem_userptr_ioctl and amdgpu_amdkfd_gpuvm.c (init_user_pages) 
we also set up an MMU notifier with the (tagged) pointer from user mode. 
That should probably also use the untagged address so that MMU notifiers 
for the untagged address get correctly matched up with the right BO. I'd 
move the untagging further up the call stack to cover that. For the GEM 
case I think amdgpu_gem_userptr_ioctl would be the right place. For the 
KFD case, I'd do this in amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu.

Regards,
   Felix

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 5/8] drm/amdgpu: new VM update backends

2019-03-25 Thread Kuehling, Felix
On 2019-03-25 7:38 a.m., Christian König wrote:
> Am 20.03.19 um 12:57 schrieb Kuehling, Felix:
>> As far as I can tell, the whole series is a small cleanup and big
>> refactor to enable CPU clearing of PTs without a lot of ugliness or code
>> duplication.
>
> It's a bit more than that. Key point is that I can now easily add a 
> parameter for direct submission during page fault handling :)

You mean for working around problems with CPU page table updates from 
page faults, to force all such updates through SDMA?

Regards,
   Felix


>
> Christian.
>
>> It looks good to me. I haven't reviewed all the moved SDMA
>> update code to make sure it all works correctly, but at least the
>> prepare and commit functions look sane to me.
>>
>> For this patch I have a suggestion (inline) to remove params->ib, which
>> seems redundant with params->job. That would also ripple through the
>> remaining patches.
>>
>> Other than that, the series is Reviewed-by: Felix Kuehling
>> 
>>
>> [+Kent], Look out for this patch series in an upcoming merge to
>> amd-kfd-staging. I don't think it'll cause conflicts, but has a risk of
>> regressions (like all big amdgpu_vm changes IME).
>>
>> Regards,
>>     Felix
>>
>> On 3/19/2019 8:44 AM, Christian König wrote:
>>> Separate out all functions for SDMA and CPU based page table
>>> updates into separate backends.
>>>
>>> This way we can keep most of the complexity of those from the
>>> core VM code.
>>>
>>> Signed-off-by: Christian König 
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/Makefile |   3 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |   7 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  30 ++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  | 116 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 248 
>>> 
>>>    5 files changed, 401 insertions(+), 3 deletions(-)
>>>    create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>>>    create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> [snip]
>>> +/**
>>> + * amdgpu_vm_sdma_prepare - prepare SDMA command submission
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @owner: owner we need to sync to
>>> + * @exclusive: exclusive move fence we need to sync to
>>> + *
>>> + * Returns:
>>> + * Negativ errno, 0 for success.
>>> + */
>>> +static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
>>> +  void *owner, struct dma_fence *exclusive)
>>> +{
>>> +    struct amdgpu_bo *root = p->vm->root.base.bo;
>>> +    unsigned int ndw = AMDGPU_VM_SDMA_MIN_NUM_DW;
>>> +    int r;
>>> +
>>> +    r = amdgpu_job_alloc_with_ib(p->adev, ndw * 4, >job);
>>> +    if (r)
>>> +    return r;
>>> +
>>> +    r = amdgpu_sync_fence(p->adev, >job->sync, exclusive, false);
>>> +    if (r)
>>> +    return r;
>>> +
>>> +    r = amdgpu_sync_resv(p->adev, >job->sync, root->tbo.resv,
>>> + owner, false);
>>> +    if (r)
>>> +    return r;
>>> +
>>> +    p->num_dw_left = ndw;
>>> +    p->ib = >job->ibs[0];
>> With p->job added, do we still need p->ib? We could just use
>> >job->ibs[0] directly, which should perform the same or be more
>> efficient since it's just a constant offset from p->job.
>>
>>
>>> +    return 0;
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_vm_sdma_commit - commit SDMA command submission
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @fence: resulting fence
>>> + *
>>> + * Returns:
>>> + * Negativ errno, 0 for success.
>>> + */
>>> +static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>>> + struct dma_fence **fence)
>>> +{
>>> +    struct amdgpu_bo *root = p->vm->root.base.bo;
>>> +    struct amdgpu_ring *ring;
>>> +    struct dma_fence *f;
>>> +    int r;
>>> +
>>> +    ring = container_of(p->vm->entity.rq->sched, struct 
>>> amdgpu_ring, sched);
>>> +
>>> +    WARN_ON(p->ib->length_dw == 0);
>>> +    amdgpu_ring_pad_ib(ring, p->ib);
>>> +    WARN_ON(p->ib->length_dw > p->num_dw_left);
>>> +  

Re: [PATCH 5/8] drm/amdgpu: new VM update backends

2019-03-20 Thread Kuehling, Felix
As far as I can tell, the whole series is a small cleanup and big 
refactor to enable CPU clearing of PTs without a lot of ugliness or code 
duplication. It looks good to me. I haven't reviewed all the moved SDMA 
update code to make sure it all works correctly, but at least the 
prepare and commit functions look sane to me.

For this patch I have a suggestion (inline) to remove params->ib, which 
seems redundant with params->job. That would also ripple through the 
remaining patches.

Other than that, the series is Reviewed-by: Felix Kuehling 


[+Kent], Look out for this patch series in an upcoming merge to 
amd-kfd-staging. I don't think it'll cause conflicts, but has a risk of 
regressions (like all big amdgpu_vm changes IME).

Regards,
   Felix

On 3/19/2019 8:44 AM, Christian König wrote:
> Separate out all functions for SDMA and CPU based page table
> updates into separate backends.
>
> This way we can keep most of the complexity of those from the
> core VM code.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile |   3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |   7 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  30 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  | 116 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 248 
>   5 files changed, 401 insertions(+), 3 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
[snip]
> +/**
> + * amdgpu_vm_sdma_prepare - prepare SDMA command submission
> + *
> + * @p: see amdgpu_vm_update_params definition
> + * @owner: owner we need to sync to
> + * @exclusive: exclusive move fence we need to sync to
> + *
> + * Returns:
> + * Negativ errno, 0 for success.
> + */
> +static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
> +   void *owner, struct dma_fence *exclusive)
> +{
> + struct amdgpu_bo *root = p->vm->root.base.bo;
> + unsigned int ndw = AMDGPU_VM_SDMA_MIN_NUM_DW;
> + int r;
> +
> + r = amdgpu_job_alloc_with_ib(p->adev, ndw * 4, >job);
> + if (r)
> + return r;
> +
> + r = amdgpu_sync_fence(p->adev, >job->sync, exclusive, false);
> + if (r)
> + return r;
> +
> + r = amdgpu_sync_resv(p->adev, >job->sync, root->tbo.resv,
> +  owner, false);
> + if (r)
> + return r;
> +
> + p->num_dw_left = ndw;
> + p->ib = >job->ibs[0];

With p->job added, do we still need p->ib? We could just use 
>job->ibs[0] directly, which should perform the same or be more 
efficient since it's just a constant offset from p->job.


> + return 0;
> +}
> +
> +/**
> + * amdgpu_vm_sdma_commit - commit SDMA command submission
> + *
> + * @p: see amdgpu_vm_update_params definition
> + * @fence: resulting fence
> + *
> + * Returns:
> + * Negativ errno, 0 for success.
> + */
> +static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
> +  struct dma_fence **fence)
> +{
> + struct amdgpu_bo *root = p->vm->root.base.bo;
> + struct amdgpu_ring *ring;
> + struct dma_fence *f;
> + int r;
> +
> + ring = container_of(p->vm->entity.rq->sched, struct amdgpu_ring, sched);
> +
> + WARN_ON(p->ib->length_dw == 0);
> + amdgpu_ring_pad_ib(ring, p->ib);
> + WARN_ON(p->ib->length_dw > p->num_dw_left);
> + r = amdgpu_job_submit(p->job, >vm->entity,
> +   AMDGPU_FENCE_OWNER_VM, );
> + if (r)
> + goto error;
> +
> + amdgpu_bo_fence(root, f, true);
> + if (fence)
> + swap(*fence, f);
> + dma_fence_put(f);
> + return 0;
> +
> +error:
> + amdgpu_job_free(p->job);
> + return r;
> +}
> +
> +
> +/**
> + * amdgpu_vm_sdma_copy_ptes - copy the PTEs from mapping
> + *
> + * @p: see amdgpu_vm_update_params definition
> + * @bo: PD/PT to update
> + * @pe: addr of the page entry
> + * @count: number of page entries to copy
> + *
> + * Traces the parameters and calls the DMA function to copy the PTEs.
> + */
> +static void amdgpu_vm_sdma_copy_ptes(struct amdgpu_vm_update_params *p,
> +  struct amdgpu_bo *bo, uint64_t pe,
> +  unsigned count)
> +{
> + uint64_t src = p->ib->gpu_addr;
> +
> + src += p->num_dw_left * 4;
> +
> + pe += amdgpu_bo_gpu_offset(bo);
> + trace_amdgpu_vm_copy_ptes(pe, src, count);
> +
> + amdgpu_vm_copy_pte(p->adev, p->ib, pe, src, count);
> +}
> +
> +/**
> + * amdgpu_vm_sdma_set_ptes - helper to call the right asic function
> + *
> + * @p: see amdgpu_vm_update_params definition
> + * @bo: PD/PT to update
> + * @pe: addr of the page entry
> + * @addr: dst addr to write into pe
> + * @count: number of page entries to update
> + * @incr: increase next addr by incr bytes
> + * @flags: hw access flags
> + *
> + * Traces the parameters and calls the 

Re: [PATCH] drm/amdgpu: revert "XGMI pstate switch initial support"

2019-03-19 Thread Kuehling, Felix
On 3/19/2019 8:49 AM, Christian König wrote:
> Yeah, all that is perfectly fine.
>
> The problem is Shaoyun didn't put this into the mapping code, but 
> rather into the VM state machine. So this won't work at all (the 
> counter and increment/decrement unbalanced and multiple times).

We tried to consider all the possible ways that this could go wrong. 
Basically, every time a mapping is updated, we update the is_xgmi state 
and update the counter if it changed. Have you seen the counter become 
unbalanced?


>
> The correct place to add this is amdgpu_vm_bo_add/amdgpu_vm_bo_rmv.

I think we considered that. The problem is that a BO can be migrated 
between bo_add and bo_rmv. I found that even bo->preferred_domain can 
change with AMDGPU_GEM_OP_SET_PLACEMENT. So you can't reliably know 
whether to increment your counter, and your counter can become 
unbalanced if a migration or AMDGPU_GEM_OP_SET_PLACEMENT happens between 
bo_add and bo_rmv.

Therefore we're trying to check for XGMI mappings every time the mapping 
changes and keep track of the state in amdgpu_bo_va_mapping.


>
> Additional to that the approach with the counter doesn't work because 
> you don't have a lock protecting the hw update itself. E.g. while 
> powering down you can add a mapping which needs to power it up again 
> and so powering down and powering up race with each other.

That's a good point.

Regards,
   Felix


>
> Regards,
> Christian.
>
> Am 19.03.19 um 13:42 schrieb Kuehling, Felix:
>> We discussed a few different approaches before settling on this one.
>>
>> Maybe it needs some more background. XGMI links are quite power hungry.
>> Being able to power them down improves performance for power-limited
>> workloads that don't need XGMI. In machine learning, pretty much all
>> workloads are power limited on our GPUs, so this is not just a
>> theoretical thing. The problem is, how do you know whether you need
>> XGMI? You need to know whether there are P2P memory mappings involving
>> XGMI. So the natural place to put that is in the memory mapping code.
>>
>> If you do spot a race condition in the code, let's talk about how to 
>> fix it.
>>
>> Regards,
>>     Felix
>>
>> On 3/19/2019 8:07 AM, Christian König wrote:
>>> This reverts commit c9115f8904eef0f880d3b4f8306f553b1bb1c532.
>>>
>>> Adding this to the mapping is complete nonsense and the whole
>>> implementation looks racy. This patch wasn't thoughtfully reviewed
>>> and should be reverted for now.
>>>
>>> Signed-off-by: Christian König 
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  3 ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 29 
>>> +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 15 ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 --
>>>    6 files changed, 1 insertion(+), 52 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b5720c1610e1..1db192150532 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -931,9 +931,6 @@ struct amdgpu_device {
>>>    int asic_reset_res;
>>>    struct work_struct    xgmi_reset_work;
>>>    -    /* counter of mapped memory through xgmi */
>>> -    atomic_t    xgmi_map_counter;
>>> -
>>>    bool    in_baco_reset;
>>>    };
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 964a4d3f1f43..206583707124 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2018,9 +2018,6 @@ static void 
>>> amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
>>>    r = amdgpu_device_enable_mgpu_fan_boost();
>>>    if (r)
>>>    DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
>>> -
>>> -    /*set to low pstate by default */
>>> -    amdgpu_xgmi_set_pstate(adev, 0);
>>>    }
>>>       static void amdgpu_device_delay_enable_gfx_off(struct 
>>> work_struct *work)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index 6f176bbe4cf2..220a6a7b1bc1 100644
>>> --- a/drivers/

Re: [PATCH] drm/amdgpu: revert "XGMI pstate switch initial support"

2019-03-19 Thread Kuehling, Felix
We discussed a few different approaches before settling on this one.

Maybe it needs some more background. XGMI links are quite power hungry. 
Being able to power them down improves performance for power-limited 
workloads that don't need XGMI. In machine learning, pretty much all 
workloads are power limited on our GPUs, so this is not just a 
theoretical thing. The problem is, how do you know whether you need 
XGMI? You need to know whether there are P2P memory mappings involving 
XGMI. So the natural place to put that is in the memory mapping code.

If you do spot a race condition in the code, let's talk about how to fix it.

Regards,
   Felix

On 3/19/2019 8:07 AM, Christian König wrote:
> This reverts commit c9115f8904eef0f880d3b4f8306f553b1bb1c532.
>
> Adding this to the mapping is complete nonsense and the whole
> implementation looks racy. This patch wasn't thoughtfully reviewed
> and should be reverted for now.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h|  3 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  3 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 29 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 15 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 --
>   6 files changed, 1 insertion(+), 52 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b5720c1610e1..1db192150532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -931,9 +931,6 @@ struct amdgpu_device {
>   int asic_reset_res;
>   struct work_struct  xgmi_reset_work;
>   
> - /* counter of mapped memory through xgmi */
> - atomic_txgmi_map_counter;
> -
>   boolin_baco_reset;
>   };
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 964a4d3f1f43..206583707124 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2018,9 +2018,6 @@ static void 
> amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
>   r = amdgpu_device_enable_mgpu_fan_boost();
>   if (r)
>   DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> -
> - /*set to low pstate by default */
> - amdgpu_xgmi_set_pstate(adev, 0);
>   }
>   
>   static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 6f176bbe4cf2..220a6a7b1bc1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -54,7 +54,6 @@ struct amdgpu_bo_va_mapping {
>   uint64_t__subtree_last;
>   uint64_toffset;
>   uint64_tflags;
> - boolis_xgmi;
>   };
>   
>   /* User space allocated BO in a VM */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index c5230a9fb7f6..c8f0e4ca05fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -34,7 +34,6 @@
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_gmc.h"
> -#include "amdgpu_xgmi.h"
>   
>   /**
>* DOC: GPUVM
> @@ -2022,9 +2021,8 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>   struct ttm_mem_reg *mem;
>   struct drm_mm_node *nodes;
>   struct dma_fence *exclusive, **last_update;
> - struct amdgpu_device *bo_adev = adev;
> - bool is_xgmi = false;
>   uint64_t flags;
> + struct amdgpu_device *bo_adev = adev;
>   int r;
>   
>   if (clear || !bo) {
> @@ -2046,10 +2044,6 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>   if (bo) {
>   flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
>   bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
> - if (adev != bo_adev &&
> - adev->gmc.xgmi.hive_id &&
> - adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id)
> - is_xgmi = true;
>   } else {
>   flags = 0x0;
>   }
> @@ -2068,19 +2062,6 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>   }
>   
>   list_for_each_entry(mapping, _va->invalids, list) {
> - if (mapping->is_xgmi != is_xgmi) {
> - if (is_xgmi) {
> - /* Adding an XGMI mapping to the PT */
> - if (atomic_inc_return(>xgmi_map_counter) 
> == 1)
> - amdgpu_xgmi_set_pstate(adev, 1);
> - } else {
> - /* Removing an XGMI mapping from the PT */
> 

Re: [PATCH] drm/amdkfd: Fix unchecked return value

2019-03-18 Thread Kuehling, Felix
Alex already applied an equivalent patch by Colin King (attached for 
reference).

Regards,
   Felix

On 3/18/2019 2:05 PM, Gustavo A. R. Silva wrote:
> Assign return value of function amdgpu_bo_sync_wait() to variable ret
> for its further check.
>
> Addresses-Coverity-ID: 1443914 ("Logically dead code")
> Fixes: c60cd590cb7d ("drm/amdgpu: Replace ttm_bo_wait with 
> amdgpu_bo_sync_wait")
> Signed-off-by: Gustavo A. R. Silva 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 1921dec3df7a..fb621abcb006 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -906,7 +906,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void 
> **process_info,
>   pr_err("validate_pt_pd_bos() failed\n");
>   goto validate_pd_fail;
>   }
> - amdgpu_bo_sync_wait(vm->root.base.bo, AMDGPU_FENCE_OWNER_KFD, false);
> + ret = amdgpu_bo_sync_wait(vm->root.base.bo, AMDGPU_FENCE_OWNER_KFD,
> +   false);
>   if (ret)
>   goto wait_pd_fail;
>   amdgpu_bo_fence(vm->root.base.bo,
--- Begin Message ---
From: Colin Ian King 

An earlier commit replaced ttm_bo_wait with amdgpu_bo_sync_wait and
removed the error return assignment to variable ret. Fix this by adding
the assignment back. Also break line to clean up checkpatch overly
long line warning.

Detected by CoverityScan, CID#1477327 ("Logically dead code")

Fixes: c60cd590cb7d ("drm/amdgpu: Replace ttm_bo_wait with amdgpu_bo_sync_wait")
Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 1921dec3df7a..92993baac91a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -906,7 +906,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void 
**process_info,
pr_err("validate_pt_pd_bos() failed\n");
goto validate_pd_fail;
}
-   amdgpu_bo_sync_wait(vm->root.base.bo, AMDGPU_FENCE_OWNER_KFD, false);
+   ret = amdgpu_bo_sync_wait(vm->root.base.bo,
+ AMDGPU_FENCE_OWNER_KFD, false);
if (ret)
goto wait_pd_fail;
amdgpu_bo_fence(vm->root.base.bo,
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx--- End Message ---
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: Slow memory access when using OpenCL without X11

2019-03-15 Thread Kuehling, Felix
Hi Lauri,

Thanks for your persistence. Seeing that this is reproducible on several boards 
with up-to-date BIOS is really helpful and gives me some confidence that it's 
more than a weird vendor or board-specific corner case and that we should be 
able to reproduce it. Yong is going to start looking into this problem.

Regards,
  Felix

On 3/14/2019 12:41 PM, Lauri Ehrenpreis wrote:
Yes it affects this a bit but it doesn't get the speed up to "normal" level. I 
got best results with "profile_peak" - then the memcpy speed on CPU is 1/3 of 
what it is without opencl initialization:

 echo "profile_peak" > 
/sys/class/drm/card0/device/power_dpm_force_performance_level
./cl_slow_test 1 5
got 1 platforms 1 devices
speed 3710.360352 avg 3710.360352 mbytes/s
speed 3713.660400 avg 3712.010254 mbytes/s
speed 3797.630859 avg 3740.550537 mbytes/s
speed 3708.004883 avg 3732.414062 mbytes/s
speed 3796.403076 avg 3745.211914 mbytes/s

Without calling clCreateContext:
./cl_slow_test 0 5
speed 7299.201660 avg 7299.201660 mbytes/s
speed 9298.841797 avg 8299.021484 mbytes/s
speed 9360.181641 avg 8652.742188 mbytes/s
speed 9004.759766 avg 8740.746094 mbytes/s
speed 9414.607422 avg 8875.518555 mbytes/s

--
Lauri

On Thu, Mar 14, 2019 at 5:46 PM Ernst Sjöstrand 
mailto:ern...@gmail.com>> wrote:
Does
echo high > /sys/class/drm/card0/device/power_dpm_force_performance_level
or setting cpu scaling governor to performance affect it at all?

Regards
//Ernst

Den tors 14 mars 2019 kl 14:31 skrev Lauri Ehrenpreis 
mailto:lauri...@gmail.com>>:
>
> I tried also with those 2 boards now:
> https://www.asrock.com/MB/AMD/Fatal1ty%20B450%20Gaming-ITXac/index.asp
> https://www.msi.com/Motherboard/B450I-GAMING-PLUS-AC
>
> Both are using latest BIOS, ubuntu 18.10, kernel 
> https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.0.2/
>
> There are some differences in dmesg (asrock has some amdgpu assert in dmesg) 
> but otherwise results are exactly the same.
> In desktop env cl_slow_test works fast, over ssh terminal it doesn't. If i 
> move mouse then it starts working fast in terminal as well.
>
> So one can't use OpenCL without monitor and desktop env running and this 
> happens with 2 different chipsets (b350 & b450), latest bios from 3 different 
> vendors, latest kernel and latest rocm. This doesn't look like edge case with 
> unusual setup to me..
>
> Attached dmesg, dmidecode, and clinfo from both boards.
>
> --
> Lauri
>
> On Wed, Mar 13, 2019 at 10:15 PM Lauri Ehrenpreis 
> mailto:lauri...@gmail.com>> wrote:
>>
>> For reproduction only the tiny cl_slow_test.cpp is needed which is attached 
>> to first e-mail.
>>
>> System information is following:
>> CPU: Ryzen5 2400G
>> Main board: Gigabyte AMD B450 AORUS mini itx: 
>> https://www.gigabyte.com/Motherboard/B450-I-AORUS-PRO-WIFI-rev-10#kf
>> BIOS: F5 8.47 MB 2019/01/25 (latest)
>> Kernel: https://kernel.ubuntu.com/~kernel-ppa/mainline/v5.0/  (amd64)
>> OS: Ubuntu 18.04 LTS
>> rocm-opencl-dev installation:
>> wget -qO - http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo 
>> apt-key add -
>> echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main' 
>> | sudo tee /etc/apt/sources.list.d/rocm.list
>> sudo apt install rocm-opencl-dev
>>
>> Also exactly the same issue happens with this board: 
>> https://www.gigabyte.com/Motherboard/GA-AB350-Gaming-3-rev-1x#kf
>>
>> I have MSI and Asrock mini itx boards ready as well, So far didn't get 
>> amdgpu & opencl working there but I'll try again tomorrow..
>>
>> --
>> Lauri
>>
>>
>> On Wed, Mar 13, 2019 at 8:51 PM Kuehling, Felix 
>> mailto:felix.kuehl...@amd.com>> wrote:
>>>
>>> Hi Lauri,
>>>
>>> I still think the SMU is doing something funny, but rocm-smi isn't
>>> showing enough information to really see what's going on.
>>>
>>> On APUs the SMU firmware is embedded in the system BIOS. Unlike discrete
>>> GPUs, the SMU firmware is not loaded by the driver. You could try
>>> updating your system BIOS to the latest version available from your main
>>> board vendor and see if that makes a difference. It may include a newer
>>> version of the SMU firmware, potentially with a fix.
>>>
>>> If that doesn't help, we'd have to reproduce the problem in house to see
>>> what's happening, which may require the same main board and BIOS version
>>> you're using. We can ask our SMU firmware team if they've ever
>>> encountered your type of problem. But I don't want to give you too much
>>> hope. It's a tricky problem involving HW, fir

Re: Slow memory access when using OpenCL without X11

2019-03-13 Thread Kuehling, Felix
Hi Lauri,

I still think the SMU is doing something funny, but rocm-smi isn't 
showing enough information to really see what's going on.

On APUs the SMU firmware is embedded in the system BIOS. Unlike discrete 
GPUs, the SMU firmware is not loaded by the driver. You could try 
updating your system BIOS to the latest version available from your main 
board vendor and see if that makes a difference. It may include a newer 
version of the SMU firmware, potentially with a fix.

If that doesn't help, we'd have to reproduce the problem in house to see 
what's happening, which may require the same main board and BIOS version 
you're using. We can ask our SMU firmware team if they've ever 
encountered your type of problem. But I don't want to give you too much 
hope. It's a tricky problem involving HW, firmware and multiple driver 
components in a fairly unusual configuration.

Regards,
   Felix

On 2019-03-13 7:28 a.m., Lauri Ehrenpreis wrote:
> What I observe is that moving the mouse made the memory speed go up 
> and also it made mclk=1200Mhz in rocm-smi output.
> However if I force mclk to 1200Mhz myself then memory speed is still 
> slow.
>
> So rocm-smi output when memory speed went fast due to mouse movement:
> rocm-smi
>         ROCm System Management Interface 
> 
> 
> GPU   Temp   AvgPwr   SCLK    MCLK    PCLK      Fan     Perf    
> PwrCap   SCLK OD   MCLK OD GPU%
> GPU[0] : WARNING: Empty SysFS value: pclk
> GPU[0] : WARNING: Unable to read 
> /sys/class/drm/card0/device/gpu_busy_percent
> 0     44.0c  N/A      400Mhz  1200Mhz N/A       0%      manual  N/A    
>   0%        0%  N/A
> 
>                End of ROCm SMI Log            
>   
>
> And rocm-smi output when I forced memclk=1200MHz myself:
> rocm-smi --setmclk 2
> rocm-smi
>         ROCm System Management Interface 
> 
> 
> GPU   Temp   AvgPwr   SCLK    MCLK    PCLK      Fan     Perf    
> PwrCap   SCLK OD   MCLK OD GPU%
> GPU[0] : WARNING: Empty SysFS value: pclk
> GPU[0] : WARNING: Unable to read 
> /sys/class/drm/card0/device/gpu_busy_percent
> 0     39.0c  N/A      400Mhz  1200Mhz N/A       0%      manual  N/A    
>   0%        0%  N/A
> 
>                End of ROCm SMI Log            
>   
>
> So only difference is that temperature shows 44c when memory speed was 
> fast and 39c when it was slow. But mclk was 1200MHz and sclk was 
> 400MHz in both cases.
> Can it be that rocm-smi just has a bug in reporting and mclk was not 
> actually 1200MHz when I forced it with rocm-smi --setmclk 2 ?
> That would explain the different behaviour..
>
> If so then is there a programmatic way how to really guarantee the 
> high speed mclk? Basically I want do something similar in my program 
> what happens if I move
> the mouse in desktop env and this way guarantee the normal memory 
> speed each time the program starts.
>
> --
> Lauri
>
>
> On Tue, Mar 12, 2019 at 11:36 PM Deucher, Alexander 
> mailto:alexander.deuc...@amd.com>> wrote:
>
> Forcing the sclk and mclk high may impact the CPU frequency since
> they share TDP.
>
> Alex
> 
> *From:* amd-gfx  <mailto:amd-gfx-boun...@lists.freedesktop.org>> on behalf of Lauri
> Ehrenpreis mailto:lauri...@gmail.com>>
> *Sent:* Tuesday, March 12, 2019 5:31 PM
> *To:* Kuehling, Felix
> *Cc:* Tom St Denis; amd-gfx@lists.freedesktop.org
> <mailto:amd-gfx@lists.freedesktop.org>
> *Subject:* Re: Slow memory access when using OpenCL without X11
> However it's not only related to mclk and sclk. I tried this:
> rocm-smi  --setsclk 2
> rocm-smi  --setmclk 3
> rocm-smi
>         ROCm System Management Interface
> 
> 
> 
> GPU   Temp   AvgPwr   SCLK    MCLK    PCLK          Fan     Perf 
>   PwrCap   SCLK OD  MCLK OD  GPU%
> GPU[0] : WARNING: Empty SysFS value: pclk
> GPU[0] : WARNING: Unable to read
> /sys/class/drm/card0/device/gpu_busy_percent

Re: [PATCH 1/3] drm/amdgpu: re-enable retry faults

2019-03-13 Thread Kuehling, Felix
The series is Reviewed-by: Felix Kuehling 

On 2019-03-13 9:44 a.m., Christian König wrote:
> Now that we have re-reoute faults to the other IH
> ring we can enable retries again.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 2 +-
>   drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 2 +-
>   2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
> index c10ed568ca6c..7bb5359d0bbd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
> @@ -236,7 +236,7 @@ static void gfxhub_v1_0_setup_vmid_config(struct 
> amdgpu_device *adev)
>   block_size);
>   /* Send no-retry XNACK on fault to suppress VM fault storm. */
>   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
> - RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
> + RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
>   WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);
>   WREG32_SOC15_OFFSET(GC, 0, 
> mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>   WREG32_SOC15_OFFSET(GC, 0, 
> mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> index 2a039946a549..1741056e6af6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> @@ -255,7 +255,7 @@ static void mmhub_v1_0_setup_vmid_config(struct 
> amdgpu_device *adev)
>   block_size);
>   /* Send no-retry XNACK on fault to suppress VM fault storm. */
>   tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
> - RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
> + RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
>   WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);
>   WREG32_SOC15_OFFSET(MMHUB, 0, 
> mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>   WREG32_SOC15_OFFSET(MMHUB, 0, 
> mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/3] drm/amdgpu: support userptr cross VMAs case with HMM v3

2019-03-12 Thread Kuehling, Felix
This patch is Reviewed-by: Felix Kuehling 

Regards,
   Felix

On 3/12/2019 9:17 PM, Yang, Philip wrote:
> userptr may cross two VMAs if the forked child process (not call exec
> after fork) malloc buffer, then free it, and then malloc larger size
> buf, kerenl will create new VMA adjacent to old VMA which was cloned
> from parent process, some pages of userptr are in the first VMA, the
> rest pages are in the second VMA.
>
> HMM expects range only have one VMA, loop over all VMAs in the address
> range, create multiple ranges to handle this case. See
> is_mergeable_anon_vma in mm/mmap.c for details.
>
> Change-Id: I0ca8c77e28deabccc139906f9ffee04b7e383314
> Signed-off-by: Philip Yang 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 126 +---
>   1 file changed, 91 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index c1240bf243ba..c14198737dcd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -711,7 +711,8 @@ struct amdgpu_ttm_tt {
>   struct task_struct  *usertask;
>   uint32_tuserflags;
>   #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> - struct hmm_rangerange;
> + struct hmm_range*ranges;
> + int nr_ranges;
>   #endif
>   };
>   
> @@ -723,62 +724,108 @@ struct amdgpu_ttm_tt {
>* once afterwards to stop HMM tracking
>*/
>   #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> +
> +/* Support Userptr pages cross max 16 vmas */
> +#define MAX_NR_VMAS  (16)
> +
>   int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
>   {
>   struct amdgpu_ttm_tt *gtt = (void *)ttm;
>   struct mm_struct *mm = gtt->usertask->mm;
> - unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
> - struct hmm_range *range = >range;
> - int r = 0, i;
> + unsigned long start = gtt->userptr;
> + unsigned long end = start + ttm->num_pages * PAGE_SIZE;
> + struct hmm_range *ranges;
> + struct vm_area_struct *vma = NULL, *vmas[MAX_NR_VMAS];
> + uint64_t *pfns, f;
> + int r = 0, i, nr_pages;
>   
>   if (!mm) /* Happens during process shutdown */
>   return -ESRCH;
>   
> - amdgpu_hmm_init_range(range);
> -
>   down_read(>mmap_sem);
>   
> - range->vma = find_vma(mm, gtt->userptr);
> - if (!range_in_vma(range->vma, gtt->userptr, end))
> - r = -EFAULT;
> - else if ((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
> - range->vma->vm_file)
> + /* user pages may cross multiple VMAs */
> + gtt->nr_ranges = 0;
> + do {
> + unsigned long vm_start;
> +
> + if (gtt->nr_ranges >= MAX_NR_VMAS) {
> + DRM_ERROR("Too many VMAs in userptr range\n");
> + r = -EFAULT;
> + goto out;
> + }
> +
> + vm_start = vma ? vma->vm_end : start;
> + vma = find_vma(mm, vm_start);
> + if (unlikely(!vma || vm_start < vma->vm_start)) {
> + r = -EFAULT;
> + goto out;
> + }
> + vmas[gtt->nr_ranges++] = vma;
> + } while (end > vma->vm_end);
> +
> + DRM_DEBUG_DRIVER("0x%lx nr_ranges %d pages 0x%lx\n",
> + start, gtt->nr_ranges, ttm->num_pages);
> +
> + if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
> + vmas[0]->vm_file)) {
>   r = -EPERM;
> - if (r)
>   goto out;
> + }
>   
> - range->pfns = kvmalloc_array(ttm->num_pages, sizeof(uint64_t),
> -  GFP_KERNEL);
> - if (range->pfns == NULL) {
> + ranges = kvmalloc_array(gtt->nr_ranges, sizeof(*ranges), GFP_KERNEL);
> + if (unlikely(!ranges)) {
>   r = -ENOMEM;
>   goto out;
>   }
> - range->start = gtt->userptr;
> - range->end = end;
>   
> - range->pfns[0] = range->flags[HMM_PFN_VALID];
> - range->pfns[0] |= amdgpu_ttm_tt_is_readonly(ttm) ?
> - 0 : range->flags[HMM_PFN_WRITE];
> - for (i = 1; i < ttm->num_pages; i++)
> - range->pfns[i] = range->pfns[0];
> + pfns = kvmalloc_array(ttm->num_pages, sizeof(*pfns), GFP_KERNEL);
> + if (unlikely(!pfns)) {
> + r = -ENOMEM;
> + goto out_free_ranges;
> + }
> +
> + for (i = 0; i < gtt->nr_ranges; i++)
> + amdgpu_hmm_init_range([i]);
> +
> + f = ranges[0].flags[HMM_PFN_VALID];
> + f |= amdgpu_ttm_tt_is_readonly(ttm) ?
> + 0 : ranges[0].flags[HMM_PFN_WRITE];
> + memset64(pfns, f, ttm->num_pages);
> +
> + for (nr_pages = 0, i = 0; i < gtt->nr_ranges; i++) {
> + ranges[i].vma = vmas[i];
> + ranges[i].start = max(start, vmas[i]->vm_start);
> + ranges[i].end = min(end, 

Re: [PATCH 1/1] drm/amdgpu: Wait for newly allocated PTs to be idle

2019-03-12 Thread Kuehling, Felix
When we use SDMA, we don't wait on the CPU. The GPU scheduler waits for 
the fences on the root PD reservation before executing the SDMA IB. 
amdgpu_vm_bo_update_mapping gets those fences and builds the sync object 
for the scheduler after all the page tables have been allocated, so it 
should be no problem.

Regards,
   Felix

On 2019-03-12 6:13 p.m., Liu, Shaoyun wrote:
> Hi,
>
> I think even use SDMA to update PTE we may still need to wait the clear
> job to be completed if we can not guarantee the clear and set PTE job
> will use the exact same SDMA engine ( Did we use a dedicate SDMA engine
> for PTE update including clear? ).  But if we didn't use the  same
> engine , it may explain why the  test failed occasionally.
>
> Regards
>
> shaoyun.liu
>
>
>
> On 2019-03-12 5:20 p.m., Kuehling, Felix wrote:
>> When page table are updated by the CPU, synchronize with the
>> allocation and initialization of newly allocated page tables.
>>
>> Signed-off-by: Felix Kuehling 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 20 +---
>>1 file changed, 13 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 8603c85..4303436 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -899,17 +899,17 @@ static void amdgpu_vm_bo_param(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>}
>>
>>/**
>> - * amdgpu_vm_alloc_pts - Allocate page tables.
>> + * amdgpu_vm_alloc_pts - Allocate a specific page table
>> *
>> * @adev: amdgpu_device pointer
>> * @vm: VM to allocate page tables for
>> - * @saddr: Start address which needs to be allocated
>> - * @size: Size from start address we need.
>> + * @cursor: Which page table to allocate
>> *
>> - * Make sure the page directories and page tables are allocated
>> + * Make sure a specific page table or directory is allocated.
>> *
>> * Returns:
>> - * 0 on success, errno otherwise.
>> + * 1 if page table needed to be allocated, 0 if page table was already
>> + * allocated, negative errno if an error occurred.
>> */
>>static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm,
>> @@ -956,7 +956,7 @@ static int amdgpu_vm_alloc_pts(struct amdgpu_device 
>> *adev,
>>  if (r)
>>  goto error_free_pt;
>>
>> -return 0;
>> +return 1;
>>
>>error_free_pt:
>>  amdgpu_bo_unref(>shadow);
>> @@ -1621,10 +1621,12 @@ static int amdgpu_vm_update_ptes(struct 
>> amdgpu_pte_update_params *params,
>>  unsigned shift, parent_shift, mask;
>>  uint64_t incr, entry_end, pe_start;
>>  struct amdgpu_bo *pt;
>> +bool need_to_sync;
>>
>>  r = amdgpu_vm_alloc_pts(params->adev, params->vm, );
>> -if (r)
>> +if (r < 0)
>>  return r;
>> +need_to_sync = (r && params->vm->use_cpu_for_update);
>>
>>  pt = cursor.entry->base.bo;
>>
>> @@ -1672,6 +1674,10 @@ static int amdgpu_vm_update_ptes(struct 
>> amdgpu_pte_update_params *params,
>>  entry_end += cursor.pfn & ~(entry_end - 1);
>>  entry_end = min(entry_end, end);
>>
>> +if (need_to_sync)
>> +r = amdgpu_bo_sync_wait(params->vm->root.base.bo,
>> +AMDGPU_FENCE_OWNER_VM, true);
>> +
>>  do {
>>  uint64_t upd_end = min(entry_end, frag_end);
>>  unsigned nptes = (upd_end - frag_start) >> shift;
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand

2019-03-12 Thread Kuehling, Felix
Never mind. I must have messed up my build. I can't reproduce the 
problem any more. The patch I sent out is still needed and valid. AFAICT 
it should be all that's needed to fix GPUVM for KFD.

I have not seen any faults with KFDCWSRTest.BasicTest on my system with 
Fiji or Vega10 with that patch applied.

Regards,
   Felix

On 2019-03-12 5:19 p.m., Felix Kuehling wrote:
> I'm also still seeing VM faults in the eviction test even with my fix, 
> and even with SDMA page table updates. There is still something else 
> going wrong. :/
>
> Thanks,
>   Felix
>
> On 2019-03-12 5:13 p.m., Yang, Philip wrote:
>> vm fault happens about 1/10 for KFDCWSRTest.BasicTest for me. I am using
>> SDMA for page table update. I don't try CPU page table update.
>>
>> Philip
>>
>> On 2019-03-12 11:12 a.m., Russell, Kent wrote:
>>> Peculiar, I hit it immediately when I ran it . Can you try use 
>>> --gtest_filter=KFDCWSRTest.BasicTest . That one hung every time for me.
>>>
>>>    Kent
>>>
>>>> -Original Message-
>>>> From: Christian König 
>>>> Sent: Tuesday, March 12, 2019 11:09 AM
>>>> To: Russell, Kent ; Koenig, Christian
>>>> ; Kuehling, Felix ;
>>>> amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>>
>>>> Yeah, same problem here.
>>>>
>>>> I removed libhsakmt package and installed it manually and now it 
>>>> seems to
>>>> work.
>>>>
>>>> Doing some testing now, but at least of hand I can't seem to 
>>>> reproduce the
>>>> VM fault on a Vega10.
>>>>
>>>> Christian.
>>>>
>>>> Am 12.03.19 um 16:01 schrieb Russell, Kent:
>>>>> Oh right, I remember that issue. I had that happen to me once, 
>>>>> where my
>>>> installed libhsakmt didn't match up with the latest source code, so 
>>>> I ended up
>>>> having to remove the libhsakmt package and pointing it to the folders
>>>> instead.
>>>>>     Kent
>>>>>
>>>>>> -Original Message-
>>>>>> From: Koenig, Christian
>>>>>> Sent: Tuesday, March 12, 2019 10:49 AM
>>>>>> To: Russell, Kent ; Kuehling, Felix
>>>>>> ; amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>>>>
>>>>>> Yeah, the problem is I do have the libhsakmt installed.
>>>>>>
>>>>>> Going to give it a try to specify the directory directly.
>>>>>>
>>>>>> Christian.
>>>>>>
>>>>>> Am 12.03.19 um 15:47 schrieb Russell, Kent:
>>>>>>> The README.txt file inside the tests/kfdtest folder has 
>>>>>>> instructions
>>>>>>> on how
>>>>>> to do it if you don't have the libhsakmt package installed on 
>>>>>> your system:
>>>>>>> export LIBHSAKMT_PATH=/*your local libhsakmt folder*/ With that, 
>>>>>>> the
>>>>>>> headers and libraries are searched under LIBHSAKMT_PATH/include and
>>>>>>> LIBHSAKMT_PATH/lib respectively.
>>>>>>>
>>>>>>> So if you try export LIBHSAKMT_PATH as the root ROCT folder (the 
>>>>>>> one
>>>>>> containing include, src, tests, etc), then that should cover it.
>>>>>>>  Kent
>>>>>>>
>>>>>>>
>>>>>>>> -Original Message-
>>>>>>>> From: Christian König 
>>>>>>>> Sent: Tuesday, March 12, 2019 9:13 AM
>>>>>>>> To: Russell, Kent ; Kuehling, Felix
>>>>>>>> ; Koenig, Christian
>>>>>>>> ; amd-gfx@lists.freedesktop.org
>>>>>>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on
>>>> demand
>>>>>>>> Hi guys,
>>>>>>>>
>>>>>>>> so found a few minutes today to compile kfdtest.
>>>>>>>>
>>>>>>>> Problem is that during the compile I get a lots of this:
>>>>>>>>> CMakeFiles/kfdtest.dir/src/BaseQueue.cpp.o: In Funktion
>>>>>>>>> »BaseQueue::Create(unsigned int, unsigned int, unsigned long*)«:
>>>>>>

[PATCH 1/1] drm/amdgpu: Wait for newly allocated PTs to be idle

2019-03-12 Thread Kuehling, Felix
When page table are updated by the CPU, synchronize with the
allocation and initialization of newly allocated page tables.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8603c85..4303436 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -899,17 +899,17 @@ static void amdgpu_vm_bo_param(struct amdgpu_device 
*adev, struct amdgpu_vm *vm,
 }
 
 /**
- * amdgpu_vm_alloc_pts - Allocate page tables.
+ * amdgpu_vm_alloc_pts - Allocate a specific page table
  *
  * @adev: amdgpu_device pointer
  * @vm: VM to allocate page tables for
- * @saddr: Start address which needs to be allocated
- * @size: Size from start address we need.
+ * @cursor: Which page table to allocate
  *
- * Make sure the page directories and page tables are allocated
+ * Make sure a specific page table or directory is allocated.
  *
  * Returns:
- * 0 on success, errno otherwise.
+ * 1 if page table needed to be allocated, 0 if page table was already
+ * allocated, negative errno if an error occurred.
  */
 static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
   struct amdgpu_vm *vm,
@@ -956,7 +956,7 @@ static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
if (r)
goto error_free_pt;
 
-   return 0;
+   return 1;
 
 error_free_pt:
amdgpu_bo_unref(>shadow);
@@ -1621,10 +1621,12 @@ static int amdgpu_vm_update_ptes(struct 
amdgpu_pte_update_params *params,
unsigned shift, parent_shift, mask;
uint64_t incr, entry_end, pe_start;
struct amdgpu_bo *pt;
+   bool need_to_sync;
 
r = amdgpu_vm_alloc_pts(params->adev, params->vm, );
-   if (r)
+   if (r < 0)
return r;
+   need_to_sync = (r && params->vm->use_cpu_for_update);
 
pt = cursor.entry->base.bo;
 
@@ -1672,6 +1674,10 @@ static int amdgpu_vm_update_ptes(struct 
amdgpu_pte_update_params *params,
entry_end += cursor.pfn & ~(entry_end - 1);
entry_end = min(entry_end, end);
 
+   if (need_to_sync)
+   r = amdgpu_bo_sync_wait(params->vm->root.base.bo,
+   AMDGPU_FENCE_OWNER_VM, true);
+
do {
uint64_t upd_end = min(entry_end, frag_end);
unsigned nptes = (upd_end - frag_start) >> shift;
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand

2019-03-12 Thread Kuehling, Felix
I'm also still seeing VM faults in the eviction test even with my fix, 
and even with SDMA page table updates. There is still something else 
going wrong. :/

Thanks,
   Felix

On 2019-03-12 5:13 p.m., Yang, Philip wrote:
> vm fault happens about 1/10 for KFDCWSRTest.BasicTest for me. I am using
> SDMA for page table update. I don't try CPU page table update.
>
> Philip
>
> On 2019-03-12 11:12 a.m., Russell, Kent wrote:
>> Peculiar, I hit it immediately when I ran it . Can you try use 
>> --gtest_filter=KFDCWSRTest.BasicTest  . That one hung every time for me.
>>
>>Kent
>>
>>> -Original Message-
>>> From: Christian König 
>>> Sent: Tuesday, March 12, 2019 11:09 AM
>>> To: Russell, Kent ; Koenig, Christian
>>> ; Kuehling, Felix ;
>>> amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>
>>> Yeah, same problem here.
>>>
>>> I removed libhsakmt package and installed it manually and now it seems to
>>> work.
>>>
>>> Doing some testing now, but at least of hand I can't seem to reproduce the
>>> VM fault on a Vega10.
>>>
>>> Christian.
>>>
>>> Am 12.03.19 um 16:01 schrieb Russell, Kent:
>>>> Oh right, I remember that issue. I had that happen to me once, where my
>>> installed libhsakmt didn't match up with the latest source code, so I ended 
>>> up
>>> having to remove the libhsakmt package and pointing it to the folders
>>> instead.
>>>> Kent
>>>>
>>>>> -Original Message-
>>>>> From: Koenig, Christian
>>>>> Sent: Tuesday, March 12, 2019 10:49 AM
>>>>> To: Russell, Kent ; Kuehling, Felix
>>>>> ; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>>>
>>>>> Yeah, the problem is I do have the libhsakmt installed.
>>>>>
>>>>> Going to give it a try to specify the directory directly.
>>>>>
>>>>> Christian.
>>>>>
>>>>> Am 12.03.19 um 15:47 schrieb Russell, Kent:
>>>>>> The README.txt file inside the tests/kfdtest folder has instructions
>>>>>> on how
>>>>> to do it if you don't have the libhsakmt package installed on your system:
>>>>>> export LIBHSAKMT_PATH=/*your local libhsakmt folder*/ With that, the
>>>>>> headers and libraries are searched under LIBHSAKMT_PATH/include and
>>>>>> LIBHSAKMT_PATH/lib respectively.
>>>>>>
>>>>>> So if you try export LIBHSAKMT_PATH as the root ROCT folder (the one
>>>>> containing include, src, tests, etc), then that should cover it.
>>>>>>  Kent
>>>>>>
>>>>>>
>>>>>>> -Original Message-
>>>>>>> From: Christian König 
>>>>>>> Sent: Tuesday, March 12, 2019 9:13 AM
>>>>>>> To: Russell, Kent ; Kuehling, Felix
>>>>>>> ; Koenig, Christian
>>>>>>> ; amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on
>>> demand
>>>>>>> Hi guys,
>>>>>>>
>>>>>>> so found a few minutes today to compile kfdtest.
>>>>>>>
>>>>>>> Problem is that during the compile I get a lots of this:
>>>>>>>> CMakeFiles/kfdtest.dir/src/BaseQueue.cpp.o: In Funktion
>>>>>>>> »BaseQueue::Create(unsigned int, unsigned int, unsigned long*)«:
>>>>>>>> /usr/src/ROCT-Thunk-Interface/tests/kfdtest/src/BaseQueue.cpp:57:
>>>>>>>> Warnung: undefinierter Verweis auf »hsaKmtCreateQueue«
>>>>>>> Any idea?
>>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>> Am 11.03.19 um 17:55 schrieb Christian König:
>>>>>>>> Hi guys,
>>>>>>>>
>>>>>>>> well it's most likely some missing handling in the KFD, so I'm
>>>>>>>> rather reluctant to revert the change immediately.
>>>>>>>>
>>>>>>>> Problem is that I don't have time right now to look into it
>>>>>>>> immediately. So Kent can you continue to take a look?
>>>>>>>>
>>>&g

Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand

2019-03-12 Thread Kuehling, Felix
The root cause is that we don't wait after calling amdgpu_vm_clear_bo in 
amdgpu_vm_alloc_pts.

Waiting for the page table BOs to be idle for CPU page table updates is 
done in amdgpu_vm_bo_update_mapping. That is now *before* the page 
tables are actually allocated and cleared in amdgpu_vm_update_ptes.

We'll need to move the waiting for page tables to be idle into 
amdgpu_vm_alloc_pts or amdgpu_vm_update_ptes.

Regards,
   Felix

On 2019-03-12 3:02 p.m., Felix Kuehling wrote:
> I find that it's related to CPU page table updates. If I force page 
> table updates with SDMA, I don't get the VM fault.
>
> Regards,
>   Felix
>
> On 2019-03-11 12:55 p.m., Christian König wrote:
>> Hi guys,
>>
>> well it's most likely some missing handling in the KFD, so I'm rather 
>> reluctant to revert the change immediately.
>>
>> Problem is that I don't have time right now to look into it 
>> immediately. So Kent can you continue to take a look?
>>
>> Sounds like its crashing immediately, so it should be something obvious.
>>
>> Christian.
>>
>> Am 11.03.19 um 10:49 schrieb Russell, Kent:
>>>  From what I've been able to dig through, the VM Fault seems to 
>>> occur right after a doorbell mmap, but that's as far as I got. I can 
>>> try to revert it in today's merge and see how things go.
>>>
>>>   Kent
>>>
>>>> -Original Message-
>>>> From: Kuehling, Felix
>>>> Sent: Friday, March 08, 2019 11:16 PM
>>>> To: Koenig, Christian ; Russell, Kent
>>>> ; amd-gfx@lists.freedesktop.org
>>>> Subject: RE: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>>
>>>> My concerns were related to eviction fence handing. It would 
>>>> manifest by
>>>> unnecessary eviction callbacks into KFD that aren't cause by real 
>>>> evictions. I
>>>> addressed that with a previous patch series that removed the need to
>>>> remove eviction fences and add them back around page table updates in
>>>> amdgpu_amdkfd_gpuvm.c.
>>>>
>>>> I don't know what's going on here. I can probably take a look on 
>>>> Monday. I
>>>> haven't considered what changed with respect to PD updates.
>>>>
>>>> Kent, can we temporarily revert the offending change in 
>>>> amd-kfd-staging
>>>> just to unblock the merge?
>>>>
>>>> Christian, I think KFD is currently broken on amd-staging-drm-next. 
>>>> If we're
>>>> serious about supporting KFD upstream, you may also want to consider
>>>> reverting your change there for now. Also consider building the 
>>>> Thunk and
>>>> kfdtest so you can do quick smoke tests locally whenever you make
>>>> amdgpu_vm changes that can affect KFD.
>>>> https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface
>>>>
>>>> Regards,
>>>>    Felix
>>>>
>>>> -Original Message-
>>>> From: amd-gfx  On Behalf Of
>>>> Christian König
>>>> Sent: Friday, March 08, 2019 9:14 AM
>>>> To: Russell, Kent ; 
>>>> amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>>
>>>> My best guess is that we forget somewhere to update the PDs. What
>>>> hardware is that on?
>>>>
>>>> Felix already mentioned that this could be problematic for the KFD.
>>>>
>>>> Maybe he has an idea,
>>>> Christian.
>>>>
>>>> Am 08.03.19 um 15:04 schrieb Russell, Kent:
>>>>> Hi Christian,
>>>>>
>>>>> This patch ended up causing a VM Fault in KFDTest. Reverting just 
>>>>> this
>>>> patch addressed the issue:
>>>>> [   82.703503] amdgpu :0c:00.0: GPU fault detected: 146 
>>>>> 0x480c for
>>>> process  pid 0 thread  pid 0
>>>>> [   82.703512] amdgpu :0c:00.0:
>>>> VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x1000
>>>>> [   82.703516] amdgpu :0c:00.0:
>>>> VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x1004800C
>>>>> [   82.703522] amdgpu :0c:00.0: VM fault (0x0c, vmid 8, pasid 
>>>>> 32769) at
>>>> page 4096, read from 'TC0' (0x54433000) (72)
>>>>> [   82.703585] Evicting PASID 32769 queues
>>>>>
>>>>> I am looking into it, but if you have any insight that would be 
>>>>> great in

Re: [PATCH 1/3] drm/amdkfd: support concurrent userptr update for HMM v2

2019-03-12 Thread Kuehling, Felix
On 2019-03-06 9:42 p.m., Yang, Philip wrote:
> Userptr restore may have concurrent userptr invalidation after
> hmm_vma_fault adds the range to the hmm->ranges list, needs call
> hmm_vma_range_done to remove the range from hmm->ranges list first,
> then reschedule the restore worker. Otherwise hmm_vma_fault will add
> same range to the list, this will cause loop in the list because
> range->next point to range itself.
>
> Add function untrack_invalid_user_pages to reduce code duplication.
>
> Change-Id: I31407739dc10554f8e418c7a0e0415d3d95552f1
> Signed-off-by: Philip Yang 

This patch is Reviewed-by: Felix Kuehling 


> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 25 ++-
>   1 file changed, 19 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index d2e315f42dad..60d53b0b497a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1935,6 +1935,23 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>   return 0;
>   }
>   
> +/* Remove invalid userptr BOs from hmm track list
> + *
> + * Stop HMM track the userptr update
> + */
> +static void untrack_invalid_user_pages(struct amdkfd_process_info 
> *process_info)
> +{
> + struct kgd_mem *mem, *tmp_mem;
> + struct amdgpu_bo *bo;
> +
> + list_for_each_entry_safe(mem, tmp_mem,
> +  _info->userptr_inval_list,
> +  validate_list.head) {
> + bo = mem->bo;
> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + }
> +}
> +
>   /* Validate invalid userptr BOs
>*
>* Validates BOs on the userptr_inval_list, and moves them back to the
> @@ -2052,12 +2069,6 @@ static int validate_invalid_user_pages(struct 
> amdkfd_process_info *process_info)
>   out_free:
>   kfree(pd_bo_list_entries);
>   out_no_mem:
> - list_for_each_entry_safe(mem, tmp_mem,
> -  _info->userptr_inval_list,
> -  validate_list.head) {
> - bo = mem->bo;
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> - }
>   
>   return ret;
>   }
> @@ -2122,7 +2133,9 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
> work_struct *work)
>* hanging. No point trying again.
>*/
>   }
> +
>   unlock_out:
> + untrack_invalid_user_pages(process_info);
>   mutex_unlock(_info->lock);
>   mmput(mm);
>   put_task_struct(usertask);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/3] drm/amdgpu: support userptr cross VMAs case with HMM v2

2019-03-12 Thread Kuehling, Felix
See one comment inline. There are still some potential problems that 
you're not catching.

On 2019-03-06 9:42 p.m., Yang, Philip wrote:
> userptr may cross two VMAs if the forked child process (not call exec
> after fork) malloc buffer, then free it, and then malloc larger size
> buf, kerenl will create new VMA adjacent to old VMA which was cloned
> from parent process, some pages of userptr are in the first VMA, the
> rest pages are in the second VMA.
>
> HMM expects range only have one VMA, loop over all VMAs in the address
> range, create multiple ranges to handle this case. See
> is_mergeable_anon_vma in mm/mmap.c for details.
>
> Change-Id: I0ca8c77e28deabccc139906f9ffee04b7e383314
> Signed-off-by: Philip Yang 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 123 +---
>   1 file changed, 88 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 7cc0ba24369d..802bec7ef917 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -711,7 +711,8 @@ struct amdgpu_ttm_tt {
>   struct task_struct  *usertask;
>   uint32_tuserflags;
>   #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> - struct hmm_rangerange;
> + struct hmm_range*ranges;
> + int nr_ranges;
>   #endif
>   };
>   
> @@ -723,62 +724,105 @@ struct amdgpu_ttm_tt {
>* once afterwards to stop HMM tracking
>*/
>   #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> +
> +/* Support Userptr pages cross max 16 vmas */
> +#define MAX_NR_VMAS  (16)
> +
>   int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
>   {
>   struct amdgpu_ttm_tt *gtt = (void *)ttm;
>   struct mm_struct *mm = gtt->usertask->mm;
> - unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
> - struct hmm_range *range = >range;
> - int r = 0, i;
> + unsigned long start = gtt->userptr;
> + unsigned long end = start + ttm->num_pages * PAGE_SIZE;
> + struct hmm_range *ranges;
> + struct vm_area_struct *vma = NULL, *vmas[MAX_NR_VMAS];
> + uint64_t *pfns, f;
> + int r = 0, i, nr_pages;
>   
>   if (!mm) /* Happens during process shutdown */
>   return -ESRCH;
>   
> - amdgpu_hmm_init_range(range);
> -
>   down_read(>mmap_sem);
>   
> - range->vma = find_vma(mm, gtt->userptr);
> - if (!range_in_vma(range->vma, gtt->userptr, end))
> - r = -EFAULT;
> - else if ((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
> - range->vma->vm_file)
> + /* user pages may cross multiple VMAs */
> + gtt->nr_ranges = 0;
> + do {
> + if (gtt->nr_ranges >= MAX_NR_VMAS) {
> + DRM_ERROR("Too many VMAs in userptr range\n");
> + r = -EFAULT;
> + goto out;
> + }
> +
> + vma = find_vma(mm, vma ? vma->vm_end : start);

You need a check here that vma->vm_start <= the requested start address. 
Otherwise you can end up with gaps in your userptr mapping that don't 
have valid pages.

Regards,
   Felix


> + if (unlikely(!vma)) {
> + r = -EFAULT;
> + goto out;
> + }
> + vmas[gtt->nr_ranges++] = vma;
> + } while (end > vma->vm_end);+
> + DRM_DEBUG_DRIVER("0x%lx nr_ranges %d pages 0x%lx\n",
> + start, gtt->nr_ranges, ttm->num_pages);
> +
> + if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
> + vmas[0]->vm_file)) {
>   r = -EPERM;
> - if (r)
>   goto out;
> + }
>   
> - range->pfns = kvmalloc_array(ttm->num_pages, sizeof(uint64_t),
> -  GFP_KERNEL);
> - if (range->pfns == NULL) {
> + ranges = kvmalloc_array(gtt->nr_ranges, sizeof(*ranges), GFP_KERNEL);
> + if (unlikely(!ranges)) {
>   r = -ENOMEM;
>   goto out;
>   }
> - range->start = gtt->userptr;
> - range->end = end;
>   
> - range->pfns[0] = range->flags[HMM_PFN_VALID];
> - range->pfns[0] |= amdgpu_ttm_tt_is_readonly(ttm) ?
> - 0 : range->flags[HMM_PFN_WRITE];
> - for (i = 1; i < ttm->num_pages; i++)
> - range->pfns[i] = range->pfns[0];
> + pfns = kvmalloc_array(ttm->num_pages, sizeof(*pfns), GFP_KERNEL);
> + if (unlikely(!pfns)) {
> + r = -ENOMEM;
> + goto out_free_ranges;
> + }
> +
> + for (i = 0; i < gtt->nr_ranges; i++)
> + amdgpu_hmm_init_range([i]);
> +
> + f = ranges[0].flags[HMM_PFN_VALID];
> + f |= amdgpu_ttm_tt_is_readonly(ttm) ?
> + 0 : ranges[0].flags[HMM_PFN_WRITE];
> + memset64(pfns, f, ttm->num_pages);
> +
> + for (nr_pages = 0, i = 0; i < gtt->nr_ranges; i++) {
> + ranges[i].vma = vmas[i];
> +   

Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand

2019-03-12 Thread Kuehling, Felix
I find that it's related to CPU page table updates. If I force page 
table updates with SDMA, I don't get the VM fault.

Regards,
   Felix

On 2019-03-11 12:55 p.m., Christian König wrote:
> Hi guys,
>
> well it's most likely some missing handling in the KFD, so I'm rather 
> reluctant to revert the change immediately.
>
> Problem is that I don't have time right now to look into it 
> immediately. So Kent can you continue to take a look?
>
> Sounds like its crashing immediately, so it should be something obvious.
>
> Christian.
>
> Am 11.03.19 um 10:49 schrieb Russell, Kent:
>>  From what I've been able to dig through, the VM Fault seems to occur 
>> right after a doorbell mmap, but that's as far as I got. I can try to 
>> revert it in today's merge and see how things go.
>>
>>   Kent
>>
>>> -Original Message-
>>> From: Kuehling, Felix
>>> Sent: Friday, March 08, 2019 11:16 PM
>>> To: Koenig, Christian ; Russell, Kent
>>> ; amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>
>>> My concerns were related to eviction fence handing. It would 
>>> manifest by
>>> unnecessary eviction callbacks into KFD that aren't cause by real 
>>> evictions. I
>>> addressed that with a previous patch series that removed the need to
>>> remove eviction fences and add them back around page table updates in
>>> amdgpu_amdkfd_gpuvm.c.
>>>
>>> I don't know what's going on here. I can probably take a look on 
>>> Monday. I
>>> haven't considered what changed with respect to PD updates.
>>>
>>> Kent, can we temporarily revert the offending change in amd-kfd-staging
>>> just to unblock the merge?
>>>
>>> Christian, I think KFD is currently broken on amd-staging-drm-next. 
>>> If we're
>>> serious about supporting KFD upstream, you may also want to consider
>>> reverting your change there for now. Also consider building the 
>>> Thunk and
>>> kfdtest so you can do quick smoke tests locally whenever you make
>>> amdgpu_vm changes that can affect KFD.
>>> https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface
>>>
>>> Regards,
>>>    Felix
>>>
>>> -Original Message-
>>> From: amd-gfx  On Behalf Of
>>> Christian König
>>> Sent: Friday, March 08, 2019 9:14 AM
>>> To: Russell, Kent ; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>
>>> My best guess is that we forget somewhere to update the PDs. What
>>> hardware is that on?
>>>
>>> Felix already mentioned that this could be problematic for the KFD.
>>>
>>> Maybe he has an idea,
>>> Christian.
>>>
>>> Am 08.03.19 um 15:04 schrieb Russell, Kent:
>>>> Hi Christian,
>>>>
>>>> This patch ended up causing a VM Fault in KFDTest. Reverting just this
>>> patch addressed the issue:
>>>> [   82.703503] amdgpu :0c:00.0: GPU fault detected: 146 
>>>> 0x480c for
>>> process  pid 0 thread  pid 0
>>>> [   82.703512] amdgpu :0c:00.0:
>>> VM_CONTEXT1_PROTECTION_FAULT_ADDR   0x1000
>>>> [   82.703516] amdgpu :0c:00.0:
>>> VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x1004800C
>>>> [   82.703522] amdgpu :0c:00.0: VM fault (0x0c, vmid 8, pasid 
>>>> 32769) at
>>> page 4096, read from 'TC0' (0x54433000) (72)
>>>> [   82.703585] Evicting PASID 32769 queues
>>>>
>>>> I am looking into it, but if you have any insight that would be 
>>>> great in
>>> helping to resolve it quickly.
>>>>    Kent
>>>>> -Original Message-
>>>>> From: amd-gfx  On Behalf Of
>>>>> Christian König
>>>>> Sent: Tuesday, February 26, 2019 7:47 AM
>>>>> To: amd-gfx@lists.freedesktop.org
>>>>> Subject: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>>>>
>>>>> Let's start to allocate VM PDs/PTs on demand instead of
>>>>> pre-allocating them during mapping.
>>>>>
>>>>> Signed-off-by: Christian König 
>>>>> Reviewed-by: Felix Kuehling 
>>>>> ---
>>>>>    .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  10 +-
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c   |   9 --
>>>>>    drivers/gpu/drm/amd/amd

Re: Slow memory access when using OpenCL without X11

2019-03-12 Thread Kuehling, Felix
[adding the list back]

I'd suspect a problem related to memory clock. This is an APU where 
system memory is shared with the CPU, so if the SMU changes memory 
clocks that would affect CPU memory access performance. If the problem 
only occurs when OpenCL is running, then the compute power profile could 
have an effect here.

Laurie, can you monitor the clocks during your tests using rocm-smi?

Regards,
   Felix

On 2019-03-11 1:15 p.m., Tom St Denis wrote:
> Hi Lauri,
>
> I don't have ROCm installed locally (not on that team at AMD) but I 
> can rope in some of the KFD folk and see what they say :-).
>
> (in the mean time I should look into installing the ROCm stack on my 
> Ubuntu disk for experimentation...).
>
> Only other thing that comes to mind is some sort of stutter due to 
> power/clock gating (or gfx off/etc).  But that typically affects the 
> display/gpu side not the CPU side.
>
> Felix:  Any known issues with Raven and ROCm interacting over memory 
> bus performance?
>
> Tom
>
> On Mon, Mar 11, 2019 at 12:56 PM Lauri Ehrenpreis  > wrote:
>
> Hi!
>
> The 100x memory slowdown is hard to belive indeed. I attached the
> test program with my first e-mail which depends only on
> rocm-opencl-dev package. Would you mind compiling it and checking
> if it slows down memory for you as well?
>
> steps:
> 1) g++ cl_slow_test.cpp -o cl_slow_test -I
> /opt/rocm/opencl/include/ -L /opt/rocm/opencl/lib/x86_64/  -lOpenCL
> 2) logout from desktop env and disconnect hdmi/diplayport etc
> 3) log in over ssh
> 4) run the program ./cl_slow_test 1
>
> For me it reproduced even without step 2 as well but less
> reliably. moving mouse for example could make the memory speed
> fast again.
>
> --
> Lauri
>
>
>
> On Mon, Mar 11, 2019 at 6:33 PM Tom St Denis  > wrote:
>
> Hi Lauri,
>
> There's really no connection between the two other than they
> run in the same package.  I too run a 2400G (as my
> workstation) and I got the same ~6.6GB/sec transfer rate but
> without a CL app running ...  The only logical reason is your
> CL app is bottlenecking the APUs memory bus but you claim
> "simply opening a context is enough" so something else is
> going on.
>
> Your last reply though says "with it running in the
> background" so it's entirely possible the CPU isn't busy but
> the package memory controller (shared between both the CPU and
> GPU) is busy.  For instance running xonotic in a 1080p window
> on my 4K display reduced the memory test to 5.8GB/sec and
> that's hardly a heavy memory bound GPU app.
>
> The only other possible connection is the GPU is generating so
> much heat that it's throttling the package which is also
> unlikely if you have a proper HSF attached (I use the ones
> that came in the retail boxes).
>
> Cheers,
> Tom
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 2/3] drm/amdgpu: free up the first paging queue

2019-03-12 Thread Kuehling, Felix
I think this would break Raven, which only has one SDMA engine.

Regards,
  Felix

-Original Message-
From: amd-gfx  On Behalf Of Christian 
König
Sent: Tuesday, March 12, 2019 8:38 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 2/3] drm/amdgpu: free up the first paging queue

We need the first paging queue to handle page faults.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 3ac5abe937f4..bed18e7bbc36 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -2266,7 +2266,7 @@ static void sdma_v4_0_set_buffer_funcs(struct 
amdgpu_device *adev)  {
adev->mman.buffer_funcs = _v4_0_buffer_funcs;
if (adev->sdma.has_page_queue)
-   adev->mman.buffer_funcs_ring = >sdma.instance[0].page;
+   adev->mman.buffer_funcs_ring = >sdma.instance[1].page;
else
adev->mman.buffer_funcs_ring = >sdma.instance[0].ring;  } 
@@ -2285,15 +2285,19 @@ static void sdma_v4_0_set_vm_pte_funcs(struct 
amdgpu_device *adev)
unsigned i;
 
adev->vm_manager.vm_pte_funcs = _v4_0_vm_pte_funcs;
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   if (adev->sdma.has_page_queue)
-   sched = >sdma.instance[i].page.sched;
-   else
-   sched = >sdma.instance[i].ring.sched;
-   adev->vm_manager.vm_pte_rqs[i] =
+   if (adev->sdma.has_page_queue) {
+   sched = >sdma.instance[1].page.sched;
+   adev->vm_manager.vm_pte_rqs[0] =
>sched_rq[DRM_SCHED_PRIORITY_KERNEL];
+   adev->vm_manager.vm_pte_num_rqs = 1;
+   } else {
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   sched = >sdma.instance[i].ring.sched;
+   adev->vm_manager.vm_pte_rqs[i] =
+   >sched_rq[DRM_SCHED_PRIORITY_KERNEL];
+   }
+   adev->vm_manager.vm_pte_num_rqs = adev->sdma.num_instances;
}
-   adev->vm_manager.vm_pte_num_rqs = adev->sdma.num_instances;
 }
 
 const struct amdgpu_ip_block_version sdma_v4_0_ip_block = {
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand

2019-03-08 Thread Kuehling, Felix
My concerns were related to eviction fence handing. It would manifest by 
unnecessary eviction callbacks into KFD that aren't cause by real evictions. I 
addressed that with a previous patch series that removed the need to remove 
eviction fences and add them back around page table updates in 
amdgpu_amdkfd_gpuvm.c.

I don't know what's going on here. I can probably take a look on Monday. I 
haven't considered what changed with respect to PD updates.

Kent, can we temporarily revert the offending change in amd-kfd-staging just to 
unblock the merge?

Christian, I think KFD is currently broken on amd-staging-drm-next. If we're 
serious about supporting KFD upstream, you may also want to consider reverting 
your change there for now. Also consider building the Thunk and kfdtest so you 
can do quick smoke tests locally whenever you make amdgpu_vm changes that can 
affect KFD. https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface

Regards,
  Felix

-Original Message-
From: amd-gfx  On Behalf Of Christian 
König
Sent: Friday, March 08, 2019 9:14 AM
To: Russell, Kent ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand

My best guess is that we forget somewhere to update the PDs. What hardware is 
that on?

Felix already mentioned that this could be problematic for the KFD.

Maybe he has an idea,
Christian.

Am 08.03.19 um 15:04 schrieb Russell, Kent:
> Hi Christian,
>
> This patch ended up causing a VM Fault in KFDTest. Reverting just this patch 
> addressed the issue:
> [   82.703503] amdgpu :0c:00.0: GPU fault detected: 146 0x480c for 
> process  pid 0 thread  pid 0
> [   82.703512] amdgpu :0c:00.0:   VM_CONTEXT1_PROTECTION_FAULT_ADDR   
> 0x1000
> [   82.703516] amdgpu :0c:00.0:   VM_CONTEXT1_PROTECTION_FAULT_STATUS 
> 0x1004800C
> [   82.703522] amdgpu :0c:00.0: VM fault (0x0c, vmid 8, pasid 32769) at 
> page 4096, read from 'TC0' (0x54433000) (72)
> [   82.703585] Evicting PASID 32769 queues
>
> I am looking into it, but if you have any insight that would be great in 
> helping to resolve it quickly.
>
>   Kent
>> -Original Message-
>> From: amd-gfx  On Behalf Of 
>> Christian König
>> Sent: Tuesday, February 26, 2019 7:47 AM
>> To: amd-gfx@lists.freedesktop.org
>> Subject: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>>
>> Let's start to allocate VM PDs/PTs on demand instead of 
>> pre-allocating them during mapping.
>>
>> Signed-off-by: Christian König 
>> Reviewed-by: Felix Kuehling 
>> ---
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  10 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c   |   9 --
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |  10 --
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 136 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|   3 -
>>   5 files changed, 39 insertions(+), 129 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 31e3953dcb6e..088e9b6b765b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -410,15 +410,7 @@ static int add_bo_to_vm(struct amdgpu_device 
>> *adev, struct kgd_mem *mem,
>>  if (p_bo_va_entry)
>>  *p_bo_va_entry = bo_va_entry;
>>
>> -/* Allocate new page tables if needed and validate
>> - * them.
>> - */
>> -ret = amdgpu_vm_alloc_pts(adev, vm, va, amdgpu_bo_size(bo));
>> -if (ret) {
>> -pr_err("Failed to allocate pts, err=%d\n", ret);
>> -goto err_alloc_pts;
>> -}
>> -
>> +/* Allocate validate page tables if needed */
>>  ret = vm_validate_pt_pd_bos(vm);
>>  if (ret) {
>>  pr_err("validate_pt_pd_bos() failed\n"); diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> index 7e22be7ca68a..54dd02a898b9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>> @@ -92,15 +92,6 @@ int amdgpu_map_static_csa(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>  return -ENOMEM;
>>  }
>>
>> -r = amdgpu_vm_alloc_pts(adev, (*bo_va)->base.vm, csa_addr,
>> -size);
>> -if (r) {
>> -DRM_ERROR("failed to allocate pts for static CSA, err=%d\n",
>> r);
>> -amdgpu_vm_bo_rmv(adev, *bo_va);
>> -ttm_eu_backoff_reservation(, );
>> -return r;
>> -}
>> -
>>  r = amdgpu_vm_bo_map(adev, *bo_va, csa_addr, 0, size,
>>   AMDGPU_PTE_READABLE |
>> AMDGPU_PTE_WRITEABLE |
>>   AMDGPU_PTE_EXECUTABLE);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index 555285e329ed..fcaaac30e84b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c

Re: [PATCH 1/2] drm/amdgpu: use ring/hash for fault handling on GMC9 v2

2019-03-07 Thread Kuehling, Felix
Hmm, that's a clever (and elegant) little data structure. The series is 
Reviewed-by: Felix Kuehling 

Regards,
   Felix

On 3/7/2019 8:28 AM, Christian König wrote:
> Further testing showed that the idea with the chash doesn't work as expected.
> Especially we can't predict when we can remove the entries from the hash 
> again.
>
> So replace the chash with a ring buffer/hash mix where entries in the 
> container
> age automatically based on their timestamp.
>
> v2: use ring buffer / hash mix
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 49 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 34 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 60 ++---
>   3 files changed, 86 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 5a32a0d2ad31..579cadd16886 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -240,3 +240,52 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, 
> struct amdgpu_gmc *mc)
>   dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n",
>   mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>   }
> +
> +/**
> + * amdgpu_gmc_filter_faults - filter VM faults
> + *
> + * @adev: amdgpu device structure
> + * @addr: address of the VM fault
> + * @pasid: PASID of the process causing the fault
> + * @timestamp: timestamp of the fault
> + *
> + * Returns:
> + * True if the fault was filtered and should not be processed further.
> + * False if the fault is a new one and needs to be handled.
> + */
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
> +   uint16_t pasid, uint64_t timestamp)
> +{
> + struct amdgpu_gmc *gmc = >gmc;
> +
> + uint64_t stamp, key = addr << 4 | pasid;
> + struct amdgpu_gmc_fault *fault;
> + uint32_t hash;
> +
> + /* If we don't have space left in the ring buffer return immediately */
> + stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
> + AMDGPU_GMC_FAULT_TIMEOUT;
> + if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
> + return true;
> +
> + /* Try to find the fault in the hash */
> + hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
> + fault = >fault_ring[gmc->fault_hash[hash].idx];
> + do {
> + if (fault->key == key)
> + return true;
> +
> + stamp = fault->timestamp;
> + fault = >fault_ring[fault->next];
> + } while (fault->timestamp < stamp);
> +
> + /* Add the fault to the ring */
> + fault = >fault_ring[gmc->last_fault];
> + fault->key = key;
> + fault->timestamp = timestamp;
> +
> + /* And update the hash */
> + fault->next = gmc->fault_hash[hash].idx;
> + gmc->fault_hash[hash].idx = gmc->last_fault++;
> + return false;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 6ce45664ff87..071145ac67b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -43,8 +43,34 @@
>*/
>   #define AMDGPU_GMC_HOLE_MASK0xULL
>   
> +/*
> + * Ring size as power of two for the log of recent faults.
> + */
> +#define AMDGPU_GMC_FAULT_RING_ORDER  8
> +#define AMDGPU_GMC_FAULT_RING_SIZE   (1 << AMDGPU_GMC_FAULT_RING_ORDER)
> +
> +/*
> + * Hash size as power of two for the log of recent faults
> + */
> +#define AMDGPU_GMC_FAULT_HASH_ORDER  8
> +#define AMDGPU_GMC_FAULT_HASH_SIZE   (1 << AMDGPU_GMC_FAULT_HASH_ORDER)
> +
> +/*
> + * Number of IH timestamp ticks until a fault is considered handled
> + */
> +#define AMDGPU_GMC_FAULT_TIMEOUT 5000ULL
> +
>   struct firmware;
>   
> +/*
> + * GMC page fault information
> + */
> +struct amdgpu_gmc_fault {
> + uint64_ttimestamp;
> + uint64_tnext:AMDGPU_GMC_FAULT_RING_ORDER;
> + uint64_tkey:52;
> +};
> +
>   /*
>* VMHUB structures, functions & helpers
>*/
> @@ -141,6 +167,12 @@ struct amdgpu_gmc {
>   struct kfd_vm_fault_info *vm_fault_info;
>   atomic_tvm_fault_info_updated;
>   
> + struct amdgpu_gmc_fault fault_ring[AMDGPU_GMC_FAULT_RING_SIZE];
> + struct {
> + uint64_tidx:AMDGPU_GMC_FAULT_RING_ORDER;
> + } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
> + uint64_tlast_fault:AMDGPU_GMC_FAULT_RING_ORDER;
> +
>   const struct amdgpu_gmc_funcs   *gmc_funcs;
>   
>   struct amdgpu_xgmi xgmi;
> @@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
> struct amdgpu_gmc *mc);
>   void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
>struct amdgpu_gmc *mc);
> +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t 

Re: [PATCH 2/3] drm/amdgpu: support userptr cross VMAs case with HMM

2019-03-06 Thread Kuehling, Felix
Some comments inline ...

On 3/5/2019 1:09 PM, Yang, Philip wrote:
> userptr may cross two VMAs if the forked child process (not call exec
> after fork) malloc buffer, then free it, and then malloc larger size
> buf, kerenl will create new VMA adjacent to old VMA which was cloned
> from parent process, some pages of userptr are in the first VMA, the
> rest pages are in the second VMA.
>
> HMM expects range only have one VMA, loop over all VMAs in the address
> range, create multiple ranges to handle this case. See
> is_mergeable_anon_vma in mm/mmap.c for details.
>
> Change-Id: I0ca8c77e28deabccc139906f9ffee04b7e383314
> Signed-off-by: Philip Yang 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 122 +---
>   1 file changed, 87 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index cd0ccfbbcb84..173bf4db5994 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -711,7 +711,8 @@ struct amdgpu_ttm_tt {
>   struct task_struct  *usertask;
>   uint32_tuserflags;
>   #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> - struct hmm_rangerange;
> + struct hmm_range*ranges;
> + int nr_ranges;
>   #endif
>   };
>   
> @@ -723,62 +724,104 @@ struct amdgpu_ttm_tt {
>* once afterwards to stop HMM tracking
>*/
>   #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
> +
> +/* Support Userptr pages cross max 16 vmas */
> +#define MAX_NR_VMAS  (16)
> +
>   int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
>   {
>   struct amdgpu_ttm_tt *gtt = (void *)ttm;
>   struct mm_struct *mm = gtt->usertask->mm;
> - unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
> - struct hmm_range *range = >range;
> - int r = 0, i;
> + unsigned long start = gtt->userptr;
> + unsigned long end = start + ttm->num_pages * PAGE_SIZE;
> + struct hmm_range *ranges;
> + struct vm_area_struct *vma = NULL, *vmas[MAX_NR_VMAS];
> + uint64_t *pfns, f;
> + int r = 0, i, nr_pages;
>   
>   if (!mm) /* Happens during process shutdown */
>   return -ESRCH;
>   
> - amdgpu_hmm_init_range(range);
> -
>   down_read(>mmap_sem);
>   
> - range->vma = find_vma(mm, gtt->userptr);
> - if (!range_in_vma(range->vma, gtt->userptr, end))
> - r = -EFAULT;
> - else if ((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
> - range->vma->vm_file)
> + /* user pages may cross multiple VMAs */
> + gtt->nr_ranges = 0;
> + do {
> + vma = find_vma(mm, vma ? vma->vm_end : start);
> + if (unlikely(!vma)) {
> + r = -EFAULT;
> + goto out;
> + }
> + vmas[gtt->nr_ranges++] = vma;
> + if (gtt->nr_ranges >= MAX_NR_VMAS) {

This will lead to a failure when you have exactly 16 VMAs. If you move 
the check to the start of the loop, it will only trigger when you exceed 
the limit not just after you reach it.


> + DRM_ERROR("invalid userptr range\n");

The userptr range is not really invalid. It only exceeds some artificial 
limitation in this code. A message like "Too many VMAs in userptr range" 
would be more appropriate.


> + r = -EFAULT;
> + goto out;
> + }
> + } while (end > vma->vm_end);
> +
> + DRM_DEBUG_DRIVER("0x%lx nr_ranges %d pages 0x%lx\n",
> + start, gtt->nr_ranges, ttm->num_pages);
> +
> + if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
> + vmas[0]->vm_file)) {
>   r = -EPERM;
> - if (r)
>   goto out;
> + }
>   
> - range->pfns = kvmalloc_array(ttm->num_pages, sizeof(uint64_t),
> -  GFP_KERNEL);
> - if (range->pfns == NULL) {
> + ranges = kvmalloc_array(gtt->nr_ranges, sizeof(*ranges), GFP_KERNEL);
> + if (unlikely(!ranges)) {
>   r = -ENOMEM;
>   goto out;
>   }
> - range->start = gtt->userptr;
> - range->end = end;
>   
> - range->pfns[0] = range->flags[HMM_PFN_VALID];
> - range->pfns[0] |= amdgpu_ttm_tt_is_readonly(ttm) ?
> - 0 : range->flags[HMM_PFN_WRITE];
> - for (i = 1; i < ttm->num_pages; i++)
> - range->pfns[i] = range->pfns[0];
> + pfns = kvmalloc_array(ttm->num_pages, sizeof(*pfns), GFP_KERNEL);
> + if (unlikely(!pfns)) {
> + r = -ENOMEM;
> + goto out_free_ranges;
> + }
> +
> + for (i = 0; i < gtt->nr_ranges; i++)
> + amdgpu_hmm_init_range([i]);
> +
> + f = ranges[0].flags[HMM_PFN_VALID];
> + f |= amdgpu_ttm_tt_is_readonly(ttm) ?
> + 0 : ranges[0].flags[HMM_PFN_WRITE];
> + memset64(pfns, f, ttm->num_pages);
> +
> + 

Re: [PATCH 1/3] drm/amdkfd: support concurrent userptr update for HMM

2019-03-06 Thread Kuehling, Felix
Hmm, I'm not sure. This change probably fixes this issue, but there may 
be other similar corner cases in other situations where the restore 
worker fails and needs to retry. The better place to call untrack in  
amdgpu_amdkfd_restore_userptr_worker would be at the very end. Anything 
that's left in the userptr_inval_list at that point needs to be untracked.

For now as a quick fix for an urgent bug, this change is Reviewed-by: 
Felix Kuehling . But please revisit this and 
check if there are similar corner cases as I explained above.

Regards,
   Felix

On 3/5/2019 1:09 PM, Yang, Philip wrote:
> Userptr restore may have concurrent userptr invalidation after
> hmm_vma_fault adds the range to the hmm->ranges list, needs call
> hmm_vma_range_done to remove the range from hmm->ranges list first,
> then reschedule the restore worker. Otherwise hmm_vma_fault will add
> same range to the list, this will cause loop in the list because
> range->next point to range itself.
>
> Add function untrack_invalid_user_pages to reduce code duplication.
>
> Change-Id: I31407739dc10554f8e418c7a0e0415d3d95552f1
> Signed-off-by: Philip Yang 
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28 ++-
>   1 file changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 314c048fcac6..783d760ccfe3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1731,6 +1731,23 @@ static int update_invalid_user_pages(struct 
> amdkfd_process_info *process_info,
>   return 0;
>   }
>   
> +/* Remove invalid userptr BOs from hmm track list
> + *
> + * Stop HMM track the userptr update
> + */
> +static void untrack_invalid_user_pages(struct amdkfd_process_info 
> *process_info)
> +{
> + struct kgd_mem *mem, *tmp_mem;
> + struct amdgpu_bo *bo;
> +
> + list_for_each_entry_safe(mem, tmp_mem,
> +  _info->userptr_inval_list,
> +  validate_list.head) {
> + bo = mem->bo;
> + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> + }
> +}
> +
>   /* Validate invalid userptr BOs
>*
>* Validates BOs on the userptr_inval_list, and moves them back to the
> @@ -1848,12 +1865,7 @@ static int validate_invalid_user_pages(struct 
> amdkfd_process_info *process_info)
>   out_free:
>   kfree(pd_bo_list_entries);
>   out_no_mem:
> - list_for_each_entry_safe(mem, tmp_mem,
> -  _info->userptr_inval_list,
> -  validate_list.head) {
> - bo = mem->bo;
> - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
> - }
> + untrack_invalid_user_pages(process_info);
>   
>   return ret;
>   }
> @@ -1897,8 +1909,10 @@ static void 
> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>* and we can just restart the queues.
>*/
>   if (!list_empty(_info->userptr_inval_list)) {
> - if (atomic_read(_info->evicted_bos) != evicted_bos)
> + if (atomic_read(_info->evicted_bos) != evicted_bos) {
> + untrack_invalid_user_pages(process_info);
>   goto unlock_out; /* Concurrent eviction, try again */
> + }
>   
>   if (validate_invalid_user_pages(process_info))
>   goto unlock_out;
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdkfd: Add curly braces around idr_for_each_entry_continue loop

2019-03-05 Thread Kuehling, Felix
On 2019-03-05 6:20 a.m., Michel Dänzer wrote:
> From: Michel Dänzer 
>
> The compiler pointed out that one if block unintentionally wasn't part
> of the loop:
>
> In file included from ./include/linux/kernfs.h:14,
>   from ./include/linux/sysfs.h:16,
>   from ./include/linux/kobject.h:20,
>   from ./include/linux/device.h:16,
>   from ./include/linux/node.h:18,
>   from ./include/linux/memory.h:19,
>   from drivers/gpu/drm//amd/amdgpu/../amdkfd/kfd_events.c:30:
> drivers/gpu/drm//amd/amdgpu/../amdkfd/kfd_events.c: In function 
> ‘kfd_signal_reset_event’:
> ./include/linux/idr.h:212:2: warning: this ‘for’ clause does not guard... 
> [-Wmisleading-indentation]
>for ((entry) = idr_get_next((idr), &(id));   \
>^~~
> drivers/gpu/drm//amd/amdgpu/../amdkfd/kfd_events.c:1038:3: note: in expansion 
> of macro ‘idr_for_each_entry_continue’
> idr_for_each_entry_continue(>event_idr, ev, id)
> ^~~
> drivers/gpu/drm//amd/amdgpu/../amdkfd/kfd_events.c:1043:4: note: ...this 
> statement, but the latter is misleadingly indented as if it were guarded by 
> the ‘for’
>  if (ev->type == KFD_EVENT_TYPE_MEMORY &&
>  ^~
>
> Fixes: "drm/amdkfd: add RAS ECC event support"
> Signed-off-by: Michel Dänzer 

Reviewed-by: Felix Kuehling 


> ---
>
> This is one reason why I think it's better to always use curly braces
> around multiple lines, even if it happens to be a single statement.
>
>   drivers/gpu/drm/amd/amdkfd/kfd_events.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 97c984684973..6e1d41c5bf86 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -1035,7 +1035,7 @@ void kfd_signal_reset_event(struct kfd_dev *dev)
>   hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>   mutex_lock(>event_mutex);
>   id = KFD_FIRST_NONSIGNAL_EVENT_ID;
> - idr_for_each_entry_continue(>event_idr, ev, id)
> + idr_for_each_entry_continue(>event_idr, ev, id) {
>   if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
>   ev->hw_exception_data = hw_exception_data;
>   set_event(ev);
> @@ -1045,6 +1045,7 @@ void kfd_signal_reset_event(struct kfd_dev *dev)
>   ev->memory_exception_data = 
> memory_exception_data;
>   set_event(ev);
>   }
> + }
>   mutex_unlock(>event_mutex);
>   }
>   srcu_read_unlock(_processes_srcu, idx);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu: rework shadow handling during PD clear v3

2019-03-04 Thread Kuehling, Felix
One not so obvious change here: The fence on the page table after 
clear_bo now waits for clearing both the page table and the shadow. That 
may make clearing of page tables appear a bit slower. On the other hand, 
if you're clearing a bunch of page tables at once, then difference will 
be minimal because clearing the second page table will have to wait for 
clearing the first shadow either way.

If that is acceptable, then the series is Reviewed-by: Felix Kuehling 


Regards,
   Felix

On 2019-03-04 11:28 a.m., Christian König wrote:
> This way we only deal with the real BO in here.
>
> v2: use a do { ... } while loop instead
> v3: fix NULL pointer in v2
>
> Signed-off-by: Christian König 
> Acked-by: Huang Rui 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 67 +++---
>   1 file changed, 39 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 12d51d96491e..d9a0ac14c4ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -788,44 +788,61 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   
>   r = ttm_bo_validate(>tbo, >placement, );
>   if (r)
> - goto error;
> + return r;
>   
>   r = amdgpu_ttm_alloc_gart(>tbo);
>   if (r)
>   return r;
>   
> + if (bo->shadow) {
> + r = ttm_bo_validate(>shadow->tbo, >shadow->placement,
> + );
> + if (r)
> + return r;
> +
> + r = amdgpu_ttm_alloc_gart(>shadow->tbo);
> + if (r)
> + return r;
> +
> + }
> +
>   r = amdgpu_job_alloc_with_ib(adev, 64, );
>   if (r)
> - goto error;
> + return r;
> +
> + do {
> + addr = amdgpu_bo_gpu_offset(bo);
> + if (ats_entries) {
> + uint64_t ats_value;
>   
> - addr = amdgpu_bo_gpu_offset(bo);
> - if (ats_entries) {
> - uint64_t ats_value;
> + ats_value = AMDGPU_PTE_DEFAULT_ATC;
> + if (level != AMDGPU_VM_PTB)
> + ats_value |= AMDGPU_PDE_PTE;
>   
> - ats_value = AMDGPU_PTE_DEFAULT_ATC;
> - if (level != AMDGPU_VM_PTB)
> - ats_value |= AMDGPU_PDE_PTE;
> + amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> +   ats_entries, 0, ats_value);
> + addr += ats_entries * 8;
> + }
>   
> - amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> -   ats_entries, 0, ats_value);
> - addr += ats_entries * 8;
> - }
> + if (entries) {
> + uint64_t value = 0;
>   
> - if (entries) {
> - uint64_t value = 0;
> + /* Workaround for fault priority problem on GMC9 */
> + if (level == AMDGPU_VM_PTB &&
> + adev->asic_type >= CHIP_VEGA10)
> + value = AMDGPU_PTE_EXECUTABLE;
>   
> - /* Workaround for fault priority problem on GMC9 */
> - if (level == AMDGPU_VM_PTB && adev->asic_type >= CHIP_VEGA10)
> - value = AMDGPU_PTE_EXECUTABLE;
> + amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> +   entries, 0, value);
> + }
>   
> - amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> -   entries, 0, value);
> - }
> + bo = bo->shadow;
> + } while (bo);
>   
>   amdgpu_ring_pad_ib(ring, >ibs[0]);
>   
>   WARN_ON(job->ibs[0].length_dw > 64);
> - r = amdgpu_sync_resv(adev, >sync, bo->tbo.resv,
> + r = amdgpu_sync_resv(adev, >sync, vm->root.base.bo->tbo.resv,
>AMDGPU_FENCE_OWNER_KFD, false);
>   if (r)
>   goto error_free;
> @@ -835,19 +852,13 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   if (r)
>   goto error_free;
>   
> - amdgpu_bo_fence(bo, fence, true);
> + amdgpu_bo_fence(vm->root.base.bo, fence, true);
>   dma_fence_put(fence);
>   
> - if (bo->shadow)
> - return amdgpu_vm_clear_bo(adev, vm, bo->shadow,
> -   level, pte_support_ats);
> -
>   return 0;
>   
>   error_free:
>   amdgpu_job_free(job);
> -
> -error:
>   return r;
>   }
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: handle userptr corner cases with HMM path

2019-03-01 Thread Kuehling, Felix
Since you're addressing two distinct bugs, please split this into two patches.

For the multiple VMAs, should we generalize that to handle any number of VMAs? 
It's not a typical case, but you could easily construct situations with 
mprotect where different parts of the same buffer have different VMAs and then 
register that as a single user pointer. Or you could user MAP_FIXED to map 
multiple files to adjacent virtual addresses.

There may be two ways to handle this:
1. If the userptr address range spans more than one VMA, fail
2. Loop over all the VMAs in the address range

Thanks,
  Felix

-Original Message-
From: amd-gfx  On Behalf Of Yang, Philip
Sent: Friday, March 01, 2019 12:30 PM
To: amd-gfx@lists.freedesktop.org
Cc: Yang, Philip 
Subject: [PATCH] drm/amdgpu: handle userptr corner cases with HMM path

Those corner cases are found by kfdtest.KFDIPCTest.

userptr may cross two vmas if the forked child process (not call exec
after fork) malloc buffer, then free it, and then malloc larger size
buf, kerenl will create new vma adjacent to old vma which was cloned
from parent process, some pages of userptr are in the first vma, the
rest pages are in the second vma. HMM expects range only have one vma,
we have to use two ranges to handle this case. See is_mergeable_anon_vma
in mm/mmap.c for details.

kfd userptr restore may have concurrent userptr invalidation, reschedule
to restore and then needs call hmm_vma_range_done to remove range from
hmm->ranges list, otherwise hmm_vma_fault add same range to the list
will cause loop in the list because range->next point to range itself.

Change-Id: I641ba7406c32bd8b7ae715f52bd896d53fe56801
Signed-off-by: Philip Yang 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 28 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 73 +--
 2 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f8104760f1e6..179af9d3ab19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1738,6 +1738,23 @@ static int update_invalid_user_pages(struct 
amdkfd_process_info *process_info,
return 0;
 }
 
+/* Untrack invalid userptr BOs
+ *
+ * Stop HMM track the userptr update
+ */
+static void untrack_invalid_user_pages(struct amdkfd_process_info 
*process_info)
+{
+   struct kgd_mem *mem, *tmp_mem;
+   struct amdgpu_bo *bo;
+
+   list_for_each_entry_safe(mem, tmp_mem,
+_info->userptr_inval_list,
+validate_list.head) {
+   bo = mem->bo;
+   amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
+   }
+}
+
 /* Validate invalid userptr BOs
  *
  * Validates BOs on the userptr_inval_list, and moves them back to the
@@ -1855,12 +1872,7 @@ static int validate_invalid_user_pages(struct 
amdkfd_process_info *process_info)
 out_free:
kfree(pd_bo_list_entries);
 out_no_mem:
-   list_for_each_entry_safe(mem, tmp_mem,
-_info->userptr_inval_list,
-validate_list.head) {
-   bo = mem->bo;
-   amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
-   }
+   untrack_invalid_user_pages(process_info);
 
return ret;
 }
@@ -1904,8 +1916,10 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
 * and we can just restart the queues.
 */
if (!list_empty(_info->userptr_inval_list)) {
-   if (atomic_read(_info->evicted_bos) != evicted_bos)
+   if (atomic_read(_info->evicted_bos) != evicted_bos) {
+   untrack_invalid_user_pages(process_info);
goto unlock_out; /* Concurrent eviction, try again */
+   }
 
if (validate_invalid_user_pages(process_info))
goto unlock_out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index cd0ccfbbcb84..e5736225f513 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -711,7 +711,7 @@ struct amdgpu_ttm_tt {
struct task_struct  *usertask;
uint32_tuserflags;
 #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
-   struct hmm_rangerange;
+   struct hmm_rangerange, range2;
 #endif
 };
 
@@ -727,58 +727,81 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, 
struct page **pages)
 {
struct amdgpu_ttm_tt *gtt = (void *)ttm;
struct mm_struct *mm = gtt->usertask->mm;
-   unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE;
+   unsigned long start = gtt->userptr;
+   unsigned long end = start + ttm->num_pages * PAGE_SIZE;
struct hmm_range *range = >range;
+   struct hmm_range *range2 = >range2;
+   struct 

Re: [PATCH] drm/amdgpu: Add sysfs files for returning VRAM/GTT info

2019-02-28 Thread Kuehling, Felix
On 2/28/2019 9:56 AM, Christian König wrote:
> Am 28.02.19 um 16:32 schrieb Russell, Kent:
>> Add 3 files that return:
>> The total amount of VRAM and the current total used VRAM
>> The total amount of VRAM and the current total used visible VRAM
>> The total GTT size and the current total of used GTT
>>
>> Each returns 2 integers, total and used, in bytes
>
> Well that is a good start, but unfortunately violates the rules for 
> sysfs. You need to return one value per file.

Is this rule written down anywhere. I see that space-separated lists of 
things are common. E.g. scaling_available_governors in the cpufreq 
directories.

In Documentation/admin-guide/sysfs-rules.rst I don't see any rule about 
single value per file. Maybe that's because these rules are more from 
user mode usage of sysfs rather than for kernel implementations.

Regards,
   Felix


>
> So you should create 6 files in total.
>
> Regards,
> Christian.
>
>>
>> Change-Id: I0bd702b166b4253887ef76fb1bba8b9aadc7e2c5
>> Signed-off-by: Kent Russell 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c  | 36 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 67 
>>   2 files changed, 103 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> index da7b1b92d9cf..adfa211c5152 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> @@ -36,6 +36,30 @@ struct amdgpu_gtt_node {
>>   struct ttm_buffer_object *tbo;
>>   };
>>   +/**
>> + * DOC: mem_info_gtt
>> + *
>> + * The amdgpu driver provides a sysfs API for reporting current GTT 
>> information
>> + * The file mem_info_gtt is used for this.
>> + * The file returns the total size of the GTT block and the current 
>> amount of
>> + * used GTT as 2 separate integers, in bytes
>> + */
>> +static ssize_t amdgpu_mem_info_gtt_show(struct device *dev,
>> +    struct device_attribute *attr, char *buf)
>> +{
>> +    struct drm_device *ddev = dev_get_drvdata(dev);
>> +    struct amdgpu_device *adev = ddev->dev_private;
>> +    uint64_t used_gtt, total_gtt;
>> +
>> +    used_gtt = amdgpu_gtt_mgr_usage(>mman.bdev.man[TTM_PL_TT]);
>> +    total_gtt = (adev->mman.bdev.man[TTM_PL_TT].size) * PAGE_SIZE;
>> +
>> +    return snprintf(buf, PAGE_SIZE, "%llu %llu\n",
>> +  total_gtt, used_gtt);
>> +}
>> +
>> +static DEVICE_ATTR(mem_info_gtt, S_IRUGO, amdgpu_mem_info_gtt_show, 
>> NULL);
>> +
>>   /**
>>    * amdgpu_gtt_mgr_init - init GTT manager and DRM MM
>>    *
>> @@ -50,6 +74,7 @@ static int amdgpu_gtt_mgr_init(struct 
>> ttm_mem_type_manager *man,
>>   struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
>>   struct amdgpu_gtt_mgr *mgr;
>>   uint64_t start, size;
>> +    int ret;
>>     mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
>>   if (!mgr)
>> @@ -61,6 +86,13 @@ static int amdgpu_gtt_mgr_init(struct 
>> ttm_mem_type_manager *man,
>>   spin_lock_init(>lock);
>>   atomic64_set(>available, p_size);
>>   man->priv = mgr;
>> +
>> +    ret = device_create_file(adev->dev, _attr_mem_info_gtt);
>> +    if (ret) {
>> +    DRM_ERROR("Failed to create device file mem_info_gtt\n");
>> +    return ret;
>> +    }
>> +
>>   return 0;
>>   }
>>   @@ -74,12 +106,16 @@ static int amdgpu_gtt_mgr_init(struct 
>> ttm_mem_type_manager *man,
>>    */
>>   static int amdgpu_gtt_mgr_fini(struct ttm_mem_type_manager *man)
>>   {
>> +    struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
>>   struct amdgpu_gtt_mgr *mgr = man->priv;
>>   spin_lock(>lock);
>>   drm_mm_takedown(>mm);
>>   spin_unlock(>lock);
>>   kfree(mgr);
>>   man->priv = NULL;
>> +
>> +    device_remove_file(adev->dev, _attr_mem_info_gtt);
>> +
>>   return 0;
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> index 3f9d5d00c9b3..d0bada997cba 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
>> @@ -32,6 +32,55 @@ struct amdgpu_vram_mgr {
>>   atomic64_t vis_usage;
>>   };
>>   +/**
>> + * DOC: mem_info_vram
>> + *
>> + * The amdgpu driver provides a sysfs API for reporting current VRAM 
>> information
>> + * The file mem_info_vram is used for this.
>> + * The file returns the total amount of VRAM and the current total 
>> amount of
>> + * used VRAM as 2 separate integers, in bytes
>> + */
>> +static ssize_t amdgpu_mem_info_vram_show(struct device *dev,
>> +    struct device_attribute *attr, char *buf)
>> +{
>> +    struct drm_device *ddev = dev_get_drvdata(dev);
>> +    struct amdgpu_device *adev = ddev->dev_private;
>> +    uint64_t used_vram, total_vram;
>> +
>> +    used_vram = 
>> amdgpu_vram_mgr_usage(>mman.bdev.man[TTM_PL_VRAM]);
>> +    total_vram = adev->gmc.real_vram_size;
>> +
>> +    return snprintf(buf, PAGE_SIZE, "%llu %llu\n", total_vram, 
>> 

Re: [PATCH 1/1] drm/ttm: Account for kernel allocations in kernel zone only

2019-02-25 Thread Kuehling, Felix
On 2/25/2019 2:58 PM, Thomas Hellstrom wrote:
> On Mon, 2019-02-25 at 14:20 +, Koenig, Christian wrote:
>> Am 23.02.19 um 00:19 schrieb Kuehling, Felix:
>>> Don't account for them in other zones such as dma32. The kernel
>>> page
>>> allocator has its own heuristics to avoid exhausting special zones
>>> for regular kernel allocations.
>>>
>>> Signed-off-by: Felix Kuehling 
>>> CC: thellst...@vmware.com
>>> CC: christian.koe...@amd.com
>> Reviewed-by: Christian König 
> Hmm,
>
> So actually with this patch we theoretically still can exhaust the
> DMA32 zone by first performing kernel allocations that DO spare a
> number of DMA32 pages  according to the kernel allocator heuristics,
> and then populate TTM buffers with DMA32 pages only.
>
> However, since vmwgfx bo's don't request DMA32 pages, we're OK with
> this, and it's really up to Christian to decide. So:
>
> Acked-by: Thomas Hellstrom 

Thanks. What's the best branch to submit this to? My patches were based 
on amd-staging-drm-next.

Regards,
   Felix

>
> Thanks,
> Thomas
>
>
>
>
>
>
>
>
>>> ---
>>>drivers/gpu/drm/ttm/ttm_memory.c | 6 +++---
>>>1 file changed, 3 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_memory.c
>>> b/drivers/gpu/drm/ttm/ttm_memory.c
>>> index f1567c3..90d1e24 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_memory.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_memory.c
>>> @@ -522,7 +522,7 @@ static void ttm_mem_global_free_zone(struct
>>> ttm_mem_global *glob,
>>>void ttm_mem_global_free(struct ttm_mem_global *glob,
>>>  uint64_t amount)
>>>{
>>> -   return ttm_mem_global_free_zone(glob, NULL, amount);
>>> +   return ttm_mem_global_free_zone(glob, glob->zone_kernel,
>>> amount);
>>>}
>>>EXPORT_SYMBOL(ttm_mem_global_free);
>>>
>>> @@ -621,10 +621,10 @@ int ttm_mem_global_alloc(struct
>>> ttm_mem_global *glob, uint64_t memory,
>>>{
>>> /**
>>>  * Normal allocations of kernel memory are registered in
>>> -* all zones.
>>> +* the kernel zone.
>>>  */
>>>
>>> -   return ttm_mem_global_alloc_zone(glob, NULL, memory, ctx);
>>> +   return ttm_mem_global_alloc_zone(glob, glob->zone_kernel,
>>> memory, ctx);
>>>}
>>>EXPORT_SYMBOL(ttm_mem_global_alloc);
>>>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/1] drm/ttm: Account for kernel allocations in kernel zone only

2019-02-22 Thread Kuehling, Felix
Don't account for them in other zones such as dma32. The kernel page
allocator has its own heuristics to avoid exhausting special zones
for regular kernel allocations.

Signed-off-by: Felix Kuehling 
CC: thellst...@vmware.com
CC: christian.koe...@amd.com
---
 drivers/gpu/drm/ttm/ttm_memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/ttm/ttm_memory.c
index f1567c3..90d1e24 100644
--- a/drivers/gpu/drm/ttm/ttm_memory.c
+++ b/drivers/gpu/drm/ttm/ttm_memory.c
@@ -522,7 +522,7 @@ static void ttm_mem_global_free_zone(struct ttm_mem_global 
*glob,
 void ttm_mem_global_free(struct ttm_mem_global *glob,
 uint64_t amount)
 {
-   return ttm_mem_global_free_zone(glob, NULL, amount);
+   return ttm_mem_global_free_zone(glob, glob->zone_kernel, amount);
 }
 EXPORT_SYMBOL(ttm_mem_global_free);
 
@@ -621,10 +621,10 @@ int ttm_mem_global_alloc(struct ttm_mem_global *glob, 
uint64_t memory,
 {
/**
 * Normal allocations of kernel memory are registered in
-* all zones.
+* the kernel zone.
 */
 
-   return ttm_mem_global_alloc_zone(glob, NULL, memory, ctx);
+   return ttm_mem_global_alloc_zone(glob, glob->zone_kernel, memory, ctx);
 }
 EXPORT_SYMBOL(ttm_mem_global_alloc);
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/1] [RFC] drm/ttm: Don't init dma32_zone on 64-bit systems

2019-02-22 Thread Kuehling, Felix
On 2019-02-22 8:45 a.m., Thomas Hellstrom wrote:
> On Fri, 2019-02-22 at 07:10 +, Koenig, Christian wrote:
>> Am 21.02.19 um 22:02 schrieb Thomas Hellstrom:
>>> Hi,
>>>
>>> On Thu, 2019-02-21 at 20:24 +, Kuehling, Felix wrote:
>>>> On 2019-02-21 12:34 p.m., Thomas Hellstrom wrote:
>>>>> On Thu, 2019-02-21 at 16:57 +, Kuehling, Felix wrote:
>>>>>> On 2019-02-21 2:59 a.m., Koenig, Christian wrote:
>>>>>>> On x86 with HIGHMEM there is no dma32 zone. Why do we need
>>>>>>> one
>>>>>>> on
>>>>>>>>> x86_64? Can we make x86_64 more like HIGHMEM instead?
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Felix
>>>>>>>>>
>>>>>>>> IIRC with x86, the kernel zone is always smaller than any
>>>>>>>> dma32
>>>>>>>> zone,
>>>>>>>> so we'd always exhaust the kernel zone before dma32
>>>>>>>> anyway.
>>>>>>>>
>>>>>>>> Not sure why we have dma32 on x86 without highmem,
>>>>>>>> though.
>>>>>>>> sounds
>>>>>>>> superflous but harmless.
>>>>>>> Well DMA32 denotes memory which is accessible by devices
>>>>>>> who
>>>>>>> can
>>>>>>> only do
>>>>>>> 32bit addressing. And IIRC we can actually do DMA32 to
>>>>>>> highmem
>>>>>>> since
>>>>>>> something like 2.4.*.
>>>>>>>
>>>>>>> Because of this it is actually irrelevant if you have
>>>>>>> highmem
>>>>>>> or
>>>>>>> not,
>>>>>>> what matters for DMA32 is if you have an IOMMU or not.
>>>>>> Are you saying we should have a dma32_zone even on x86 with
>>>>>> HIGHMEM?
>>>>>>
>>>>>>
>>>>>>> So even on x86_64 you actually do need the DMA32 zone if
>>>>>>> you
>>>>>>> don't
>>>>>>> have
>>>>>>> an IOMMU which remaps all memory for devices which can't
>>>>>>> directly
>>>>>>> address it.
>>>>>> Why is DMA32 special in this way? For example AMD GFX8 GPUs
>>>>>> support
>>>>>> 40-bit DMA. But we don't have a special zone for that.
>>>>> If you're running on a non-IOMMU system with physical memory
>>>>> addresses
>>>>>> 40 bits, and tell the DMA subsystem that you need to restrict
>>>>>> to
>>>>>> 40
>>>>> bits, it will probably start using bounce buffers for streaming
>>>>> DMA
>>>>> (which won't work with most graphics drivers), or for
>>>>> dma_alloc_coherent(), it will probably use memory from the
>>>>> DMA32
>>>>> zone.
>>>> OK, then why is it not needed when CONFIG_HIGHMEM is defined?
>>>>
>>>> I found that there is a CONFIG_ZONE_DMA32 parameter. Maybe we
>>>> should
>>>> use
>>>> that to decide whether to account for the DMA32 zone in TTM. It
>>>> is
>>>> set
>>>> on x86_64 and a number of other 64-bit architectures.
>>>>
>>>>
>>>>>> How common is it to have devices that need DMA32 on an x86_64
>>>>>> system?
>>>>> IIRC All devices using dma_alloc_coherent() allocate DMA32
>>>>> memory
>>>>> unless they explicitly set the dma coherent mask to something
>>>>> larger.
>>>>> Like Christian says, if an IOMMU is present and enabled, the
>>>>> need
>>>>> for
>>>>> the DMA32 zone goes away. In theory at least.
>>>> Thanks. I read up a bit on DMA32 and memory zones in general. I
>>>> found
>>>> that there is a lowmem_reserve_ratio feature that protects
>>>> against
>>>> normal page allocations overflowing into lowmem zones. There is
>>>> some
>>>> documentation in Documentation/scsctl/vm.txt (search for
>>>> lowmem_reserve_ratio). The protected amount of memory in each
>>>> zone
>>>> can
>>>> be seen in /proc/zoneinfo.
>>>>
>>>> With that, can we conclude that we don't need to count
>>>> ttm_mem_global_alloc against the dma32 zone.
>>> Yes, it indeed looks like that.
>>> But then I would suggest removing the DMA32 zone entirely.
>> We still need it for the pages we allocate, but we should just stop
>> accounting all the housekeeping to it.
> Why is that? Can't we just account all pages in the kernel zone, and
> leave it up to the kernel to make sure there are still DMA32 pages
> left?

ttm_page_alloc and ttm_page_alloc_dma support allocating from DMA32 
explicitly (setting GFP_DMA32). Such allocations could exhaust DMA32 
memory, which TTM should prevent by limiting its DMA32 usage. This would 
still be counted against the dma32 zone by ttm_mem_global_alloc_page.

I'll send out a new patch that counts general kernel allocations against 
the kernel zone only. I hope this would be acceptable.

Regards,
   Felix


>
> /Thomas
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/1] [RFC] drm/ttm: Don't init dma32_zone on 64-bit systems

2019-02-21 Thread Kuehling, Felix

On 2019-02-21 12:34 p.m., Thomas Hellstrom wrote:
> On Thu, 2019-02-21 at 16:57 +0000, Kuehling, Felix wrote:
>> On 2019-02-21 2:59 a.m., Koenig, Christian wrote:
>>> On x86 with HIGHMEM there is no dma32 zone. Why do we need one on
>>>>> x86_64? Can we make x86_64 more like HIGHMEM instead?
>>>>>
>>>>> Regards,
>>>>>   Felix
>>>>>
>>>> IIRC with x86, the kernel zone is always smaller than any dma32
>>>> zone,
>>>> so we'd always exhaust the kernel zone before dma32 anyway.
>>>>
>>>> Not sure why we have dma32 on x86 without highmem, though. sounds
>>>> superflous but harmless.
>>> Well DMA32 denotes memory which is accessible by devices who can
>>> only do
>>> 32bit addressing. And IIRC we can actually do DMA32 to highmem
>>> since
>>> something like 2.4.*.
>>>
>>> Because of this it is actually irrelevant if you have highmem or
>>> not,
>>> what matters for DMA32 is if you have an IOMMU or not.
>> Are you saying we should have a dma32_zone even on x86 with HIGHMEM?
>>
>>
>>> So even on x86_64 you actually do need the DMA32 zone if you don't
>>> have
>>> an IOMMU which remaps all memory for devices which can't directly
>>> address it.
>> Why is DMA32 special in this way? For example AMD GFX8 GPUs support
>> 40-bit DMA. But we don't have a special zone for that.
> If you're running on a non-IOMMU system with physical memory addresses
>> 40 bits, and tell the DMA subsystem that you need to restrict to 40
> bits, it will probably start using bounce buffers for streaming DMA
> (which won't work with most graphics drivers), or for
> dma_alloc_coherent(), it will probably use memory from the DMA32 zone.

OK, then why is it not needed when CONFIG_HIGHMEM is defined?

I found that there is a CONFIG_ZONE_DMA32 parameter. Maybe we should use 
that to decide whether to account for the DMA32 zone in TTM. It is set 
on x86_64 and a number of other 64-bit architectures.


>> How common is it to have devices that need DMA32 on an x86_64 system?
> IIRC All devices using dma_alloc_coherent() allocate DMA32 memory
> unless they explicitly set the dma coherent mask to something larger.
> Like Christian says, if an IOMMU is present and enabled, the need for
> the DMA32 zone goes away. In theory at least.

Thanks. I read up a bit on DMA32 and memory zones in general. I found 
that there is a lowmem_reserve_ratio feature that protects against 
normal page allocations overflowing into lowmem zones. There is some 
documentation in Documentation/scsctl/vm.txt (search for 
lowmem_reserve_ratio). The protected amount of memory in each zone can 
be seen in /proc/zoneinfo.

With that, can we conclude that we don't need to count 
ttm_mem_global_alloc against the dma32 zone.

Thanks,
   Felix


>
> /Thomas
>
>
>> Regards,
>> Felix
>>
>>
>>> Regards,
>>> Christian.
>>>
>>>> /Thomas
>>>>
>>>>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: fix HMM config dependency issue

2019-02-21 Thread Kuehling, Felix
On 2019-02-21 12:48 p.m., Yang, Philip wrote:
> Only select HMM_MIRROR will get kernel config dependency warnings
> if CONFIG_HMM is missing in the config. Add depends on HMM will
> solve the issue.
>
> Add conditional compilation to fix compilation errors if HMM_MIRROR
> is not enabled as HMM config is not enabled.
>
> Change-Id: I1b44a0b5285bbef5e98bfb045d1d82c167af1cb8
> Signed-off-by: Philip Yang 

Reviewed-by: Felix Kuehling 

See one semi-related comment inline ...

> ---
>   drivers/gpu/drm/amd/amdgpu/Kconfig  |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  6 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 12 
>   3 files changed, 19 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
> b/drivers/gpu/drm/amd/amdgpu/Kconfig
> index 960a63355705..67553effb649 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
> +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
> @@ -26,6 +26,7 @@ config DRM_AMDGPU_CIK
>   config DRM_AMDGPU_USERPTR
>   bool "Always enable userptr write support"
>   depends on DRM_AMDGPU
> + depends on ARCH_HAS_HMM
>   select HMM_MIRROR
>   help
> This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 1e675048f790..c1dbca14dce5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -712,7 +712,9 @@ struct amdgpu_ttm_tt {
>   uint64_tuserptr;
>   struct task_struct  *usertask;
>   uint32_tuserflags;
> +#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>   struct hmm_rangerange;
> +#endif
>   };
>   
>   /**
> @@ -722,6 +724,7 @@ struct amdgpu_ttm_tt {
>* Calling function must call amdgpu_ttm_tt_userptr_range_done() once and 
> only
>* once afterwards to stop HMM tracking
>*/
> +#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>   int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
>   {
>   struct amdgpu_ttm_tt *gtt = (void *)ttm;
> @@ -804,6 +807,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
>   
>   return r;
>   }
> +#endif
>   
>   /**
>* amdgpu_ttm_tt_set_user_pages - Copy pages in, putting old pages as 
> necessary.
> @@ -904,9 +908,11 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt 
> *ttm)
>   
>   sg_free_table(ttm->sg);
>   
> +#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>   if (gtt->range.pfns &&
>   ttm->pages[0] == hmm_pfn_to_page(>range, gtt->range.pfns[0]))
>   WARN_ONCE(1, "Missing get_user_page_done\n");
> +#endif
>   }
>   
>   int amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index 8988c87fff9d..c9d87271a4cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -101,8 +101,20 @@ int amdgpu_mmap(struct file *filp, struct vm_area_struct 
> *vma);
>   int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo);
>   int amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);
>   
> +#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
>   int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages);
>   bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm);
> +#else
> +static inline int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct 
> page **pages)
> +{
> + return -EPERM;
> +}
> +static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
> +{
> + return false;
> +}
> +#endif
> +
>   void amdgpu_ttm_tt_set_user_pages(struct ttm_tt *ttm, struct page **pages);
>   void amdgpu_ttm_tt_mark_user_pages(struct ttm_tt *ttm);

mark_user_pages isn't used any more. This function could be removed.

Regards,
   Felix


>   int amdgpu_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr,
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/1] [RFC] drm/ttm: Don't init dma32_zone on 64-bit systems

2019-02-21 Thread Kuehling, Felix

On 2019-02-21 2:59 a.m., Koenig, Christian wrote:
> On x86 with HIGHMEM there is no dma32 zone. Why do we need one on
>>> x86_64? Can we make x86_64 more like HIGHMEM instead?
>>>
>>> Regards,
>>>  Felix
>>>
>> IIRC with x86, the kernel zone is always smaller than any dma32 zone,
>> so we'd always exhaust the kernel zone before dma32 anyway.
>>
>> Not sure why we have dma32 on x86 without highmem, though. sounds
>> superflous but harmless.
> Well DMA32 denotes memory which is accessible by devices who can only do
> 32bit addressing. And IIRC we can actually do DMA32 to highmem since
> something like 2.4.*.
>
> Because of this it is actually irrelevant if you have highmem or not,
> what matters for DMA32 is if you have an IOMMU or not.

Are you saying we should have a dma32_zone even on x86 with HIGHMEM?


>
> So even on x86_64 you actually do need the DMA32 zone if you don't have
> an IOMMU which remaps all memory for devices which can't directly
> address it.

Why is DMA32 special in this way? For example AMD GFX8 GPUs support 
40-bit DMA. But we don't have a special zone for that.

How common is it to have devices that need DMA32 on an x86_64 system?

Regards,
   Felix


>
> Regards,
> Christian.
>
>> /Thomas
>>
>>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: select ARCH_HAS_HMM and ZONE_DEVICE option

2019-02-20 Thread Kuehling, Felix
On 2019-02-20 6:34 p.m., Jerome Glisse wrote:
> On Wed, Feb 20, 2019 at 10:39:49PM +0000, Kuehling, Felix wrote:
>> On 2019-02-20 5:12 p.m., Jerome Glisse wrote:
>>> On Wed, Feb 20, 2019 at 07:18:17PM +0000, Kuehling, Felix wrote:
>>>> [+Jerome]
>>>>
>>>> Why to we need ZONE_DEVICE. I didn't think this was needed for mirroring
>>>> CPU page tables to device page tables.
>>>>
>>>> ARCH_HAS_HMM depends on (X86_64 || PPC64). Do we have some alternative
>>>> for ARM support?
>>>>
>>>> Also, the name ARCH_HAS_HMM looks like it's meant to be selected by the
>>>> CPU architecture rather than any driver. Jerome, do you have any advice?
>>> This patch is wrong you need to depend on ARCH_HAS_HMM and
>> Who selects ARCH_HAS_HMM? Currently I don't see this selected anywhere.
>> So any config option that depends on it will be invisible in menuconfig.
>> Do we need ARCH_HAS_HMM somewhere in the arch/x86/Kconfig and
>> arch/powerpc/Kconfig?
>>
>> Also, ARCH_HAS_HMM does not currently support ARM. Does that mean we
>> can't have ARM support in AMDGPU if we start using HMM?
> ARCH_HAS_HMM is defined by architecture that support HMM. So par x86
> and PPC. It should not be hard to add it to ARM (i can not remember if
> ARM has DAX yet or not, if ARM does not have DAX then you need to add
> that first).

Not having ARM support is a bummer. I just enabled KFD on ARM a few 
weeks ago. Now depending on HMM makes KFD unusable on ARM. [+Mark FYI] I 
hope this is only a temporary setback.


>> Finally, ARCH_HAS_HMM has a bunch of dependencies. If they are not met,
>> I guess it can't be enabled. Should those be "select"s instead?
> No they should not be selected, people configuring their system need
> to have the freedom of doing so. All those option are selected in all
> the big distribution.
As far as I can tell, the arch/x86/Kconfig doesn't select ARCH_HAS_HMM. 
Its default is "y", so it should be enabled on anything that meets the 
dependencies. But ZONE_DEVICE was not enabled by default. I think that's 
what broke our kernel configs.

We'll fix our own kernel configs to enable ZONE_DEVICE and ARCH_HAS_HMM 
to get our internal builds to work again.

I suspect other users with their own kernel configs will stumble over 
this and wonder why KFD and userptr support are disabled in their builds.

Regards,
   Felix


>
>> config ARCH_HAS_HMM
>>   bool
>>   default y
>>   depends on (X86_64 || PPC64)
>>   depends on ZONE_DEVICE
>>   depends on MMU && 64BIT
>>   depends on MEMORY_HOTPLUG
>>   depends on MEMORY_HOTREMOVE
>>   depends on SPARSEMEM_VMEMMAP
>>
> Cheers,
> Jérôme
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: select ARCH_HAS_HMM and ZONE_DEVICE option

2019-02-20 Thread Kuehling, Felix
On 2019-02-20 5:12 p.m., Jerome Glisse wrote:
> On Wed, Feb 20, 2019 at 07:18:17PM +0000, Kuehling, Felix wrote:
>> [+Jerome]
>>
>> Why to we need ZONE_DEVICE. I didn't think this was needed for mirroring
>> CPU page tables to device page tables.
>>
>> ARCH_HAS_HMM depends on (X86_64 || PPC64). Do we have some alternative
>> for ARM support?
>>
>> Also, the name ARCH_HAS_HMM looks like it's meant to be selected by the
>> CPU architecture rather than any driver. Jerome, do you have any advice?
> This patch is wrong you need to depend on ARCH_HAS_HMM and

Who selects ARCH_HAS_HMM? Currently I don't see this selected anywhere. 
So any config option that depends on it will be invisible in menuconfig. 
Do we need ARCH_HAS_HMM somewhere in the arch/x86/Kconfig and 
arch/powerpc/Kconfig?

Also, ARCH_HAS_HMM does not currently support ARM. Does that mean we 
can't have ARM support in AMDGPU if we start using HMM?

Finally, ARCH_HAS_HMM has a bunch of dependencies. If they are not met, 
I guess it can't be enabled. Should those be "select"s instead?

config ARCH_HAS_HMM
 bool
 default y
 depends on (X86_64 || PPC64)
 depends on ZONE_DEVICE
 depends on MMU && 64BIT
 depends on MEMORY_HOTPLUG
 depends on MEMORY_HOTREMOVE
 depends on SPARSEMEM_VMEMMAP

Regards,
   Felix

> select HMM_MIRROR you do not need to select ZONE_DEVICE
>
> So it should look like:
>
> config DRM_AMDGPU_USERPTR
>   bool "Always enable userptr write support"
>   depends on DRM_AMDGPU
>   depends on ARCH_HAS_HMM
>   select HMM_MIRROR
>   help
> This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
> isn't already selected to enabled full userptr support.
>
> I have not got around to work on amdgpu on that respect yet
> but it is on my todo list unless someone else beat me to it :)
>
> Cheers,
> Jérôme
>
>> Thanks,
>>     Felix
>>
>> On 2019-02-20 1:56 p.m., Yang, Philip wrote:
>>> Those options are needed to support HMM
>>>
>>> Change-Id: Ieb7bb3bcec07245d79a02793e6728228decc400a
>>> Signed-off-by: Philip Yang 
>>> ---
>>>drivers/gpu/drm/amd/amdgpu/Kconfig | 2 ++
>>>1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
>>> b/drivers/gpu/drm/amd/amdgpu/Kconfig
>>> index 960a63355705..63f0542bc34b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
>>> +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
>>> @@ -26,7 +26,9 @@ config DRM_AMDGPU_CIK
>>>config DRM_AMDGPU_USERPTR
>>> bool "Always enable userptr write support"
>>> depends on DRM_AMDGPU
>>> +   select ARCH_HAS_HMM
>>> select HMM_MIRROR
>>> +   select ZONE_DEVICE
>>> help
>>>   This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
>>>   isn't already selected to enabled full userptr support.
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/1] [RFC] drm/ttm: Don't init dma32_zone on 64-bit systems

2019-02-20 Thread Kuehling, Felix

On 2019-02-20 1:41 a.m., Thomas Hellstrom wrote:
> On Tue, 2019-02-19 at 17:06 +0000, Kuehling, Felix wrote:
>> On 2019-02-18 3:39 p.m., Thomas Hellstrom wrote:
>>> On Mon, 2019-02-18 at 18:07 +0100, Christian König wrote:
>>>> Am 18.02.19 um 10:47 schrieb Thomas Hellstrom:
>>>>> On Mon, 2019-02-18 at 09:20 +, Koenig, Christian wrote:
>>>>>> Another good question is also why the heck the acc_size
>>>>>> counts
>>>>>> towards
>>>>>> the DMA32 zone?
>>>>> DMA32 TTM pages are accounted in the DMA32 zone. Other pages
>>>>> are
>>>>> not.
>>>> Yeah, I'm perfectly aware of this. But this is for the accounting
>>>> size!
>>>>
>>>> We have an accounting for the stuff needed additional to the
>>>> pages
>>>> backing the BO (e.g. the page and DMA addr array).
>>>>
>>>> And from the bug description it sounds like we use the DMA32 zone
>>>> for
>>>> this accounting which of course is completely nonsense.
>>> It's actually accounted in all available zones, since it would be
>>> pretty hard to determine exactly where that memory should be
>>> accounted.
>>> In particular if it's vmalloced. It might be DMA32, it might not.
>>> Given
>>> the objective of stopping malicious user-space from exhausting the
>>> DMA32 zone it was, at the time the code was written, a reasonable
>>> approximation. With ever increasing memory sizes, there might be
>>> better
>>> solutions?
>> As far as I can see, in TTM, ttm_mem_global_alloc is only used for
>> the
>> acc_size in ttm_bo_init_reserved. Other than that vmwgfx also seems
>> to
>> use it to account for a few things that are allocated with kmalloc.
>>
>> So would a better solution be to change ttm_mem_global_alloc to use
>> only
>> the kernel zone?
>>
> IMO we need to determine what functionality to keep and then the best
> solution. The current code does its job, but is obviously too
> restrictive. Both of the solutions you suggest open up for potential
> DOS attacks (DMA32 and kernel zones are not mutually exclusive. They
> overlap).
On x86 with HIGHMEM there is no dma32 zone. Why do we need one on 
x86_64? Can we make x86_64 more like HIGHMEM instead?

Regards,
   Felix


>
>
> /Thomas
>
>
>
>
>> Regards,
>> Felix
>>
>>
>>> /Thomas
>>>
>>>> Christian.
>>>>
>>>>> For small persistent allocations using ttm_mem_global_alloc(),
>>>>> they
>>>>> are
>>>>> accounted also in the DMA32 zone, which may cause over-
>>>>> accounting
>>>>> of
>>>>> that zone, but that's pretty unlikely to be a big problem..
>>>>>
>>>>> /Thomas
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>> In other words why should the internal bookkeeping pages be
>>>>>> allocated
>>>>>> in
>>>>>> the DMA32 zone?
>>>>>>
>>>>>> That doesn't sounds valid to me in any way,
>>>>>> Christian.
>>>>>>
>>>>>> Am 18.02.19 um 09:02 schrieb Thomas Hellstrom:
>>>>>>> Hmm,
>>>>>>>
>>>>>>> This zone was intended to stop TTM page allocations from
>>>>>>> exhausting
>>>>>>> the DMA32 zone. IIRC dma_alloc_coherent() uses DMA32 by
>>>>>>> default,
>>>>>>> which
>>>>>>> means if we drop this check, other devices may stop
>>>>>>> functioning
>>>>>>> unexpectedly?
>>>>>>>
>>>>>>> However, in the end I'd expect the kernel page allocation
>>>>>>> system
>>>>>>> to
>>>>>>> make sure there are some pages left in the DMA32 zone,
>>>>>>> otherwise
>>>>>>> random non-IO page allocations would also potentially
>>>>>>> exhaust
>>>>>>> the
>>>>>>> DMA32 zone without anybody caring, which means removing
>>>>>>> this
>>>>>>> zone
>>>>>>> wouldn't be any worse than whatever other subsystems may be
>>>>>>> doing
>>>>>>> already..

Re: [PATCH] drm/amdgpu: select ARCH_HAS_HMM and ZONE_DEVICE option

2019-02-20 Thread Kuehling, Felix
[+Jerome]

Why to we need ZONE_DEVICE. I didn't think this was needed for mirroring 
CPU page tables to device page tables.

ARCH_HAS_HMM depends on (X86_64 || PPC64). Do we have some alternative 
for ARM support?

Also, the name ARCH_HAS_HMM looks like it's meant to be selected by the 
CPU architecture rather than any driver. Jerome, do you have any advice?

Thanks,
   Felix

On 2019-02-20 1:56 p.m., Yang, Philip wrote:
> Those options are needed to support HMM
>
> Change-Id: Ieb7bb3bcec07245d79a02793e6728228decc400a
> Signed-off-by: Philip Yang 
> ---
>   drivers/gpu/drm/amd/amdgpu/Kconfig | 2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig 
> b/drivers/gpu/drm/amd/amdgpu/Kconfig
> index 960a63355705..63f0542bc34b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Kconfig
> +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
> @@ -26,7 +26,9 @@ config DRM_AMDGPU_CIK
>   config DRM_AMDGPU_USERPTR
>   bool "Always enable userptr write support"
>   depends on DRM_AMDGPU
> + select ARCH_HAS_HMM
>   select HMM_MIRROR
> + select ZONE_DEVICE
>   help
> This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
> isn't already selected to enabled full userptr support.
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: disable userptr if swiotlb is active

2019-02-20 Thread Kuehling, Felix
I guess we'll need something similar for KFD? I don't think we've ever 
intentionally tested KFD with swiotlb. But I've seen some backtraces 
with swiotlb in them before. I wonder how badly broken it is ...

Regards,
   Felix

On 2019-02-20 8:46 a.m., Christian König wrote:
> Otherwise we can't be sure that we won't end up with a bounce buffer.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 4 
>   1 file changed, 4 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index d21dd2f369da..abc65633119b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -289,6 +289,10 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, 
> void *data,
>   if (offset_in_page(args->addr | args->size))
>   return -EINVAL;
>   
> + /* We can't do this when swiotlb is active */
> + if (adev->needs_swiotlb)
> + return -ENXIO;
> +
>   /* reject unknown flag values */
>   if (args->flags & ~(AMDGPU_GEM_USERPTR_READONLY |
>   AMDGPU_GEM_USERPTR_ANONONLY | AMDGPU_GEM_USERPTR_VALIDATE |
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/7] drm/amdgpu: clear PDs/PTs only after initializing them

2019-02-19 Thread Kuehling, Felix
I commented on patches 2 and 3 in separate emails. The rest of the 
series is Reviewed-by: Felix Kuehling 


On 2019-02-19 8:40 a.m., Christian König wrote:
> Clear the VM PDs/PTs only after initializing all the structures.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 11 ++-
>   1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 942b5ebc6dc2..12d51d96491e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -945,10 +945,6 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>   if (r)
>   return r;
>   
> - r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats);
> - if (r)
> - goto error_free_pt;
> -
>   if (vm->use_cpu_for_update) {
>   r = amdgpu_bo_kmap(pt, NULL);
>   if (r)
> @@ -961,6 +957,10 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>   pt->parent = amdgpu_bo_ref(cursor.parent->base.bo);
>   
>   amdgpu_vm_bo_base_init(>base, vm, pt);
> +
> + r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats);
> + if (r)
> + goto error_free_pt;
>   }
>   
>   return 0;
> @@ -3031,13 +3031,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>   if (r)
>   goto error_unreserve;
>   
> + amdgpu_vm_bo_base_init(>root.base, vm, root);
> +
>   r = amdgpu_vm_clear_bo(adev, vm, root,
>  adev->vm_manager.root_level,
>  vm->pte_support_ats);
>   if (r)
>   goto error_unreserve;
>   
> - amdgpu_vm_bo_base_init(>root.base, vm, root);
>   amdgpu_bo_unreserve(vm->root.base.bo);
>   
>   if (pasid) {
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/7] drm/amdgpu: let amdgpu_vm_clear_bo figure out ats status

2019-02-19 Thread Kuehling, Felix
On 2019-02-19 8:40 a.m., Christian König wrote:
> Instead of providing it from outside figure out the ats status in the
> function itself from the data structures.
>
> Signed-off-by: Christian König 

One suggestion inline. Other than that this patch is Reviewed-by: Felix 
Kuehling 

Regards,
   Felix


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 52 ++
>   1 file changed, 29 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3c7b98a758c9..48da4ac76837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -747,8 +747,6 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
>* @adev: amdgpu_device pointer
>* @vm: VM to clear BO from
>* @bo: BO to clear
> - * @level: level this BO is at
> - * @pte_support_ats: indicate ATS support from PTE
>*
>* Root PD needs to be reserved when calling this.
>*
> @@ -756,10 +754,11 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
>* 0 on success, errno otherwise.
>*/
>   static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
> -   struct amdgpu_vm *vm, struct amdgpu_bo *bo,
> -   unsigned level, bool pte_support_ats)
> +   struct amdgpu_vm *vm,
> +   struct amdgpu_bo *bo)
>   {
>   struct ttm_operation_ctx ctx = { true, false };
> + unsigned level = adev->vm_manager.root_level;
>   struct dma_fence *fence = NULL;
>   unsigned entries, ats_entries;
>   struct amdgpu_ring *ring;
> @@ -768,17 +767,32 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   int r;
>   
>   entries = amdgpu_bo_size(bo) / 8;
> + if (vm->pte_support_ats) {
> + ats_entries = amdgpu_vm_level_shift(adev, level);
> + ats_entries += AMDGPU_GPU_PAGE_SHIFT;
> + ats_entries = AMDGPU_GMC_HOLE_START >> ats_entries;
>   
> - if (pte_support_ats) {
> - if (level == adev->vm_manager.root_level) {
> - ats_entries = amdgpu_vm_level_shift(adev, level);
> - ats_entries += AMDGPU_GPU_PAGE_SHIFT;
> - ats_entries = AMDGPU_GMC_HOLE_START >> ats_entries;
> + if (!bo->parent) {
>   ats_entries = min(ats_entries, entries);
>   entries -= ats_entries;
>   } else {
> - ats_entries = entries;
> - entries = 0;
> + struct amdgpu_bo *ancestor = bo;
> + struct amdgpu_vm_pt *pt;
> +
> + ++level;
> + while (ancestor->parent->parent) {
> + ancestor = ancestor->parent;
> + ++level;
> + }

This could be simplified as

do {
ancestor = ancestor->parent;
++level;
} while (ancestor->parent);


> +
> + pt = container_of(ancestor->vm_bo, struct amdgpu_vm_pt,
> +   base);
> + if ((pt - vm->root.entries) >= ats_entries) {
> + ats_entries = 0;
> + } else {
> + ats_entries = entries;
> + entries = 0;
> + }
>   }
>   } else {
>   ats_entries = 0;
> @@ -911,7 +925,6 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>   {
>   struct amdgpu_vm_pt_cursor cursor;
>   struct amdgpu_bo *pt;
> - bool ats = false;
>   uint64_t eaddr;
>   int r;
>   
> @@ -921,9 +934,6 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>   
>   eaddr = saddr + size - 1;
>   
> - if (vm->pte_support_ats)
> - ats = saddr < AMDGPU_GMC_HOLE_START;
> -
>   saddr /= AMDGPU_GPU_PAGE_SIZE;
>   eaddr /= AMDGPU_GPU_PAGE_SIZE;
>   
> @@ -972,7 +982,7 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
>   
>   amdgpu_vm_bo_base_init(>base, vm, pt);
>   
> - r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats);
> + r = amdgpu_vm_clear_bo(adev, vm, pt);
>   if (r)
>   goto error_free_pt;
>   }
> @@ -3047,9 +3057,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>   
>   amdgpu_vm_bo_base_init(>root.base, vm, root);
>   
> - r = amdgpu_vm_clear_bo(adev, vm, root,
> -adev->vm_manager.root_level,
> -vm->pte_support_ats);
> + r = amdgpu_vm_clear_bo(adev, vm, root);
>   if (r)
>   goto error_unreserve;
>   
> @@ -3144,9 +3152,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
> struct amdgpu_vm *vm, uns
>* changing any other state, in case it fails.
>*/
> 

Re: [PATCH 2/7] drm/amdgpu: rework shadow handling during PD clear

2019-02-19 Thread Kuehling, Felix
Comments inline.

On 2019-02-19 8:40 a.m., Christian König wrote:
> This way we only deal with the real BO in here.
>
> Signed-off-by: Christian König 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 64 --
>   1 file changed, 39 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 12d51d96491e..3c7b98a758c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -788,38 +788,58 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   
>   r = ttm_bo_validate(>tbo, >placement, );
>   if (r)
> - goto error;
> + return r;
>   
>   r = amdgpu_ttm_alloc_gart(>tbo);
>   if (r)
>   return r;
>   
> + if (bo->shadow) {
> + r = ttm_bo_validate(>shadow->tbo, >shadow->placement,
> + );
> + if (r)
> + return r;
> +
> + r = amdgpu_ttm_alloc_gart(>shadow->tbo);
> + if (r)
> + return r;
> +
> + }
> +
>   r = amdgpu_job_alloc_with_ib(adev, 64, );

I guess that's still big enough to fit 4 instead of 2 SDMA commands (10 
dwords each).


>   if (r)
> - goto error;
> + return r;
>   
> - addr = amdgpu_bo_gpu_offset(bo);
> - if (ats_entries) {
> - uint64_t ats_value;
> + while (1) {
> + addr = amdgpu_bo_gpu_offset(bo);
> + if (ats_entries) {
> + uint64_t ats_value;
>   
> - ats_value = AMDGPU_PTE_DEFAULT_ATC;
> - if (level != AMDGPU_VM_PTB)
> - ats_value |= AMDGPU_PDE_PTE;
> + ats_value = AMDGPU_PTE_DEFAULT_ATC;
> + if (level != AMDGPU_VM_PTB)
> + ats_value |= AMDGPU_PDE_PTE;
>   
> - amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> -   ats_entries, 0, ats_value);
> - addr += ats_entries * 8;
> - }
> + amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> +   ats_entries, 0, ats_value);
> + addr += ats_entries * 8;
> + }
>   
> - if (entries) {
> - uint64_t value = 0;
> + if (entries) {
> + uint64_t value = 0;
>   
> - /* Workaround for fault priority problem on GMC9 */
> - if (level == AMDGPU_VM_PTB && adev->asic_type >= CHIP_VEGA10)
> - value = AMDGPU_PTE_EXECUTABLE;
> + /* Workaround for fault priority problem on GMC9 */
> + if (level == AMDGPU_VM_PTB &&
> + adev->asic_type >= CHIP_VEGA10)
> + value = AMDGPU_PTE_EXECUTABLE;
> +
> + amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> +   entries, 0, value);
> + }
>   
> - amdgpu_vm_set_pte_pde(adev, >ibs[0], addr, 0,
> -   entries, 0, value);
> + if (bo->shadow)
> + bo = bo->shadow;
> + else
> + break;
>   }

Instead of a while(1) endless loop, this could be written as a do ... 
while loop with a sane termination condition like this:

do {
...
bo = bo->shadow;
} while (bo);

Assuming that you don't need "bo" after this. You do below, but I think 
that's a mistake anyway. See the next comment.

>   
>   amdgpu_ring_pad_ib(ring, >ibs[0]);
> @@ -838,16 +858,10 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device 
> *adev,
>   amdgpu_bo_fence(bo, fence, true);

Here you'll only fence the shadow BO.

Regards,
   Felix


>   dma_fence_put(fence);
>   
> - if (bo->shadow)
> - return amdgpu_vm_clear_bo(adev, vm, bo->shadow,
> -   level, pte_support_ats);
> -
>   return 0;
>   
>   error_free:
>   amdgpu_job_free(job);
> -
> -error:
>   return r;
>   }
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/powerplay: print current clock level when dpm is disabled on vg20

2019-02-19 Thread Kuehling, Felix
On 2019-02-19 4:09 p.m., Liu, Shaoyun wrote:
> When DPM for the specific clock is difabled, driver should still print out
> current clock info for rocm-smi support on vega20
>
> Change-Id: I8669c77bf153caa2cd63a575802eb58747151239
> Signed-off-by: shaoyunl 
> ---
>   drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 56 
> +++---
>   1 file changed, 28 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c 
> b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> index aad79aff..c95e0f3 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> @@ -2641,9 +2641,8 @@ static int vega20_get_sclks(struct pp_hwmgr *hwmgr,
>   struct vega20_single_dpm_table *dpm_table = 
> &(data->dpm_table.gfx_table);
>   int i, count;
>   
> - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_GFXCLK].enabled,
> - "[GetSclks]: gfxclk dpm not enabled!\n",
> - return -EPERM);
> + if (!data->smu_features[GNLD_DPM_GFXCLK].enabled)
> + return -1;

Please return a proper error code here.

Same for the other changes below.

Regards,
   Felix


>   
>   count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : 
> dpm_table->count;
>   clocks->num_levels = count;
> @@ -2670,9 +2669,8 @@ static int vega20_get_memclocks(struct pp_hwmgr *hwmgr,
>   struct vega20_single_dpm_table *dpm_table = 
> &(data->dpm_table.mem_table);
>   int i, count;
>   
> - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_UCLK].enabled,
> - "[GetMclks]: uclk dpm not enabled!\n",
> - return -EPERM);
> + if (!data->smu_features[GNLD_DPM_UCLK].enabled)
> + return -1;
>   
>   count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : 
> dpm_table->count;
>   clocks->num_levels = data->mclk_latency_table.count = count;
> @@ -2696,9 +2694,8 @@ static int vega20_get_dcefclocks(struct pp_hwmgr *hwmgr,
>   struct vega20_single_dpm_table *dpm_table = 
> &(data->dpm_table.dcef_table);
>   int i, count;
>   
> - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_DCEFCLK].enabled,
> - "[GetDcfclocks]: dcefclk dpm not enabled!\n",
> - return -EPERM);
> + if (!data->smu_features[GNLD_DPM_DCEFCLK].enabled)
> + return -1;
>   
>   count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : 
> dpm_table->count;
>   clocks->num_levels = count;
> @@ -2719,9 +2716,8 @@ static int vega20_get_socclocks(struct pp_hwmgr *hwmgr,
>   struct vega20_single_dpm_table *dpm_table = 
> &(data->dpm_table.soc_table);
>   int i, count;
>   
> - PP_ASSERT_WITH_CODE(data->smu_features[GNLD_DPM_SOCCLK].enabled,
> - "[GetSocclks]: socclk dpm not enabled!\n",
> - return -EPERM);
> + if (!data->smu_features[GNLD_DPM_SOCCLK].enabled)
> + return -1;
>   
>   count = (dpm_table->count > MAX_NUM_CLOCKS) ? MAX_NUM_CLOCKS : 
> dpm_table->count;
>   clocks->num_levels = count;
> @@ -3137,10 +3133,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr 
> *hwmgr,
>   "Attempt to get current gfx clk Failed!",
>   return ret);
>   
> - ret = vega20_get_sclks(hwmgr, );
> - PP_ASSERT_WITH_CODE(!ret,
> - "Attempt to get gfx clk levels Failed!",
> - return ret);
> + if (vega20_get_sclks(hwmgr, )) {
> + size += sprintf(buf + size, "0: %uMhz * (DPM 
> disabled)\n",
> + now / 100);
> + break;
> + }
>   
>   for (i = 0; i < clocks.num_levels; i++)
>   size += sprintf(buf + size, "%d: %uMhz %s\n",
> @@ -3154,10 +3151,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr 
> *hwmgr,
>   "Attempt to get current mclk freq Failed!",
>   return ret);
>   
> - ret = vega20_get_memclocks(hwmgr, );
> - PP_ASSERT_WITH_CODE(!ret,
> - "Attempt to get memory clk levels Failed!",
> - return ret);
> + if (vega20_get_memclocks(hwmgr, )) {
> + size += sprintf(buf + size, "0: %uMhz * (DPM 
> disabled)\n",
> + now / 100);
> + break;
> + }
>   
>   for (i = 0; i < clocks.num_levels; i++)
>   size += sprintf(buf + size, "%d: %uMhz %s\n",
> @@ -3171,10 +3169,11 @@ static int vega20_print_clock_levels(struct pp_hwmgr 
> *hwmgr,
>   "Attempt to get current socclk freq Failed!",
>   return ret);
>   
> - ret = vega20_get_socclocks(hwmgr, );
> - PP_ASSERT_WITH_CODE(!ret,
> -

<    1   2   3   4   5   6   >