Re: [PATCH 10/10] drm/amdgpu: stop removing BOs from the LRU v3

2019-05-24 Thread Kuehling, Felix
On 2019-05-23 5:06 a.m., Christian König wrote:
> [CAUTION: External Email]
>
> Leaving BOs on the LRU is harmless. We always did this for VM page table
> and per VM BOs.
>
> The key point is that BOs which couldn't be reserved can't be evicted.
> So what happened is that an application used basically all of VRAM
> during CS and because of this X server couldn't pin a BO for scanout.
>
> Now we keep the BOs on the LRU and modify TTM to block for the CS to
> complete, which in turn allows the X server to pin its BO for scanout.


OK, let me rephrase that to make sure I understand it correctly. I think 
the point is that eviction candidates come from an LRU list, so leaving 
things on the LRU makes more BOs available for eviction and avoids OOM 
situations. To take advantage of that, patch 6 adds the ability to wait 
for reserved BOs when there is nothing easier to evict.

ROCm applications like to use lots of memory. So it probably makes sense 
for us to stop removing our BOs from the LRU as well while we 
mass-validate our BOs in amdgpu_amdkfd_gpuvm_restore_process_bos.

Regards,
   Felix


>
> Christian.
>
> Am 22.05.19 um 21:43 schrieb Kuehling, Felix:
>> Can you explain how this avoids OOM situations? When is it safe to leave
>> a reserved BO on the LRU list? Could we do the same thing in
>> amdgpu_amdkfd_gpuvm.c? And if we did, what would be the expected side
>> effects or consequences?
>>
>> Thanks,
>>     Felix
>>
>> On 2019-05-22 8:59 a.m., Christian König wrote:
>>> [CAUTION: External Email]
>>>
>>> This avoids OOM situations when we have lots of threads
>>> submitting at the same time.
>>>
>>> v3: apply this to the whole driver, not just CS
>>>
>>> Signed-off-by: Christian König 
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c    | 2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +-
>>>    4 files changed, 5 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 20f2955d2a55..3e2da24cd17a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -648,7 +648,7 @@ static int amdgpu_cs_parser_bos(struct 
>>> amdgpu_cs_parser *p,
>>>   }
>>>
>>>   r = ttm_eu_reserve_buffers(>ticket, >validated, true,
>>> -  , true);
>>> +  , false);
>>>   if (unlikely(r != 0)) {
>>>   if (r != -ERESTARTSYS)
>>>   DRM_ERROR("ttm_eu_reserve_buffers 
>>> failed.\n");
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>> index 06f83cac0d3a..f660628e6af9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
>>> @@ -79,7 +79,7 @@ int amdgpu_map_static_csa(struct amdgpu_device 
>>> *adev, struct amdgpu_vm *vm,
>>>   list_add(_tv.head, );
>>>   amdgpu_vm_get_pd_bo(vm, , );
>>>
>>> -   r = ttm_eu_reserve_buffers(, , true, NULL, true);
>>> +   r = ttm_eu_reserve_buffers(, , true, NULL, false);
>>>   if (r) {
>>>   DRM_ERROR("failed to reserve CSA,PD BOs: 
>>> err=%d\n", r);
>>>   return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index d513a5ad03dd..ed25a4e14404 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -171,7 +171,7 @@ void amdgpu_gem_object_close(struct 
>>> drm_gem_object *obj,
>>>
>>>   amdgpu_vm_get_pd_bo(vm, , _pd);
>>>
>>> -   r = ttm_eu_reserve_buffers(, , false, 
>>> , true);
>>> +   r = ttm_eu_reserve_buffers(, , false, 
>>> , false);
>>>   if (r) {
>>>   dev_err(adev->dev, "leaking bo va because "
>>>   "we fail to reserve bo (%d)\n", r);
>>> @@ -608,7 +608,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, 
>>> void *data,
>>>
>>>   amdgpu_vm_get_pd_bo(>vm, , _pd);
>>>
>>> -   r = ttm_eu_reserve_buffers(, , true, 
>>> , true);
>>> +   r = ttm_eu_reserve_buffers(, , true, 
>>> , false);
>>>   if (r)
>>>   goto error_unref;
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index c430e8259038..d60593cc436e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -155,7 +155,7 @@ static inline int amdgpu_bo_reserve(struct 
>>> amdgpu_bo *bo, bool no_intr)
>>>   struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
>>>   int r;
>>>
>>> -   r = ttm_bo_reserve(>tbo, !no_intr, false, NULL);
>>> +   r = __ttm_bo_reserve(>tbo, !no_intr, false, NULL);
>>>    

Re: [PATCH 6/8] drm/amdkfd: New IOCTL to allocate queue GWS

2019-05-24 Thread Kuehling, Felix
On 2019-05-23 6:41 p.m., Zeng, Oak wrote:
> Add a new kfd ioctl to allocate queue GWS. Queue
> GWS is released on queue destroy.
>
> Change-Id: I60153c26a577992ad873e4292e759e5c3d5bbd15
> Signed-off-by: Oak Zeng 

Reviewed-by: Felix Kuehling 


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 27 +++
>   include/uapi/linux/kfd_ioctl.h   | 20 +++-
>   2 files changed, 46 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 38ae53f..455a3db 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1559,6 +1559,31 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
> *filep,
>   return err;
>   }
>   
> +static int kfd_ioctl_alloc_queue_gws(struct file *filep,
> + struct kfd_process *p, void *data)
> +{
> + int retval;
> + struct kfd_ioctl_alloc_queue_gws_args *args = data;
> + struct kfd_dev *dev = NULL;
> +
> + if (!hws_gws_support ||
> + dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
> + return -EINVAL;
> +
> + dev = kfd_device_by_id(args->gpu_id);
> + if (!dev) {
> + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
> + return -EINVAL;
> + }
> +
> + mutex_lock(>mutex);
> + retval = pqm_set_gws(>pqm, args->queue_id, args->num_gws ? dev->gws 
> : NULL);
> + mutex_unlock(>mutex);
> +
> + args->first_gws = 0;
> + return retval;
> +}
> +
>   static int kfd_ioctl_get_dmabuf_info(struct file *filep,
>   struct kfd_process *p, void *data)
>   {
> @@ -1761,6 +1786,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = 
> {
>   AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
>   kfd_ioctl_import_dmabuf, 0),
>   
> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
> + kfd_ioctl_alloc_queue_gws, 0),
>   };
>   
>   #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 20917c5..070d1bc 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -410,6 +410,21 @@ struct kfd_ioctl_unmap_memory_from_gpu_args {
>   __u32 n_success;/* to/from KFD */
>   };
>   
> +/* Allocate GWS for specific queue
> + *
> + * @gpu_id:  device identifier
> + * @queue_id:queue's id that GWS is allocated for
> + * @num_gws: how many GWS to allocate
> + * @first_gws:   index of the first GWS allocated.
> + *   only support contiguous GWS allocation
> + */
> +struct kfd_ioctl_alloc_queue_gws_args {
> + __u32 gpu_id;   /* to KFD */
> + __u32 queue_id; /* to KFD */
> + __u32 num_gws;  /* to KFD */
> + __u32 first_gws;/* from KFD */
> +};
> +
>   struct kfd_ioctl_get_dmabuf_info_args {
>   __u64 size; /* from KFD */
>   __u64 metadata_ptr; /* to KFD */
> @@ -529,7 +544,10 @@ enum kfd_mmio_remap {
>   #define AMDKFD_IOC_IMPORT_DMABUF\
>   AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args)
>   
> +#define AMDKFD_IOC_ALLOC_QUEUE_GWS   \
> + AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
> +
>   #define AMDKFD_COMMAND_START0x01
> -#define AMDKFD_COMMAND_END   0x1E
> +#define AMDKFD_COMMAND_END   0x1F
>   
>   #endif
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 5/8] drm/amdkfd: Add function to set queue gws

2019-05-24 Thread Kuehling, Felix
On 2019-05-23 6:41 p.m., Zeng, Oak wrote:
> Add functions in process queue manager to
> set/unset queue gws. Also set process's number
> of gws used. Currently only one queue in
> process can use and use all gws.
>
> Change-Id: I03e480c8692db3eabfc3a188cce8904d5d962ab7
> Signed-off-by: Oak Zeng 
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |  6 +++
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 57 
> ++
>   2 files changed, 63 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 5969e37..40a5c67 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -454,6 +454,9 @@ struct queue_properties {
>*
>* @device: The kfd device that created this queue.
>*
> + * @gws: Pointing to gws kgd_mem if this is a gws control queue; NULL
> + * otherwise.
> + *
>* This structure represents user mode compute queues.
>* It contains all the necessary data to handle such queues.
>*
> @@ -475,6 +478,7 @@ struct queue {
>   
>   struct kfd_process  *process;
>   struct kfd_dev  *device;
> + void *gws;
>   };
>   
>   /*
> @@ -868,6 +872,8 @@ int pqm_update_queue(struct process_queue_manager *pqm, 
> unsigned int qid,
>   struct queue_properties *p);
>   int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
>   struct queue_properties *p);
> +int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
> + void *gws);
>   struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
>   unsigned int qid);
>   int pqm_get_wave_state(struct process_queue_manager *pqm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index e652e25..5764491 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -26,6 +26,7 @@
>   #include "kfd_device_queue_manager.h"
>   #include "kfd_priv.h"
>   #include "kfd_kernel_queue.h"
> +#include "amdgpu_amdkfd.h"
>   
>   static inline struct process_queue_node *get_queue_by_qid(
>   struct process_queue_manager *pqm, unsigned int qid)
> @@ -74,6 +75,55 @@ void kfd_process_dequeue_from_device(struct 
> kfd_process_device *pdd)
>   pdd->already_dequeued = true;
>   }
>   
> +int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
> + void *gws)
> +{
> + struct kfd_dev *dev = NULL;
> + struct process_queue_node *pqn;
> + struct kfd_process_device *pdd;
> + struct kgd_mem *mem;
> + int ret;
> +
> + pqn = get_queue_by_qid(pqm, qid);
> + if (!pqn) {
> + pr_err("Queue id does not match any known queue\n");
> + return -EINVAL;
> + }
> +
> + if (pqn->q)
> + dev = pqn->q->device;
> + if (WARN_ON(!dev))
> + return -ENODEV;
> +
> + pdd = kfd_get_process_device_data(dev, pqm->process);
> + if (!pdd) {
> + pr_err("Process device data doesn't exist\n");
> + return -EINVAL;
> + }
> +
> + /* Only allow one queue per process can have GWS assigned */
> + if (gws && pdd->qpd.num_gws)
> + return -EINVAL;
> +
> + if (!gws && pdd->qpd.num_gws == 0)
> + return -EINVAL;
> +
> + if (gws)
> + ret = 
> amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
> + gws, );
> + else
> + 
> amdgpu_amdkfd_remove_gws_from_process(pdd->process->kgd_process_info,
> + pqn->q->gws);
> + if (unlikely(ret))

ret may be uninitialized here. You probably get a compiler warning for 
that. Please check for compiler warnings and initialize ret at the start 
of the function.


> + return ret;
> +
> + pqn->q->gws = gws ? mem : NULL;

I think there is a chance that some compilers will complain here that 
mem can be used uninitialized. It may be safer to initialize mem to NULL 
at the start of the function, just in case. Then you can also just 
assign mem here without the conditional.

With these two issues fixed, this patch is Reviewed-by: Felix Kuehling 



> + pdd->qpd.num_gws = gws ? amdgpu_amdkfd_get_num_gws(dev->kgd) : 0;
> +
> + return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
> + pqn->q);
> +}
> +
>   void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
>   {
>   struct kfd_process_device *pdd;
> @@ -330,6 +380,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
> unsigned int qid)
>   if (retval != -ETIME)
>   goto err_destroy_queue;
>   }
> +
> + if 

Re: [PATCH] drm/amdgpu: add pmu counters

2019-05-24 Thread Kuehling, Felix

On 2019-05-24 3:12 p.m., Kim, Jonathan wrote:
> add pmu counters to monitor amdgpu device performance
>
> Change-Id: I8449f4ea824c411ee24a5b783ac066189b9de08e
> Signed-off-by: Jonathan Kim 
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile|   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   5 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c| 370 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.h|  37 +++
>   4 files changed, 413 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 11a651ff7f0d..90d4c5d299dd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>   amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
>   amdgpu_vf_error.o amdgpu_sched.o amdgpu_debugfs.o amdgpu_ids.o \
>   amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
> - amdgpu_vm_sdma.o
> + amdgpu_vm_sdma.o amdgpu_pmu.o
>   
>   # add asic specific block
>   amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 582f5635fcb2..51f479b357a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -61,6 +61,7 @@
>   
>   #include "amdgpu_xgmi.h"
>   #include "amdgpu_ras.h"
> +#include "amdgpu_pmu.h"
>   
>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
> @@ -2748,6 +2749,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   goto failed;
>   }
>   
> + r = amdgpu_pmu_init(adev);
> + if (r)
> + dev_err(adev->dev, "amdgpu_pmu_init failed\n");
> +
>   /* must succeed. */
>   amdgpu_ras_resume(adev);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> new file mode 100644
> index ..b991e988d6fa
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> @@ -0,0 +1,370 @@
> +/*
> + * Copyright 2019 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * Author: Jonathan Kim 
> + *
> + */
> +
> +#define pr_fmt(fmt)  "perf/amdgpu_pmu: " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "amdgpu.h"
> +#include "amdgpu_pmu.h"
> +#include "df_v3_6.h"
> +
> +#define PMU_NAME_SIZE 32
> +
> +struct amdgpu_perf_status {
> + struct list_head list;
> + struct pmu pmu;
> + struct amdgpu_device *gpu;
> + int node_idx;
> + char name[PMU_NAME_SIZE];
> + u8 max_counters;
> + uint64_t cntr_assign_mask;
> + raw_spinlock_t lock;
> +};
> +
> +static LIST_HEAD(amdgpu_perf_status_list);
> +
> +
> +/*-
> + * sysfs format attributes
> + *-*/
> +
> +PMU_FORMAT_ATTR(df_event,"config:0-7");
> +PMU_FORMAT_ATTR(df_instance, "config:8-15");
> +PMU_FORMAT_ATTR(df_unitmask, "config:16-23");
> +
> +static struct attribute *amdgpu_pmu_format_attrs[] = {
> + _attr_df_event.attr,
> + _attr_df_instance.attr,
> + _attr_df_unitmask.attr,
> + NULL,
> +};
> +
> +
> +static struct attribute_group amdgpu_pmu_format_group = {
> + .name = "format",
> + .attrs = amdgpu_pmu_format_attrs,
> +};
> +
> +
> +/*-
> + * sysfs events attributes
> + *-*/
> +
> +
> +static struct attribute_group amdgpu_pmu_events_group = {
> + .name = "events",
> +};
> +
> +struct 

[PATCH] drm/amdgpu: add pmu counters

2019-05-24 Thread Kim, Jonathan
add pmu counters to monitor amdgpu device performance

Change-Id: I8449f4ea824c411ee24a5b783ac066189b9de08e
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/Makefile|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c| 370 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.h|  37 +++
 4 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 11a651ff7f0d..90d4c5d299dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
amdgpu_vf_error.o amdgpu_sched.o amdgpu_debugfs.o amdgpu_ids.o \
amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
-   amdgpu_vm_sdma.o
+   amdgpu_vm_sdma.o amdgpu_pmu.o
 
 # add asic specific block
 amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 582f5635fcb2..51f479b357a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -61,6 +61,7 @@
 
 #include "amdgpu_xgmi.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_pmu.h"
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2748,6 +2749,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
goto failed;
}
 
+   r = amdgpu_pmu_init(adev);
+   if (r)
+   dev_err(adev->dev, "amdgpu_pmu_init failed\n");
+
/* must succeed. */
amdgpu_ras_resume(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
new file mode 100644
index ..b991e988d6fa
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Jonathan Kim 
+ *
+ */
+
+#define pr_fmt(fmt)"perf/amdgpu_pmu: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include "amdgpu.h"
+#include "amdgpu_pmu.h"
+#include "df_v3_6.h"
+
+#define PMU_NAME_SIZE 32
+
+struct amdgpu_perf_status {
+   struct list_head list;
+   struct pmu pmu;
+   struct amdgpu_device *gpu;
+   int node_idx;
+   char name[PMU_NAME_SIZE];
+   u8 max_counters;
+   uint64_t cntr_assign_mask;
+   raw_spinlock_t lock;
+};
+
+static LIST_HEAD(amdgpu_perf_status_list);
+
+
+/*-
+ * sysfs format attributes
+ *-*/
+
+PMU_FORMAT_ATTR(df_event,  "config:0-7");
+PMU_FORMAT_ATTR(df_instance,   "config:8-15");
+PMU_FORMAT_ATTR(df_unitmask,   "config:16-23");
+
+static struct attribute *amdgpu_pmu_format_attrs[] = {
+   _attr_df_event.attr,
+   _attr_df_instance.attr,
+   _attr_df_unitmask.attr,
+   NULL,
+};
+
+
+static struct attribute_group amdgpu_pmu_format_group = {
+   .name = "format",
+   .attrs = amdgpu_pmu_format_attrs,
+};
+
+
+/*-
+ * sysfs events attributes
+ *-*/
+
+
+static struct attribute_group amdgpu_pmu_events_group = {
+   .name = "events",
+};
+
+struct AMDGPU_PMU_EVENT_DESC {
+   struct kobj_attribute attr;
+   const char *event;
+};
+
+static ssize_t _pmu_event_show(struct kobject *kobj,
+  struct kobj_attribute *attr, char *buf)
+{
+   struct AMDGPU_PMU_EVENT_DESC *event =
+   

Re: [PATCH 8/8] drm/amdkfd: Use kfd fd to mmap mmio

2019-05-24 Thread Kuehling, Felix
Hi Oak,

I'm not sure why this is part of the GWS patch series. It's unrelated to 
GWS. Anyway, see one comment inline.

On 2019-05-23 6:41 p.m., Zeng, Oak wrote:
> TTM doesn't support CPU mapping of sg type bo (under which
> mmio bo is created). Switch mmaping of mmio page to kfd
> device file.
>
> Change-Id: I1a1a24f2ac0662be3783d460c137731ade007b83
> Signed-off-by: Oak Zeng 
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 46 
> 
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  1 +
>   2 files changed, 47 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 455a3db..67d269b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1309,6 +1309,15 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
> *filep,
>   args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
>   args->mmap_offset = offset;
>   
> + /* MMIO is mapped through kfd device
> +  * Generate a kfd mmap offset
> +  */
> + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
> + args->mmap_offset = KFD_MMAP_TYPE_MMIO | 
> KFD_MMAP_GPU_ID(args->gpu_id);
> + args->mmap_offset <<= PAGE_SHIFT;
> + args->mmap_offset |= 
> amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);

It seems the mmio_remap_phys_addr doesn't need to be part of the 
mmap_offset. It's not used for anything in kfd_mmap. And it 
unnecessarily exposes a physical address to user mode, which we usually 
tend to avoid.

With that fixes, the patch is Reviewed-by: Felix Kuehling 


Regards,
   Felix


> + }
> +
>   return 0;
>   
>   err_free:
> @@ -1880,6 +1889,39 @@ static long kfd_ioctl(struct file *filep, unsigned int 
> cmd, unsigned long arg)
>   return retcode;
>   }
>   
> +static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process,
> +   struct vm_area_struct *vma)
> +{
> + phys_addr_t address;
> + int ret;
> +
> + if (vma->vm_end - vma->vm_start != PAGE_SIZE)
> + return -EINVAL;
> +
> + address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
> +
> + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
> + VM_DONTDUMP | VM_PFNMAP;
> +
> + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> + pr_debug("Process %d mapping mmio page\n"
> +  " target user address == 0x%08llX\n"
> +  " physical address== 0x%08llX\n"
> +  " vm_flags== 0x%04lX\n"
> +  " size== 0x%04lX\n",
> +  process->pasid, (unsigned long long) vma->vm_start,
> +  address, vma->vm_flags, PAGE_SIZE);
> +
> + ret = io_remap_pfn_range(vma,
> + vma->vm_start,
> + address >> PAGE_SHIFT,
> + PAGE_SIZE,
> + vma->vm_page_prot);
> + return ret;
> +}
> +
> +
>   static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
>   {
>   struct kfd_process *process;
> @@ -1910,6 +1952,10 @@ static int kfd_mmap(struct file *filp, struct 
> vm_area_struct *vma)
>   if (!dev)
>   return -ENODEV;
>   return kfd_reserved_mem_mmap(dev, process, vma);
> + case KFD_MMAP_TYPE_MMIO:
> + if (!dev)
> + return -ENODEV;
> + return kfd_mmio_mmap(dev, process, vma);
>   }
>   
>   return -EFAULT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 40a5c67..b61dc53 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -59,6 +59,7 @@
>   #define KFD_MMAP_TYPE_DOORBELL  (0x3ULL << KFD_MMAP_TYPE_SHIFT)
>   #define KFD_MMAP_TYPE_EVENTS(0x2ULL << KFD_MMAP_TYPE_SHIFT)
>   #define KFD_MMAP_TYPE_RESERVED_MEM  (0x1ULL << KFD_MMAP_TYPE_SHIFT)
> +#define KFD_MMAP_TYPE_MMIO   (0x0ULL << KFD_MMAP_TYPE_SHIFT)
>   
>   #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
>   #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Need to set the baco cap before baco reset

2019-05-24 Thread Alex Deucher
On Thu, May 23, 2019 at 10:29 PM Deng, Emily  wrote:
>
>
>
> >-Original Message-
> >From: Alex Deucher 
> >Sent: Friday, May 24, 2019 12:09 AM
> >To: Deng, Emily 
> >Cc: amd-gfx list 
> >Subject: Re: [PATCH] drm/amdgpu: Need to set the baco cap before baco
> >reset
> >
> >[CAUTION: External Email]
> >
> >On Thu, May 23, 2019 at 6:22 AM Emily Deng  wrote:
> >>
> >> For passthrough, after rebooted the VM, driver will do a baco reset
> >> before doing other driver initialization during loading  driver. For
> >> doing the baco reset, it will first check the baco reset capability.
> >> So first need to set the cap from the vbios information or baco reset
> >> won't be enabled.
> >>
> >> Signed-off-by: Emily Deng 
> >> ---
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  8 
> >>  drivers/gpu/drm/amd/amdgpu/soc15.c |  3 ++-
> >>  drivers/gpu/drm/amd/include/kgd_pp_interface.h |  1 +
> >>  drivers/gpu/drm/amd/powerplay/amd_powerplay.c  | 16
> >+++
> >>  drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c |  1 +
> >>  .../amd/powerplay/hwmgr/vega10_processpptables.c   | 24
> >++
> >>  .../amd/powerplay/hwmgr/vega10_processpptables.h   |  1 +
> >>  drivers/gpu/drm/amd/powerplay/inc/hwmgr.h  |  1 +
> >>  8 files changed, 54 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> index bdd1fe73..2dde672 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >> @@ -2611,6 +2611,14 @@ int amdgpu_device_init(struct amdgpu_device
> >*adev,
> >>  *  E.g., driver was not cleanly unloaded previously, etc.
> >>  */
> >> if (!amdgpu_sriov_vf(adev) &&
> >> amdgpu_asic_need_reset_on_init(adev)) {
> >> +   if (amdgpu_passthrough(adev) && adev->powerplay.pp_funcs &&
> >adev->powerplay.pp_funcs->set_asic_baco_cap) {
> >> +   r = 
> >> adev->powerplay.pp_funcs->set_asic_baco_cap(adev-
> >>powerplay.pp_handle);
> >> +   if (r) {
> >> +   dev_err(adev->dev, "set baco capability 
> >> failed\n");
> >> +   goto failed;
> >> +   }
> >> +   }
> >> +
> >
> >I think it would be cleaner to add this to hwmgr_early_init() or something
> >called from early init for powerplay.
> I  also preferred to put it in the hwmgr_early_init, but as the function 
> set_asic_baco_cap  need to get the vbios info,  so need to put the 
> amdgpu_get_bios before early init. If so the code changes too big.

I think this change is all you need:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bdd1fe73f14b..952f61e28d42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2564,6 +2564,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,

amdgpu_device_get_pcie_info(adev);

+   /* Read BIOS */
+   if (!amdgpu_get_bios(adev)) {
+   r = -EINVAL;
+   goto failed;
+   }
+
/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
@@ -2591,12 +2597,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
goto fence_driver_init;
}

-   /* Read BIOS */
-   if (!amdgpu_get_bios(adev)) {
-   r = -EINVAL;
-   goto failed;
-   }
-
r = amdgpu_atombios_init(adev);
if (r) {
dev_err(adev->dev, "amdgpu_atombios_init failed\n");

I guess that could be a follow up change if you want.

Alex

> >
> >Alex
> >
> >> r = amdgpu_asic_reset(adev);
> >> if (r) {
> >> dev_err(adev->dev, "asic reset on init
> >> failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
> >> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> >> index 78bd4fc..d9fdd95 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> >> @@ -764,7 +764,8 @@ static bool soc15_need_reset_on_init(struct
> >amdgpu_device *adev)
> >> /* Just return false for soc15 GPUs.  Reset does not seem to
> >>  * be necessary.
> >>  */
> >> -   return false;
> >> +   if (!amdgpu_passthrough(adev))
> >> +   return false;
> >>
> >> if (adev->flags & AMD_IS_APU)
> >> return false;
> >> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> >> b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> >> index 9f661bf..c6e2a51 100644
> >> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> >> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> >> @@ -296,6 +296,7 @@ struct amd_pm_funcs {
> >> int (*set_hard_min_fclk_by_freq)(void *handle, uint32_t clock);
> >> int 

Re: [PATCH] drm/amd/display: Don't load DMCU for Raven 1 (v2)

2019-05-24 Thread Alex Deucher
On Fri, May 24, 2019 at 12:32 PM Mike Lothian  wrote:
>
> I realise you don't want to enable this as it's breaking some people's
> systems, but could we add a new boot parameter to force it for working
> systems? Or check against a black list maybe?

We could probably add a whitelist.  I'm not sure what the best way to
id the working systems are though.

Alex

>
> On Fri, 24 May 2019 at 17:20, Alex Deucher  wrote:
> >
> > On Fri, May 24, 2019 at 12:09 PM Mike Lothian  wrote:
> > >
> > > Hi
> > >
> > > Curious to know what this means for folk that have newer Raven1 boards
> > > that didn't have issues loading the firmware
> >
> > You won't get ABM I think.  ABM is the automatic backlight management.
> >
> > Alex
> >
> > >
> > > Cheers
> > >
> > > Mike
> > >
> > > On Fri, 24 May 2019 at 16:34, Alex Deucher  wrote:
> > > >
> > > > From: Harry Wentland 
> > > >
> > > > [WHY]
> > > > Some early Raven boards had a bad SBIOS that doesn't play nicely with
> > > > the DMCU FW. We thought the issues were fixed by ignoring errors on DMCU
> > > > load but that doesn't seem to be the case. We've still seen reports of
> > > > users unable to boot their systems at all.
> > > >
> > > > [HOW]
> > > > Disable DMCU load on Raven 1. Only load it for Raven 2 and Picasso.
> > > >
> > > > v2: Fix ifdef (Alex)
> > > >
> > > > Signed-off-by: Harry Wentland 
> > > > Reviewed-by: Nicholas Kazlauskas 
> > > > Signed-off-by: Alex Deucher 
> > > > Cc: sta...@vger.kernel.org
> > > > ---
> > > >  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++--
> > > >  1 file changed, 10 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> > > > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > > index 995f9df66142..bcb1a93c0b4c 100644
> > > > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > > @@ -29,6 +29,7 @@
> > > >  #include "dm_services_types.h"
> > > >  #include "dc.h"
> > > >  #include "dc/inc/core_types.h"
> > > > +#include "dal_asic_id.h"
> > > >
> > > >  #include "vid.h"
> > > >  #include "amdgpu.h"
> > > > @@ -640,7 +641,7 @@ static void amdgpu_dm_fini(struct amdgpu_device 
> > > > *adev)
> > > >
> > > >  static int load_dmcu_fw(struct amdgpu_device *adev)
> > > >  {
> > > > -   const char *fw_name_dmcu;
> > > > +   const char *fw_name_dmcu = NULL;
> > > > int r;
> > > > const struct dmcu_firmware_header_v1_0 *hdr;
> > > >
> > > > @@ -663,7 +664,14 @@ static int load_dmcu_fw(struct amdgpu_device *adev)
> > > > case CHIP_VEGA20:
> > > > return 0;
> > > > case CHIP_RAVEN:
> > > > -   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > > > +#if defined(CONFIG_DRM_AMD_DC_DCN1_01)
> > > > +   if (ASICREV_IS_PICASSO(adev->external_rev_id))
> > > > +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > > > +   else if (ASICREV_IS_RAVEN2(adev->external_rev_id))
> > > > +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > > > +   else
> > > > +#endif
> > > > +   return 0;
> > > > break;
> > > > default:
> > > > DRM_ERROR("Unsupported ASIC type: 0x%X\n", 
> > > > adev->asic_type);
> > > > --
> > > > 2.20.1
> > > >
> > > > ___
> > > > amd-gfx mailing list
> > > > amd-gfx@lists.freedesktop.org
> > > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amd/display: Don't load DMCU for Raven 1 (v2)

2019-05-24 Thread Mike Lothian
I realise you don't want to enable this as it's breaking some people's
systems, but could we add a new boot parameter to force it for working
systems? Or check against a black list maybe?

On Fri, 24 May 2019 at 17:20, Alex Deucher  wrote:
>
> On Fri, May 24, 2019 at 12:09 PM Mike Lothian  wrote:
> >
> > Hi
> >
> > Curious to know what this means for folk that have newer Raven1 boards
> > that didn't have issues loading the firmware
>
> You won't get ABM I think.  ABM is the automatic backlight management.
>
> Alex
>
> >
> > Cheers
> >
> > Mike
> >
> > On Fri, 24 May 2019 at 16:34, Alex Deucher  wrote:
> > >
> > > From: Harry Wentland 
> > >
> > > [WHY]
> > > Some early Raven boards had a bad SBIOS that doesn't play nicely with
> > > the DMCU FW. We thought the issues were fixed by ignoring errors on DMCU
> > > load but that doesn't seem to be the case. We've still seen reports of
> > > users unable to boot their systems at all.
> > >
> > > [HOW]
> > > Disable DMCU load on Raven 1. Only load it for Raven 2 and Picasso.
> > >
> > > v2: Fix ifdef (Alex)
> > >
> > > Signed-off-by: Harry Wentland 
> > > Reviewed-by: Nicholas Kazlauskas 
> > > Signed-off-by: Alex Deucher 
> > > Cc: sta...@vger.kernel.org
> > > ---
> > >  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++--
> > >  1 file changed, 10 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> > > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > index 995f9df66142..bcb1a93c0b4c 100644
> > > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > @@ -29,6 +29,7 @@
> > >  #include "dm_services_types.h"
> > >  #include "dc.h"
> > >  #include "dc/inc/core_types.h"
> > > +#include "dal_asic_id.h"
> > >
> > >  #include "vid.h"
> > >  #include "amdgpu.h"
> > > @@ -640,7 +641,7 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
> > >
> > >  static int load_dmcu_fw(struct amdgpu_device *adev)
> > >  {
> > > -   const char *fw_name_dmcu;
> > > +   const char *fw_name_dmcu = NULL;
> > > int r;
> > > const struct dmcu_firmware_header_v1_0 *hdr;
> > >
> > > @@ -663,7 +664,14 @@ static int load_dmcu_fw(struct amdgpu_device *adev)
> > > case CHIP_VEGA20:
> > > return 0;
> > > case CHIP_RAVEN:
> > > -   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > > +#if defined(CONFIG_DRM_AMD_DC_DCN1_01)
> > > +   if (ASICREV_IS_PICASSO(adev->external_rev_id))
> > > +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > > +   else if (ASICREV_IS_RAVEN2(adev->external_rev_id))
> > > +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > > +   else
> > > +#endif
> > > +   return 0;
> > > break;
> > > default:
> > > DRM_ERROR("Unsupported ASIC type: 0x%X\n", 
> > > adev->asic_type);
> > > --
> > > 2.20.1
> > >
> > > ___
> > > amd-gfx mailing list
> > > amd-gfx@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amd/display: Don't load DMCU for Raven 1 (v2)

2019-05-24 Thread Alex Deucher
On Fri, May 24, 2019 at 12:09 PM Mike Lothian  wrote:
>
> Hi
>
> Curious to know what this means for folk that have newer Raven1 boards
> that didn't have issues loading the firmware

You won't get ABM I think.  ABM is the automatic backlight management.

Alex

>
> Cheers
>
> Mike
>
> On Fri, 24 May 2019 at 16:34, Alex Deucher  wrote:
> >
> > From: Harry Wentland 
> >
> > [WHY]
> > Some early Raven boards had a bad SBIOS that doesn't play nicely with
> > the DMCU FW. We thought the issues were fixed by ignoring errors on DMCU
> > load but that doesn't seem to be the case. We've still seen reports of
> > users unable to boot their systems at all.
> >
> > [HOW]
> > Disable DMCU load on Raven 1. Only load it for Raven 2 and Picasso.
> >
> > v2: Fix ifdef (Alex)
> >
> > Signed-off-by: Harry Wentland 
> > Reviewed-by: Nicholas Kazlauskas 
> > Signed-off-by: Alex Deucher 
> > Cc: sta...@vger.kernel.org
> > ---
> >  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++--
> >  1 file changed, 10 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index 995f9df66142..bcb1a93c0b4c 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -29,6 +29,7 @@
> >  #include "dm_services_types.h"
> >  #include "dc.h"
> >  #include "dc/inc/core_types.h"
> > +#include "dal_asic_id.h"
> >
> >  #include "vid.h"
> >  #include "amdgpu.h"
> > @@ -640,7 +641,7 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
> >
> >  static int load_dmcu_fw(struct amdgpu_device *adev)
> >  {
> > -   const char *fw_name_dmcu;
> > +   const char *fw_name_dmcu = NULL;
> > int r;
> > const struct dmcu_firmware_header_v1_0 *hdr;
> >
> > @@ -663,7 +664,14 @@ static int load_dmcu_fw(struct amdgpu_device *adev)
> > case CHIP_VEGA20:
> > return 0;
> > case CHIP_RAVEN:
> > -   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > +#if defined(CONFIG_DRM_AMD_DC_DCN1_01)
> > +   if (ASICREV_IS_PICASSO(adev->external_rev_id))
> > +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > +   else if (ASICREV_IS_RAVEN2(adev->external_rev_id))
> > +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> > +   else
> > +#endif
> > +   return 0;
> > break;
> > default:
> > DRM_ERROR("Unsupported ASIC type: 0x%X\n", adev->asic_type);
> > --
> > 2.20.1
> >
> > ___
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amd/display: Don't load DMCU for Raven 1 (v2)

2019-05-24 Thread Mike Lothian
Hi

Curious to know what this means for folk that have newer Raven1 boards
that didn't have issues loading the firmware

Cheers

Mike

On Fri, 24 May 2019 at 16:34, Alex Deucher  wrote:
>
> From: Harry Wentland 
>
> [WHY]
> Some early Raven boards had a bad SBIOS that doesn't play nicely with
> the DMCU FW. We thought the issues were fixed by ignoring errors on DMCU
> load but that doesn't seem to be the case. We've still seen reports of
> users unable to boot their systems at all.
>
> [HOW]
> Disable DMCU load on Raven 1. Only load it for Raven 2 and Picasso.
>
> v2: Fix ifdef (Alex)
>
> Signed-off-by: Harry Wentland 
> Reviewed-by: Nicholas Kazlauskas 
> Signed-off-by: Alex Deucher 
> Cc: sta...@vger.kernel.org
> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 995f9df66142..bcb1a93c0b4c 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -29,6 +29,7 @@
>  #include "dm_services_types.h"
>  #include "dc.h"
>  #include "dc/inc/core_types.h"
> +#include "dal_asic_id.h"
>
>  #include "vid.h"
>  #include "amdgpu.h"
> @@ -640,7 +641,7 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
>
>  static int load_dmcu_fw(struct amdgpu_device *adev)
>  {
> -   const char *fw_name_dmcu;
> +   const char *fw_name_dmcu = NULL;
> int r;
> const struct dmcu_firmware_header_v1_0 *hdr;
>
> @@ -663,7 +664,14 @@ static int load_dmcu_fw(struct amdgpu_device *adev)
> case CHIP_VEGA20:
> return 0;
> case CHIP_RAVEN:
> -   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> +#if defined(CONFIG_DRM_AMD_DC_DCN1_01)
> +   if (ASICREV_IS_PICASSO(adev->external_rev_id))
> +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> +   else if (ASICREV_IS_RAVEN2(adev->external_rev_id))
> +   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
> +   else
> +#endif
> +   return 0;
> break;
> default:
> DRM_ERROR("Unsupported ASIC type: 0x%X\n", adev->asic_type);
> --
> 2.20.1
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v15 05/17] arms64: untag user pointers passed to memory syscalls

2019-05-24 Thread Andrew Murray
On Mon, May 06, 2019 at 06:30:51PM +0200, Andrey Konovalov wrote:
> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
> 
> This patch allows tagged pointers to be passed to the following memory
> syscalls: brk, get_mempolicy, madvise, mbind, mincore, mlock, mlock2,
> mmap, mmap_pgoff, mprotect, mremap, msync, munlock, munmap,
> remap_file_pages, shmat and shmdt.
> 
> This is done by untagging pointers passed to these syscalls in the
> prologues of their handlers.
> 
> Signed-off-by: Andrey Konovalov 
> ---


> +SYSCALL_DEFINE2(arm64_mlock, unsigned long, start, size_t, len)
> +{
> + start = untagged_addr(start);
> + return ksys_mlock(start, len, VM_LOCKED);
> +}
> +
> +SYSCALL_DEFINE2(arm64_mlock2, unsigned long, start, size_t, len)
> +{
> + start = untagged_addr(start);
> + return ksys_mlock(start, len, VM_LOCKED);
> +}

I think this may be a copy/paste error...

Shouldn't mlock2 have a third 'flags' argument to distinguish is from mlock?

Thanks,

Andrew Murray

> +
> +SYSCALL_DEFINE2(arm64_munlock, unsigned long, start, size_t, len)
> +{
> + start = untagged_addr(start);
> + return ksys_munlock(start, len);
> +}
> +
> +SYSCALL_DEFINE3(arm64_mprotect, unsigned long, start, size_t, len,
> + unsigned long, prot)
> +{
> + start = untagged_addr(start);
> + return ksys_mprotect_pkey(start, len, prot, -1);
> +}
> +
> +SYSCALL_DEFINE3(arm64_msync, unsigned long, start, size_t, len, int, flags)
> +{
> + start = untagged_addr(start);
> + return ksys_msync(start, len, flags);
> +}
> +
> +SYSCALL_DEFINE3(arm64_mincore, unsigned long, start, size_t, len,
> + unsigned char __user *, vec)
> +{
> + start = untagged_addr(start);
> + return ksys_mincore(start, len, vec);
> +}
> +
> +SYSCALL_DEFINE5(arm64_remap_file_pages, unsigned long, start,
> + unsigned long, size, unsigned long, prot,
> + unsigned long, pgoff, unsigned long, flags)
> +{
> + start = untagged_addr(start);
> + return ksys_remap_file_pages(start, size, prot, pgoff, flags);
> +}
> +
> +SYSCALL_DEFINE3(arm64_shmat, int, shmid, char __user *, shmaddr, int, shmflg)
> +{
> + shmaddr = untagged_addr(shmaddr);
> + return ksys_shmat(shmid, shmaddr, shmflg);
> +}
> +
> +SYSCALL_DEFINE1(arm64_shmdt, char __user *, shmaddr)
> +{
> + shmaddr = untagged_addr(shmaddr);
> + return ksys_shmdt(shmaddr);
> +}
> +
>  /*
>   * Wrappers to pass the pt_regs argument.
>   */
>  #define sys_personality  sys_arm64_personality
> +#define sys_mmap_pgoff   sys_arm64_mmap_pgoff
> +#define sys_mremap   sys_arm64_mremap
> +#define sys_munmap   sys_arm64_munmap
> +#define sys_brk  sys_arm64_brk
> +#define sys_get_mempolicysys_arm64_get_mempolicy
> +#define sys_madvise  sys_arm64_madvise
> +#define sys_mbindsys_arm64_mbind
> +#define sys_mlocksys_arm64_mlock
> +#define sys_mlock2   sys_arm64_mlock2
> +#define sys_munlock  sys_arm64_munlock
> +#define sys_mprotect sys_arm64_mprotect
> +#define sys_msyncsys_arm64_msync
> +#define sys_mincore  sys_arm64_mincore
> +#define sys_remap_file_pages sys_arm64_remap_file_pages
> +#define sys_shmatsys_arm64_shmat
> +#define sys_shmdtsys_arm64_shmdt
>  
>  asmlinkage long sys_ni_syscall(const struct pt_regs *);
>  #define __arm64_sys_ni_syscall   sys_ni_syscall
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 
> 
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel


Re: [PATCH 1/2] drm/amd/doc: Add XGMI sysfs documentation

2019-05-24 Thread Alex Deucher
On Fri, May 24, 2019 at 9:23 AM StDenis, Tom  wrote:
>
> Signed-off-by: Tom St Denis 

Series is:
Reviewed-by: Alex Deucher 

> ---
>  Documentation/gpu/amdgpu.rst |  9 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 
>  2 files changed, 37 insertions(+)
>
> diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
> index a740e491dfcc..cacfcfad2356 100644
> --- a/Documentation/gpu/amdgpu.rst
> +++ b/Documentation/gpu/amdgpu.rst
> @@ -70,6 +70,15 @@ Interrupt Handling
>  .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> :internal:
>
> +AMDGPU XGMI Support
> +===
> +
> +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +   :doc: AMDGPU XGMI Support
> +
> +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +   :internal:
> +
>  GPU Power/Thermal Controls and Monitoring
>  =
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index e48e9394f1e4..d11eba09eadd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -40,6 +40,34 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
> *hive)
> return >device_list;
>  }
>
> +/**
> + * DOC: AMDGPU XGMI Support
> + *
> + * XGMI is a high speed interconnect that joins multiple GPU cards
> + * into a homogeneous memory space that is organized by a collective
> + * hive ID and individual node IDs, both of which are 64-bit numbers.
> + *
> + * The file xgmi_device_id contains the unique per GPU device ID and
> + * is stored in the /sys/class/drm/card${cardno}/device/ directory.
> + *
> + * Inside the device directory a sub-directory 'xgmi_hive_info' is
> + * created which contains the hive ID and the list of nodes.
> + *
> + * The hive ID is stored in:
> + *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
> + *
> + * The node information is stored in numbered directories:
> + *   
> /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
> + *
> + * Each device has their own xgmi_hive_info direction with a mirror
> + * set of node sub-directories.
> + *
> + * The XGMI memory space is built by contiguously adding the power of
> + * two padded VRAM space from each node to each other.
> + *
> + */
> +
> +
>  static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
> struct device_attribute *attr, char *buf)
>  {
> --
> 2.21.0
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amd/display: Don't load DMCU for Raven 1 (v2)

2019-05-24 Thread Alex Deucher
From: Harry Wentland 

[WHY]
Some early Raven boards had a bad SBIOS that doesn't play nicely with
the DMCU FW. We thought the issues were fixed by ignoring errors on DMCU
load but that doesn't seem to be the case. We've still seen reports of
users unable to boot their systems at all.

[HOW]
Disable DMCU load on Raven 1. Only load it for Raven 2 and Picasso.

v2: Fix ifdef (Alex)

Signed-off-by: Harry Wentland 
Reviewed-by: Nicholas Kazlauskas 
Signed-off-by: Alex Deucher 
Cc: sta...@vger.kernel.org
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 995f9df66142..bcb1a93c0b4c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -29,6 +29,7 @@
 #include "dm_services_types.h"
 #include "dc.h"
 #include "dc/inc/core_types.h"
+#include "dal_asic_id.h"
 
 #include "vid.h"
 #include "amdgpu.h"
@@ -640,7 +641,7 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
 
 static int load_dmcu_fw(struct amdgpu_device *adev)
 {
-   const char *fw_name_dmcu;
+   const char *fw_name_dmcu = NULL;
int r;
const struct dmcu_firmware_header_v1_0 *hdr;
 
@@ -663,7 +664,14 @@ static int load_dmcu_fw(struct amdgpu_device *adev)
case CHIP_VEGA20:
return 0;
case CHIP_RAVEN:
-   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
+#if defined(CONFIG_DRM_AMD_DC_DCN1_01)
+   if (ASICREV_IS_PICASSO(adev->external_rev_id))
+   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
+   else if (ASICREV_IS_RAVEN2(adev->external_rev_id))
+   fw_name_dmcu = FIRMWARE_RAVEN_DMCU;
+   else
+#endif
+   return 0;
break;
default:
DRM_ERROR("Unsupported ASIC type: 0x%X\n", adev->asic_type);
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amd/doc: Add XGMI sysfs documentation

2019-05-24 Thread Abramov, Slava
Acked-by: Slava Abramov 


From: amd-gfx  on behalf of StDenis, Tom 

Sent: Friday, May 24, 2019 9:23:49 AM
To: amd-gfx@lists.freedesktop.org
Cc: StDenis, Tom
Subject: [PATCH 1/2] drm/amd/doc: Add XGMI sysfs documentation

[CAUTION: External Email]

Signed-off-by: Tom St Denis 
---
 Documentation/gpu/amdgpu.rst |  9 
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 
 2 files changed, 37 insertions(+)

diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index a740e491dfcc..cacfcfad2356 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -70,6 +70,15 @@ Interrupt Handling
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
:internal:

+AMDGPU XGMI Support
+===
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+   :doc: AMDGPU XGMI Support
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+   :internal:
+
 GPU Power/Thermal Controls and Monitoring
 =

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e48e9394f1e4..d11eba09eadd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,6 +40,34 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
*hive)
return >device_list;
 }

+/**
+ * DOC: AMDGPU XGMI Support
+ *
+ * XGMI is a high speed interconnect that joins multiple GPU cards
+ * into a homogeneous memory space that is organized by a collective
+ * hive ID and individual node IDs, both of which are 64-bit numbers.
+ *
+ * The file xgmi_device_id contains the unique per GPU device ID and
+ * is stored in the /sys/class/drm/card${cardno}/device/ directory.
+ *
+ * Inside the device directory a sub-directory 'xgmi_hive_info' is
+ * created which contains the hive ID and the list of nodes.
+ *
+ * The hive ID is stored in:
+ *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
+ *
+ * The node information is stored in numbered directories:
+ *   
/sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
+ *
+ * Each device has their own xgmi_hive_info direction with a mirror
+ * set of node sub-directories.
+ *
+ * The XGMI memory space is built by contiguously adding the power of
+ * two padded VRAM space from each node to each other.
+ *
+ */
+
+
 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
struct device_attribute *attr, char *buf)
 {
--
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amd/doc: Add RAS documentation to guide

2019-05-24 Thread Abramov, Slava
Acked-by: Slava Abramov 


From: amd-gfx  on behalf of StDenis, Tom 

Sent: Friday, May 24, 2019 9:23:50 AM
To: amd-gfx@lists.freedesktop.org
Cc: StDenis, Tom
Subject: [PATCH 2/2] drm/amd/doc: Add RAS documentation to guide

[CAUTION: External Email]

Signed-off-by: Tom St Denis 
---
 Documentation/gpu/amdgpu.rst| 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index cacfcfad2356..86138798128f 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -79,6 +79,17 @@ AMDGPU XGMI Support
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
:internal:

+AMDGPU RAS debugfs control interface
+
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :doc: AMDGPU RAS debugfs control interface
+
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :internal:
+
+
 GPU Power/Thermal Controls and Monitoring
 =

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d5719b0fb82c..7c8a4aedf07c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -244,8 +244,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,

return 0;
 }
-/*
- * DOC: ras debugfs control interface
+/**
+ * DOC: AMDGPU RAS debugfs control interface
  *
  * It accepts struct ras_debug_if who has two members.
  *
--
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v15 00/17] arm64: untag user pointers passed to the kernel

2019-05-24 Thread Khalid Aziz
On 5/24/19 4:11 AM, Catalin Marinas wrote:
> On Thu, May 23, 2019 at 03:49:05PM -0600, Khalid Aziz wrote:
>> On 5/23/19 2:11 PM, Catalin Marinas wrote:
>>> On Thu, May 23, 2019 at 11:51:40AM -0600, Khalid Aziz wrote:
 On 5/21/19 6:04 PM, Kees Cook wrote:
> As an aside: I think Sparc ADI support in Linux actually side-stepped
> this[1] (i.e. chose "solution 1"): "All addresses passed to kernel must
> be non-ADI tagged addresses." (And sadly, "Kernel does not enable ADI
> for kernel code.") I think this was a mistake we should not repeat for
> arm64 (we do seem to be at least in agreement about this, I think).
>
> [1] https://lore.kernel.org/patchwork/patch/654481/

 That is a very early version of the sparc ADI patch. Support for tagged
 addresses in syscalls was added in later versions and is in the patch
 that is in the kernel.
>>>
>>> I tried to figure out but I'm not familiar with the sparc port. How did
>>> you solve the tagged address going into various syscall implementations
>>> in the kernel (e.g. sys_write)? Is the tag removed on kernel entry or it
>>> ends up deeper in the core code?
>>
>> Another spot I should point out in ADI patch - Tags are not stored in
>> VMAs and IOMMU does not support ADI tags on M7. ADI tags are stripped
>> before userspace addresses are passed to IOMMU in the following snippet
>> from the patch:
>>
>> diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
>> index 5335ba3c850e..357b6047653a 100644
>> --- a/arch/sparc/mm/gup.c
>> +++ b/arch/sparc/mm/gup.c
>> @@ -201,6 +202,24 @@ int __get_user_pages_fast(unsigned long start, int
>> nr_pages
>> , int write,
>> pgd_t *pgdp;
>> int nr = 0;
>>
>> +#ifdef CONFIG_SPARC64
>> +   if (adi_capable()) {
>> +   long addr = start;
>> +
>> +   /* If userspace has passed a versioned address, kernel
>> +* will not find it in the VMAs since it does not store
>> +* the version tags in the list of VMAs. Storing version
>> +* tags in list of VMAs is impractical since they can be
>> +* changed any time from userspace without dropping into
>> +* kernel. Any address search in VMAs will be done with
>> +* non-versioned addresses. Ensure the ADI version bits
>> +* are dropped here by sign extending the last bit before
>> +* ADI bits. IOMMU does not implement version tags.
>> +*/
>> +   addr = (addr << (long)adi_nbits()) >> (long)adi_nbits();
>> +   start = addr;
>> +   }
>> +#endif
>> start &= PAGE_MASK;
>> addr = start;
>> len = (unsigned long) nr_pages << PAGE_SHIFT;
> 
> Thanks Khalid. I missed that sparc does not enable HAVE_GENERIC_GUP, so
> you fix this case here. If we add the generic untagged_addr() macro in
> the generic code, I think sparc can start making use of it rather than
> open-coding the shifts.

Hi Catalin,

Yes, that will be good. Right now addresses are untagged in sparc code
in only two spots but that will expand as we expand use of tags.
Scalabale solution is definitely better.

> 
> There are a few other other places where tags can leak and the core code
> would get confused (for example, madvise()). I presume your user space
> doesn't exercise them. On arm64 we plan to just allow the C library to
> tag any new memory allocation, so those core code paths would need to be
> covered.
> 
> And similarly, devices, IOMMU, any DMA would ignore tags.
> 

Right. You are doing lot more with tags than sparc code intended to do.
I had looked into implementing just malloc(), mmap() and possibly
shmat() in library that automatically tags pointers. Expanding tags to
any pointers in C library will require covering lot more paths in kernel.

--
Khalid




Re: [PATCH v15 00/17] arm64: untag user pointers passed to the kernel

2019-05-24 Thread Dave Martin
On Thu, May 23, 2019 at 05:57:09PM +0100, Catalin Marinas wrote:
> On Thu, May 23, 2019 at 11:42:57AM +0100, Dave P Martin wrote:
> > On Wed, May 22, 2019 at 09:20:52PM -0300, Jason Gunthorpe wrote:
> > > On Wed, May 22, 2019 at 02:49:28PM +0100, Dave Martin wrote:
> > > > If multiple people will care about this, perhaps we should try to
> > > > annotate types more explicitly in SYSCALL_DEFINEx() and ABI data
> > > > structures.
> > > > 
> > > > For example, we could have a couple of mutually exclusive modifiers
> > > > 
> > > > T __object *
> > > > T __vaddr * (or U __vaddr)
> > > > 
> > > > In the first case the pointer points to an object (in the C sense)
> > > > that the call may dereference but not use for any other purpose.
> > > 
> > > How would you use these two differently?
> > > 
> > > So far the kernel has worked that __user should tag any pointer that
> > > is from userspace and then you can't do anything with it until you
> > > transform it into a kernel something
> > 
> > Ultimately it would be good to disallow casting __object pointers execpt
> > to compatible __object pointer types, and to make get_user etc. demand
> > __object.
> > 
> > __vaddr pointers / addresses would be freely castable, but not to
> > __object and so would not be dereferenceable even indirectly.
> 
> I think it gets too complicated and there are ambiguous cases that we
> may not be able to distinguish. For example copy_from_user() may be used
> to copy a user data structure into the kernel, hence __object would
> work, while the same function may be used to copy opaque data to a file,
> so __vaddr may be a better option (unless I misunderstood your
> proposal).

Can you illustrate?  I'm not sure of your point here.

> We currently have T __user * and I think it's a good starting point. The
> prior attempt [1] was shut down because it was just hiding the cast
> using __force. We'd need to work through those cases again and rather
> start changing the function prototypes to avoid unnecessary casting in
> the callers (e.g. get_user_pages(void __user *) or come up with a new
> type) while changing the explicit casting to a macro where it needs to
> be obvious that we are converting a user pointer, potentially typed
> (tagged), to an untyped address range. We may need a user_ptr_to_ulong()
> macro or similar (it seems that we have a u64_to_user_ptr, wasn't aware
> of it).
> 
> It may actually not be far from what you suggested but I'd keep the
> current T __user * to denote possible dereference.

This may not have been clear, but __object and __vaddr would be
orthogonal to __user.  Since __object and __vaddr strictly constrain
what can be done with an lvalue, they could be cast on, but not be
cast off without __force.

Syscall arguments and pointer in ioctl structs etc. would typically
be annotated as __object __user * or __vaddr __user *.  Plain old
__user * would work as before, but would be more permissive and give
static analysers less information to go on.

Conversion or use or __object or __vaddr pointers would require specific
APIs in the kernel, so that we can be clear about the semantics.

Doing things this way would allow migration to annotation of most or all
ABI pointers with __object or __vaddr over time, but we wouldn't have to
do it all in one go.  Problem cases (which won't be the majority) could
continue to be plain __user.


This does not magically solve the challenges of MTE, but might provide
tools that are useful to help avoid bitrot and regressions over time.

I agree though that there might be a fair number of of cases that don't
conveniently fall under __object or __vaddr semantics.  It's hard to
know without trying it.

_Most_ syscall arguments seem to be fairly obviously one or another
though, and this approach has some possibility of scaling to ioctls
and other odd interfaces.

Cheers
---Dave


[PATCH 2/2] drm/amd/doc: Add RAS documentation to guide

2019-05-24 Thread StDenis, Tom
Signed-off-by: Tom St Denis 
---
 Documentation/gpu/amdgpu.rst| 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index cacfcfad2356..86138798128f 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -79,6 +79,17 @@ AMDGPU XGMI Support
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
:internal:
 
+AMDGPU RAS debugfs control interface
+
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :doc: AMDGPU RAS debugfs control interface
+
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+   :internal:
+
+
 GPU Power/Thermal Controls and Monitoring
 =
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d5719b0fb82c..7c8a4aedf07c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -244,8 +244,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
 
return 0;
 }
-/*
- * DOC: ras debugfs control interface
+/**
+ * DOC: AMDGPU RAS debugfs control interface
  *
  * It accepts struct ras_debug_if who has two members.
  *
-- 
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amd/doc: Add XGMI sysfs documentation

2019-05-24 Thread StDenis, Tom
Signed-off-by: Tom St Denis 
---
 Documentation/gpu/amdgpu.rst |  9 
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 
 2 files changed, 37 insertions(+)

diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index a740e491dfcc..cacfcfad2356 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -70,6 +70,15 @@ Interrupt Handling
 .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
:internal:
 
+AMDGPU XGMI Support
+===
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+   :doc: AMDGPU XGMI Support
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+   :internal:
+
 GPU Power/Thermal Controls and Monitoring
 =
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index e48e9394f1e4..d11eba09eadd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,6 +40,34 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
*hive)
return >device_list;
 }
 
+/**
+ * DOC: AMDGPU XGMI Support
+ *
+ * XGMI is a high speed interconnect that joins multiple GPU cards
+ * into a homogeneous memory space that is organized by a collective
+ * hive ID and individual node IDs, both of which are 64-bit numbers.
+ *
+ * The file xgmi_device_id contains the unique per GPU device ID and
+ * is stored in the /sys/class/drm/card${cardno}/device/ directory.
+ *
+ * Inside the device directory a sub-directory 'xgmi_hive_info' is
+ * created which contains the hive ID and the list of nodes.
+ *
+ * The hive ID is stored in:
+ *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
+ *
+ * The node information is stored in numbered directories:
+ *   
/sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
+ *
+ * Each device has their own xgmi_hive_info direction with a mirror
+ * set of node sub-directories.
+ *
+ * The XGMI memory space is built by contiguously adding the power of
+ * two padded VRAM space from each node to each other.
+ *
+ */
+
+
 static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
struct device_attribute *attr, char *buf)
 {
-- 
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v15 14/17] media/v4l2-core, arm64: untag user pointers in videobuf_dma_contig_user_get

2019-05-24 Thread Mauro Carvalho Chehab
Em Mon,  6 May 2019 18:31:00 +0200
Andrey Konovalov  escreveu:

> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
> 
> videobuf_dma_contig_user_get() uses provided user pointers for vma
> lookups, which can only by done with untagged pointers.
> 
> Untag the pointers in this function.
> 
> Signed-off-by: Andrey Konovalov 

Acked-by: Mauro Carvalho Chehab 

> ---
>  drivers/media/v4l2-core/videobuf-dma-contig.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c 
> b/drivers/media/v4l2-core/videobuf-dma-contig.c
> index e1bf50df4c70..8a1ddd146b17 100644
> --- a/drivers/media/v4l2-core/videobuf-dma-contig.c
> +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c
> @@ -160,6 +160,7 @@ static void videobuf_dma_contig_user_put(struct 
> videobuf_dma_contig_memory *mem)
>  static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory 
> *mem,
>   struct videobuf_buffer *vb)
>  {
> + unsigned long untagged_baddr = untagged_addr(vb->baddr);
>   struct mm_struct *mm = current->mm;
>   struct vm_area_struct *vma;
>   unsigned long prev_pfn, this_pfn;
> @@ -167,22 +168,22 @@ static int videobuf_dma_contig_user_get(struct 
> videobuf_dma_contig_memory *mem,
>   unsigned int offset;
>   int ret;
>  
> - offset = vb->baddr & ~PAGE_MASK;
> + offset = untagged_baddr & ~PAGE_MASK;
>   mem->size = PAGE_ALIGN(vb->size + offset);
>   ret = -EINVAL;
>  
>   down_read(>mmap_sem);
>  
> - vma = find_vma(mm, vb->baddr);
> + vma = find_vma(mm, untagged_baddr);
>   if (!vma)
>   goto out_up;
>  
> - if ((vb->baddr + mem->size) > vma->vm_end)
> + if ((untagged_baddr + mem->size) > vma->vm_end)
>   goto out_up;
>  
>   pages_done = 0;
>   prev_pfn = 0; /* kill warning */
> - user_address = vb->baddr;
> + user_address = untagged_baddr;
>  
>   while (pages_done < (mem->size >> PAGE_SHIFT)) {
>   ret = follow_pfn(vma, user_address, _pfn);



Thanks,
Mauro


Re: [PATCH v15 00/17] arm64: untag user pointers passed to the kernel

2019-05-24 Thread Catalin Marinas
On Thu, May 23, 2019 at 02:31:16PM -0700, Kees Cook wrote:
> On Thu, May 23, 2019 at 06:43:46PM +0100, Catalin Marinas wrote:
> > On Thu, May 23, 2019 at 09:38:19AM -0700, Kees Cook wrote:
> > > What about testing tools that intentionally insert high bits for syscalls
> > > and are _expecting_ them to fail? It seems the TBI series will break them?
> > > In that case, do we need to opt into TBI as well?
> > 
> > If there are such tools, then we may need a per-process control. It's
> > basically an ABI change.
> 
> syzkaller already attempts to randomly inject non-canonical and
> 0x addresses for user pointers in syscalls in an effort to
> find bugs like CVE-2017-5123 where waitid() via unchecked put_user() was
> able to write directly to kernel memory[1].
> 
> It seems that using TBI by default and not allowing a switch back to
> "normal" ABI without a reboot actually means that userspace cannot inject
> kernel pointers into syscalls any more, since they'll get universally
> stripped now. Is my understanding correct, here? i.e. exploiting
> CVE-2017-5123 would be impossible under TBI?

Unless the kernel is also using TBI (khwasan), in which case masking out
the top byte wouldn't help. Anyway, as per this discussion, we want the
tagged pointer to remain intact all the way to put_user(), so nothing
gets masked out. I don't think this would have helped with the waitid()
bug.

> If so, then I think we should commit to the TBI ABI and have a boot
> flag to disable it, but NOT have a process flag, as that would allow
> attackers to bypass the masking. The only flag should be "TBI or MTE".
> 
> If so, can I get top byte masking for other architectures too? Like,
> just to strip high bits off userspace addresses? ;)

But you didn't like my option 2 shim proposal which strips the tag on
kernel entry because it lowers the value of MTE ;).

> (Oh, in looking I see this is implemented with sign-extension... why
> not just a mask? So it'll either be valid userspace address or forced
> into the non-canonical range?)

The TTBR0/1 selection on memory accesses is done based on bit 63 if TBI
is disabled and bit 55 when enabled. Sign-extension allows us to use the
same macro for both user and kernel tagged pointers. With MTE tag 0
would be match-all for TTBR0 and 0xff for TTBR1 (so that we don't modify
the virtual address space of the kernel; I need to check the latest spec
to be sure). Note that the VA space for both user and kernel is limited
to 52-bit architecturally so, on access, bits 55..52 must be the same, 0
or 1, otherwise you get a fault.

Since the syzkaller tests would also need to set bits 55-52 (actually 48
for kernel addresses, we haven't merged the 52-bit kernel VA patches
yet) to hit a valid kernel address, I don't think ignoring the top byte
makes much difference to the expected failure scenario.

> > > Alright, the tl;dr appears to be:
> > > - you want more assurances that we can find __user stripping in the
> > >   kernel more easily. (But this seems like a parallel problem.)
> > 
> > Yes, and that we found all (most) cases now. The reason I don't see it
> > as a parallel problem is that, as maintainer, I promise an ABI to user
> > and I'd rather stick to it. I don't want, for example, ncurses to stop
> > working because of some ioctl() rejecting tagged pointers.
> 
> But this is what I don't understand: it would need to be ncurses _using
> TBI_, that would stop working (having started to work before, but then
> regress due to a newly added one-off bug). Regular ncurses will be fine
> because it's not using TBI. So The Golden Rule isn't violated,

Once we introduced TBI and the libc starts tagging heap allocations,
this becomes the new "regular" user space behaviour (i.e. using TBI). So
a new bug would break the golden rule. It could also be an old bug that
went unnoticed (i.e. you changed the graphics card and its driver gets
confused by tagged pointers coming from user-space).

> and by definition, it's a specific regression caused by some bug
> (since TBI would have had to have worked _before_ in the situation to
> be considered a regression now). Which describes the normal path for
> kernel development... add feature, find corner cases where it doesn't
> work, fix them, encounter new regressions, fix those, repeat forever.
> 
> > If it's just the occasional one-off bug I'm fine to deal with it. But
> > no-one convinced me yet that this is the case.
> 
> You believe there still to be some systemic cases that haven't been
> found yet? And even if so -- isn't it better to work on that
> incrementally?

I want some way to systematically identify potential issues (sparse?).
Since problems are most likely in drivers, I don't have all devices to
check and not all users have the knowledge to track down why something
failed.

I think we can do this incrementally as long the TBI ABI is not the
default. Even better if we made it per process.

> > As for the generic driver code (filesystems or other 

[PATCH] drm/amdgpu: Don't need to call csb_vram_unpin

2019-05-24 Thread Emily Deng
As it will destory clear_state_obj, and also will
unpin it in the gfx_v9_0_sw_fini, so don't need to
call csb_vram unpin in gfx_v9_0_hw_fini, or it will
have unpin warning.

Signed-off-by: Emily Deng 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 16 
 1 file changed, 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index c763733..231b9e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1154,20 +1154,6 @@ static int gfx_v9_0_csb_vram_pin(struct amdgpu_device 
*adev)
return r;
 }
 
-static void gfx_v9_0_csb_vram_unpin(struct amdgpu_device *adev)
-{
-   int r;
-
-   if (!adev->gfx.rlc.clear_state_obj)
-   return;
-
-   r = amdgpu_bo_reserve(adev->gfx.rlc.clear_state_obj, true);
-   if (likely(r == 0)) {
-   amdgpu_bo_unpin(adev->gfx.rlc.clear_state_obj);
-   amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
-   }
-}
-
 static void gfx_v9_0_mec_fini(struct amdgpu_device *adev)
 {
amdgpu_bo_free_kernel(>gfx.mec.hpd_eop_obj, NULL, NULL);
@@ -3385,8 +3371,6 @@ static int gfx_v9_0_hw_fini(void *handle)
gfx_v9_0_cp_enable(adev, false);
adev->gfx.rlc.funcs->stop(adev);
 
-   gfx_v9_0_csb_vram_unpin(adev);
-
return 0;
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: fix unload driver fail

2019-05-24 Thread Christian König

Am 24.05.19 um 11:52 schrieb Emily Deng:

dc_destroy should be called amdgpu_cgs_destroy_device,
as it will use cgs context to read or write registers.

Signed-off-by: Emily Deng 


Acked-by: Christian König 


---
  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 ---
  1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index fd04217..f558c9c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -616,6 +616,10 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
  static void amdgpu_dm_fini(struct amdgpu_device *adev)
  {
amdgpu_dm_destroy_drm_device(>dm);
+
+   /* DC Destroy TODO: Replace destroy DAL */
+   if (adev->dm.dc)
+   dc_destroy(>dm.dc);
/*
 * TODO: pageflip, vlank interrupt
 *
@@ -630,9 +634,6 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
mod_freesync_destroy(adev->dm.freesync_module);
adev->dm.freesync_module = NULL;
}
-   /* DC Destroy TODO: Replace destroy DAL */
-   if (adev->dm.dc)
-   dc_destroy(>dm.dc);
  
  	mutex_destroy(>dm.dc_lock);
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v15 00/17] arm64: untag user pointers passed to the kernel

2019-05-24 Thread Catalin Marinas
On Thu, May 23, 2019 at 03:49:05PM -0600, Khalid Aziz wrote:
> On 5/23/19 2:11 PM, Catalin Marinas wrote:
> > On Thu, May 23, 2019 at 11:51:40AM -0600, Khalid Aziz wrote:
> >> On 5/21/19 6:04 PM, Kees Cook wrote:
> >>> As an aside: I think Sparc ADI support in Linux actually side-stepped
> >>> this[1] (i.e. chose "solution 1"): "All addresses passed to kernel must
> >>> be non-ADI tagged addresses." (And sadly, "Kernel does not enable ADI
> >>> for kernel code.") I think this was a mistake we should not repeat for
> >>> arm64 (we do seem to be at least in agreement about this, I think).
> >>>
> >>> [1] https://lore.kernel.org/patchwork/patch/654481/
> >>
> >> That is a very early version of the sparc ADI patch. Support for tagged
> >> addresses in syscalls was added in later versions and is in the patch
> >> that is in the kernel.
> > 
> > I tried to figure out but I'm not familiar with the sparc port. How did
> > you solve the tagged address going into various syscall implementations
> > in the kernel (e.g. sys_write)? Is the tag removed on kernel entry or it
> > ends up deeper in the core code?
> 
> Another spot I should point out in ADI patch - Tags are not stored in
> VMAs and IOMMU does not support ADI tags on M7. ADI tags are stripped
> before userspace addresses are passed to IOMMU in the following snippet
> from the patch:
> 
> diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
> index 5335ba3c850e..357b6047653a 100644
> --- a/arch/sparc/mm/gup.c
> +++ b/arch/sparc/mm/gup.c
> @@ -201,6 +202,24 @@ int __get_user_pages_fast(unsigned long start, int
> nr_pages
> , int write,
> pgd_t *pgdp;
> int nr = 0;
> 
> +#ifdef CONFIG_SPARC64
> +   if (adi_capable()) {
> +   long addr = start;
> +
> +   /* If userspace has passed a versioned address, kernel
> +* will not find it in the VMAs since it does not store
> +* the version tags in the list of VMAs. Storing version
> +* tags in list of VMAs is impractical since they can be
> +* changed any time from userspace without dropping into
> +* kernel. Any address search in VMAs will be done with
> +* non-versioned addresses. Ensure the ADI version bits
> +* are dropped here by sign extending the last bit before
> +* ADI bits. IOMMU does not implement version tags.
> +*/
> +   addr = (addr << (long)adi_nbits()) >> (long)adi_nbits();
> +   start = addr;
> +   }
> +#endif
> start &= PAGE_MASK;
> addr = start;
> len = (unsigned long) nr_pages << PAGE_SHIFT;

Thanks Khalid. I missed that sparc does not enable HAVE_GENERIC_GUP, so
you fix this case here. If we add the generic untagged_addr() macro in
the generic code, I think sparc can start making use of it rather than
open-coding the shifts.

There are a few other other places where tags can leak and the core code
would get confused (for example, madvise()). I presume your user space
doesn't exercise them. On arm64 we plan to just allow the C library to
tag any new memory allocation, so those core code paths would need to be
covered.

And similarly, devices, IOMMU, any DMA would ignore tags.

-- 
Catalin


[PATCH] drm/amdgpu: fix unload driver fail

2019-05-24 Thread Emily Deng
dc_destroy should be called amdgpu_cgs_destroy_device,
as it will use cgs context to read or write registers.

Signed-off-by: Emily Deng 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index fd04217..f558c9c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -616,6 +616,10 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
 static void amdgpu_dm_fini(struct amdgpu_device *adev)
 {
amdgpu_dm_destroy_drm_device(>dm);
+
+   /* DC Destroy TODO: Replace destroy DAL */
+   if (adev->dm.dc)
+   dc_destroy(>dm.dc);
/*
 * TODO: pageflip, vlank interrupt
 *
@@ -630,9 +634,6 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
mod_freesync_destroy(adev->dm.freesync_module);
adev->dm.freesync_module = NULL;
}
-   /* DC Destroy TODO: Replace destroy DAL */
-   if (adev->dm.dc)
-   dc_destroy(>dm.dc);
 
mutex_destroy(>dm.dc_lock);
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

2019-05-24 Thread Christian König
Yeah, that shouldn't be a problem. We just need to make sure we don't 
busy wait for the BOs to become available.


Christian.

Am 24.05.19 um 07:35 schrieb Liang, Prike:

Use Abaqus torturing the amdgpu driver more times will running into locking 
first busy BO deadlock .Then the caller will
return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned 
failed message .For this case, the patch#7
  can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .

Thanks,
Prike
-Original Message-
From: Christian König 
Sent: Thursday, May 23, 2019 7:04 PM
To: Zhou, David(ChunMing) ; Olsak, Marek ; Zhou, 
David(ChunMing) ; Liang, Prike ; 
dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

[CAUTION: External Email]

Am 23.05.19 um 12:24 schrieb zhoucm1:


On 2019年05月22日 20:59, Christian König wrote:

[CAUTION: External Email]

BOs on the LRU might be blocked during command submission and cause
OOM situations.

Avoid this by blocking for the first busy BO not locked by the same
ticket as the BO we are searching space for.

v10: completely start over with the patch since we didn't
   handled a whole bunch of corner cases.

Signed-off-by: Christian König 
---
   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++--
   1 file changed, 66 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4
100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
* b. Otherwise, trylock it.
*/
   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
*bo,
-   struct ttm_operation_ctx *ctx, bool *locked)
+   struct ttm_operation_ctx *ctx, bool *locked,
bool *busy)
   {
  bool ret = false;

-   *locked = false;
  if (bo->resv == ctx->resv) {
  reservation_object_assert_held(bo->resv);
  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
  || !list_empty(>ddestroy))
  ret = true;
+   *locked = false;
+   if (busy)
+   *busy = false;
  } else {
-   *locked = reservation_object_trylock(bo->resv);
-   ret = *locked;
+   ret = reservation_object_trylock(bo->resv);
+   *locked = ret;
+   if (busy)
+   *busy = !ret;
  }

  return ret;
   }

+/**
+ * ttm_mem_evict_wait_busy - wait for a busy BO to become available
+ *
+ * @busy_bo: BO which couldn't be locked with trylock
+ * @ctx: operation context
+ * @ticket: acquire ticket
+ *
+ * Try to lock a busy buffer object to avoid failing eviction.
+ */
+static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
+  struct ttm_operation_ctx *ctx,
+  struct ww_acquire_ctx *ticket) {
+   int r;
+
+   if (!busy_bo || !ticket)
+   return -EBUSY;
+
+   if (ctx->interruptible)
+   r =
+ reservation_object_lock_interruptible(busy_bo->resv,
+ ticket);
+   else
+   r = reservation_object_lock(busy_bo->resv, ticket);
+
+   /*
+* TODO: It would be better to keep the BO locked until
allocation is at
+* least tried one more time, but that would mean a much
larger rework
+* of TTM.
+*/
+   if (!r)
+   reservation_object_unlock(busy_bo->resv);
+
+   return r == -EDEADLK ? -EAGAIN : r; }
+
   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 uint32_t mem_type,
 const struct ttm_place *place,
-  struct ttm_operation_ctx *ctx)
+  struct ttm_operation_ctx *ctx,
+  struct ww_acquire_ctx *ticket)
   {
+   struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
  struct ttm_bo_global *glob = bdev->glob;
  struct ttm_mem_type_manager *man = >man[mem_type];
-   struct ttm_buffer_object *bo = NULL;
  bool locked = false;
  unsigned i;
  int ret;
@@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct
ttm_bo_device *bdev,
  spin_lock(>lru_lock);
  for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
  list_for_each_entry(bo, >lru[i], lru) {
-   if (!ttm_bo_evict_swapout_allowable(bo, ctx,
))
+   bool busy;
+
+   if (!ttm_bo_evict_swapout_allowable(bo, ctx,
,
+ )) {
+   if (busy && !busy_bo &&
+   bo->resv->lock.ctx != ticket)
+   busy_bo = bo;
 

Re: [PATCH 8/8] drm/amdkfd: Use kfd fd to mmap mmio

2019-05-24 Thread Christian König

Am 24.05.19 um 00:41 schrieb Zeng, Oak:

TTM doesn't support CPU mapping of sg type bo (under which
mmio bo is created). Switch mmaping of mmio page to kfd
device file.

Change-Id: I1a1a24f2ac0662be3783d460c137731ade007b83
Signed-off-by: Oak Zeng 


Acked-by: Christian König 


---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 46 
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  1 +
  2 files changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 455a3db..67d269b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1309,6 +1309,15 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
args->mmap_offset = offset;
  
+	/* MMIO is mapped through kfd device

+* Generate a kfd mmap offset
+*/
+   if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+   args->mmap_offset = KFD_MMAP_TYPE_MMIO | 
KFD_MMAP_GPU_ID(args->gpu_id);
+   args->mmap_offset <<= PAGE_SHIFT;
+   args->mmap_offset |= 
amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+   }
+
return 0;
  
  err_free:

@@ -1880,6 +1889,39 @@ static long kfd_ioctl(struct file *filep, unsigned int 
cmd, unsigned long arg)
return retcode;
  }
  
+static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process,

+ struct vm_area_struct *vma)
+{
+   phys_addr_t address;
+   int ret;
+
+   if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+   return -EINVAL;
+
+   address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+
+   vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
+   VM_DONTDUMP | VM_PFNMAP;
+
+   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+   pr_debug("Process %d mapping mmio page\n"
+" target user address == 0x%08llX\n"
+" physical address== 0x%08llX\n"
+" vm_flags== 0x%04lX\n"
+" size== 0x%04lX\n",
+process->pasid, (unsigned long long) vma->vm_start,
+address, vma->vm_flags, PAGE_SIZE);
+
+   ret = io_remap_pfn_range(vma,
+   vma->vm_start,
+   address >> PAGE_SHIFT,
+   PAGE_SIZE,
+   vma->vm_page_prot);
+   return ret;
+}
+
+
  static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
  {
struct kfd_process *process;
@@ -1910,6 +1952,10 @@ static int kfd_mmap(struct file *filp, struct 
vm_area_struct *vma)
if (!dev)
return -ENODEV;
return kfd_reserved_mem_mmap(dev, process, vma);
+   case KFD_MMAP_TYPE_MMIO:
+   if (!dev)
+   return -ENODEV;
+   return kfd_mmio_mmap(dev, process, vma);
}
  
  	return -EFAULT;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 40a5c67..b61dc53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -59,6 +59,7 @@
  #define KFD_MMAP_TYPE_DOORBELL(0x3ULL << KFD_MMAP_TYPE_SHIFT)
  #define KFD_MMAP_TYPE_EVENTS  (0x2ULL << KFD_MMAP_TYPE_SHIFT)
  #define KFD_MMAP_TYPE_RESERVED_MEM(0x1ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_MMIO (0x0ULL << KFD_MMAP_TYPE_SHIFT)
  
  #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)

  #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: Need to set the baco cap before baco reset

2019-05-24 Thread Deng, Emily
Ping ..

Best wishes
Emily Deng
>-Original Message-
>From: amd-gfx  On Behalf Of Deng,
>Emily
>Sent: Friday, May 24, 2019 10:29 AM
>To: Alex Deucher 
>Cc: amd-gfx list 
>Subject: RE: [PATCH] drm/amdgpu: Need to set the baco cap before baco
>reset
>
>[CAUTION: External Email]
>
>>-Original Message-
>>From: Alex Deucher 
>>Sent: Friday, May 24, 2019 12:09 AM
>>To: Deng, Emily 
>>Cc: amd-gfx list 
>>Subject: Re: [PATCH] drm/amdgpu: Need to set the baco cap before baco
>>reset
>>
>>[CAUTION: External Email]
>>
>>On Thu, May 23, 2019 at 6:22 AM Emily Deng  wrote:
>>>
>>> For passthrough, after rebooted the VM, driver will do a baco reset
>>> before doing other driver initialization during loading  driver. For
>>> doing the baco reset, it will first check the baco reset capability.
>>> So first need to set the cap from the vbios information or baco reset
>>> won't be enabled.
>>>
>>> Signed-off-by: Emily Deng 
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  8 
>>>  drivers/gpu/drm/amd/amdgpu/soc15.c |  3 ++-
>>>  drivers/gpu/drm/amd/include/kgd_pp_interface.h |  1 +
>>>  drivers/gpu/drm/amd/powerplay/amd_powerplay.c  | 16
>>+++
>>>  drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c |  1 +
>>>  .../amd/powerplay/hwmgr/vega10_processpptables.c   | 24
>>++
>>>  .../amd/powerplay/hwmgr/vega10_processpptables.h   |  1 +
>>>  drivers/gpu/drm/amd/powerplay/inc/hwmgr.h  |  1 +
>>>  8 files changed, 54 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index bdd1fe73..2dde672 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2611,6 +2611,14 @@ int amdgpu_device_init(struct amdgpu_device
>>*adev,
>>>  *  E.g., driver was not cleanly unloaded previously, etc.
>>>  */
>>> if (!amdgpu_sriov_vf(adev) &&
>>> amdgpu_asic_need_reset_on_init(adev)) {
>>> +   if (amdgpu_passthrough(adev) &&
>>> + adev->powerplay.pp_funcs &&
>>adev->powerplay.pp_funcs->set_asic_baco_cap) {
>>> +   r =
>>> + adev->powerplay.pp_funcs->set_asic_baco_cap(adev-
>>>powerplay.pp_handle);
>>> +   if (r) {
>>> +   dev_err(adev->dev, "set baco capability 
>>> failed\n");
>>> +   goto failed;
>>> +   }
>>> +   }
>>> +
>>
>>I think it would be cleaner to add this to hwmgr_early_init() or
>>something called from early init for powerplay.
>I  also preferred to put it in the hwmgr_early_init, but as the function
>set_asic_baco_cap  need to get the vbios info,  so need to put the
>amdgpu_get_bios before early init. If so the code changes too big.
>>
>>Alex
>>
>>> r = amdgpu_asic_reset(adev);
>>> if (r) {
>>> dev_err(adev->dev, "asic reset on init
>>> failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> index 78bd4fc..d9fdd95 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> @@ -764,7 +764,8 @@ static bool soc15_need_reset_on_init(struct
>>amdgpu_device *adev)
>>> /* Just return false for soc15 GPUs.  Reset does not seem to
>>>  * be necessary.
>>>  */
>>> -   return false;
>>> +   if (!amdgpu_passthrough(adev))
>>> +   return false;
>>>
>>> if (adev->flags & AMD_IS_APU)
>>> return false;
>>> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> index 9f661bf..c6e2a51 100644
>>> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> @@ -296,6 +296,7 @@ struct amd_pm_funcs {
>>> int (*set_hard_min_fclk_by_freq)(void *handle, uint32_t clock);
>>> int (*set_min_deep_sleep_dcefclk)(void *handle, uint32_t clock);
>>> int (*get_asic_baco_capability)(void *handle, bool *cap);
>>> +   int (*set_asic_baco_cap)(void *handle);
>>> int (*get_asic_baco_state)(void *handle, int *state);
>>> int (*set_asic_baco_state)(void *handle, int state);
>>> int (*get_ppfeature_status)(void *handle, char *buf); diff
>>> --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
>>> b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
>>> index bea1587..9856760 100644
>>> --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
>>> +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
>>> @@ -1404,6 +1404,21 @@ static int pp_set_active_display_count(void
>>*handle, uint32_t count)
>>> return ret;
>>>  }
>>>
>>> +static int pp_set_asic_baco_cap(void *handle) {
>>> +   struct pp_hwmgr *hwmgr = handle;
>>> +
>>> +   if (!hwmgr)
>>> +   return -EINVAL;
>>>