[Public]
>-----Original Message-----
>From: Pan, Ellen <[email protected]>
>Sent: Saturday, October 11, 2025 12:19 AM
>To: [email protected]
>Cc: Deucher, Alexander <[email protected]>; Koenig, Christian
><[email protected]>; Lazar, Lijo <[email protected]>; Chan, Hing
>Pong <[email protected]>; Pan, Ellen <[email protected]>
>Subject: [PATCH v3 3/6] drm/amdgpu: Introduce SRIOV critical regions v2
>during VF init
>
> 1. Introduced amdgpu_virt_init_critical_region during VF init.
> - VFs use init_data_header_offset and init_data_header_size_kb
> transmitted via PF2VF mailbox to fetch the offset of
> critical regions' offsets/sizes in VRAM and save to
> adev->virt.crit_region_offsets and adev->virt.crit_region_sizes_kb.
>
>Signed-off-by: Ellen Pan <[email protected]>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 113
>++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 7 ++
> drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 31 ++++++
> 4 files changed, 155 insertions(+)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 929936c8d87c..351cfe03a1aa 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -2754,6 +2754,10 @@ static int amdgpu_device_ip_early_init(struct
>amdgpu_device *adev)
> r = amdgpu_virt_request_full_gpu(adev, true);
> if (r)
> return r;
>+
>+ r = amdgpu_virt_init_critical_region(adev);
>+ if (r)
>+ return r;
> }
>
> switch (adev->asic_type) {
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>index 3a6b0e1084d7..6eca5e8a7375 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>@@ -843,6 +843,119 @@ static void amdgpu_virt_init_ras(struct
>amdgpu_device *adev)
> adev->virt.ras.cper_rptr = 0;
> }
>
>+static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t
>+*buf_start, uint8_t *buf_end) {
>+ uint32_t sum = 0;
>+
>+ if (buf_start >= buf_end)
>+ return 0;
>+
>+ for (; buf_start < buf_end; buf_start++)
>+ sum += buf_start[0];
>+
>+ return 0xffffffff - sum;
>+}
>+
>+int amdgpu_virt_init_critical_region(struct amdgpu_device *adev) {
>+ struct amd_sriov_msg_init_data_header *init_data_hdr = NULL;
>+ uint32_t init_hdr_offset = adev->virt.init_data_header.offset;
>+ uint32_t init_hdr_size = adev->virt.init_data_header.size_kb << 10;
>+ uint64_t pos = 0;
[lijo]
This variable is not required. Seems it's always reading from init_hdr_offset.
>+ uint64_t vram_size;
>+ int r = 0;
>+ uint8_t checksum = 0;
>+
>+ /* Skip below init if critical region version != v2 */
>+ if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2)
>+ return 0;
>+
>+ if (init_hdr_offset < 0) {
>+ dev_err(adev->dev, "Invalid init header offset\n");
>+ return -EINVAL;
>+ }
>+
>+ vram_size = RREG32(mmRCC_CONFIG_MEMSIZE);
>+ if (!vram_size || vram_size == U32_MAX)
>+ return -EINVAL;
>+ vram_size <<= 20;
>+
>+ if ((init_hdr_offset + init_hdr_size) > vram_size) {
>+ dev_err(adev->dev, "init_data_header exceeds VRAM size,
>exiting\n");
>+ return -EINVAL;
>+ }
>+
>+ /* Allocate for init_data_hdr */
>+ init_data_hdr = kzalloc(sizeof(struct
>amd_sriov_msg_init_data_header), GFP_KERNEL);
>+ if (!init_data_hdr)
>+ return -ENOMEM;
>+
>+ pos = (uint64_t)init_hdr_offset;
>+ amdgpu_device_vram_access(adev, pos, (uint32_t *)init_data_hdr,
>+ sizeof(struct
>amd_sriov_msg_init_data_header), false);
>+
>+ switch (init_data_hdr->version) {
>+ case GPU_CRIT_REGION_V2:
[lijo]
There is already a version check at the beginning of this function.
>+ if (strncmp(init_data_hdr->signature, "INDA", 4) != 0) {
[lijo]
Suggest keeping this signature as a #define
>+ dev_err(adev->dev, "Invalid init data signature:
>%.4s\n",
>+ init_data_hdr->signature);
>+ r = -EINVAL;
>+ goto out;
>+ }
>+
>+ checksum = amdgpu_virt_crit_region_calc_checksum(
>+ (uint8_t *)&init_data_hdr->initdata_offset,
>+ (uint8_t *)init_data_hdr +
>+ sizeof(struct
>amd_sriov_msg_init_data_header));
>+ if (checksum != init_data_hdr->checksum) {
>+ dev_err(adev->dev, "Found unmatching checksum
>from calculation 0x%x and init_data 0x%x\n",
>+ checksum, init_data_hdr->checksum);
>+ r = -EINVAL;
>+ goto out;
>+ }
>+
>+ /* Initialize critical region offsets */
>+ adev->virt.crit_regn.offset = init_data_hdr->initdata_offset;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset =
>+ init_data_hdr->ip_discovery_offset;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset =
>+ init_data_hdr->vbios_img_offset;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset =
>+ init_data_hdr->ras_tele_info_offset;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset =
>+ init_data_hdr->dataexchange_offset;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset =
>+ init_data_hdr->bad_page_info_offset;
>+
>+ /* Initialize critical region sizes */
>+ adev->virt.crit_regn.size_kb = init_data_hdr-
>>initdata_size_in_kb;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb =
>+ init_data_hdr->ip_discovery_size_in_kb;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb =
>+ init_data_hdr->vbios_img_size_in_kb;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb =
>+ init_data_hdr->ras_tele_info_size_in_kb;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb =
>+ init_data_hdr->dataexchange_size_in_kb;
>+ adev-
>>virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb =
>+ init_data_hdr->bad_page_size_in_kb;
>+
>+ adev->virt.is_dynamic_crit_regn_enabled = true;
>+ break;
>+ default:
>+ dev_err(adev->dev, "Invalid init header version: %u\n",
>+ init_data_hdr->version);
>+ r = -EINVAL;
>+ goto out;
>+ }
>+
>+out:
>+ kfree(init_data_hdr);
>+ init_data_hdr = NULL;
>+
>+ return r;
>+}
>+
> void amdgpu_virt_init(struct amdgpu_device *adev) {
> bool is_sriov = false;
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>index 36247a160aa6..f46edc03f57f 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>@@ -52,6 +52,8 @@
> /* tonga/fiji use this offset */
> #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
>
>+#define mmRCC_CONFIG_MEMSIZE 0xde3
[lijo]
Alex already commented about this. Keeping this here will pollute other files
where amdgpu_virt.h is included and the real definition.
Thanks,
Lijo
>+
> #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
>
> enum amdgpu_sriov_vf_mode {
>@@ -296,6 +298,9 @@ struct amdgpu_virt {
>
> /* dynamic(v2) critical regions */
> struct amdgpu_virt_region init_data_header;
>+ struct amdgpu_virt_region crit_regn;
>+ struct amdgpu_virt_region
>crit_regn_tbl[AMD_SRIOV_MSG_MAX_TABLE_ID];
>+ bool is_dynamic_crit_regn_enabled;
>
> /* vf2pf message */
> struct delayed_work vf2pf_work;
>@@ -432,6 +437,8 @@ void amdgpu_virt_exchange_data(struct
>amdgpu_device *adev); void amdgpu_virt_fini_data_exchange(struct
>amdgpu_device *adev); void amdgpu_virt_init(struct amdgpu_device *adev);
>
>+int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
>+
> bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev); int
>amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev); void
>amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev); diff --git
>a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
>b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
>index b53caab5b706..d15c256f9abd 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
>@@ -70,6 +70,37 @@ enum amd_sriov_crit_region_version {
> GPU_CRIT_REGION_V2 = 2,
> };
>
>+/* v2 layout offset enum (in order of allocation) */ enum
>+amd_sriov_msg_table_id_enum {
>+ AMD_SRIOV_MSG_IPD_TABLE_ID = 0,
>+ AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID,
>+ AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID,
>+ AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID,
>+ AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID,
>+ AMD_SRIOV_MSG_INITD_H_TABLE_ID,
>+ AMD_SRIOV_MSG_MAX_TABLE_ID,
>+};
>+
>+struct amd_sriov_msg_init_data_header {
>+ char signature[4]; /* "INDA" */
>+ uint32_t version;
>+ uint32_t checksum;
>+ uint32_t initdata_offset; /* 0 */
>+ uint32_t initdata_size_in_kb; /* 5MB */
>+ uint32_t valid_tables;
>+ uint32_t vbios_img_offset;
>+ uint32_t vbios_img_size_in_kb;
>+ uint32_t dataexchange_offset;
>+ uint32_t dataexchange_size_in_kb;
>+ uint32_t ras_tele_info_offset;
>+ uint32_t ras_tele_info_size_in_kb;
>+ uint32_t ip_discovery_offset;
>+ uint32_t ip_discovery_size_in_kb;
>+ uint32_t bad_page_info_offset;
>+ uint32_t bad_page_size_in_kb;
>+ uint32_t reserved[8];
>+};
>+
> /*
> * PF2VF history log:
> * v1 defined in amdgim
>--
>2.34.1