On 10/16/2025 3:18 AM, Ellen Pan wrote:
     1. Introduced amdgpu_virt_init_critical_region during VF init.
      - VFs use init_data_header_offset and init_data_header_size_kb
             transmitted via PF2VF mailbox to fetch the offset of
             critical regions' offsets/sizes in VRAM and save to
             adev->virt.crit_region_offsets and adev->virt.crit_region_sizes_kb.

Signed-off-by: Ellen Pan <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   4 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 165 ++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h    |  11 ++
  drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  31 ++++
  4 files changed, 211 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a99185ed0642..3ffb9bb1ec0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2782,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct 
amdgpu_device *adev)
                r = amdgpu_virt_request_full_gpu(adev, true);
                if (r)
                        return r;
+
+               r = amdgpu_virt_init_critical_region(adev);
+               if (r)
+                       return r;
        }
switch (adev->asic_type) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 56573fb27f63..805ecc69a8b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -44,6 +44,18 @@
                vf2pf_info->ucode_info[ucode].version = ver; \
        } while (0)
+#define mmRCC_CONFIG_MEMSIZE 0xde3
+
+const char *amdgpu_virt_dynamic_crit_table_name[] = {
+       "IP DISCOVERY",
+       "VBIOS IMG",
+       "RAS TELEMETRY",
+       "DATA EXCHANGE",
+       "BAD PAGE INFO",
+       "INIT HEADER",
+       "LAST",
+};
+
  bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev)
  {
        /* By now all MMIO pages except mailbox are blocked */
@@ -843,6 +855,159 @@ static void amdgpu_virt_init_ras(struct amdgpu_device 
*adev)
        adev->virt.ras.cper_rptr = 0;
  }
+static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end)
+{
+       uint32_t sum = 0;
+
+       if (buf_start >= buf_end)
+               return 0;
+
+       for (; buf_start < buf_end; buf_start++)
+               sum += buf_start[0];
+
+       return 0xffffffff - sum;
+}
+
+int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
+{
+       struct amd_sriov_msg_init_data_header *init_data_hdr = NULL;
+       uint32_t init_hdr_offset = adev->virt.init_data_header.offset;
+       uint32_t init_hdr_size = adev->virt.init_data_header.size_kb << 10;
+       uint64_t vram_size;
+       int r = 0;
+       uint8_t checksum = 0;
+
+       /* Skip below init if critical region version != v2 */
+       if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2)
+               return 0;
+
+       if (init_hdr_offset < 0) {
+               dev_err(adev->dev, "Invalid init header offset\n");
+               return -EINVAL;
+       }
+
+       vram_size = RREG32(mmRCC_CONFIG_MEMSIZE);
+       if (!vram_size || vram_size == U32_MAX)
+               return -EINVAL;
+       vram_size <<= 20;
+
+       if ((init_hdr_offset + init_hdr_size) > vram_size) {
+               dev_err(adev->dev, "init_data_header exceeds VRAM size, 
exiting\n");
+               return -EINVAL;
+       }
+
+       /* Allocate for init_data_hdr */
+       init_data_hdr = kzalloc(sizeof(struct amd_sriov_msg_init_data_header), 
GFP_KERNEL);
+       if (!init_data_hdr)
+               return -ENOMEM;
+
+       amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t 
*)init_data_hdr,
+                                       sizeof(struct 
amd_sriov_msg_init_data_header), false);
+
+       /* Table validation */
+       if (strncmp(init_data_hdr->signature,
+                               AMDGPU_SRIOV_CRIT_DATA_SIGNATURE,
+                               AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) {
+               dev_err(adev->dev, "Invalid init data signature: %.4s\n",
+                       init_data_hdr->signature);
+               r = -EINVAL;
+               goto out;
+       }
+
+       checksum = amdgpu_virt_crit_region_calc_checksum(
+                       (uint8_t *)&init_data_hdr->initdata_offset,
+                       (uint8_t *)init_data_hdr +
+                       sizeof(struct amd_sriov_msg_init_data_header));
+       if (checksum != init_data_hdr->checksum) {
+               dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x 
and init_data 0x%x\n",
+                               checksum, init_data_hdr->checksum);
+               r = -EINVAL;
+               goto out;
+       }
+
+       memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn));
+       memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl));
+
+       adev->virt.crit_regn.offset = init_data_hdr->initdata_offset;
+       adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb;
+
+       /* Validation and initialization for each table entry */
+       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, 
AMD_SRIOV_MSG_IPD_TABLE_ID)) {
+               if (init_data_hdr->ip_discovery_size_in_kb > 
DISCOVERY_TMR_SIZE) {
+                       dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
+                                       init_data_hdr->ip_discovery_size_in_kb);
+                       r = -EINVAL;
+                       goto out;
+               }
+
+               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset =
+                       init_data_hdr->ip_discovery_offset;
+               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb =
+                       init_data_hdr->ip_discovery_size_in_kb;
+       } else {
+               dev_err(adev->dev, "Missing dynamic %s info from critical region 
v2.\n",
+                       
amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID]);
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, 
AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) {
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset =
+                       init_data_hdr->vbios_img_offset;
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb =
+                       init_data_hdr->vbios_img_size_in_kb;
+       } else {
+               dev_err(adev->dev, "Missing dynamic %s info from critical region 
v2.\n",
+                       
amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID]);
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, 
AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) {
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset =
+                       init_data_hdr->ras_tele_info_offset;
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb =
+                       init_data_hdr->ras_tele_info_size_in_kb;
+       } else {
+               dev_err(adev->dev, "Missing dynamic %s info from critical region 
v2.\n",
+                       
amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID]);
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, 
AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) {
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset =
+                       init_data_hdr->dataexchange_offset;
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb =
+                       init_data_hdr->dataexchange_size_in_kb;
+       } else {
+               dev_err(adev->dev, "Missing dynamic %s info from critical region 
v2.\n",
+                       
amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID]);
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, 
AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) {
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset =
+                       init_data_hdr->bad_page_info_offset;
+               
adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb =
+                       init_data_hdr->bad_page_size_in_kb;
+       } else {
+               dev_err(adev->dev, "Missing dynamic %s info from critical region 
v2.\n",
+                       
amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID]);
+               r = -EINVAL;
+               goto out;

Could you confirm if this is really an error codndtion? For ex: I could see this scheme followed on SOCs which don't support RAS and thus ras telemetry/badpage table sections may not make sense. Same could be applicable for others (though most others don't look optional), but you may confirm which ones are mandatory vs optional.

Thanks,
Lijo

+       }
+
+       adev->virt.is_dynamic_crit_regn_enabled = true;
+
+out:
+       kfree(init_data_hdr);
+       init_data_hdr = NULL;
+
+       return r;
+}
+
  void amdgpu_virt_init(struct amdgpu_device *adev)
  {
        bool is_sriov = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 36247a160aa6..8d03a8620de9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -54,6 +54,12 @@
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2 +/* Signature used to validate the SR-IOV dynamic critical region init data header ("INDA") */
+#define AMDGPU_SRIOV_CRIT_DATA_SIGNATURE "INDA"
+#define AMDGPU_SRIOV_CRIT_DATA_SIG_LEN   4
+
+#define IS_SRIOV_CRIT_REGN_ENTRY_VALID(hdr, id) ((hdr)->valid_tables & (1 << 
(id)))
+
  enum amdgpu_sriov_vf_mode {
        SRIOV_VF_MODE_BARE_METAL = 0,
        SRIOV_VF_MODE_ONE_VF,
@@ -296,6 +302,9 @@ struct amdgpu_virt {
/* dynamic(v2) critical regions */
        struct amdgpu_virt_region init_data_header;
+       struct amdgpu_virt_region crit_regn;
+       struct amdgpu_virt_region crit_regn_tbl[AMD_SRIOV_MSG_MAX_TABLE_ID];
+       bool is_dynamic_crit_regn_enabled;
/* vf2pf message */
        struct delayed_work vf2pf_work;
@@ -432,6 +441,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
  void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
  void amdgpu_virt_init(struct amdgpu_device *adev);
+int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
+
  bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
  int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
  void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index 9228fd2c6dfd..1cee083fb6bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -71,6 +71,37 @@ enum amd_sriov_crit_region_version {
        GPU_CRIT_REGION_V2 = 2,
  };
+/* v2 layout offset enum (in order of allocation) */
+enum amd_sriov_msg_table_id_enum {
+       AMD_SRIOV_MSG_IPD_TABLE_ID = 0,
+       AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID,
+       AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID,
+       AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID,
+       AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID,
+       AMD_SRIOV_MSG_INITD_H_TABLE_ID,
+       AMD_SRIOV_MSG_MAX_TABLE_ID,
+};
+
+struct amd_sriov_msg_init_data_header {
+       char     signature[4];  /* "INDA"  */
+       uint32_t version;
+       uint32_t checksum;
+       uint32_t initdata_offset; /* 0 */
+       uint32_t initdata_size_in_kb; /* 5MB */
+       uint32_t valid_tables;
+       uint32_t vbios_img_offset;
+       uint32_t vbios_img_size_in_kb;
+       uint32_t dataexchange_offset;
+       uint32_t dataexchange_size_in_kb;
+       uint32_t ras_tele_info_offset;
+       uint32_t ras_tele_info_size_in_kb;
+       uint32_t ip_discovery_offset;
+       uint32_t ip_discovery_size_in_kb;
+       uint32_t bad_page_info_offset;
+       uint32_t bad_page_size_in_kb;
+       uint32_t reserved[8];
+};
+
  /*
   * PF2VF history log:
   * v1 defined in amdgim

Reply via email to