The patch introduces baseline implementation of a draft proposal of RISC-V IOMMU specification as discussed in the RISC-V Forum [1] [2].
The implementation follows a draft version of the specification published at [3] including all updates available on 2022/03/10. This patch covers baseline features proposed in the specification: - Two stage address translation, with Sv32, Sv39, Sv48, Sv57 modes. - Multilevel device directory tree. - Cache management command interface. - Fault reporting interface. References: [1] https://lists.riscv.org/g/tech-privileged/message/875 [2] https://lists.riscv.org/g/tech-iommu/message/3 [3] https://docs.google.com/document/d/1ytBZ6eDk1pAeBlZjDvm6_qqJbKQ0fMYKedyx0uoAGB0/view Signed-off-by: Tomasz Jeznach <tjezn...@rivosinc.com> --- hw/riscv/Kconfig | 3 + hw/riscv/meson.build | 1 + hw/riscv/rivos_iommu.c | 1350 ++++++++++++++++++++++++++++++++ hw/riscv/trace-events | 7 + hw/riscv/trace.h | 2 + include/hw/pci/pci_ids.h | 1 + include/hw/riscv/rivos_iommu.h | 80 ++ meson.build | 1 + 8 files changed, 1445 insertions(+) create mode 100644 hw/riscv/rivos_iommu.c create mode 100644 hw/riscv/trace-events create mode 100644 hw/riscv/trace.h create mode 100644 include/hw/riscv/rivos_iommu.h diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig index 91bb9d21c4..c6cbd7b42c 100644 --- a/hw/riscv/Kconfig +++ b/hw/riscv/Kconfig @@ -4,6 +4,9 @@ config RISCV_NUMA config IBEX bool +config RIVOS_IOMMU + bool + config MICROCHIP_PFSOC bool select CADENCE_SDHCI diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build index ab6cae57ea..a2aeb5fab4 100644 --- a/hw/riscv/meson.build +++ b/hw/riscv/meson.build @@ -9,5 +9,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_E', if_true: files('sifive_e.c')) riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c')) riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c')) riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c')) +riscv_ss.add(when: 'CONFIG_RIVOS_IOMMU', if_true: files('rivos_iommu.c')) hw_arch += {'riscv': riscv_ss} diff --git a/hw/riscv/rivos_iommu.c b/hw/riscv/rivos_iommu.c new file mode 100644 index 0000000000..f043a6864a --- /dev/null +++ b/hw/riscv/rivos_iommu.c @@ -0,0 +1,1350 @@ +/* + * QEMU emulation of an RISC-V RIVOS-IOMMU + * + * Copyright (C) 2022 Rivos Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qom/object.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/qdev-properties.h" +#include "hw/riscv/riscv_hart.h" +#include "hw/riscv/rivos_iommu.h" +#include "migration/vmstate.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +#include "trace.h" + + +/* Based on Rivos RISC-V IOMMU Specification, Mar 10, 2022 */ + +/* Rivos I/O programming interface registers */ +#define RIO_REG_CAP 0x0000 /* Supported capabilities */ +#define RIO_REG_DDTP 0x0010 /* Device Directory Table Pointer */ +#define RIO_REG_CQ_BASE 0x0018 /* Command queue base/head/tail */ +#define RIO_REG_CQ_HEAD 0x0020 +#define RIO_REG_CQ_TAIL 0x0024 +#define RIO_REG_FQ_BASE 0x0028 /* Fault queue base/head/tail */ +#define RIO_REG_FQ_HEAD 0x0030 +#define RIO_REG_FQ_TAIL 0x0034 +#define RIO_REG_PQ_BASE 0x0038 /* Page request queue base/head/tail */ +#define RIO_REG_PQ_HEAD 0x0040 +#define RIO_REG_PQ_TAIL 0x0044 +#define RIO_REG_CQ_CONTROL 0x0048 /* Command queue control */ +#define RIO_REG_FQ_CONTROL 0x004C /* Fault queue control */ +#define RIO_REG_PQ_CONTROL 0x0050 /* Page request queue control */ +#define RIO_REG_IPSR 0x0054 /* Interrupt pending status */ +#define RIO_REG_IOCNTOVF 0x0058 +#define RIO_REG_IOCNTINH 0x005C +#define RIO_REG_IOHPMCYCLES 0x0060 +#define RIO_REG_IOHPMCTR_BASE 0x0068 +#define RIO_REG_IOHPMEVT_BASE 0x0160 +#define RIO_REG_IOCNTSEC 0x0258 +#define RIO_REG_IVEC 0x02F8 /* Interrupt cause to vector mapping */ +#define RIO_REG_MSI_ADDR_BASE 0x0300 /* MSI address for vector #0 */ +#define RIO_REG_MSI_DATA_BASE 0x0308 /* MSI data for vector #0 */ +#define RIO_REG_MSI_CTRL_BASE 0x030C /* MSI control for vector #0 */ +#define RIO_REG_MSI_PBA_BASE 0x0400 /* MSI Pending Bit Array */ + +/* Capabilities supported by the IOMMU, RIO_REG_CAP */ +#define RIO_CAP_REVISION_MASK 0x00FF +#define RIO_CAP_STAGE_ONE (1ULL << 8) +#define RIO_CAP_STAGE_TWO (1ULL << 9) +#define RIO_CAP_MSI (1ULL << 10) +#define RIO_CAP_MRIF (1ULL << 11) +#define RIO_CAP_ATS (1ULL << 12) +#define RIO_CAP_AMO (1ULL << 13) + +/* Device directory table pointer */ +#define RIO_DDTP_BUSY (1ULL << 59) + +#define RIO_DDTP_MASK_PPN 0x00000FFFFFFFFFFFULL +#define RIO_DDTP_MASK_MODE 0xF000000000000000ULL +#define RIO_DDTE_MASK_PPN 0x00FFFFFFFFFFF000ULL + +/* Device directory mode values, within RIO_DDTP_MASK_MODE */ +#define RIO_DDTP_MODE_OFF 0 +#define RIO_DDTP_MODE_BARE 1 +#define RIO_DDTP_MODE_3LVL 2 +#define RIO_DDTP_MODE_2LVL 3 +#define RIO_DDTP_MODE_1LVL 4 +#define RIO_DDTP_MODE_MAX RIO_DDTP_MODE_1LVL + +/* Command queue base register */ +#define RIO_CQ_MASK_LOG2SZ 0x000000000000001FULL +#define RIO_CQ_MASK_PPN 0x0001FFFFFFFFFFE0ULL + +/* Command queue control and status register */ +#define RIO_CQ_ENABLE (1 << 0) +#define RIO_CQ_IRQ_ENABLE (1 << 1) +#define RIO_CQ_FAULT (1 << 8) +#define RIO_CQ_TIMEOUT (1 << 9) +#define RIO_CQ_ERROR (1 << 10) +#define RIO_CQ_ACTIVE (1 << 16) +#define RIO_CQ_BUSY (1 << 17) + +/* Fault queue base register */ +#define RIO_FQ_MASK_LOG2SZ 0x000000000000001FULL +#define RIO_FQ_MASK_PPN 0x0001FFFFFFFFFFE0ULL + +/* Fault queue control and status register */ +#define RIO_FQ_ENABLE (1 << 0) +#define RIO_FQ_IRQ_ENABLE (1 << 1) +#define RIO_FQ_FAULT (1 << 8) +#define RIO_FQ_FULL (1 << 9) +#define RIO_FQ_ACTIVE (1 << 16) +#define RIO_FQ_BUSY (1 << 17) + +/* Page request queue base register */ +#define RIO_PQ_MASK_LOG2SZ 0x000000000000001FULL +#define RIO_PQ_MASK_PPN 0x0001FFFFFFFFFFE0ULL + +/* Page request queue control and status register */ +#define RIO_PQ_ENABLE (1 << 0) +#define RIO_PQ_IRQ_ENABLE (1 << 1) +#define RIO_PQ_FAULT (1 << 8) +#define RIO_PQ_FULL (1 << 9) +#define RIO_PQ_ACTIVE (1 << 16) +#define RIO_PQ_BUSY (1 << 17) + +/* Interrupt Sources, used for IPSR and IVEC indexing. */ +#define RIO_INT_CQ 0 +#define RIO_INT_FQ 1 +#define RIO_INT_PM 2 +#define RIO_INT_PQ 3 +#define RIO_INT_COUNT 4 + +/* Device Context */ +typedef struct RivosIOMMUDeviceContext { + uint64_t tc; /* Translation Control */ + uint64_t gatp; /* IO Hypervisor Guest Address Translation */ + uint64_t satp; /* IO SATP or IO vSATP or PDTP */ + uint64_t pscid; /* Process soft-context ID */ + uint64_t msiptp; /* MSI Page Table Pointer (extended context) */ + uint64_t msi_addr_mask; + uint64_t msi_addr_pattern; + uint64_t _reserved; +} RivosIOMMUDeviceContext; + +#define RIO_DCTC_VALID (1ULL << 0) +#define RIO_DCTC_EN_ATS (1ULL << 1) +#define RIO_DCTC_EN_PRI (1ULL << 2) +#define RIO_DCTC_T2GPA (1ULL << 3) +#define RIO_DCTC_DIS_TRANS_FAULT (1ULL << 4) +#define RIO_DCTC_PDTV (1ULL << 5) + +/* Shared MODE:ASID:PPN masks for GATP, SATP */ +#define RIO_ATP_MASK_PPN SATP64_PPN +#define RIO_ATP_MASK_GSCID SATP64_ASID +#define RIO_ATP_MASK_MODE SATP64_MODE + +#define RIO_ATP_MODE_SV32 VM_1_10_SV32 +#define RIO_ATP_MODE_SV39 VM_1_10_SV39 +#define RIO_ATP_MODE_SV48 VM_1_10_SV48 +#define RIO_ATP_MODE_SV57 VM_1_10_SV57 +#define RIO_ATP_MODE_BARE VM_1_10_MBARE + +/* satp.mode when tc.RIO_DCTC_PDTV is set */ +#define RIO_PDTP_MODE_BARE 0 +#define RIO_PDTP_MODE_PD20 1 +#define RIO_PDTP_MODE_PD17 2 +#define RIO_PDTP_MODE_PD8 3 + +#define RIO_DCMSI_VALID 1 +#define RIO_DCMSI_MASK_PPN 0x0FFFFFFFFFFFFFFEULL +#define RIO_DCMSI_MASK_MODE 0xF000000000000000ULL + +#define RIO_DCMSI_MODE_BARE 0 +#define RIO_DCMSI_MODE_FLAT 1 + +/* I/O Management Unit Command format */ +typedef struct RivosIOMMUCommand { + uint64_t request; + uint64_t address; +} RivosIOMMUCommand; + +/* RivosIOMMUCommand.request opcode and function mask */ +#define RIO_CMD_MASK_FUN_OP 0x00000000000003FFULL + +/* opcode == IOTINVAL.* */ +#define RIO_CMD_IOTINVAL_VMA 0x001 +#define RIO_CMD_IOTINVAL_GVMA 0x081 +#define RIO_CMD_IOTINVAL_MSI 0x101 + +#define RIO_IOTINVAL_PSCID_VALID 0x0000000000000400ULL +#define RIO_IOTINVAL_ADDR_VALID 0x0000000000000800ULL +#define RIO_IOTINVAL_GSCID_VALID 0x0000000000001000ULL +#define RIO_IOTINVAL_ADDR_NAPOT 0x0000000000002000ULL +#define RIO_IOTINVAL_MASK_PSCID 0x0000000FFFFF0000ULL +#define RIO_IOTINVAL_MASK_GSCID 0x00FFFF0000000000ULL + +/* opcode == IODIR.* */ +#define RIO_CMD_IODIR_INV_DDT 0x002 +#define RIO_CMD_IODIR_PRE_DDT 0x082 +#define RIO_CMD_IODIR_INV_PDT 0x102 +#define RIO_CMD_IODIR_PRE_PDT 0x182 + +#define RIO_IODIR_DID_VALID 0x0000000000000400ULL +#define RIO_IODIR_MASK_PID 0x0000000FFFFF0000ULL +#define RIO_IODIR_MASK_DID 0xFFFFFF0000000000ULL + +/* opcode == IOFENCE.* */ +#define RIO_CMD_IOFENCE_C 0x003 + +#define RIO_IOFENCE_PR 0x0000000000000400ULL +#define RIO_IOFENCE_PW 0x0000000000000800ULL +#define RIO_IOFENCE_AV 0x0000000000001000ULL +#define RIO_IOFENCE_MASK_DATA 0xFFFFFFFF00000000ULL + +/* opcode == ATS */ +#define RIO_CMD_ATS_INVAL 0x004 +#define RIO_CMD_ATS_PRGR 0x084 + +/* Fault Queue element */ +typedef struct RivosIOMMUEvent { + uint64_t reason; + uint64_t _rsrvd; + uint64_t iova; + uint64_t phys; +} RivosIOMMUEvent; + +/* Event reason */ +#define RIO_EVENT_MASK_DID 0x0000000000FFFFFFULL +#define RIO_EVENT_MASK_PID 0x00000FFFFF000000ULL +#define RIO_EVENT_PV 0x0000100000000000ULL +#define RIO_EVENT_PRIV 0x0000200000000000ULL +#define RIO_EVENT_MASK_TTYP 0x000FC00000000000ULL +#define RIO_EVENT_MASK_CAUSE 0xFFF0000000000000ULL + +#define RIO_TTYP_NONE 0 /* Fault not caused by an inbound trx */ +#define RIO_TTYP_URX 1 /* Untranslated read for execute trx */ +#define RIO_TTYP_URD 2 /* Untranslated read transaction */ +#define RIO_TTYP_UWR 3 /* Untranslated write/AMO transaction */ +#define RIO_TTYP_TRX 4 /* Translated read for execute trx */ +#define RIO_TTYP_TRD 5 /* Translated read transaction */ +#define RIO_TTYP_TWR 6 /* Translated write/AMO transaction */ +#define RIO_TTYP_ATS 7 /* PCIe ATS Translation Request */ +#define RIO_TTYP_MRQ 8 /* Message Request */ + +#define RIO_ERRC_I_ALIGN 0 /* Instruction address misaligned */ +#define RIO_ERRC_I_FAULT 1 /* Instruction access fault */ +#define RIO_ERRC_RD_ALIGN 4 /* Read address misaligned */ +#define RIO_ERRC_RD_FAULT 5 /* Read access fault */ +#define RIO_ERRC_WR_ALIGN 6 /* Write/AMO address misaligned */ +#define RIO_ERRC_WR_FAULT 7 /* Write/AMO access fault */ +#define RIO_ERRC_PGFAULT_I 12 /* Instruction page fault */ +#define RIO_ERRC_PGFAULT_RD 13 /* Read page fault */ +#define RIO_ERRC_PGFAULT_WR 15 /* Write/AMO page fault */ +#define RIO_ERRC_GPGFAULT_I 20 /* Instruction guest page fault */ +#define RIO_ERRC_GPGFAULT_RD 21 /* Read guest-page fault */ +#define RIO_ERRC_GPGFAULT_WR 23 /* Write/AMO guest-page fault */ +#define RIO_ERRC_DMA_DISABLED 256 /* Inbound transactions disallowed */ +#define RIO_ERRC_DDT_FAULT 257 /* DDT entry load access fault */ +#define RIO_ERRC_DDT_INVALID 258 /* DDT entry not valid */ +#define RIO_ERRC_DDT_UNSUPPORTED 259 /* DDT entry misconfigured */ +#define RIO_ERRC_REQ_INVALID 260 /* Transaction type disallowed */ +#define RIO_ERRC_PDT_FAULT 261 /* PDT entry load access fault. */ +#define RIO_ERRC_PDT_INVALID 262 /* PDT entry not valid */ +#define RIO_ERRC_PDT_UNSUPPORTED 263 /* PDT entry misconfigured */ +#define RIO_ERRC_MSI_FAULT 264 /* MSI PTE load access fault */ +#define RIO_ERRC_MSI_INVALID 265 /* MSI PTE not valid */ +#define RIO_ERRC_MRIF_FAULT 266 /* MRIF access fault */ + + +/* + * Rivos Inc. I/O Management Unit PCIe Device Emulation + */ + +#ifndef PCI_VENDOR_ID_RIVOS +#define PCI_VENDOR_ID_RIVOS 0x1efd +#endif + +#ifndef PCI_DEVICE_ID_RIVOS_IOMMU +#define PCI_DEVICE_ID_RIVOS_IOMMU 0x8001 +#endif + +/* Programming interface revision */ +#define RIO_CAP_REVISION 0x0002 + +#define RIO_REG_MMIO_SIZE 0x0300 + +#define RIO_ERR_NONE 0 +#define RIO_ERR_ANY 1 + +#define RIO_ERR(cause) \ + (RIO_ERR_ANY | (((cause) & 0x0fff) << 16)) + +#define RIO_ERR_IO(cause, ttyp) \ + (RIO_ERR_ANY | (((cause) & 0x0fff) << 16) | (((ttyp) & 0x3f) << 8)) + +#define RIO_ERR_CAUSE(err) (((err) >> 16) & 0xfff) +#define RIO_ERR_TTYP(err) (((err) >> 8) & 0x3f) + + +/* IO virtual address space wrapper for attached PCI devices */ +struct RivosIOMMUSpace { + IOMMUMemoryRegion mr; + AddressSpace as; + RivosIOMMUState *iommu; + RivosIOMMUDeviceContext dc; + bool dc_valid; + uint32_t devid; + QLIST_ENTRY(RivosIOMMUSpace) list; +}; + + +static uint32_t rivos_iommu_reg_mod(RivosIOMMUState *s, + unsigned idx, uint32_t set, uint32_t clr) +{ + uint32_t val; + qemu_mutex_lock(&s->core_lock); + val = ldl_le_p(&s->regs_rw[idx]); + stl_le_p(&s->regs_rw[idx], set | (val & ~clr)); + qemu_mutex_unlock(&s->core_lock); + return val; +} + +static unsigned rivos_iommu_irq_vector(RivosIOMMUState *s, int source) +{ + const uint32_t ivec = ldl_le_p(&s->regs_rw[RIO_REG_IVEC]); + return (ivec >> (source * 4)) & 0x0F; +} + +static void rivos_iommu_irq_use(RivosIOMMUState *s, int source) +{ + msix_vector_use(&(s->pci), rivos_iommu_irq_vector(s, source)); +} + +static void rivos_iommu_irq_unuse(RivosIOMMUState *s, int source) +{ + msix_vector_unuse(&(s->pci), rivos_iommu_irq_vector(s, source)); +} + +static void rivos_iommu_irq_assert(RivosIOMMUState *s, int source) +{ + uint32_t ipsr = rivos_iommu_reg_mod(s, RIO_REG_IPSR, (1 << source), 0); + + if (!(ipsr & (1 << source)) && msix_enabled(&(s->pci))) { + const unsigned vector = rivos_iommu_irq_vector(s, source); + msix_notify(&(s->pci), vector); + } +} + +static void rivos_iommu_fault_iova(RivosIOMMUSpace *as, int err, hwaddr iova, + hwaddr gpa) +{ + RivosIOMMUState *s = as->iommu; + RivosIOMMUEvent ev; + MemTxResult res; + MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED; + uint32_t head = ldl_le_p(&s->regs_rw[RIO_REG_FQ_HEAD]) & s->fq_mask; + uint32_t next = (s->fq_tail + 1) & s->fq_mask; + uint32_t ctrl = ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]); + uint32_t ctrl_err = 0; + + ev.reason = as->devid; + ev.reason = set_field(ev.reason, RIO_EVENT_MASK_CAUSE, RIO_ERR_CAUSE(err)); + ev.reason = set_field(ev.reason, RIO_EVENT_MASK_TTYP, RIO_ERR_TTYP(err)); + ev.iova = iova; + ev.phys = gpa; + + trace_rivos_iommu_flt(PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), + PCI_FUNC(as->devid), RIO_ERR_CAUSE(err), iova); + + if (!(ctrl & RIO_FQ_ACTIVE) || !!(ctrl & (RIO_FQ_FULL | RIO_FQ_FAULT))) { + return; + } + + if (head == next) { + ctrl_err = RIO_FQ_FULL; + } else { + dma_addr_t addr = s->fq_base + s->fq_tail * sizeof(RivosIOMMUEvent); + res = dma_memory_write(&address_space_memory, addr, &ev, sizeof(ev), + ma); + if (res != MEMTX_OK) { + ctrl_err = RIO_FQ_FAULT; + } else { + s->fq_tail = next; + } + } + + stl_le_p(&s->regs_rw[RIO_REG_FQ_TAIL], s->fq_tail); + + if (ctrl_err) { + rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, ctrl_err, 0); + } + + if (ctrl & RIO_FQ_IRQ_ENABLE) { + rivos_iommu_irq_assert(s, RIO_INT_FQ); + } +} + +static void rivos_iommu_fault(RivosIOMMUSpace *as, int cause) +{ + rivos_iommu_fault_iova(as, cause, 0, 0); +} + + +/* Risc-V IOMMU Page Table walker. + * + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c + * Both implementation can be merged into single helper function in future. + * Keeping them separate for now, as error reporting and flow specifics are + * sufficiently different for separate implementation. + * + * Returns RIO_ERR_ with fault code. + */ +static int rivos_iommu_fetch_pa(RivosIOMMUSpace *as, + hwaddr addr, hwaddr *physical, uint64_t gatp, uint64_t satp, + bool first_stage, IOMMUAccessFlags access) +{ + MemTxResult res; + MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED; + hwaddr base; + int i, levels, ptidxbits, ptshift, ptesize, mode, widened; + uint64_t atp = first_stage ? satp : gatp; + + base = (hwaddr) get_field(atp, RIO_ATP_MASK_PPN) << PGSHIFT; + mode = get_field(atp, RIO_ATP_MASK_MODE); + + switch (mode) { + case RIO_ATP_MODE_SV32: + levels = 2; + ptidxbits = 10; + ptesize = 4; + break; + case RIO_ATP_MODE_SV39: + levels = 3; + ptidxbits = 9; + ptesize = 8; + break; + case RIO_ATP_MODE_SV48: + levels = 4; + ptidxbits = 9; + ptesize = 8; + break; + case RIO_ATP_MODE_SV57: + levels = 5; + ptidxbits = 9; + ptesize = 8; + break; + case RIO_ATP_MODE_BARE: + if (first_stage) { + return rivos_iommu_fetch_pa(as, addr, physical, + gatp, satp, false, access); + } + *physical = addr; + return RIO_ERR_NONE; + default: + return RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED); + } + + widened = first_stage ? 0 : 2; + ptshift = (levels - 1) * ptidxbits; + + /* zero extended address range check */ + int va_bits = PGSHIFT + levels * ptidxbits + widened; + uint64_t va_mask = (1ULL << va_bits) - 1; + if ((addr & va_mask) != addr) { + return RIO_ERR(RIO_ERRC_DMA_DISABLED); + } + + for (i = 0; i < levels; i++, ptshift -= ptidxbits) { + target_ulong pte; + hwaddr pte_addr; + target_ulong idx; + + idx = (addr >> (PGSHIFT + ptshift)) & ((1 << (ptidxbits + widened))-1); + pte_addr = base + idx * ptesize; + widened = 0; + + if (ptesize == 4) { + pte = address_space_ldl(&address_space_memory, pte_addr, ma, &res); + } else { + pte = address_space_ldq(&address_space_memory, pte_addr, ma, &res); + } + + if (res != MEMTX_OK) { + return RIO_ERR(RIO_ERRC_PDT_FAULT); + } + + hwaddr ppn = pte >> PTE_PPN_SHIFT; + + if (!(pte & PTE_V)) { + /* Invalid PTE */ + return RIO_ERR(RIO_ERRC_PDT_INVALID); + } else if (!(pte & (PTE_R | PTE_W | PTE_X))) { + /* Inner PTE, continue walking */ + base = ppn << PGSHIFT; + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) { + /* Reserved leaf PTE flags: PTE_W */ + return RIO_ERR(RIO_ERRC_PDT_INVALID); + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) { + /* Reserved leaf PTE flags: PTE_W + PTE_X */ + return RIO_ERR(RIO_ERRC_PDT_INVALID); + } else if (ppn & ((1ULL << ptshift) - 1)) { + /* Misaligned PPN */ + return RIO_ERR(RIO_ERRC_PDT_INVALID); + } else if ((access & IOMMU_RO) && !(pte & PTE_R)) { + /* Read access check failed */ + return first_stage ? RIO_ERR(RIO_ERRC_GPGFAULT_RD) + : RIO_ERR(RIO_ERRC_PGFAULT_RD); + } else if ((access & IOMMU_WO) && !(pte & PTE_W)) { + /* Write access check failed */ + return first_stage ? RIO_ERR(RIO_ERRC_GPGFAULT_WR) + : RIO_ERR(RIO_ERRC_PGFAULT_WR); + } else { + /* Leaf PTE, update base to translated address. */ + target_ulong vpn = addr >> PGSHIFT; + base = ((ppn | (vpn & ((1L << ptshift) - 1))) << PGSHIFT) | + (addr & ~TARGET_PAGE_MASK); + } + + /* Do the second stage translation if enabled. */ + if (first_stage) { + hwaddr spa; + + int ret = rivos_iommu_fetch_pa(as, base, &spa, + gatp, satp, false, access); + + /* Report back GPA causing second stage translation fault. */ + if (ret) { + *physical = base; + return ret; + } + + base = spa; + } + + if (pte & (PTE_R | PTE_W | PTE_X)) { + /* Leaf PTE, return translated address */ + *physical = base; + return RIO_ERR_NONE; + } + } + return RIO_ERR(RIO_ERRC_PDT_INVALID); +} + +/* Risc-V IOMMU Device Directory Tree walker. + * + * Returns RIO_ERR_ with fault code. + */ +static int rivos_iommu_fetch_dc(RivosIOMMUState *iommu, uint32_t devid, + RivosIOMMUDeviceContext *dc) +{ + MemTxResult res; + MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED; + hwaddr addr; + const bool dcbase = !iommu->enable_msi; + const size_t dcsize = sizeof(*dc) >> dcbase; + unsigned int depth = RIO_DDTP_MODE_1LVL - iommu->ddt_mode; + + if (depth > 2) { + return RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED); + } + + /* Check supported device id range. */ + if (devid >= (1 << (depth * 9 + 6 + (dcbase && depth != 2)))) { + return RIO_ERR(RIO_ERRC_DDT_INVALID); + } + + for (addr = iommu->ddt_base; depth-- > 0; ) { + const int split = depth * 9 + 6 + dcbase; + addr |= ((devid >> split) << 3) & ~TARGET_PAGE_MASK; + uint64_t dde = address_space_ldq(&address_space_memory, addr, ma, &res); + if (res != MEMTX_OK) { + return RIO_ERR(RIO_ERRC_DDT_FAULT); + } + if (!(dde & RIO_DCTC_VALID)) { + return RIO_ERR(RIO_ERRC_DDT_INVALID); + } + addr = dde & RIO_DDTE_MASK_PPN; + } + + /* index into device context entry page */ + addr |= (devid * dcsize) & ~TARGET_PAGE_MASK; + + memset(dc, 0, sizeof(*dc)); + res = dma_memory_read(&address_space_memory, addr, dc, dcsize, ma); + + if (res != MEMTX_OK) { + return RIO_ERR(RIO_ERRC_DDT_FAULT); + } + + if (!(dc->tc & RIO_DCTC_VALID)) { + return RIO_ERR(RIO_ERRC_DDT_INVALID); + } + + return RIO_ERR_NONE; +} + +static void rivos_iommu_translate_tlb(RivosIOMMUSpace *as, + IOMMUAccessFlags flag, IOMMUTLBEntry *tlb) +{ + RivosIOMMUState *iommu = as->iommu; + + if (!as->dc_valid) { + /* Fetch device context if not cached. */ + int ret = rivos_iommu_fetch_dc(iommu, as->devid, &as->dc); + if (ret != RIO_ERR_NONE) { + rivos_iommu_fault(as, ret); + return; + } else { + as->dc_valid = true; + } + } + + /* MSI window */ + if (!(((tlb->iova >> PGSHIFT) ^ as->dc.msi_addr_pattern) & + ~as->dc.msi_addr_mask)) { + if (flag != IOMMU_WO) { + /* only writes are allowed. */ + rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_MRIF_FAULT), + tlb->iova, 0); + return; + } + if (tlb->iova & ~TARGET_PAGE_MASK) { + /* unaligned access. */ + rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_MRIF_FAULT), + tlb->iova, 0); + return; + } + if (!(as->dc.msiptp & RIO_DCMSI_VALID)) { + /* MSI remapping not enabled */ + rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_INVALID)); + return; + } + int mode = get_field(as->dc.msiptp, RIO_DCMSI_MASK_MODE); + switch (mode) { + case RIO_DCMSI_MODE_BARE: + tlb->translated_addr = tlb->iova; + tlb->addr_mask = ((1ULL << PGSHIFT) - 1); + tlb->perm = flag; + break; + + case RIO_DCMSI_MODE_FLAT: + /* TODO: not implemented, follow AIA section 9.5 */ + rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED)); + return; + + default: + rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED)); + return; + } + + return; + } + + /* Lookup SATP */ + if (as->dc.tc & RIO_DCTC_PDTV) { + /* Process directory tree is not supported yet. */ + rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_PDT_UNSUPPORTED)); + return; + } + + /* Lookup IOATC */ + /* TODO: merge in IOATC PoC */ + + /* Memory access */ + hwaddr physical; + int err = rivos_iommu_fetch_pa(as, tlb->iova, &physical, + as->dc.gatp, as->dc.satp, + iommu->enable_stage_one, flag); + if (err == RIO_ERR_NONE) { + tlb->translated_addr = physical; + tlb->addr_mask = ((1ULL << PGSHIFT) - 1); + tlb->perm = flag; + } else if (!(as->dc.tc & RIO_DCTC_DIS_TRANS_FAULT)) { + const int fault = RIO_ERR_IO(RIO_ERR_CAUSE(err), + flag == IOMMU_WO ? RIO_TTYP_UWR : RIO_TTYP_URD); + rivos_iommu_fault_iova(as, fault, tlb->iova, physical); + } + + return; +} + +static const char *IOMMU_FLAG_STR[] = { + "NA", + "RO", + "WR", + "RW", +}; + +/* Called from RCU critical section */ +static IOMMUTLBEntry rivos_iommu_translate(IOMMUMemoryRegion *iommu_mr, + hwaddr addr, IOMMUAccessFlags flag, int iommu_idx) +{ + RivosIOMMUSpace *as = container_of(iommu_mr, RivosIOMMUSpace, mr); + const uint32_t ddt_mode = as->iommu->ddt_mode; + IOMMUTLBEntry tlb = { + .iova = addr, + .target_as = &address_space_memory, + .perm = IOMMU_NONE, + }; + + switch (ddt_mode) { + case RIO_DDTP_MODE_OFF: + /* All translations disabled, power-on state. */ + rivos_iommu_fault_iova(as, RIO_ERR(RIO_ERRC_DMA_DISABLED), + tlb.iova, 0); + break; + + case RIO_DDTP_MODE_BARE: + /* Global passthrough mode enabled for all devices. */ + tlb.translated_addr = tlb.iova; + tlb.addr_mask = ~0ULL; + tlb.perm = flag; + break; + + case RIO_DDTP_MODE_3LVL: + case RIO_DDTP_MODE_2LVL: + case RIO_DDTP_MODE_1LVL: + /* Translate using device directory information. */ + rivos_iommu_translate_tlb(as, flag, &tlb); + break; + + default: + /* Invalid device directory tree mode, should never happen. */ + rivos_iommu_fault(as, RIO_ERR(RIO_ERRC_DDT_UNSUPPORTED)); + break; + } + + trace_rivos_iommu_dma(PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), + PCI_FUNC(as->devid), IOMMU_FLAG_STR[tlb.perm & IOMMU_RW], + tlb.iova, tlb.translated_addr); + + return tlb; +} + +static void rivos_iommu_iodir_inval_ddt(RivosIOMMUState *s, bool all, + uint32_t devid) +{ + RivosIOMMUSpace *as; + + qemu_mutex_lock(&s->core_lock); + QLIST_FOREACH(as, &s->spaces, list) { + if (all || (as->devid == devid)) { + as->dc_valid = false; + } + } + qemu_mutex_unlock(&s->core_lock); +} + +static void rivos_iommu_iofence(RivosIOMMUState *s, bool av, uint64_t addr, + uint32_t data) +{ + MemTxResult res; + MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED; + + if (av) { + res = dma_memory_write(&address_space_memory, addr, &data, sizeof(data), + ma); + if (res != MEMTX_OK) { + rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, RIO_CQ_FAULT, 0); + } + } +} + +static int rivos_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr, + IOMMUNotifierFlag old, IOMMUNotifierFlag new, Error **errp) +{ + if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { + error_setg(errp, "rivos-iommu does not support dev-iotlb"); + return -EINVAL; + } + + return 0; +} + +static void rivos_iommu_process_cq_tail(RivosIOMMUState *s) +{ + RivosIOMMUCommand cmd; + MemTxResult res; + dma_addr_t addr; + MemTxAttrs ma = MEMTXATTRS_UNSPECIFIED; + uint32_t tail; + uint32_t ctrl = ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]); + uint32_t bdf = pci_get_bdf(&s->pci); + uint32_t err = 0; + + /* Fetch latest tail position and clear busy marker */ + s->cq_tail_db = false; + tail = s->cq_mask & ldl_le_p(&s->regs_rw[RIO_REG_CQ_TAIL]); + + /* Check for pending error or queue processing disabled */ + if (!(ctrl & RIO_CQ_ACTIVE) || !!(ctrl & (RIO_CQ_ERROR | RIO_CQ_FAULT))) + { + return; + } + + while (tail != s->cq_head) { + addr = s->cq_base + s->cq_head * sizeof(cmd); + res = dma_memory_read(&address_space_memory, addr, &cmd, sizeof(cmd), + ma); + + if (res != MEMTX_OK) { + err = RIO_CQ_FAULT; + break; + } + + trace_rivos_iommu_cmd(PCI_BUS_NUM(bdf), PCI_SLOT(bdf), + PCI_FUNC(bdf), cmd.request, cmd.address); + + int fun_op = get_field(cmd.request, RIO_CMD_MASK_FUN_OP); + + switch(fun_op) { + case RIO_CMD_IOFENCE_C: + rivos_iommu_iofence(s, !!(cmd.request & RIO_IOFENCE_AV), + cmd.address, + get_field(cmd.request, RIO_IOFENCE_MASK_DATA)); + break; + + case RIO_CMD_IOTINVAL_GVMA: + /* IOTLB not implemented */ + break; + + case RIO_CMD_IOTINVAL_MSI: + /* IOTLB not implemented */ + break; + + case RIO_CMD_IOTINVAL_VMA: + /* IOTLB not implemented */ + break; + + case RIO_CMD_IODIR_INV_DDT: + rivos_iommu_iodir_inval_ddt(s, + !(cmd.request & RIO_IODIR_DID_VALID), + get_field(cmd.request, RIO_IODIR_MASK_DID)); + break; + + case RIO_CMD_IODIR_INV_PDT: + /* PDT invalidate not implemented. */ + break; + + case RIO_CMD_IODIR_PRE_DDT: + /* DDT pre-fetching not implemented. */ + break; + + case RIO_CMD_IODIR_PRE_PDT: + /* PDT pre-fetching not implemented. */ + break; + + default: + err = RIO_CQ_ERROR; + break; + } + + /* Invalid instruction, keep cq_head at failed instruction index. */ + if (err) { + break; + } + + s->cq_head = (s->cq_head + 1) & s->cq_mask; + } + + stl_le_p(&s->regs_rw[RIO_REG_CQ_HEAD], s->cq_head); + + if (err) { + rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, err, 0); + } + + if (ctrl & RIO_CQ_IRQ_ENABLE) { + rivos_iommu_irq_assert(s, RIO_INT_CQ); + } +} + +static void rivos_iommu_process_ddtp(RivosIOMMUState *s) +{ + uint64_t base = ldq_le_p(&s->regs_rw[RIO_REG_DDTP]) & ~RIO_DDTP_BUSY; + uint32_t mode = get_field(base, RIO_DDTP_MASK_MODE); + bool ok; + + /* Allowed DDTP.MODE transitions: + * {OFF, BARE} -> {OFF, BARE, 1LVL, 2LVL, 3LVL} + * {1LVL, 2LVL, 3LVL} -> {OFF, BARE} + */ + + if (s->ddt_mode == mode) { + ok = true; + } else if (s->ddt_mode == RIO_DDTP_MODE_OFF || + s->ddt_mode == RIO_DDTP_MODE_BARE) { + ok = mode == RIO_DDTP_MODE_1LVL || + mode == RIO_DDTP_MODE_2LVL || + mode == RIO_DDTP_MODE_3LVL; + } else { + ok = mode == RIO_DDTP_MODE_OFF || + mode == RIO_DDTP_MODE_BARE; + } + + if (ok) { + s->ddt_base = get_field(base, RIO_DDTP_MASK_PPN) << PGSHIFT; + s->ddt_mode = mode; + } else { + /* report back last valid mode and device directory table pointer. */ + base = s->ddt_base >> PGSHIFT; + base = set_field(base, RIO_DDTP_MASK_MODE, s->ddt_mode); + } + + stq_le_p(&s->regs_rw[RIO_REG_DDTP], base); +} + +static void rivos_iommu_process_cq_control(RivosIOMMUState *s) +{ + uint64_t base; + uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]); + uint32_t ctrl_clr; + bool enable = !!(ctrl_set & RIO_FQ_ENABLE); + bool active = !!(ctrl_set & RIO_FQ_ACTIVE); + + if (enable && !active) { + base = ldq_le_p(&s->regs_rw[RIO_REG_CQ_BASE]); + s->cq_mask = (2ULL << get_field(base, RIO_CQ_MASK_LOG2SZ)) - 1; + s->cq_base = get_field(base, RIO_CQ_MASK_PPN) << PGSHIFT; + s->cq_head = 0; + rivos_iommu_irq_use(s, RIO_INT_CQ); + stl_le_p(&s->regs_ro[RIO_REG_CQ_TAIL], ~s->cq_mask); + stl_le_p(&s->regs_rw[RIO_REG_CQ_HEAD], s->cq_head); + stl_le_p(&s->regs_rw[RIO_REG_CQ_TAIL], s->cq_head); + ctrl_set = RIO_CQ_ACTIVE; + ctrl_clr = RIO_CQ_BUSY | RIO_CQ_FAULT | RIO_CQ_ERROR | RIO_CQ_TIMEOUT; + } else if (!enable && active) { + rivos_iommu_irq_unuse(s, RIO_INT_CQ); + stl_le_p(&s->regs_ro[RIO_REG_CQ_TAIL], ~0); + ctrl_set = 0; + ctrl_clr = RIO_CQ_BUSY | RIO_CQ_ACTIVE; + } else { + ctrl_set = 0; + ctrl_clr = RIO_CQ_BUSY; + } + + rivos_iommu_reg_mod(s, RIO_REG_CQ_CONTROL, ctrl_set, ctrl_clr); +} + +static void rivos_iommu_process_fq_control(RivosIOMMUState *s) +{ + uint64_t base; + uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]); + uint32_t ctrl_clr; + bool enable = !!(ctrl_set & RIO_FQ_ENABLE); + bool active = !!(ctrl_set & RIO_FQ_ACTIVE); + + if (enable && !active) { + base = ldq_le_p(&s->regs_rw[RIO_REG_FQ_BASE]); + s->fq_mask = (2ULL << get_field(base, RIO_FQ_MASK_LOG2SZ)) - 1; + s->fq_base = get_field(base, RIO_FQ_MASK_PPN) << PGSHIFT; + s->fq_tail = 0; + rivos_iommu_irq_use(s, RIO_INT_FQ); + stl_le_p(&s->regs_rw[RIO_REG_FQ_HEAD], s->fq_tail); + stl_le_p(&s->regs_rw[RIO_REG_FQ_TAIL], s->fq_tail); + stl_le_p(&s->regs_ro[RIO_REG_FQ_HEAD], ~s->fq_mask); + ctrl_set = RIO_FQ_ACTIVE; + ctrl_clr = RIO_FQ_BUSY | RIO_FQ_FAULT | RIO_FQ_FULL; + } else if (!enable && active) { + rivos_iommu_irq_unuse(s, RIO_INT_FQ); + stl_le_p(&s->regs_ro[RIO_REG_FQ_HEAD], ~0); + ctrl_set = 0; + ctrl_clr = RIO_FQ_BUSY | RIO_FQ_ACTIVE; + } else { + ctrl_set = 0; + ctrl_clr = RIO_FQ_BUSY; + } + + rivos_iommu_reg_mod(s, RIO_REG_FQ_CONTROL, ctrl_set, ctrl_clr); +} + +static void rivos_iommu_process_pq_control(RivosIOMMUState *s) +{ + uint64_t base; + uint32_t ctrl_set = ldl_le_p(&s->regs_rw[RIO_REG_PQ_CONTROL]); + uint32_t ctrl_clr; + bool enable = !!(ctrl_set & RIO_PQ_ENABLE); + bool active = !!(ctrl_set & RIO_PQ_ACTIVE); + + if (enable && !active) { + base = ldq_le_p(&s->regs_rw[RIO_REG_PQ_BASE]); + s->pq_mask = (2ULL << get_field(base, RIO_PQ_MASK_LOG2SZ)) - 1; + s->pq_base = get_field(base, RIO_PQ_MASK_PPN) << PGSHIFT; + s->pq_tail = 0; + rivos_iommu_irq_use(s, RIO_INT_PQ); + stl_le_p(&s->regs_rw[RIO_REG_PQ_HEAD], s->pq_tail); + stl_le_p(&s->regs_rw[RIO_REG_PQ_TAIL], s->pq_tail); + stl_le_p(&s->regs_ro[RIO_REG_PQ_HEAD], ~s->pq_mask); + ctrl_set = RIO_PQ_ACTIVE; + ctrl_clr = RIO_PQ_BUSY | RIO_PQ_FAULT | RIO_PQ_FULL; + } else if (!enable && active) { + rivos_iommu_irq_unuse(s, RIO_INT_PQ); + stl_le_p(&s->regs_ro[RIO_REG_PQ_HEAD], ~0); + ctrl_set = 0; + ctrl_clr = RIO_PQ_BUSY | RIO_PQ_ACTIVE; + } else { + ctrl_set = 0; + ctrl_clr = RIO_PQ_BUSY; + } + + rivos_iommu_reg_mod(s, RIO_REG_PQ_CONTROL, ctrl_set, ctrl_clr); +} + +static void *rivos_iommu_core_proc(void* arg) +{ + RivosIOMMUState *s = arg; + + qemu_mutex_lock(&s->core_lock); + while (!s->core_stop) { + if (s->cq_tail_db) { + qemu_mutex_unlock(&s->core_lock); + rivos_iommu_process_cq_tail(s); + } else if (ldl_le_p(&s->regs_rw[RIO_REG_CQ_CONTROL]) & RIO_CQ_BUSY) { + qemu_mutex_unlock(&s->core_lock); + rivos_iommu_process_cq_control(s); + } else if (ldl_le_p(&s->regs_rw[RIO_REG_FQ_CONTROL]) & RIO_FQ_BUSY) { + qemu_mutex_unlock(&s->core_lock); + rivos_iommu_process_fq_control(s); + } else if (ldl_le_p(&s->regs_rw[RIO_REG_PQ_CONTROL]) & RIO_PQ_BUSY) { + qemu_mutex_unlock(&s->core_lock); + rivos_iommu_process_pq_control(s); + } else if (ldq_le_p(&s->regs_rw[RIO_REG_DDTP]) & RIO_DDTP_BUSY) { + qemu_mutex_unlock(&s->core_lock); + rivos_iommu_process_ddtp(s); + } else { + qemu_cond_wait(&s->core_cond, &s->core_lock); + continue; + } + qemu_mutex_lock(&s->core_lock); + } + qemu_mutex_unlock(&s->core_lock); + + return NULL; +} + +static void rivos_iommu_mmio_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + RivosIOMMUState *s = opaque; + uint64_t busy = 0; + bool wakeup = true; + + if (addr + size > sizeof(s->regs_rw)) { + /* unsupported MMIO access location */ + return; + } + + /* actionable MMIO write. */ + switch (addr) { + case RIO_REG_DDTP: + busy = RIO_DDTP_BUSY; + break; + + /* upper half DDTP update */ + case RIO_REG_DDTP + 4: + busy = RIO_DDTP_BUSY >> 32; + break; + + case RIO_REG_CQ_TAIL: + s->cq_tail_db = true; + break; + + case RIO_REG_CQ_CONTROL: + busy = RIO_CQ_BUSY; + break; + + case RIO_REG_FQ_CONTROL: + busy = RIO_FQ_BUSY; + break; + + case RIO_REG_PQ_CONTROL: + busy = RIO_PQ_BUSY; + break; + + default: + wakeup = false; + break; + } + + qemu_mutex_lock(&s->core_lock); + if (size == 1) { + uint8_t ro = s->regs_ro[addr]; + uint8_t wc = s->regs_wc[addr]; + uint8_t rw = s->regs_rw[addr]; + s->regs_rw[addr] = ((rw & ro) | (val & ~ro)) & ~(val & wc); + } else if (size == 2) { + uint16_t ro = lduw_le_p(&s->regs_ro[addr]); + uint16_t wc = lduw_le_p(&s->regs_wc[addr]); + uint16_t rw = lduw_le_p(&s->regs_rw[addr]); + stw_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc)); + } else if (size == 4) { + uint32_t ro = ldl_le_p(&s->regs_ro[addr]); + uint32_t wc = ldl_le_p(&s->regs_wc[addr]); + uint32_t rw = ldl_le_p(&s->regs_rw[addr]) | busy; + stl_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc)); + } else if (size == 8) { + uint64_t ro = ldq_le_p(&s->regs_ro[addr]); + uint64_t wc = ldq_le_p(&s->regs_wc[addr]); + uint64_t rw = ldq_le_p(&s->regs_rw[addr]) | busy; + stq_le_p(&s->regs_rw[addr], ((rw & ro) | (val & ~ro)) & ~(val & wc)); + } + + /* wakeup core processing thread */ + if (wakeup) { + qemu_cond_signal(&s->core_cond); + } + qemu_mutex_unlock(&s->core_lock); +} + +static uint64_t rivos_iommu_mmio_read(void *opaque, hwaddr addr, unsigned size) +{ + RivosIOMMUState *s = opaque; + uint64_t val = -1; + + if (addr + size > sizeof(s->regs_rw)) { + return (uint64_t)-1; + } else if (size == 1) { + val = (uint64_t) s->regs_rw[addr]; + } else if (size == 2) { + val = lduw_le_p(&s->regs_rw[addr]); + } else if (size == 4) { + val = ldl_le_p(&s->regs_rw[addr]); + } else if (size == 8) { + val = ldq_le_p(&s->regs_rw[addr]); + } + + return val; +} + +static const MemoryRegionOps rivos_iommu_mmio_ops = { + .read = rivos_iommu_mmio_read, + .write = rivos_iommu_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + .unaligned = false, + }, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + } +}; + +static AddressSpace *rivos_iommu_dma_as(PCIBus *bus, void *opaque, int devfn) +{ + RivosIOMMUState *s = opaque; + RivosIOMMUSpace *as; + char name[64]; + uint32_t devid = PCI_BUILD_BDF(pci_bus_num(bus), devfn); + uint32_t iommu_devid = pci_get_bdf(&s->pci); + + if (iommu_devid == devid) { + /* No translation for IOMMU device itself. */ + return &address_space_memory; + } + + qemu_mutex_lock(&s->core_lock); + QLIST_FOREACH(as, &s->spaces, list) { + if (as->devid == devid) + break; + } + qemu_mutex_unlock(&s->core_lock); + + if (as == NULL) { + as = g_malloc0(sizeof(RivosIOMMUSpace)); + + as->iommu = s; + as->devid = devid; + + snprintf(name, sizeof(name), "rivos-iommu-%04x:%02x.%d-iova", + PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid)); + + memory_region_init_iommu(&as->mr, sizeof(as->mr), + TYPE_RIVOS_IOMMU_MEMORY_REGION, + OBJECT(as), name, UINT64_MAX); + + address_space_init(&as->as, MEMORY_REGION(&as->mr), + TYPE_RIVOS_IOMMU_PCI); + + qemu_mutex_lock(&s->core_lock); + QLIST_INSERT_HEAD(&s->spaces, as, list); + qemu_mutex_unlock(&s->core_lock); + + trace_rivos_iommu_new(PCI_BUS_NUM(iommu_devid), PCI_SLOT(iommu_devid), + PCI_FUNC(iommu_devid), PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), + PCI_FUNC(as->devid)); + } + + return &as->as; +} + +static void rivos_iommu_reg_reset(RivosIOMMUState *s) +{ + const uint64_t cap = (s->version & RIO_CAP_REVISION_MASK) | + (s->enable_stage_one * RIO_CAP_STAGE_ONE) | + (s->enable_stage_two * RIO_CAP_STAGE_TWO) | + (s->enable_msi * RIO_CAP_MSI); + + /* Mark all registers read-only */ + memset(s->regs_ro, 0xff, sizeof(s->regs_ro)); + + /* Set power-on register state */ + stq_le_p(&s->regs_rw[RIO_REG_CAP], cap); + stq_le_p(&s->regs_ro[RIO_REG_DDTP], + ~(RIO_DDTP_MASK_PPN | RIO_DDTP_MASK_MODE)); + stq_le_p(&s->regs_ro[RIO_REG_CQ_BASE], + ~(RIO_CQ_MASK_LOG2SZ | RIO_CQ_MASK_PPN)); + stq_le_p(&s->regs_ro[RIO_REG_FQ_BASE], + ~(RIO_FQ_MASK_LOG2SZ | RIO_FQ_MASK_PPN)); + stq_le_p(&s->regs_ro[RIO_REG_PQ_BASE], + ~(RIO_PQ_MASK_LOG2SZ | RIO_PQ_MASK_PPN)); + stl_le_p(&s->regs_wc[RIO_REG_CQ_CONTROL], + RIO_CQ_FAULT | RIO_CQ_TIMEOUT | RIO_CQ_ERROR); + stl_le_p(&s->regs_ro[RIO_REG_CQ_CONTROL], RIO_CQ_ACTIVE | RIO_CQ_BUSY); + stl_le_p(&s->regs_wc[RIO_REG_FQ_CONTROL], RIO_FQ_FAULT | RIO_FQ_FULL); + stl_le_p(&s->regs_ro[RIO_REG_FQ_CONTROL], RIO_FQ_ACTIVE | RIO_FQ_BUSY); + stl_le_p(&s->regs_wc[RIO_REG_PQ_CONTROL], RIO_PQ_FAULT | RIO_PQ_FULL); + stl_le_p(&s->regs_ro[RIO_REG_PQ_CONTROL], RIO_PQ_ACTIVE | RIO_PQ_BUSY); + stl_le_p(&s->regs_wc[RIO_REG_IPSR], ~0); +} + +static void rivos_iommu_realize(PCIDevice *dev, Error **errp) +{ + DeviceState *d = DEVICE(dev); + RivosIOMMUState *s = RIVOS_IOMMU_PCI(d); + const uint64_t bar_size = + pow2ceil(QEMU_ALIGN_UP(sizeof(s->regs_rw), TARGET_PAGE_SIZE)); + Error *err = NULL; + + QLIST_INIT(&s->spaces); + qemu_cond_init(&s->core_cond); + qemu_mutex_init(&s->core_lock); + rivos_iommu_reg_reset(s); + + qemu_thread_create(&s->core_proc, "rivos-iommu-core", + rivos_iommu_core_proc, s, QEMU_THREAD_JOINABLE); + + memory_region_init(&s->bar0, OBJECT(s), + "rivos-iommu-bar0", bar_size); + memory_region_init_io(&s->mmio, OBJECT(s), &rivos_iommu_mmio_ops, s, + "rivos-iommu", sizeof(s->regs_rw)); + memory_region_add_subregion(&s->bar0, 0, &s->mmio); + + pcie_endpoint_cap_init(dev, 0x80); + + pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64, &s->bar0); + + int ret = msix_init(dev, RIO_INT_COUNT, + &s->bar0, 0, RIO_REG_MSI_ADDR_BASE, + &s->bar0, 0, RIO_REG_MSI_PBA_BASE, 0, &err); + + if (ret == -ENOTSUP) { + /* MSI-x is not supported by the platform. + * Driver should use timer/polling based notification handlers. + */ + warn_report_err(err); + } else if (ret < 0) { + error_propagate(errp, err); + return; + } + + /* TODO: find root port bus ranges and use for FDT/ACPI generation. */ + PCIBus *bus = pci_device_root_bus(dev); + if (!bus) { + error_setg(errp, "can't find PCIe root port for %02x:%02x.%x", + pci_bus_num(pci_get_bus(dev)), PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + return; + } + + pci_setup_iommu(bus, rivos_iommu_dma_as, s); +} + +static void rivos_iommu_exit(PCIDevice *dev) +{ + DeviceState *d = DEVICE(dev); + RivosIOMMUState *s = RIVOS_IOMMU_PCI(d); + + qemu_mutex_lock(&s->core_lock); + s->core_stop = true; + qemu_cond_signal(&s->core_cond); + qemu_mutex_unlock(&s->core_lock); + qemu_thread_join(&s->core_proc); + qemu_cond_destroy(&s->core_cond); + qemu_mutex_destroy(&s->core_lock); +} + +static const VMStateDescription rivos_iommu_vmstate = { + .name = "rivos-iommu", + .unmigratable = 1 +}; + +static Property rivos_iommu_properties[] = { + DEFINE_PROP_UINT32("version", RivosIOMMUState, version, RIO_CAP_REVISION), + DEFINE_PROP_BOOL("msi", RivosIOMMUState, enable_msi, TRUE), + DEFINE_PROP_BOOL("stage-one", RivosIOMMUState, enable_stage_one, TRUE), + DEFINE_PROP_BOOL("stage-two", RivosIOMMUState, enable_stage_two, TRUE), + DEFINE_PROP_END_OF_LIST(), +}; + +static void rivos_iommu_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + device_class_set_props(dc, rivos_iommu_properties); + k->realize = rivos_iommu_realize; + k->exit = rivos_iommu_exit; + k->vendor_id = PCI_VENDOR_ID_RIVOS; + k->device_id = PCI_DEVICE_ID_RIVOS_IOMMU; + k->revision = 0; + k->class_id = PCI_CLASS_SYSTEM_IOMMU; + dc->desc = "RIVOS-IOMMU (RIO) DMA Remapping device"; + dc->vmsd = &rivos_iommu_vmstate; + dc->hotpluggable = false; + dc->user_creatable = true; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); +} + +static const TypeInfo rivos_iommu_pci = { + .name = TYPE_RIVOS_IOMMU_PCI, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(RivosIOMMUState), + .class_init = rivos_iommu_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { }, + }, +}; + +static void rivos_iommu_memory_region_class_init(ObjectClass *klass, void *data) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); + + imrc->translate = rivos_iommu_translate; + imrc->notify_flag_changed = rivos_iommu_notify_flag_changed; +} + +static const TypeInfo rivos_iommu_memory_region_info = { + .parent = TYPE_IOMMU_MEMORY_REGION, + .name = TYPE_RIVOS_IOMMU_MEMORY_REGION, + .class_init = rivos_iommu_memory_region_class_init, +}; + +static void rivos_iommu_register_types(void) +{ + type_register_static(&rivos_iommu_pci); + type_register_static(&rivos_iommu_memory_region_info); +} + +type_init(rivos_iommu_register_types); diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events new file mode 100644 index 0000000000..c3618764ed --- /dev/null +++ b/hw/riscv/trace-events @@ -0,0 +1,7 @@ +# See documentation at docs/devel/tracing.rst + +# rivos-iommu.c +rivos_iommu_new(int bus, int slot, int func, int dbus, int dslot, int dfunc) "NEW %04x:%02x.%d attached %04x:%02x.%d" +rivos_iommu_flt(int bus, int slot, int func, int cause, uint64_t iova) "FLT %04x:%02x.%d cause: %d iova: 0x%"PRIx64 +rivos_iommu_dma(int bus, int slot, int func, const char *dir, uint64_t iova, uint64_t phys) "TLB q%04x:%02x.%d %s 0x%"PRIx64" -> 0x%"PRIx64 +rivos_iommu_cmd(int bus, int slot, int func, uint64_t l, uint64_t u) "CMD %04x:%02x.%d 0x%"PRIx64" 0x%"PRIx64 diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h new file mode 100644 index 0000000000..b88504b750 --- /dev/null +++ b/hw/riscv/trace.h @@ -0,0 +1,2 @@ +#include "trace/trace-hw_riscv.h" + diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h index 11abe22d46..73dad2aced 100644 --- a/include/hw/pci/pci_ids.h +++ b/include/hw/pci/pci_ids.h @@ -88,6 +88,7 @@ #define PCI_CLASS_SYSTEM_RTC 0x0803 #define PCI_CLASS_SYSTEM_PCI_HOTPLUG 0x0804 #define PCI_CLASS_SYSTEM_SDHCI 0x0805 +#define PCI_CLASS_SYSTEM_IOMMU 0x0806 #define PCI_CLASS_SYSTEM_OTHER 0x0880 #define PCI_BASE_CLASS_INPUT 0x09 diff --git a/include/hw/riscv/rivos_iommu.h b/include/hw/riscv/rivos_iommu.h new file mode 100644 index 0000000000..097086d83e --- /dev/null +++ b/include/hw/riscv/rivos_iommu.h @@ -0,0 +1,80 @@ +/* + * QEMU emulation of an RISC-V RIVOS-IOMMU + * + * Copyright (C) 2022 Rivos Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef HW_RIVOS_IOMMU_H +#define HW_RIVOS_IOMMU_H + +#include "hw/sysbus.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" +#include "qom/object.h" + +#define TYPE_RIVOS_IOMMU_PCI "rivos-iommu" +OBJECT_DECLARE_SIMPLE_TYPE(RivosIOMMUState, RIVOS_IOMMU_PCI) + +#define TYPE_RIVOS_IOMMU_MEMORY_REGION "rivos-iommu-memory-region" + +typedef struct RivosIOMMUState RivosIOMMUState; +typedef struct RivosIOMMUSpace RivosIOMMUSpace; + +#define RIVOS_IOMMU_REGS_SIZE 0x300 /* control registers space */ + +/* + * IO virtual address space remapping device state. + */ +struct RivosIOMMUState { + PCIDevice pci; /* Parent PCI device */ + + MemoryRegion bar0; + MemoryRegion mmio; + uint8_t regs_rw[RIVOS_IOMMU_REGS_SIZE]; /* MMIO register state */ + uint8_t regs_wc[RIVOS_IOMMU_REGS_SIZE]; /* MMIO write-1-to-clear */ + uint8_t regs_ro[RIVOS_IOMMU_REGS_SIZE]; /* MMIO read/only mask */ + + /* IOMMU Properties */ + uint32_t version; + bool enable_msi; /* Enable MSI translation */ + bool enable_stage_one; /* Enable IOVA->GPA translation */ + bool enable_stage_two; /* Enable GPA->SPA translation */ + + QemuCond core_cond; + QemuMutex core_lock; + QemuThread core_proc; + bool core_stop; + + hwaddr ddt_base; + uint32_t ddt_mode; + int ddt_depth; + + hwaddr cq_base; + uint32_t cq_mask; + uint32_t cq_head; + bool cq_tail_db; + + hwaddr fq_base; + uint32_t fq_mask; + uint32_t fq_tail; + + hwaddr pq_base; + uint32_t pq_mask; + uint32_t pq_tail; + + QLIST_HEAD(, RivosIOMMUSpace) spaces; +}; + +#endif diff --git a/meson.build b/meson.build index bae62efc9c..62d2a56326 100644 --- a/meson.build +++ b/meson.build @@ -2688,6 +2688,7 @@ if have_system 'hw/ppc', 'hw/rdma', 'hw/rdma/vmw', + 'hw/riscv', 'hw/rtc', 'hw/s390x', 'hw/scsi', -- 2.25.1