On Mon, 13 Oct 2025 15:31:06 +0100
Alireza Sanaee via <[email protected]> wrote:

> The CXL address to device decoding logic is complex because of the
> need to correctly decode fine grained interleave. The current
> implementation prevents use with KVM where executed instructions may
> reside in that memory and gives very slow performance even in TCG.
> 
> In many real cases non interleaved memory configurations are useful
> and for those we can use a more conventional memory region alias
> allowing similar performance to other memory in the system.
> 
> Whether this fast path is applicable can be established once the full
> set of HDM decoders has been committed (in whatever order the guest
> decides to commit them). As such a check is performed on each commit /
> uncommit of HDM decoder to establish if the alias should be added or
> removed.
> 
> Co-developed-by: Jonathan Cameron <[email protected]>
> Signed-off-by: Jonathan Cameron <[email protected]>
> Signed-off-by: Alireza Sanaee <[email protected]>
> ---
> Thanks to Jonathan Cameron for feedback and help with this patch.
>  hw/cxl/cxl-component-utils.c |   9 ++
>  hw/cxl/cxl-host.c            | 268
> ++++++++++++++++++++++++++++++++++- hw/mem/cxl_type3.c           |
> 4 + include/hw/cxl/cxl.h         |   1 +
>  include/hw/cxl/cxl_device.h  |  10 ++
>  5 files changed, 289 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/cxl/cxl-component-utils.c
> b/hw/cxl/cxl-component-utils.c index 473895948b..f1ecd6ed22 100644
> --- a/hw/cxl/cxl-component-utils.c
> +++ b/hw/cxl/cxl-component-utils.c
> @@ -116,6 +116,15 @@ static void dumb_hdm_handler(CXLComponentState
> *cxl_cstate, hwaddr offset, value = FIELD_DP32(value,
> CXL_HDM_DECODER0_CTRL, COMMITTED, 0); }
>      stl_le_p((uint8_t *)cache_mem + offset, value);
> +
> +    if (should_commit) {
> +        cfmws_update_non_interleaved(true);
> +    }
> +
> +    if (should_uncommit) {
> +        cfmws_update_non_interleaved(false);
> +    }
> +
>  }
>  
>  static void cxl_cache_mem_write_reg(void *opaque, hwaddr offset,
> uint64_t value, diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
> index 0d891c651d..3a563af3bc 100644
> --- a/hw/cxl/cxl-host.c
> +++ b/hw/cxl/cxl-host.c
> @@ -12,6 +12,7 @@
>  #include "qapi/error.h"
>  #include "system/qtest.h"
>  #include "hw/boards.h"
> +#include "qemu/log.h"
>  
>  #include "qapi/qapi-visit-machine.h"
>  #include "hw/cxl/cxl.h"
> @@ -104,7 +105,7 @@ void cxl_fmws_link_targets(Error **errp)
>  }
>  
>  static bool cxl_hdm_find_target(uint32_t *cache_mem, hwaddr addr,
> -                                uint8_t *target)
> +                                uint8_t *target, bool *interleaved)
>  {
>      int hdm_inc = R_CXL_HDM_DECODER1_BASE_LO -
> R_CXL_HDM_DECODER0_BASE_LO; unsigned int hdm_count;
> @@ -138,6 +139,11 @@ static bool cxl_hdm_find_target(uint32_t
> *cache_mem, hwaddr addr, found = true;
>          ig_enc = FIELD_EX32(ctrl, CXL_HDM_DECODER0_CTRL, IG);
>          iw_enc = FIELD_EX32(ctrl, CXL_HDM_DECODER0_CTRL, IW);
> +
> +        if (interleaved) {
> +            *interleaved = iw_enc != 0;
> +        }
> +
>          target_idx = (addr / cxl_decode_ig(ig_enc)) % (1 << iw_enc);
>  
>          if (target_idx < 4) {
> @@ -190,7 +196,7 @@ static PCIDevice
> *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr) 
>          cache_mem = hb_cstate->crb.cache_mem_registers;
>  
> -        target_found = cxl_hdm_find_target(cache_mem, addr, &target);
> +        target_found = cxl_hdm_find_target(cache_mem, addr, &target,
> NULL); if (!target_found) {
>              return NULL;
>          }
> @@ -226,7 +232,7 @@ static PCIDevice
> *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr) 
>      cache_mem = usp_cstate->crb.cache_mem_registers;
>  
> -    target_found = cxl_hdm_find_target(cache_mem, addr, &target);
> +    target_found = cxl_hdm_find_target(cache_mem, addr, &target,
> NULL); if (!target_found) {
>          return NULL;
>      }
> @@ -248,6 +254,262 @@ static PCIDevice
> *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr) return d;
>  }
>  
> +static bool cfmws_is_not_interleaved(CXLFixedWindow *fw, hwaddr addr)
> +{
> +    PCIDevice *rp, *d;
> +    PCIHostState *hb;
> +    CXLComponentState *hb_cstate, *usp_cstate;
> +    CXLUpstreamPort *usp;
> +    uint32_t *cache_mem;
> +    bool target_found, interleaved;
> +    uint8_t target;
> +
> +    addr = fw->base;
> +    if (fw->num_targets > 1) {
> +        return false;
> +    }
> +
> +    hb = PCI_HOST_BRIDGE(fw->target_hbs[0]->cxl_host_bridge);
> +    if (!hb || !hb->bus || !pci_bus_is_cxl(hb->bus)) {
> +        return false;
> +    }
> +    if (cxl_get_hb_passthrough(hb)) {
> +        rp = pcie_find_port_first(hb->bus);
> +        if (!rp) {
> +            return false;
> +        }
> +    } else {
> +        hb_cstate = cxl_get_hb_cstate(hb);
> +        if (!hb_cstate) {
> +            return false;
> +        }
> +        cache_mem = hb_cstate->crb.cache_mem_registers;
> +
> +        target_found = cxl_hdm_find_target(cache_mem, addr, &target,
> +                                           &interleaved);
> +        if (!target_found) {
> +            return false;
> +        }
> +        if (interleaved) {
> +            return false;
> +        }
> +
> +        rp = pcie_find_port_by_pn(hb->bus, target);
> +        if (!rp) {
> +            return false;
> +        }
> +    }
> +    d = pci_bridge_get_sec_bus(PCI_BRIDGE(rp))->devices[0];
> +    if (!d) {
> +        return false;
> +    }
> +
> +    if (object_dynamic_cast(OBJECT(d), TYPE_CXL_TYPE3)) {
> +        return true;
> +    }
> +    /*
> +     * Could also be a switch.  Note only one level of switching
> currently
> +     * supported.
> +     */
> +    if (!object_dynamic_cast(OBJECT(d), TYPE_CXL_USP)) {
> +        return false;
> +    }
> +    usp = CXL_USP(d);
> +
> +    usp_cstate = cxl_usp_to_cstate(usp);
> +    if (!usp_cstate) {
> +        return false;
> +    }
> +
> +    cache_mem = usp_cstate->crb.cache_mem_registers;
> +
> +    target_found = cxl_hdm_find_target(cache_mem, addr, &target,
> +                                       &interleaved);
> +    if (!target_found) {
> +        return false;
> +    }
> +    if (interleaved) {
> +        return false;
> +    }
> +
> +    d = pcie_find_port_by_pn(&PCI_BRIDGE(d)->sec_bus, target);
> +    if (!d) {
> +        return false;
> +    }
> +
> +    d = pci_bridge_get_sec_bus(PCI_BRIDGE(d))->devices[0];
> +    if (!d) {
> +        return false;
> +    }
> +
> +    if (!object_dynamic_cast(OBJECT(d), TYPE_CXL_TYPE3)) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +static int cxl_fmws_direct_passthrough(Object *obj, void *opaque)
> +{
> +    struct cxl_direct_pt_state *state = opaque;
> +    struct CXLFixedWindow *fw;
> +    CXLType3Dev *ct3d = state->ct3d;
> +
> +    if (!object_dynamic_cast(obj, TYPE_CXL_FMW)) {
> +        return 0;
> +    }
> +
> +    fw = CXL_FMW(obj);
> +
> +    if (!cfmws_is_not_interleaved(fw, state->decoder_base)) {
> +        return 0;
> +    }
> +
> +    if (state->commit) {
> +        MemoryRegion *mr = NULL;
> +        uint64_t vmr_size = 0, pmr_size = 0;
> +        uint64_t offset = 0;
> +
> +        if (ct3d->hostvmem) {
> +            MemoryRegion *vmr =
> host_memory_backend_get_memory(ct3d->hostvmem);
> +            vmr_size = memory_region_size(vmr);
> +            if (state->dpa_base < vmr_size) {
> +                mr = vmr;
> +                offset = state->dpa_base;
> +            }
> +        }
> +        if (!mr && ct3d->hostpmem) {
> +            MemoryRegion *pmr =
> host_memory_backend_get_memory(ct3d->hostpmem);
> +            pmr_size = memory_region_size(pmr);
> +            if (state->dpa_base - vmr_size < pmr_size) {
> +                mr = pmr;
> +                offset = state->dpa_base - vmr_size;
> +            }
> +        }
> +
> +        if (!mr) {
> +            return 0;
> +        }
> +
> +        if
> (memory_region_is_mapped(&ct3d->direct_mr[state->hdm_decoder_idx])) {
> +            return 0;
> +        }
> +
> +
> memory_region_init_alias(&ct3d->direct_mr[state->hdm_decoder_idx],
> +                                 OBJECT(ct3d), "direct-mapping", mr,
> offset,
> +                                 state->decoder_size);
> +        memory_region_add_subregion(&fw->mr,
> +                                    state->decoder_base - fw->base,
> +
> &ct3d->direct_mr[state->hdm_decoder_idx]);
> +    } else {
> +        if
> (memory_region_is_mapped(&ct3d->direct_mr[state->hdm_decoder_idx])) {
> +            memory_region_del_subregion(&fw->mr,
> +                &ct3d->direct_mr[state->hdm_decoder_idx]);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int update_non_interleaved(Object *obj, void *opaque)
> +{
> +    CXLType3Dev *ct3d;
> +    uint32_t *cache_mem;
> +    unsigned int hdm_count, i;
> +    uint32_t cap;
> +    int hdm_inc = R_CXL_HDM_DECODER1_BASE_LO -
> R_CXL_HDM_DECODER0_BASE_LO;
> +    uint64_t dpa_base = 0;
> +    bool commit = *(bool *) opaque;
> +
> +    if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) {
> +        return 0;
> +    }
> +
> +    ct3d = CXL_TYPE3(obj);
> +    cache_mem = ct3d->cxl_cstate.crb.cache_mem_registers;
> +    /*
> +     * Walk the decoders and find any committed with iw set to 0
> +     * (non interleaved).
> +     */
> +    cap = ldl_le_p(cache_mem + R_CXL_HDM_DECODER_CAPABILITY);
> +    hdm_count = cxl_decoder_count_dec(FIELD_EX32(cap,
> +
> CXL_HDM_DECODER_CAPABILITY,
> +                                                 DECODER_COUNT));
> +
> +    /* Now for each committed HDM decoder */
> +    for (i = 0; i < hdm_count; i++) {
> +        uint64_t decoder_base, decoder_size, skip;
> +        uint32_t hdm_ctrl, low, high;
> +        int iw, committed;
> +
> +        hdm_ctrl = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + i
> * hdm_inc);
> +        committed = FIELD_EX32(hdm_ctrl, CXL_HDM_DECODER0_CTRL,
> COMMITTED);
> +        if (commit) {
> +            if (!committed) {
> +                return 0;
> +            }
> +        } else {
> +            if (committed) {
> +                return 0;
> +            }
> +        }
> +
> +        /*
> +         * Even if this decoder is interleaved need to keep track of
> DPA as the
> +         * next HDM decoder may not be interleaved.
> +         */
> +        low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_DPA_SKIP_LO +
> +                       i * hdm_inc);
> +        high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_DPA_SKIP_HI +
> +                        i * hdm_inc);
> +        skip = ((uint64_t)high << 32) | (low & 0xf0000000);
> +        dpa_base += skip;
> +
> +        low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_LO + i *
> hdm_inc);
> +        high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_HI + i *
> hdm_inc);
> +        decoder_size = ((uint64_t)high << 32) | (low & 0xf0000000);
> +        iw = FIELD_EX32(hdm_ctrl, CXL_HDM_DECODER0_CTRL, IW);
> +        /* Get the HPA of the base */
> +        low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_BASE_LO + i *
> hdm_inc);
> +        high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_BASE_HI + i *
> hdm_inc);
> +        decoder_base = ((uint64_t)high << 32) | (low & 0xf0000000);
> +
> +#ifdef DEBUG
> +        qemu_log("non interleaved decoder %lx %lx %lx %d\n",
> decoder_base,
> +                 decoder_size, dpa_base, commit);
> +#endif
> +
> +        /* Is it non interleaved? - need to check full path later */
> +        if (iw == 0) {
> +            struct cxl_direct_pt_state state = {
> +                .ct3d = ct3d,
> +                .decoder_base = decoder_base,
> +                .decoder_size = decoder_size,
> +                .dpa_base = dpa_base,
> +                .hdm_decoder_idx = i,
> +                .commit = commit,
> +            };
> +            object_child_foreach_recursive(object_get_root(),
> +
> cxl_fmws_direct_passthrough, &state);
> +        }
> +        dpa_base += decoder_size / cxl_interleave_ways_dec(iw,
> &error_fatal); +
> +    }
> +    return 0;
> +}
> +
> +bool cfmws_update_non_interleaved(bool commit)
> +{
> +    /*
> +     * Walk endpoints to find committed decoders then check if they
> are not
> +     * interleaved (but path full is set up).
> +     */
> +    object_child_foreach_recursive(object_get_root(),
> +                                   update_non_interleaved, &commit);
> +
> +    return false;
> +}
> +
>  static MemTxResult cxl_read_cfmws(void *opaque, hwaddr addr,
> uint64_t *data, unsigned size, MemTxAttrs attrs)
>  {
> diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> index be609ff9d0..8cdb3bff7e 100644
> --- a/hw/mem/cxl_type3.c
> +++ b/hw/mem/cxl_type3.c
> @@ -427,6 +427,8 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d,
> int which) ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED,
> 1); 
>      stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc,
> ctrl); +
> +    cfmws_update_non_interleaved(true);
>  }
>  
>  static void hdm_decoder_uncommit(CXLType3Dev *ct3d, int which)
> @@ -442,6 +444,8 @@ static void hdm_decoder_uncommit(CXLType3Dev
> *ct3d, int which) ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL,
> COMMITTED, 0); 
>      stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc,
> ctrl); +
> +    cfmws_update_non_interleaved(false);
>  }
>  
>  static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
> diff --git a/include/hw/cxl/cxl.h b/include/hw/cxl/cxl.h
> index 998f495a98..d5d2a9efb5 100644
> --- a/include/hw/cxl/cxl.h
> +++ b/include/hw/cxl/cxl.h
> @@ -71,4 +71,5 @@ CXLComponentState
> *cxl_usp_to_cstate(CXLUpstreamPort *usp); typedef struct
> CXLDownstreamPort CXLDownstreamPort;
> DECLARE_INSTANCE_CHECKER(CXLDownstreamPort, CXL_DSP, TYPE_CXL_DSP) 
> +bool cfmws_update_non_interleaved(bool);
>  #endif
> diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
> index 89411c8093..1d199d035e 100644
> --- a/include/hw/cxl/cxl_device.h
> +++ b/include/hw/cxl/cxl_device.h
> @@ -584,6 +584,7 @@ struct CXLType3Dev {
>      uint64_t sn;
>  
>      /* State */
> +    MemoryRegion direct_mr[CXL_HDM_DECODER_COUNT];
>      AddressSpace hostvmem_as;
>      AddressSpace hostpmem_as;
>      CXLComponentState cxl_cstate;
> @@ -671,6 +672,15 @@ struct CSWMBCCIDev {
>      CXLCCI *cci;
>  };
>  
> +struct cxl_direct_pt_state {
> +    CXLType3Dev *ct3d;
> +    hwaddr decoder_base;
> +    hwaddr decoder_size;
> +    hwaddr dpa_base;
> +    unsigned int hdm_decoder_idx;
> +    bool commit;
> +};
> +
>  #define TYPE_CXL_SWITCH_MAILBOX_CCI "cxl-switch-mailbox-cci"
>  OBJECT_DECLARE_TYPE(CSWMBCCIDev, CSWMBCCIClass,
> CXL_SWITCH_MAILBOX_CCI) 

Added anisa, linux-cxl, and linux-arm to the CC. 

Reply via email to