Ashish Anand <[email protected]> writes:

> Currently, QEMU implements the 'Wait For Event' (WFE) instruction as a
> simple yield. This causes high host CPU usage because guest
> RTOS idle loops effectively become busy-wait loops.
>
> To improve efficiency, this patch transitions WFE to use the architectural
> 'Halt' state (EXCP_HLT) for M-profile CPUs. This allows the host thread
> to sleep when the guest is idle.
>
> To support this transition, we implement the full architectural behavior
> required for WFE, specifically the 'Event Register', 'SEVONPEND' logic,
> and 'R_BPBR' exception handling requirements defined in the ARM
> Architecture Reference Manual.
>
> This patch enables resource-efficient idle emulation for Cortex-M.
>
> Signed-off-by: Ashish Anand <[email protected]>
> ---
> Changes in v2:
> 1. Added R_BPBR handling as required by the Armv8-M architecture.
> 2. Unified pending-state updates to detect interrupt transitions for 
> SEVONPEND.
> 3. Moved the Event Register to the top-level ARM CPU state, since it is
>    architecturally shared between A-profile and M-profile CPUs.
> 4. Updated migration handling to be profile-agnostic so the state can be
>    reused if A-profile WFE support is implemented in the future. 
> 5. Added CONFIG_USER_ONLY handling for WFE/SEV and tightened profile checks.
> 6. Addressed review comments and fixes raised during v1 discussion. 
>
> Link to v1 - 
> https://lore.kernel.org/qemu-devel/[email protected]/
>
>
<snip>
>  
> +    /*
> +     * The event register is shared by all ARM profiles (A/R/M),
> +     * so it is stored in the top-level CPU state.
> +     * WFE/SEV handling is currently implemented only for M-profile.
> +     */
> +    uint32_t event_register;
> +

Looking at the Arm ARM for armv8m it says:

  The Event register is a single-bit register for each PE in the system.
  Software cannot read, and cannot write to, the Event register directly.

So I wonder if this could simply be a bool - and in that case some of
the logic simplifies.

>      /* Fields up to this point are cleared by a CPU reset */
>      struct {} end_reset_fields;
>  
> diff --git a/target/arm/machine.c b/target/arm/machine.c
> index 0befdb0b28..814a74745f 100644
> --- a/target/arm/machine.c
> +++ b/target/arm/machine.c
> @@ -508,6 +508,24 @@ static const VMStateDescription vmstate_m_mve = {
>      },
>  };
>  
> +static bool event_needed(void *opaque)
> +{
> +    ARMCPU *cpu = opaque;
> +    /* Only save the state if the event register is set (non-zero) */
> +    return cpu->env.event_register != 0;
> +}
> +
> +static const VMStateDescription vmstate_event = {
> +    .name = "cpu/event",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = event_needed,
> +    .fields = (const VMStateField[]) {
> +        VMSTATE_UINT32(env.event_register, ARMCPU),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_m = {
>      .name = "cpu/m",
>      .version_id = 4,
> @@ -1210,6 +1228,7 @@ const VMStateDescription vmstate_arm_cpu = {
>          &vmstate_wfxt_timer,
>          &vmstate_syndrome64,
>          &vmstate_pstate64,
> +        &vmstate_event,
>          NULL
>      }
>  };
> diff --git a/target/arm/tcg/helper.h b/target/arm/tcg/helper.h
> index 4636d1bc03..5a10a9fba3 100644
> --- a/target/arm/tcg/helper.h
> +++ b/target/arm/tcg/helper.h
> @@ -60,6 +60,7 @@ DEF_HELPER_1(yield, void, env)
>  DEF_HELPER_1(pre_hvc, void, env)
>  DEF_HELPER_2(pre_smc, void, env, i32)
>  DEF_HELPER_1(vesb, void, env)
> +DEF_HELPER_1(sev, void, env)
>  
>  DEF_HELPER_3(cpsr_write, void, env, i32, i32)
>  DEF_HELPER_2(cpsr_write_eret, void, env, i32)
> diff --git a/target/arm/tcg/m_helper.c b/target/arm/tcg/m_helper.c
> index 3fb24c7790..2a2224c996 100644
> --- a/target/arm/tcg/m_helper.c
> +++ b/target/arm/tcg/m_helper.c
> @@ -962,7 +962,9 @@ static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, 
> bool dotailchain,
>       * Now we've done everything that might cause a derived exception
>       * we can go ahead and activate whichever exception we're going to
>       * take (which might now be the derived exception).
> +     * Exception entry sets the event register (ARM ARM R_BPBR)
>       */
> +    env->event_register = 1;
>      armv7m_nvic_acknowledge_irq(env->nvic);
>  
>      /* Switch to target security state -- must do this before writing SPSEL 
> */
> @@ -1906,6 +1908,9 @@ static void do_v7m_exception_exit(ARMCPU *cpu)
>      /* Otherwise, we have a successful exception exit. */
>      arm_clear_exclusive(env);
>      arm_rebuild_hflags(env);
> +
> +    /* Exception return sets the event register (ARM ARM R_BPBR) */
> +    env->event_register = 1;
>      qemu_log_mask(CPU_LOG_INT, "...successful exception return\n");
>  }
>  
> diff --git a/target/arm/tcg/op_helper.c b/target/arm/tcg/op_helper.c
> index 4fbd219555..9f385dae6d 100644
> --- a/target/arm/tcg/op_helper.c
> +++ b/target/arm/tcg/op_helper.c
> @@ -469,16 +469,58 @@ void HELPER(wfit)(CPUARMState *env, uint64_t timeout)
>  #endif
>  }
>  
> +void HELPER(sev)(CPUARMState *env)
> +{
> +    CPUState *cs = env_cpu(env);
> +    CPU_FOREACH(cs) {
> +        ARMCPU *target_cpu = ARM_CPU(cs);
> +        if (arm_feature(&target_cpu->env, ARM_FEATURE_M)) {
> +            target_cpu->env.event_register = 1;
> +        }
> +        if (!qemu_cpu_is_self(cs)) {
> +            qemu_cpu_kick(cs);
> +        }
> +    }
> +}
> +
>  void HELPER(wfe)(CPUARMState *env)
>  {
> -    /* This is a hint instruction that is semantically different
> -     * from YIELD even though we currently implement it identically.
> -     * Don't actually halt the CPU, just yield back to top
> -     * level loop. This is not going into a "low power state"
> -     * (ie halting until some event occurs), so we never take
> -     * a configurable trap to a different exception level.
> +#ifdef CONFIG_USER_ONLY
> +    /*
> +     * WFE in the user-mode emulator is a NOP. Real-world user-mode code
> +     * shouldn't execute WFE, but if it does, we make it a NOP rather than
> +     * aborting when we try to raise EXCP_HLT.
> +     */
> +    return;
> +#else
> +    /*
> +     * WFE (Wait For Event) is a hint instruction.
> +     * For Cortex-M (M-profile), we implement the strict architectural 
> behavior:
> +     * 1. Check the Event Register (set by SEV or SEVONPEND).
> +     * 2. If set, clear it and continue (consume the event).
>       */
> -    HELPER(yield)(env);
> +    if (arm_feature(env, ARM_FEATURE_M)) {
> +        CPUState *cs = env_cpu(env);
> +
> +        if (env->event_register) {
> +            env->event_register = 0;
> +            return;
> +        }
> +
> +        cs->exception_index = EXCP_HLT;
> +        cs->halted = 1;
> +        cpu_loop_exit(cs);
> +    } else {
> +        /*
> +         * For A-profile and others, we rely on the existing "yield" 
> behavior.
> +         * Don't actually halt the CPU, just yield back to top
> +         * level loop. This is not going into a "low power state"
> +         * (ie halting until some event occurs), so we never take
> +         * a configurable trap to a different exception level
> +         */
> +        HELPER(yield)(env);
> +    }
> +#endif
>  }
>  
>  void HELPER(yield)(CPUARMState *env)
> diff --git a/target/arm/tcg/t16.decode b/target/arm/tcg/t16.decode
> index 646c74929d..778fbf1627 100644
> --- a/target/arm/tcg/t16.decode
> +++ b/target/arm/tcg/t16.decode
> @@ -228,8 +228,9 @@ REVSH           1011 1010 11 ... ...            @rdm
>      WFE         1011 1111 0010 0000
>      WFI         1011 1111 0011 0000
>  
> -    # TODO: Implement SEV, SEVL; may help SMP performance.
> -    # SEV       1011 1111 0100 0000
> +    # M-profile SEV is implemented.
> +    # TODO: Implement SEV for other profiles, and SEVL for all profiles; may 
> help SMP performance.
> +    SEV         1011 1111 0100 0000
>      # SEVL      1011 1111 0101 0000
>  
>      # The canonical nop has the second nibble as 0000, but the whole of the
> diff --git a/target/arm/tcg/t32.decode b/target/arm/tcg/t32.decode
> index d327178829..49b8d0037e 100644
> --- a/target/arm/tcg/t32.decode
> +++ b/target/arm/tcg/t32.decode
> @@ -369,8 +369,9 @@ CLZ              1111 1010 1011 ---- 1111 .... 1000 ....  
>     @rdm
>          WFE      1111 0011 1010 1111 1000 0000 0000 0010
>          WFI      1111 0011 1010 1111 1000 0000 0000 0011
>  
> -        # TODO: Implement SEV, SEVL; may help SMP performance.
> -        # SEV    1111 0011 1010 1111 1000 0000 0000 0100
> +        # M-profile SEV is implemented.
> +        # TODO: Implement SEV for other profiles, and SEVL for all profiles; 
> may help SMP performance.
> +        SEV      1111 0011 1010 1111 1000 0000 0000 0100
>          # SEVL   1111 0011 1010 1111 1000 0000 0000 0101
>  
>          ESB      1111 0011 1010 1111 1000 0000 0001 0000
> diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
> index 63735d9789..c90b0106f7 100644
> --- a/target/arm/tcg/translate.c
> +++ b/target/arm/tcg/translate.c
> @@ -3241,14 +3241,30 @@ static bool trans_YIELD(DisasContext *s, arg_YIELD *a)
>      return true;
>  }
>  
> +static bool trans_SEV(DisasContext *s, arg_SEV *a)
> +{
> +    /*
> +     * Currently SEV is a NOP for non-M-profile and in user-mode emulation.
> +     * For system-mode M-profile, it sets the event register.
> +     */
> +#ifndef CONFIG_USER_ONLY
> +    if (arm_dc_feature(s, ARM_FEATURE_M)) {
> +        gen_helper_sev(tcg_env);
> +    }
> +#endif
> +    return true;
> +}
> +
>  static bool trans_WFE(DisasContext *s, arg_WFE *a)
>  {
>      /*
>       * When running single-threaded TCG code, use the helper to ensure that
> -     * the next round-robin scheduled vCPU gets a crack.  In MTTCG mode we
> -     * just skip this instruction.  Currently the SEV/SEVL instructions,
> -     * which are *one* of many ways to wake the CPU from WFE, are not
> -     * implemented so we can't sleep like WFI does.
> +     * the next round-robin scheduled vCPU gets a crack.
> +     *
> +     * For Cortex-M, we implement the architectural WFE behavior (sleeping
> +     * until an event occurs or the Event Register is set).
> +     * For other profiles, we currently treat this as a NOP or yield,
> +     * to preserve existing performance characteristics.
>       */
>      if (!(tb_cflags(s->base.tb) & CF_PARALLEL)) {
>          gen_update_pc(s, curr_insn_len(s));
> @@ -6807,6 +6823,11 @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, 
> CPUState *cpu)
>              break;
>          case DISAS_WFE:
>              gen_helper_wfe(tcg_env);
> +            /*
> +             * The helper can return if the event register is set, so we
> +             * must go back to the main loop to check for events.
> +             */
> +            tcg_gen_exit_tb(NULL, 0);
>              break;
>          case DISAS_YIELD:
>              gen_helper_yield(tcg_env);

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Reply via email to