On June 30, 2015 9:15:22 PM GMT+08:00, Christoffer Dall 
<christoffer.d...@linaro.org> wrote:
>On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
>> Implement switching of the debug registers. While the number
>> of registers is massive, CPUs usually don't implement them all
>> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
>> of 22 registers "only").
>> 
>> Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
>> the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).
>> 
>> We have to do the save/restore dance in this case, because the host
>> and the guest might use their respective debug registers at any moment.
>
>this sounds expensive, and I suggested an alternative approach in the
>previsou patch.  In any case, measuring the impact on this on hardware
>would be a great idea...
>
>> 
>> If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
>> the debug registers as dirty, we only save/resotre DBGDSCR.
>
>restore
>
>> 
>> Signed-off-by: Zhichao Huang <zhichao.hu...@linaro.org>
>> ---
>>  arch/arm/kvm/interrupts.S      |  16 +++
>>  arch/arm/kvm/interrupts_head.S | 249 
>> ++++++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 263 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
>> index 79caf79..d626275 100644
>> --- a/arch/arm/kvm/interrupts.S
>> +++ b/arch/arm/kvm/interrupts.S
>> @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
>>      read_cp15_state store_to_vcpu = 0
>>      write_cp15_state read_from_vcpu = 1
>>  
>> +    @ Store hardware CP14 state and load guest state
>> +    compute_debug_state 1f
>> +    bl __save_host_debug_regs
>> +    bl __restore_guest_debug_regs
>> +
>> +1:
>>      @ If the host kernel has not been configured with VFPv3 support,
>>      @ then it is safer if we deny guests from using it as well.
>>  #ifdef CONFIG_VFPv3
>> @@ -201,6 +207,16 @@ after_vfp_restore:
>>      mrc     p15, 0, r2, c0, c0, 5
>>      mcr     p15, 4, r2, c0, c0, 5
>>  
>> +    @ Store guest CP14 state and restore host state
>> +    skip_debug_state 1f
>> +    bl __save_guest_debug_regs
>> +    bl __restore_host_debug_regs
>> +    /* Clear the dirty flag for the next run, as all the state has
>> +     * already been saved. Note that we nuke the whole 32bit word.
>> +     * If we ever add more flags, we'll have to be more careful...
>> +     */
>> +    clear_debug_dirty_bit
>> +1:
>>      @ Store guest CP15 state and restore host state
>>      read_cp15_state store_to_vcpu = 1
>>      write_cp15_state read_from_vcpu = 0
>> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
>> index 5662c39..ed406be 100644
>> --- a/arch/arm/kvm/interrupts_head.S
>> +++ b/arch/arm/kvm/interrupts_head.S
>> @@ -7,6 +7,7 @@
>>  #define VCPU_USR_SP         (VCPU_USR_REG(13))
>>  #define VCPU_USR_LR         (VCPU_USR_REG(14))
>>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
>> +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) *
>4))
>>  
>>  /*
>>   * Many of these macros need to access the VCPU structure, which is
>always
>> @@ -168,8 +169,7 @@ vcpu     .req    r0              @ vcpu pointer always 
>> in r0
>>   * Clobbers *all* registers.
>>   */
>>  .macro restore_guest_regs
>> -    /* reset DBGDSCR to disable debug mode */
>> -    mov     r2, #0
>> +    ldr     r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>>      mcr     p14, 0, r2, c0, c2, 2
>>  
>>      restore_guest_regs_mode svc, #VCPU_SVC_REGS
>> @@ -250,6 +250,10 @@ vcpu    .req    r0              @ vcpu pointer always 
>> in r0
>>      save_guest_regs_mode abt, #VCPU_ABT_REGS
>>      save_guest_regs_mode und, #VCPU_UND_REGS
>>      save_guest_regs_mode irq, #VCPU_IRQ_REGS
>> +
>> +    /* DBGDSCR reg */
>> +    mrc     p14, 0, r2, c0, c1, 0
>> +    str     r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>>  .endm
>>  
>>  /* Reads cp15 registers from hardware and stores them in memory
>> @@ -449,6 +453,231 @@ vcpu   .req    r0              @ vcpu pointer always 
>> in r0
>>      str     r5, [vcpu, #VCPU_DEBUG_FLAGS]
>>  .endm
>>  
>> +/* Assume r11/r12 in used, clobbers r2-r10 */
>> +.macro cp14_read_and_push Op2 skip_num
>> +    cmp     \skip_num, #8
>> +    // if (skip_num >= 8) then skip c8-c15 directly
>> +    bge     1f
>> +    adr     r2, 9998f
>> +    add     r2, r2, \skip_num, lsl #2
>> +    bx      r2
>> +1:
>> +    adr     r2, 9999f
>> +    sub     r3, \skip_num, #8
>> +    add     r2, r2, r3, lsl #2
>> +    bx      r2
>> +9998:
>> +    mrc     p14, 0, r10, c0, c15, \Op2
>> +    mrc     p14, 0, r9, c0, c14, \Op2
>> +    mrc     p14, 0, r8, c0, c13, \Op2
>> +    mrc     p14, 0, r7, c0, c12, \Op2
>> +    mrc     p14, 0, r6, c0, c11, \Op2
>> +    mrc     p14, 0, r5, c0, c10, \Op2
>> +    mrc     p14, 0, r4, c0, c9, \Op2
>> +    mrc     p14, 0, r3, c0, c8, \Op2
>> +    push    {r3-r10}
>
>you probably don't want to do more stores to memory than required

Yeah, there is no need to push some registers, but I can't find a better 
way to optimize it, is there any precedents that I can refer to?

Imaging that there are only 2 hwbrpts available, BCR0/BCR1, and we
can code like this:

Save:
                 jump to 1
    save BCR2 to r5
1:
    save BCR1 to r4
    save BCR0 to r3
    push {r3-r5}

Restore:
    pop {r3-r5}
    jump to 1
    restore r5 to BCR2
1:
    restore r4 to BCR1
    restore r3 to BCR0

Buf if we want to only push the registers we acutally need, we have to
code like this:

Save:
    jump to 1
    save BCR2 to r5
    push r5
1:
    save BCR1 to r4
    push r4
    save BCR0 to r3
    push r3

Resotre:
    jump to 1
    pop r5
    restore r5 to BCR2
1:
    pop r4
    restore r4 to BCR1
    pop r3
    restore r3 to BCR0

Then we might entercounter a mistake on restoring, as we want to pop
r4, but actually we pop r3.

>
>> +9999:
>> +    mrc     p14, 0, r10, c0, c7, \Op2
>> +    mrc     p14, 0, r9, c0, c6, \Op2
>> +    mrc     p14, 0, r8, c0, c5, \Op2
>> +    mrc     p14, 0, r7, c0, c4, \Op2
>> +    mrc     p14, 0, r6, c0, c3, \Op2
>> +    mrc     p14, 0, r5, c0, c2, \Op2
>> +    mrc     p14, 0, r4, c0, c1, \Op2
>> +    mrc     p14, 0, r3, c0, c0, \Op2
>> +    push    {r3-r10}
>
>same
>
>> +.endm
>> +
>> +/* Assume r11/r12 in used, clobbers r2-r10 */
>> +.macro cp14_pop_and_write Op2 skip_num
>> +    cmp     \skip_num, #8
>> +    // if (skip_num >= 8) then skip c8-c15 directly
>> +    bge     1f
>> +    adr     r2, 9998f
>> +    add     r2, r2, \skip_num, lsl #2
>> +    pop     {r3-r10}
>
>you probably don't want to do more loads from memory than required
>
>> +    bx      r2
>> +1:
>> +    adr     r2, 9999f
>> +    sub     r3, \skip_num, #8
>> +    add     r2, r2, r3, lsl #2
>> +    pop     {r3-r10}
>
>same
>
>> +    bx      r2
>> +
>> +9998:
>> +    mcr     p14, 0, r10, c0, c15, \Op2
>> +    mcr     p14, 0, r9, c0, c14, \Op2
>> +    mcr     p14, 0, r8, c0, c13, \Op2
>> +    mcr     p14, 0, r7, c0, c12, \Op2
>> +    mcr     p14, 0, r6, c0, c11, \Op2
>> +    mcr     p14, 0, r5, c0, c10, \Op2
>> +    mcr     p14, 0, r4, c0, c9, \Op2
>> +    mcr     p14, 0, r3, c0, c8, \Op2
>> +
>> +    pop     {r3-r10}
>> +9999:
>> +    mcr     p14, 0, r10, c0, c7, \Op2
>> +    mcr     p14, 0, r9, c0, c6, \Op2
>> +    mcr     p14, 0, r8, c0, c5, \Op2
>> +    mcr     p14, 0, r7, c0, c4, \Op2
>> +    mcr     p14, 0, r6, c0, c3, \Op2
>> +    mcr     p14, 0, r5, c0, c2, \Op2
>> +    mcr     p14, 0, r4, c0, c1, \Op2
>> +    mcr     p14, 0, r3, c0, c0, \Op2
>> +.endm
>> +
>> +/* Assume r11/r12 in used, clobbers r2-r3 */
>> +.macro cp14_read_and_str Op2 cp14_reg0 skip_num
>> +    adr     r3, 1f
>> +    add     r3, r3, \skip_num, lsl #3
>> +    bx      r3
>> +1:
>> +    mrc     p14, 0, r2, c0, c15, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
>> +    mrc     p14, 0, r2, c0, c14, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
>> +    mrc     p14, 0, r2, c0, c13, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
>> +    mrc     p14, 0, r2, c0, c12, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
>> +    mrc     p14, 0, r2, c0, c11, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
>> +    mrc     p14, 0, r2, c0, c10, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
>> +    mrc     p14, 0, r2, c0, c9, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
>> +    mrc     p14, 0, r2, c0, c8, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
>> +    mrc     p14, 0, r2, c0, c7, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
>> +    mrc     p14, 0, r2, c0, c6, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
>> +    mrc     p14, 0, r2, c0, c5, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
>> +    mrc     p14, 0, r2, c0, c4, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
>> +    mrc     p14, 0, r2, c0, c3, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
>> +    mrc     p14, 0, r2, c0, c2, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
>> +    mrc     p14, 0, r2, c0, c1, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
>> +    mrc     p14, 0, r2, c0, c0, \Op2
>> +    str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
>> +.endm
>> +
>> +/* Assume r11/r12 in used, clobbers r2-r3 */
>> +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
>> +    adr     r3, 1f
>> +    add     r3, r3, \skip_num, lsl #3
>> +    bx      r3
>> +1:
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
>> +    mcr     p14, 0, r2, c0, c15, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
>> +    mcr     p14, 0, r2, c0, c14, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
>> +    mcr     p14, 0, r2, c0, c13, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
>> +    mcr     p14, 0, r2, c0, c12, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
>> +    mcr     p14, 0, r2, c0, c11, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
>> +    mcr     p14, 0, r2, c0, c10, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
>> +    mcr     p14, 0, r2, c0, c9, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
>> +    mcr     p14, 0, r2, c0, c8, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
>> +    mcr     p14, 0, r2, c0, c7, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
>> +    mcr     p14, 0, r2, c0, c6, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
>> +    mcr     p14, 0, r2, c0, c5, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
>> +    mcr     p14, 0, r2, c0, c4, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
>> +    mcr     p14, 0, r2, c0, c3, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
>> +    mcr     p14, 0, r2, c0, c2, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
>> +    mcr     p14, 0, r2, c0, c1, \Op2
>> +    ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
>> +    mcr     p14, 0, r2, c0, c0, \Op2
>> +.endm
>
>can you not find some way of unifying cp14_pop_and_write with
>cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?
>
>Probably having two separate structs for the VFP state on the vcpu
>struct
>for both the guest and the host state is one possible way of doing so.
>

OK, I will do it.
Would you like me to rename the struct vfp_hard_struct, and add 
host_cp14_state in there, or add a new struct host_cp14_state in the
kvm_vcpu_arch?

>> +
>> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
>> +.macro read_hw_dbg_num
>> +    mrc     p14, 0, r2, c0, c0, 0
>> +    ubfx    r11, r2, #24, #4
>> +    add     r11, r11, #1            // Extract BRPs
>> +    ubfx    r12, r2, #28, #4
>> +    add     r12, r12, #1            // Extract WRPs
>> +    mov     r2, #16
>> +    sub     r11, r2, r11            // How many BPs to skip
>> +    sub     r12, r2, r12            // How many WPs to skip
>> +.endm
>> +
>> +/* Reads cp14 registers from hardware.
>
>You have a lot of multi-line comments in these patches which don't
>start
>with a separate '/*' line, as dictated by the Linux kernel coding
>style.
>So far, I've ignored this, but please fix all these throughout the
>series when you respin.
>
>> + * Writes cp14 registers in-order to the stack.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro save_host_debug_regs
>> +    read_hw_dbg_num
>> +    cp14_read_and_push #4, r11      @ DBGBVR
>> +    cp14_read_and_push #5, r11      @ DBGBCR
>> +    cp14_read_and_push #6, r12      @ DBGWVR
>> +    cp14_read_and_push #7, r12      @ DBGWCR
>> +.endm
>> +
>> +/* Reads cp14 registers from hardware.
>> + * Writes cp14 registers in-order to the VCPU struct pointed to by
>vcpup.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro save_guest_debug_regs
>> +    read_hw_dbg_num
>> +    cp14_read_and_str #4, cp14_DBGBVR0, r11
>
>why do you need the has before the op2 field?

Sorry, I can't quite understand.

>
>> +    cp14_read_and_str #5, cp14_DBGBCR0, r11
>> +    cp14_read_and_str #6, cp14_DBGWVR0, r12
>> +    cp14_read_and_str #7, cp14_DBGWCR0, r12
>> +.endm
>> +
>> +/* Reads cp14 registers in-order from the stack.
>> + * Writes cp14 registers to hardware.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro restore_host_debug_regs
>> +    read_hw_dbg_num
>> +    cp14_pop_and_write #4, r11      @ DBGBVR
>> +    cp14_pop_and_write #5, r11      @ DBGBCR
>> +    cp14_pop_and_write #6, r12      @ DBGWVR
>> +    cp14_pop_and_write #7, r12      @ DBGWCR
>> +.endm
>> +
>> +/* Reads cp14 registers in-order from the VCPU struct pointed to by
>vcpup
>> + * Writes cp14 registers to hardware.
>> + *
>> + * Assumes vcpu pointer in vcpu reg
>> + *
>> + * Clobbers r2-r12
>> + */
>> +.macro restore_guest_debug_regs
>> +    read_hw_dbg_num
>> +    cp14_ldr_and_write #4, cp14_DBGBVR0, r11
>> +    cp14_ldr_and_write #5, cp14_DBGBCR0, r11
>> +    cp14_ldr_and_write #6, cp14_DBGWVR0, r12
>> +    cp14_ldr_and_write #7, cp14_DBGWCR0, r12
>> +.endm
>> +
>>  /*
>>   * Save the VGIC CPU state into memory
>>   *
>> @@ -684,3 +913,19 @@ ARM_BE8(rev     r6, r6  )
>>  .macro load_vcpu
>>      mrc     p15, 4, vcpu, c13, c0, 2        @ HTPIDR
>>  .endm
>> +
>> +__save_host_debug_regs:
>> +    save_host_debug_regs
>> +    bx      lr
>> +
>> +__save_guest_debug_regs:
>> +    save_guest_debug_regs
>> +    bx      lr
>> +
>> +__restore_host_debug_regs:
>> +    restore_host_debug_regs
>> +    bx      lr
>> +
>> +__restore_guest_debug_regs:
>> +    restore_guest_debug_regs
>> +    bx      lr
>> -- 
>> 1.7.12.4
>> 
>
>Thanks,
>-Christoffer

-- 
Zhichao Huang
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

Reply via email to