On 09/11/2015 13:43, Paolo Bonzini wrote:
> 
> 
> On 09/11/2015 12:54, Huaitong Han wrote:
>> Protection keys define a new 4-bit protection key field (PKEY) in bits
>> 62:59 of leaf entries of the page tables, the PKEY is an index to PKRU
>> register(16 domains), every domain has 2 bits(write disable bit, access
>> disable bit).
>>
>> Static logic has been produced in update_permission_bitmask, dynamic logic
>> need read pkey from page table entries, get pkru value, and deduce the
>> correct result.
>>
>> Signed-off-by: Huaitong Han <huaitong....@intel.com>
>>
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index e4202e4..bbb5555 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -3,6 +3,7 @@
>>  
>>  #include <linux/kvm_host.h>
>>  #include "kvm_cache_regs.h"
>> +#include "x86.h"
>>  
>>  #define PT64_PT_BITS 9
>>  #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
>> @@ -24,6 +25,11 @@
>>  #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
>>  #define PT_PAT_MASK (1ULL << 7)
>>  #define PT_GLOBAL_MASK (1ULL << 8)
>> +
>> +#define PT64_PKEY_BIT0 (1ULL << _PAGE_BIT_PKEY_BIT0)
>> +#define PT64_PKEY_BIT1 (1ULL << _PAGE_BIT_PKEY_BIT1)
>> +#define PT64_PKEY_BIT2 (1ULL << _PAGE_BIT_PKEY_BIT2)
>> +#define PT64_PKEY_BIT3 (1ULL << _PAGE_BIT_PKEY_BIT3)
>>  #define PT64_NX_SHIFT 63
>>  #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
>>  
>> @@ -45,6 +51,15 @@
>>  #define PT_PAGE_TABLE_LEVEL 1
>>  #define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
>>  
>> +#define PKEYS_BIT0_VALUE (1ULL << 0)
>> +#define PKEYS_BIT1_VALUE (1ULL << 1)
>> +#define PKEYS_BIT2_VALUE (1ULL << 2)
>> +#define PKEYS_BIT3_VALUE (1ULL << 3)
>> +
>> +#define PKRU_READ   0
>> +#define PKRU_WRITE  1
>> +#define PKRU_ATTRS  2
>> +
>>  static inline u64 rsvd_bits(int s, int e)
>>  {
>>      return ((1ULL << (e - s + 1)) - 1) << s;
>> @@ -145,10 +160,44 @@ static inline bool is_write_protection(struct kvm_vcpu 
>> *vcpu)
>>   * fault with the given access (in ACC_* format)?
>>   */
>>  static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu 
>> *mmu,
>> -                                unsigned pte_access, unsigned pfec)
>> +            unsigned pte_access, unsigned pte_pkeys, unsigned pfec)
>>  {
>> -    int cpl = kvm_x86_ops->get_cpl(vcpu);
>> -    unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
>> +    unsigned long smap, rflags;
>> +    u32 pkru;
>> +    int cpl, index;
>> +    bool wf, uf, pk, pkru_ad, pkru_wd;
>> +
>> +    cpl = kvm_x86_ops->get_cpl(vcpu);
>> +    rflags = kvm_x86_ops->get_rflags(vcpu);
>> +
>> +    pkru = read_pkru();
>> +
>> +    /*
>> +    * PKRU defines 32 bits, there are 16 domains and 2 attribute bits per
>> +    * domain in pkru, pkey is index to a defined domain, so the value of
>> +    * pkey * PKRU_ATTRS + R/W is offset of a defined domain attribute.
>> +    */
>> +    pkru_ad = (pkru >> (pte_pkeys * PKRU_ATTRS + PKRU_READ)) & 1;
>> +    pkru_wd = (pkru >> (pte_pkeys * PKRU_ATTRS + PKRU_WRITE)) & 1;
>> +
>> +    wf = pfec & PFERR_WRITE_MASK;
>> +    uf = pfec & PFERR_USER_MASK;
>> +
>> +    /*
>> +    * PKeys 2nd and 6th conditions:
>> +    * 2.EFER_LMA=1
>> +    * 6.PKRU.AD=1
>> +    *       or The access is a data write and PKRU.WD=1 and
>> +    *               either CR0.WP=1 or it is a user mode access
>> +    */
>> +    pk = is_long_mode(vcpu) && (pkru_ad ||
>> +                    (pkru_wd && wf && (is_write_protection(vcpu) || uf)));

A little more optimized:

        pkru_bits = (pkru >> (pte_pkeys * PKRU_ATTRS)) & 3;

        /*
         * Ignore PKRU.WD if not relevant to this access (a read,
         * or a supervisor mode access if CR0.WP=0).
         */
        if (!wf || (!uf && !is_write_protection(vcpu)))
                pkru_bits &= ~(1 << PKRU_WRITE);

... and then just check pkru_bits != 0.

>> +    /*
>> +    * PK bit right value in pfec equal to
>> +    * PK bit current value in pfec and pk value.
>> +    */
>> +    pfec &= (pk << PFERR_PK_BIT) + ~PFERR_PK_MASK;
> 
> PK is only applicable to guest page tables, but if you do not support
> PKRU without EPT (patch 9), none of this is necessary, is it?

Doh. :(  Sorry, this is of course needed for the emulation case.

I think you should optimize this for the common case where pkru is zero,
hence pk will always be zero.  So something like

        pkru = is_long_mode(vcpu) ? read_pkru() : 0;
        if (unlikely(pkru) && (pfec & PFERR_PK_MASK)) {
                ... from above ... */

                /* Flip PFERR_PK_MASK if pkru_bits is non-zero */
                pfec ^= -pkru_bits & PFERR_PK_MASK;
        }

Paolo

>>      /*
>>       * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
>> @@ -163,8 +212,8 @@ static inline bool permission_fault(struct kvm_vcpu 
>> *vcpu, struct kvm_mmu *mmu,
>>       * but it will be one in index if SMAP checks are being overridden.
>>       * It is important to keep this branchless.
>>       */
>> -    unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
>> -    int index = (pfec >> 1) +
>> +    smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
>> +    index = (pfec >> 1) +
>>                  (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
>>  
>>      WARN_ON(pfec & PFERR_RSVD_MASK);
>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>> index 736e6ab..99563bc 100644
>> --- a/arch/x86/kvm/paging_tmpl.h
>> +++ b/arch/x86/kvm/paging_tmpl.h
>> @@ -253,6 +253,17 @@ static int FNAME(update_accessed_dirty_bits)(struct 
>> kvm_vcpu *vcpu,
>>      }
>>      return 0;
>>  }
>> +static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
>> +{
>> +    unsigned pkeys = 0;
>> +#if PTTYPE == 64
>> +    pkeys = ((gpte & PT64_PKEY_BIT0) ? PKEYS_BIT0_VALUE : 0) |
>> +            ((gpte & PT64_PKEY_BIT1) ? PKEYS_BIT1_VALUE : 0) |
>> +            ((gpte & PT64_PKEY_BIT2) ? PKEYS_BIT2_VALUE : 0) |
>> +            ((gpte & PT64_PKEY_BIT3) ? PKEYS_BIT3_VALUE : 0);
> 
> This is just pkeys = (gpte >> _PAGE_BIT_PKEY_BIT0) & 15.
> 
> Paolo
> 
>> +#endif
>> +    return pkeys;
>> +}
>>  
>>  /*
>>   * Fetch a guest pte for a guest virtual address
>> @@ -265,12 +276,13 @@ static int FNAME(walk_addr_generic)(struct 
>> guest_walker *walker,
>>      pt_element_t pte;
>>      pt_element_t __user *uninitialized_var(ptep_user);
>>      gfn_t table_gfn;
>> -    unsigned index, pt_access, pte_access, accessed_dirty;
>> +    unsigned index, pt_access, pte_access, accessed_dirty, pte_pkeys;
>>      gpa_t pte_gpa;
>>      int offset;
>>      const int write_fault = access & PFERR_WRITE_MASK;
>>      const int user_fault  = access & PFERR_USER_MASK;
>>      const int fetch_fault = access & PFERR_FETCH_MASK;
>> +    const int pk_fault = access & PFERR_PK_MASK;
>>      u16 errcode = 0;
>>      gpa_t real_gpa;
>>      gfn_t gfn;
>> @@ -356,7 +368,9 @@ retry_walk:
>>              walker->ptes[walker->level - 1] = pte;
>>      } while (!is_last_gpte(mmu, walker->level, pte));
>>  
>> -    if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
>> +    pte_pkeys = FNAME(gpte_pkeys)(vcpu, pte);
>> +    if (unlikely(permission_fault(vcpu, mmu, pte_access, pte_pkeys,
>> +                                    access))) {
>>              errcode |= PFERR_PRESENT_MASK;
>>              goto error;
>>      }
>> @@ -399,7 +413,7 @@ retry_walk:
>>      return 1;
>>  
>>  error:
>> -    errcode |= write_fault | user_fault;
>> +    errcode |= write_fault | user_fault | pk_fault;
>>      if (fetch_fault && (mmu->nx ||
>>                          kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
>>              errcode |= PFERR_FETCH_MASK;
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 5181834..7a84b83 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -4107,7 +4107,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, 
>> unsigned long gva,
>>  
>>      if (vcpu_match_mmio_gva(vcpu, gva)
>>          && !permission_fault(vcpu, vcpu->arch.walk_mmu,
>> -                             vcpu->arch.access, access)) {
>> +                             vcpu->arch.access, 0, access)) {
>>              *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
>>                                      (gva & (PAGE_SIZE - 1));
>>              trace_vcpu_match_mmio(gva, *gpa, write, false);
>>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to