Paolo Bonzini <pbonz...@redhat.com> writes:

> We do not need a memory copying loop anymore in insn_fetch; we
> can use a byte-aligned pointer to access instruction fields directly

Nice approach!

> from the fetch_cache.  This eliminates 40-80 cycles (corresponding to
> a 5-7% improvement in performance) from each instruction.
>
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
> ---
>  arch/x86/kvm/emulate.c | 47 ++++++++++++++++++++++-------------------------
>  1 file changed, 22 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 886f9a88010f..245a2d0bfe68 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -706,7 +706,7 @@ static int segmented_read_std(struct x86_emulate_ctxt 
> *ctxt,
>   * Prefetch the remaining bytes of the instruction without crossing page
>   * boundary if they are not in fetch_cache yet.
>   */
> -static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
> +static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
>  {
>       struct fetch_cache *fc = &ctxt->fetch;
>       int rc;
> @@ -738,42 +738,39 @@ static int do_insn_fetch_bytes(struct x86_emulate_ctxt 
> *ctxt, int op_size)
>       return X86EMUL_CONTINUE;
>  }
>  
> -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
> -                      void *__dest, unsigned size)
> +static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
> +                                            unsigned size)
>  {
> -     int rc;
> -     struct fetch_cache *fc = &ctxt->fetch;
> -     u8 *dest = __dest;
> -     u8 *src = &fc->data[ctxt->_eip - fc->start];
> -
>       /* We have to be careful about overflow! */
> -     if (unlikely(ctxt->_eip > fc->end - size)) {
> -             rc != do_insn_fetch_bytes(ctxt, size);
> -             if (rc != X86EMUL_CONTINNUE)
> -                     goto done;
> -     }
> -
> -     while (size--) {
> -             *dest++ = *src++;
> -             ctxt->_eip++;
> -             continue;
> -     }
> -     return X86EMUL_CONTINUE;
> +     if (unlikely(ctxt->_eip > ctxt->fetch.end - size))
> +             return __do_insn_fetch_bytes(ctxt, size);
> +     else
> +             return X86EMUL_CONTINUE;
>  }
>  
>  /* Fetch next part of the instruction being emulated. */
>  #define insn_fetch(_type, _ctxt)                                     \
> -({   unsigned long _x;                                               \
> -     rc = do_insn_fetch(_ctxt, &_x, sizeof(_type));                  \
> +({   _type _x;                                                       \
> +     struct fetch_cache *_fc;                                        \
> +                                                                     \
> +     rc = do_insn_fetch_bytes(_ctxt, sizeof(_type));                 \
>       if (rc != X86EMUL_CONTINUE)                                     \
>               goto done;                                              \
> -     (_type)_x;                                                      \
> +     _fc = &ctxt->fetch;                                             \
> +     _x = *(_type __aligned(1) *) &_fc->data[ctxt->_eip - _fc->start]; \
For my own understanding, how does the __aligned help here ? Wouldn't 
that result in unaligned accesses that will actually impact performance ?

> +     ctxt->_eip += sizeof(_type);                                    \
> +     _x;                                                             \
>  })
>  
>  #define insn_fetch_arr(_arr, _size, _ctxt)                           \
> -({   rc = do_insn_fetch(_ctxt, _arr, (_size));                       \
> +({                                                                   \
> +     struct fetch_cache *_fc;                                        \
> +     rc = do_insn_fetch_bytes(_ctxt, _size);                         \
>       if (rc != X86EMUL_CONTINUE)                                     \
>               goto done;                                              \
> +     _fc = &ctxt->fetch;                                             \
> +     memcpy(_arr, &_fc->data[ctxt->_eip - _fc->start], _size);       \
> +     ctxt->_eip += (_size);                                          \
>  })
>  
>  /*
> @@ -4282,7 +4279,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void 
> *insn, int insn_len)
>       if (insn_len > 0)
>               memcpy(ctxt->fetch.data, insn, insn_len);
>       else {
> -             rc = do_insn_fetch_bytes(ctxt, 1);
> +             rc = __do_insn_fetch_bytes(ctxt, 1);
>               if (rc != X86EMUL_CONTINUE)
>                       return rc;
>       }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to