Re: [PATCH v2 0/2] KVM: x86: expose direct stores instructions into VM.

2018-11-30 Thread Eric Northup
On Mon, Nov 5, 2018 at 10:01 PM Liu Jingqi  wrote:
>
> Direct stores instructions MOVDIRI and MOVDIR64B will be available in
> Tremont and other future x86 processors,
> and need to be exposed to guest VM.

It seems like KVM's emulator should be able to complete these
instructions to emulated MMIO before exposing CPUID to guests in any
default or supported configurations.

It'll be much simpler for usermode to implement that property if the
KVM-reported supported CPUID table doesn't get updated before the KVM
emulator does.

-Eric


Re: [PATCH 4/4] kvm, vmx: remove manually coded vmx instructions

2018-10-24 Thread Eric Northup
On Wed, Oct 24, 2018 at 1:30 AM Julian Stecklina  wrote:
>
> So far the VMX code relied on manually assembled VMX instructions. This
> was apparently done to ensure compatibility with old binutils. VMX
> instructions were introduced with binutils 2.19 and the kernel currently
> requires binutils 2.20.
>
> Remove the manually assembled versions and replace them with the proper
> inline assembly. This improves code generation (and source code
> readability).
>
> According to the bloat-o-meter this change removes ~1300 bytes from the
> text segment.

This loses the exception handling from __ex* ->
kvm_handle_fault_on_reboot.

If deliberate, this should be called out in changelog.  Has the race
which commit 4ecac3fd fixed been mitigated otherwise?



>
> Signed-off-by: Julian Stecklina 
> Reviewed-by: Jan H. Schönherr 
> Reviewed-by: Konrad Jan Miller 
> Reviewed-by: Razvan-Alin Ghitulete 
> ---
>  arch/x86/include/asm/virtext.h |  2 +-
>  arch/x86/include/asm/vmx.h | 13 -
>  arch/x86/kvm/vmx.c | 39 ++-
>  3 files changed, 19 insertions(+), 35 deletions(-)
>
> diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
> index 0116b2e..c5395b3 100644
> --- a/arch/x86/include/asm/virtext.h
> +++ b/arch/x86/include/asm/virtext.h
> @@ -40,7 +40,7 @@ static inline int cpu_has_vmx(void)
>   */
>  static inline void cpu_vmxoff(void)
>  {
> -   asm volatile (ASM_VMX_VMXOFF : : : "cc");
> +   asm volatile ("vmxoff" : : : "cc");
> cr4_clear_bits(X86_CR4_VMXE);
>  }
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 9527ba5..ade0f15 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -503,19 +503,6 @@ enum vmcs_field {
>
>  #define VMX_EPT_IDENTITY_PAGETABLE_ADDR0xfffbc000ul
>
> -
> -#define ASM_VMX_VMCLEAR_RAX   ".byte 0x66, 0x0f, 0xc7, 0x30"
> -#define ASM_VMX_VMLAUNCH  ".byte 0x0f, 0x01, 0xc2"
> -#define ASM_VMX_VMRESUME  ".byte 0x0f, 0x01, 0xc3"
> -#define ASM_VMX_VMPTRLD_RAX   ".byte 0x0f, 0xc7, 0x30"
> -#define ASM_VMX_VMREAD_RDX_RAX".byte 0x0f, 0x78, 0xd0"
> -#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
> -#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
> -#define ASM_VMX_VMXOFF".byte 0x0f, 0x01, 0xc4"
> -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
> -#define ASM_VMX_INVEPT   ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
> -#define ASM_VMX_INVVPID  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
> -
>  struct vmx_msr_entry {
> u32 index;
> u32 reserved;
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 82cfb909..bbbdccb 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2077,7 +2077,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 
> msr)
> return -1;
>  }
>
> -static inline void __invvpid(int ext, u16 vpid, gva_t gva)
> +static inline void __invvpid(long ext, u16 vpid, gva_t gva)
>  {
> struct {
> u64 vpid : 16;
> @@ -2086,21 +2086,21 @@ static inline void __invvpid(int ext, u16 vpid, gva_t 
> gva)
> } operand = { vpid, 0, gva };
> bool error;
>
> -   asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
> - : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
> +   asm volatile ("invvpid %1, %2" CC_SET(na)
> + : CC_OUT(na) (error) : "m"(operand), "r"(ext)
>   : "memory");
> BUG_ON(error);
>  }
>
> -static inline void __invept(int ext, u64 eptp, gpa_t gpa)
> +static inline void __invept(long ext, u64 eptp, gpa_t gpa)
>  {
> struct {
> u64 eptp, gpa;
> } operand = {eptp, gpa};
> bool error;
>
> -   asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
> - : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
> +   asm volatile ("invept %1, %2" CC_SET(na)
> + : CC_OUT(na) (error) : "m" (operand), "r" (ext)
>   : "memory");
> BUG_ON(error);
>  }
> @@ -2120,8 +2120,8 @@ static void vmcs_clear(struct vmcs *vmcs)
> u64 phys_addr = __pa(vmcs);
> bool error;
>
> -   asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
> - : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
> +   asm volatile ("vmclear %1" CC_SET(na)
> + : CC_OUT(na) (error) : "m"(phys_addr)
>   : "memory");
> if (unlikely(error))
> printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
> @@ -2145,8 +2145,8 @@ static void vmcs_load(struct vmcs *vmcs)
> if (static_branch_unlikely(&enable_evmcs))
> return evmcs_load(phys_addr);
>
> -   asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
> - : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
> +   asm volatile ("vmptrld %1" CC_

Re: [PATCH 00/12] KVM: x86: add support for VMX TSC scaling

2015-09-28 Thread Eric Northup
On Sun, Sep 27, 2015 at 10:37 PM, Haozhong Zhang
 wrote:
> This patchset adds support for VMX TSC scaling feature which is
> available on Intel Skylake CPU. The specification of VMX TSC scaling
> can be found at
> http://www.intel.com/content/www/us/en/processors/timestamp-counter-scaling-virtualization-white-paper.html
>
> VMX TSC scaling allows guest TSC which is read by guest rdtsc(p)
> instructions increases in a rate that is customized by the hypervisor
> and can be different than the host TSC rate. Basically, VMX TSC
> scaling adds a 64-bit field called TSC multiplier in VMCS so that, if
> VMX TSC scaling is enabled, TSC read by guest rdtsc(p) instructions
> will be calculated by the following formula:
>
>   guest EDX:EAX = (Host TSC * TSC multiplier) >> 48 + VMX TSC Offset
>
> where, Host TSC = Host MSR_IA32_TSC + Host MSR_IA32_TSC_ADJUST.
>
> This patchset, when cooperating with another QEMU patchset (sent in
> another email "target-i386: save/restore vcpu's TSC rate during
> migration"), allows guest programs observe a consistent TSC rate even
> though they are migrated among machines with different host TSC rates.
>
> VMX TSC scaling shares some common logics with SVM TSC scaling which
> is already supported by KVM. Patch 1 ~ 8 move those common logics from
> SVM code to the common code. Upon them, patch 9 ~ 12 add VMX-specific
> support for VMX TSC scaling.

reviewed-by: Eric Northup 

>
> Haozhong Zhang (12):
>   KVM: x86: Collect information for setting TSC scaling ratio
>   KVM: x86: Add a common TSC scaling ratio field in kvm_vcpu_arch
>   KVM: x86: Add a common TSC scaling function
>   KVM: x86: Replace call-back set_tsc_khz() with a common function
>   KVM: x86: Replace call-back compute_tsc_offset() with a common function
>   KVM: x86: Move TSC scaling logic out of call-back adjust_tsc_offset()
>   KVM: x86: Move TSC scaling logic out of call-back read_l1_tsc()
>   KVM: x86: Use the correct vcpu's TSC rate to compute time scale
>   KVM: VMX: Enable and initialize VMX TSC scaling
>   KVM: VMX: Setup TSC scaling ratio when a vcpu is loaded
>   KVM: VMX: Use a scaled host TSC for guest readings of MSR_IA32_TSC
>   KVM: VMX: Dump TSC multiplier in dump_vmcs()
>
>  arch/x86/include/asm/kvm_host.h |  24 +++
>  arch/x86/include/asm/vmx.h  |   4 +-
>  arch/x86/kvm/lapic.c|   5 +-
>  arch/x86/kvm/svm.c  | 113 +++--
>  arch/x86/kvm/vmx.c  |  60 
>  arch/x86/kvm/x86.c  | 154 
> +---
>  include/linux/kvm_host.h|  21 +-
>  7 files changed, 221 insertions(+), 160 deletions(-)
>
> --
> 2.4.8
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 01/12] KVM: x86: Collect information for setting TSC scaling ratio

2015-09-28 Thread Eric Northup
On Sun, Sep 27, 2015 at 10:38 PM, Haozhong Zhang
 wrote:
>
> The number of bits of the fractional part of the 64-bit TSC scaling
> ratio in VMX and SVM is different. This patch makes the architecture
> code to collect the number of fractional bits and other related
> information into variables that can be accessed in the common code.
>
> Signed-off-by: Haozhong Zhang 
> ---
>  arch/x86/include/asm/kvm_host.h | 8 
>  arch/x86/kvm/svm.c  | 5 +
>  arch/x86/kvm/x86.c  | 8 
>  3 files changed, 21 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2beee03..5b9b86e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -965,6 +965,14 @@ extern bool kvm_has_tsc_control;
>  extern u32  kvm_min_guest_tsc_khz;
>  /* maximum supported tsc_khz for guests */
>  extern u32  kvm_max_guest_tsc_khz;
> +/* number of bits of the fractional part of the TSC scaling ratio */
> +extern u8   kvm_tsc_scaling_ratio_frac_bits;
> +/* reserved bits of TSC scaling ratio (SBZ) */
> +extern u64  kvm_tsc_scaling_ratio_rsvd;
> +/* default TSC scaling ratio (= 1.0) */
> +extern u64  kvm_default_tsc_scaling_ratio;
> +/* maximum allowed value of TSC scaling ratio */
> +extern u64  kvm_max_tsc_scaling_ratio;

Do we need all 3 of kvm_max_guest_tsc_khz, kvm_max_tsc_scaling_ratio,
and kvm_tsc_scaling_ratio_rsvd (since only SVM has reserved bits - and
just for complaining if the high bits are set, which can already be
expressed by max_tsc_scaling ratio)

kvm_max_tsc_scaling_ratio seems to be write-only.

>
>  enum emulation_result {
> EMULATE_DONE, /* no further processing */
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 94b7d15..eff7db7 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -963,7 +963,12 @@ static __init int svm_hardware_setup(void)
> max = min(0x7fffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
>
> kvm_max_guest_tsc_khz = max;
> +
> +   kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
> +   kvm_tsc_scaling_ratio_frac_bits = 32;
> +   kvm_tsc_scaling_ratio_rsvd = TSC_RATIO_RSVD;
> }
> +   kvm_default_tsc_scaling_ratio = TSC_RATIO_DEFAULT;
>
> if (nested) {
> printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 991466b..f888225 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -106,6 +106,14 @@ bool kvm_has_tsc_control;
>  EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
>  u32  kvm_max_guest_tsc_khz;
>  EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
> +u8   kvm_tsc_scaling_ratio_frac_bits;
> +EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
> +u64  kvm_tsc_scaling_ratio_rsvd;
> +EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_rsvd);
> +u64  kvm_default_tsc_scaling_ratio;
> +EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
> +u64  kvm_max_tsc_scaling_ratio;
> +EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
>
>  /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold 
> */
>  static u32 tsc_tolerance_ppm = 250;
> --
> 2.4.8
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] vhost: support upto 509 memory regions

2015-02-17 Thread Eric Northup
On Tue, Feb 17, 2015 at 4:32 AM, Michael S. Tsirkin  wrote:
> On Tue, Feb 17, 2015 at 11:59:48AM +0100, Paolo Bonzini wrote:
>>
>>
>> On 17/02/2015 10:02, Michael S. Tsirkin wrote:
>> > > Increasing VHOST_MEMORY_MAX_NREGIONS from 65 to 509
>> > > to match KVM_USER_MEM_SLOTS fixes issue for vhost-net.
>> > >
>> > > Signed-off-by: Igor Mammedov 
>> >
>> > This scares me a bit: each region is 32byte, we are talking
>> > a 16K allocation that userspace can trigger.
>>
>> What's bad with a 16K allocation?
>
> It fails when memory is fragmented.
>
>> > How does kvm handle this issue?
>>
>> It doesn't.
>>
>> Paolo
>
> I'm guessing kvm doesn't do memory scans on data path,
> vhost does.
>
> qemu is just doing things that kernel didn't expect it to need.
>
> Instead, I suggest reducing number of GPA<->HVA mappings:
>
> you have GPA 1,5,7
> map them at HVA 11,15,17
> then you can have 1 slot: 1->11
>
> To avoid libc reusing the memory holes, reserve them with MAP_NORESERVE
> or something like this.

This works beautifully when host virtual address bits are more
plentiful than guest physical address bits.  Not all architectures
have that property, though.

> We can discuss smarter lookup algorithms but I'd rather
> userspace didn't do things that we then have to
> work around in kernel.
>
>
> --
> MST
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC] virtio_pci: fix virtio spec compliance on restore

2014-09-23 Thread Eric Northup
On Tue, Sep 23, 2014 at 3:32 AM, Michael S. Tsirkin  wrote:
> On restore, virtio pci does the following:
> + set features
> + init vqs etc - device can be used at this point!
> + set ACKNOWLEDGE,DRIVER and DRIVER_OK status bits
>
> This is in violation of the virtio spec, which
> requires the following order:
> - ACKNOWLEDGE
> - DRIVER
> - init vqs
> - DRIVER_OK
>
> Cc: sta...@vger.kernel.org
> Cc: Amit Shah 
> Signed-off-by: Michael S. Tsirkin 
> ---
>
> Lightly tested.
> Will repost as non-RFC once testing is done, sending
> out now for early flames/comments.
>
> Thanks!
>
>  drivers/virtio/virtio_pci.c | 36 
>  1 file changed, 32 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
> index 58f7e45..58cbf6e 100644
> --- a/drivers/virtio/virtio_pci.c
> +++ b/drivers/virtio/virtio_pci.c
> @@ -785,6 +785,7 @@ static int virtio_pci_restore(struct device *dev)
> struct pci_dev *pci_dev = to_pci_dev(dev);
> struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
> struct virtio_driver *drv;
> +   unsigned status = 0;
> int ret;
>
> drv = container_of(vp_dev->vdev.dev.driver,
> @@ -795,14 +796,41 @@ static int virtio_pci_restore(struct device *dev)
> return ret;
>
> pci_set_master(pci_dev);
> +   /* We always start by resetting the device, in case a previous
> +* driver messed it up. */
> +   vp_reset(&vp_dev->vdev);

This should happen before enabling PCI bus mastering.

> +
> +   /* Acknowledge that we've seen the device. */
> +   status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
> +   vp_set_status(&vp_dev->vdev, status);
> +
> +   /* Maybe driver failed before freeze.
> +* Restore the failed status, for debugging. */
> +   status |= vp_dev->saved_status & VIRTIO_CONFIG_S_FAILED;
> +   vp_set_status(&vp_dev->vdev, status);
> +
> +   if (!drv)
> +   return 0;
> +
> +   /* We have a driver! */
> +   status |= VIRTIO_CONFIG_S_DRIVER;
> +   vp_set_status(&vp_dev->vdev, status);
> +
> vp_finalize_features(&vp_dev->vdev);
>
> -   if (drv && drv->restore)
> -   ret = drv->restore(&vp_dev->vdev);
> +   if (!drv->restore)
> +   return 0;
> +
> +   ret = drv->restore(&vp_dev->vdev);
> +   if (ret) {
> +   status |= VIRTIO_CONFIG_S_FAILED;
> +   vp_set_status(&vp_dev->vdev, status);
> +   return ret;
> +   }
>
> /* Finally, tell the device we're all set */
> -   if (!ret)
> -   vp_set_status(&vp_dev->vdev, vp_dev->saved_status);
> +   status |= VIRTIO_CONFIG_S_DRIVER_OK;
> +   vp_set_status(&vp_dev->vdev, status);
>
> return ret;
>  }
> --
> MST
> ___
> Virtualization mailing list
> virtualizat...@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] KVM: x86 emulator: emulate MOVNTDQ

2014-07-11 Thread Eric Northup
On Fri, Jul 11, 2014 at 10:56 AM, Alex Williamson
 wrote:
> Windows 8.1 guest with NVIDIA driver and GPU fails to boot with an
> emulation failure.  The KVM spew suggests the fault is with lack of
> movntdq emulation (courtesy of Paolo):
>
> Code=02 00 00 b8 08 00 00 00 f3 0f 6f 44 0a f0 f3 0f 6f 4c 0a e0 <66> 0f e7 
> 41 f0 66 0f e7 49 e0 48 83 e9 40 f3 0f 6f 44 0a 10 f3 0f 6f 0c 0a 66 0f e7 41 
> 10
>
> $ as -o a.out
> .section .text
> .byte 0x66, 0x0f, 0xe7, 0x41, 0xf0
> .byte 0x66, 0x0f, 0xe7, 0x49, 0xe0
> $ objdump -d a.out
> 0:  66 0f e7 41 f0  movntdq %xmm0,-0x10(%rcx)
> 5:  66 0f e7 49 e0  movntdq %xmm1,-0x20(%rcx)
>
> Add the necessary emulation.
>
> Signed-off-by: Alex Williamson 
> Cc: Paolo Bonzini 
> ---
>
> Hope I got all the flags correct from copying similar MOV ops, but it
> allows the guest to boot, so I suspect it's ok.
>
>  arch/x86/kvm/emulate.c |7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index e4e833d..ae39f08 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3681,6 +3681,10 @@ static const struct gprefix pfx_0f_28_0f_29 = {
> I(Aligned, em_mov), I(Aligned, em_mov), N, N,
>  };
>
> +static const struct gprefix pfx_0f_e7 = {
> +   N, I(Sse, em_mov), N, N,

I think you need 'Aligned' flag in here - from my reading of the
manual, this instruction will #GP if the memory operand isn't aligned.

> +};
> +
>  static const struct escape escape_d9 = { {
> N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
>  }, {
> @@ -3951,7 +3955,8 @@ static const struct opcode twobyte_table[256] = {
> /* 0xD0 - 0xDF */
> N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
> /* 0xE0 - 0xEF */
> -   N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
> +   N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
> +   N, N, N, N, N, N, N, N,
> /* 0xF0 - 0xFF */
> N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
>  };
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] KVM: x86: correct mwait and monitor emulation

2014-06-18 Thread Eric Northup
Quoting Gabriel's post http://www.spinics.net/lists/kvm/msg103792.html :

[...]

> E.g., OS X 10.5 *does* check CPUID, and panics if it doesn't find it.
> It needs the MONITOR cpuid flag to be on, *and* the actual
> instructions to work.




On Wed, Jun 18, 2014 at 11:23 AM, Nadav Amit  wrote:
> On 6/18/14, 8:59 PM, Eric Northup wrote:
>>
>> On Wed, Jun 18, 2014 at 7:19 AM, Nadav Amit 
>> wrote:
>>>
>>> mwait and monitor are currently handled as nop. Considering this
>>> behavior, they
>>> should still be handled correctly, i.e., check execution conditions and
>>> generate
>>> exceptions when required. mwait and monitor may also be executed in
>>> real-mode
>>> and are not handled in that case.  This patch performs the emulation of
>>> monitor-mwait according to Intel SDM (other than checking whether
>>> interrupt can
>>> be used as a break event).
>>>
>>> Signed-off-by: Nadav Amit 
>>> ---
>>>   arch/x86/kvm/emulate.c | 41 +++--
>>>   arch/x86/kvm/svm.c | 22 ++
>>>   arch/x86/kvm/vmx.c | 27 +++
>>>   3 files changed, 52 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
>>> index ef7a5a0..424b58d 100644
>>> --- a/arch/x86/kvm/emulate.c
>>> +++ b/arch/x86/kvm/emulate.c
>>> @@ -3344,6 +3344,43 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt)
>>>  return X86EMUL_CONTINUE;
>>>   }
>>>
>>> +static int em_monitor(struct x86_emulate_ctxt *ctxt)
>>> +{
>>> +   int rc;
>>> +   struct segmented_address addr;
>>> +   u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
>>> +   u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
>>> +   u8 byte;
>>
>>
>> I'd request:
>>
>> u32 ebx, ecx, edx, eax = 1;
>> ctxt->opt->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
>> if (!(ecx & FFL(MWAIT)))
>>  return emulate_ud(ctxt);
>>
>> and also in em_mwait.
>>
>
> I had similar implementation on previous version, which also checked on
> mwait whether "interrupt as break event" matches ECX value. However, I was
> under the impression that it was decided that MWAIT will always be emulated
> as NOP to avoid misbehaving VMs that ignore CPUID (see the discussion at
> http://www.spinics.net/lists/kvm/msg102766.html ).
>
> Nadav
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] KVM: x86: correct mwait and monitor emulation

2014-06-18 Thread Eric Northup
On Wed, Jun 18, 2014 at 7:19 AM, Nadav Amit  wrote:
> mwait and monitor are currently handled as nop. Considering this behavior, 
> they
> should still be handled correctly, i.e., check execution conditions and 
> generate
> exceptions when required. mwait and monitor may also be executed in real-mode
> and are not handled in that case.  This patch performs the emulation of
> monitor-mwait according to Intel SDM (other than checking whether interrupt 
> can
> be used as a break event).
>
> Signed-off-by: Nadav Amit 
> ---
>  arch/x86/kvm/emulate.c | 41 +++--
>  arch/x86/kvm/svm.c | 22 ++
>  arch/x86/kvm/vmx.c | 27 +++
>  3 files changed, 52 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index ef7a5a0..424b58d 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3344,6 +3344,43 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt)
> return X86EMUL_CONTINUE;
>  }
>
> +static int em_monitor(struct x86_emulate_ctxt *ctxt)
> +{
> +   int rc;
> +   struct segmented_address addr;
> +   u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
> +   u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
> +   u8 byte;

I'd request:

u32 ebx, ecx, edx, eax = 1;
ctxt->opt->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
if (!(ecx & FFL(MWAIT)))
return emulate_ud(ctxt);

and also in em_mwait.

> +
> +   if (ctxt->mode != X86EMUL_MODE_PROT64)
> +   rcx = (u32)rcx;
> +   if (rcx != 0)
> +   return emulate_gp(ctxt, 0);
> +
> +   addr.seg = seg_override(ctxt);
> +   addr.ea = ctxt->ad_bytes == 8 ? rax : (u32)rax;
> +   rc = segmented_read(ctxt, addr, &byte, 1);
> +   if (rc != X86EMUL_CONTINUE)
> +   return rc;
> +
> +   printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as 
> NOP!\n");
> +   return X86EMUL_CONTINUE;
> +}
> +
> +static int em_mwait(struct x86_emulate_ctxt *ctxt)
> +{
> +   u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
> +
> +   if (ctxt->mode != X86EMUL_MODE_PROT64)
> +   rcx = (u32)rcx;
> +   if ((rcx & ~1UL) != 0)
> +   return emulate_gp(ctxt, 0);
> +
> +   /* Accepting interrupt as break event regardless to cpuid */
> +   printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
> +   return X86EMUL_CONTINUE;
> +}
> +
>  static bool valid_cr(int nr)
>  {
> switch (nr) {
> @@ -3557,8 +3594,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
> F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
>
>  static const struct opcode group7_rm1[] = {
> -   DI(SrcNone | Priv, monitor),
> -   DI(SrcNone | Priv, mwait),
> +   II(SrcNone | Priv | NoBigReal | UDOnPriv, em_monitor, monitor),
> +   II(SrcNone | Priv | NoBigReal | UDOnPriv, em_mwait, mwait),
> N, N, N, N, N, N,
>  };
>
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index ec8366c..a524e04 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3274,24 +3274,6 @@ static int pause_interception(struct vcpu_svm *svm)
> return 1;
>  }
>
> -static int nop_interception(struct vcpu_svm *svm)
> -{
> -   skip_emulated_instruction(&(svm->vcpu));
> -   return 1;
> -}
> -
> -static int monitor_interception(struct vcpu_svm *svm)
> -{
> -   printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as 
> NOP!\n");
> -   return nop_interception(svm);
> -}
> -
> -static int mwait_interception(struct vcpu_svm *svm)
> -{
> -   printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
> -   return nop_interception(svm);
> -}
> -
>  static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
> [SVM_EXIT_READ_CR0] = cr_interception,
> [SVM_EXIT_READ_CR3] = cr_interception,
> @@ -3349,8 +3331,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm 
> *svm) = {
> [SVM_EXIT_CLGI] = clgi_interception,
> [SVM_EXIT_SKINIT]   = skinit_interception,
> [SVM_EXIT_WBINVD]   = emulate_on_interception,
> -   [SVM_EXIT_MONITOR]  = monitor_interception,
> -   [SVM_EXIT_MWAIT]= mwait_interception,
> +   [SVM_EXIT_MONITOR]  = emulate_on_interception,
> +   [SVM_EXIT_MWAIT]= emulate_on_interception,
> [SVM_EXIT_XSETBV]   = xsetbv_interception,
> [SVM_EXIT_NPF]  = pf_interception,
>  };
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 801332e..7023e71 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -5672,22 +5672,17 @@ static int handle_pause(struct kvm_vcpu *vcpu)
> return 1;
>  }
>
> -static int handle_nop(struct kvm_vcpu *vcpu)
> +static 

Re: [PATCH 4/4] kvm: Implement PEBS virtualization

2014-06-02 Thread Eric Northup
On Thu, May 29, 2014 at 6:12 PM, Andi Kleen  wrote:
> From: Andi Kleen 
>
> PEBS (Precise Event Bases Sampling) profiling is very powerful,
> allowing improved sampling precision and much additional information,
> like address or TSX abort profiling. cycles:p and :pp uses PEBS.
>
> This patch enables PEBS profiling in KVM guests.
>
> PEBS writes profiling records to a virtual address in memory. Since
> the guest controls the virtual address space the PEBS record
> is directly delivered to the guest buffer. We set up the PEBS state
> that is works correctly.The CPU cannot handle any kinds of faults during
> these guest writes.
>
> To avoid any problems with guest pages being swapped by the host we
> pin the pages when the PEBS buffer is setup, by intercepting
> that MSR.
>
> Typically profilers only set up a single page, so pinning that is not
> a big problem. The pinning is limited to 17 pages currently (64K+1)
>
> In theory the guest can change its own page tables after the PEBS
> setup. The host has no way to track that with EPT. But if a guest
> would do that it could only crash itself. It's not expected
> that normal profilers do that.
>
> The patch also adds the basic glue to enable the PEBS CPUIDs
> and other PEBS MSRs, and ask perf to enable PEBS as needed.
>
> Due to various limitations it currently only works on Silvermont
> based systems.
>
> This patch doesn't implement the extended MSRs some CPUs support.
> For example latency profiling on SLM will not work at this point.
>
> Timing:
>
> The emulation is somewhat more expensive than a real PMU. This
> may trigger the expensive PMI detection in the guest.
> Usually this can be disabled with
> echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent
>
> Migration:
>
> In theory it should should be possible (as long as we migrate to
> a host with the same PEBS event and the same PEBS format), but I'm not
> sure the basic KVM PMU code supports it correctly: no code to
> save/restore state, unless I'm missing something. Once the PMU
> code grows proper migration support it should be straight forward
> to handle the PEBS state too.
>
> Signed-off-by: Andi Kleen 
> ---
>  arch/x86/include/asm/kvm_host.h   |   6 ++
>  arch/x86/include/uapi/asm/msr-index.h |   4 +
>  arch/x86/kvm/cpuid.c  |  10 +-
>  arch/x86/kvm/pmu.c| 184 
> --
>  arch/x86/kvm/vmx.c|   6 ++
>  5 files changed, 196 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7de069af..d87cb66 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -319,6 +319,8 @@ struct kvm_pmc {
> struct kvm_vcpu *vcpu;
>  };
>
> +#define MAX_PINNED_PAGES 17 /* 64k buffer + ds */
> +
>  struct kvm_pmu {
> unsigned nr_arch_gp_counters;
> unsigned nr_arch_fixed_counters;
> @@ -335,6 +337,10 @@ struct kvm_pmu {
> struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
> struct irq_work irq_work;
> u64 reprogram_pmi;
> +   u64 pebs_enable;
> +   u64 ds_area;
> +   struct page *pinned_pages[MAX_PINNED_PAGES];
> +   unsigned num_pinned_pages;
>  };
>
>  enum {
> diff --git a/arch/x86/include/uapi/asm/msr-index.h 
> b/arch/x86/include/uapi/asm/msr-index.h
> index fcf2b3a..409a582 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -72,6 +72,10 @@
>  #define MSR_IA32_PEBS_ENABLE   0x03f1
>  #define MSR_IA32_DS_AREA   0x0600
>  #define MSR_IA32_PERF_CAPABILITIES 0x0345
> +#define PERF_CAP_PEBS_TRAP (1U << 6)
> +#define PERF_CAP_ARCH_REG  (1U << 7)
> +#define PERF_CAP_PEBS_FORMAT   (0xf << 8)
> +
>  #define MSR_PEBS_LD_LAT_THRESHOLD  0x03f6
>
>  #define MSR_MTRRfix64K_0   0x0250
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index f47a104..c8cc76b 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -260,6 +260,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
> *entry, u32 function,
> unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
> unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 
> 0;
> unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
> +   bool pebs = perf_pebs_virtualization();
> +   unsigned f_ds = pebs ? F(DS) : 0;
> +   unsigned f_pdcm = pebs ? F(PDCM) : 0;
> +   unsigned f_dtes64 = pebs ? F(DTES64) : 0;
>
> /* cpuid 1.edx */
> const u32 kvm_supported_word0_x86_features =
> @@ -268,7 +272,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
> *entry, u32 function,
> F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
> F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
> F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
> -   

Re: [PATCH V12 3/5] kvm : Fold pv_unhalt flag into GET_MP_STATE ioctl to aid migration

2013-08-06 Thread Eric Northup
On Tue, Aug 6, 2013 at 11:23 AM, Raghavendra K T
 wrote:
> kvm : Fold pv_unhalt flag into GET_MP_STATE ioctl to aid migration
>
> From: Raghavendra K T 
>
> During migration, any vcpu that got kicked but did not become runnable
> (still in halted state) should be runnable after migration.

If this is about migration correctness, could it get folded into the
previous patch 2/5, so that there's not a broken commit which could
hurt bisection?

>
> Signed-off-by: Raghavendra K T 
> Acked-by: Gleb Natapov 
> Acked-by: Ingo Molnar 
> ---
>  arch/x86/kvm/x86.c |7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index dae4575..1e73dab 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6284,7 +6284,12 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu 
> *vcpu,
> struct kvm_mp_state *mp_state)
>  {
> kvm_apic_accept_events(vcpu);
> -   mp_state->mp_state = vcpu->arch.mp_state;
> +   if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
> +   vcpu->arch.pv.pv_unhalted)
> +   mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
> +   else
> +   mp_state->mp_state = vcpu->arch.mp_state;
> +
> return 0;
>  }
>
>
> ___
> Virtualization mailing list
> virtualizat...@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] x86: kaslr: relocate base offset at boot

2013-04-15 Thread Eric Northup
W/the relocation information, we can pick the virtual address to load
at independent from the physical load address.

On Mon, Apr 15, 2013 at 2:41 PM, Kees Cook  wrote:
> On Mon, Apr 15, 2013 at 2:25 PM, H. Peter Anvin  wrote:
>> On 04/15/2013 02:06 PM, Eric Northup wrote:
>>> On Sat, Apr 13, 2013 at 8:06 PM, H. Peter Anvin  wrote:
>>>> On 04/13/2013 05:37 PM, Yinghai Lu wrote:
>>>>>
>>>>> so decompress code position is changed?
>>>>>
>>>>> You may push out bss and other data area of run-time kernel of limit
>>>>> that boot loader
>>>>> chose according to setup_header.init_size.
>>>>> aka that make those area overlap with ram hole or other area like
>>>>> boot command line or initrd
>>>>>
>>>>
>>>> Is there a strong reason to randomize the physical address on 64 bits
>>>> (and if so, shouldn't we do it right?)
>>>
>>> The reason to randomize the physical address is because of the kernel
>>> direct mapping range -- a predictable-to-attackers physical address
>>> implies a predictable-to-attackers virtual address.
>>>
>>> It had seemed to me like changing the virtual base of the direct
>>> mapping would be much more involved than physically relocating the
>>> kernel, but better suggestions would be most welcome :-)
>>>
>>
>> You seem to be missing something here...
>>
>> There are *two* mappings in 64-bit mode.  Physically, if you're going to
>> randomize you might as well randomize over the entire range... except
>> not too far down (on either 32 or 64 bit mode)... in particular, you
>> don't want to drop below 16 MiB if you can avoid it.
>>
>> On 64 bits, there is no reason the virtual address has to be randomized
>> the same way.
>
> Aren't we bound by the negative 2GB addressing due to -mcmodel=kernel?
>
> -Kees
>
> --
> Kees Cook
> Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] x86: kaslr: relocate base offset at boot

2013-04-15 Thread Eric Northup
On Sat, Apr 13, 2013 at 8:06 PM, H. Peter Anvin  wrote:
> On 04/13/2013 05:37 PM, Yinghai Lu wrote:
>>
>> so decompress code position is changed?
>>
>> You may push out bss and other data area of run-time kernel of limit
>> that boot loader
>> chose according to setup_header.init_size.
>> aka that make those area overlap with ram hole or other area like
>> boot command line or initrd
>>
>
> Is there a strong reason to randomize the physical address on 64 bits
> (and if so, shouldn't we do it right?)

The reason to randomize the physical address is because of the kernel
direct mapping range -- a predictable-to-attackers physical address
implies a predictable-to-attackers virtual address.

It had seemed to me like changing the virtual base of the direct
mapping would be much more involved than physically relocating the
kernel, but better suggestions would be most welcome :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: make IDT read-only

2013-04-10 Thread Eric Northup
On Wed, Apr 10, 2013 at 3:40 AM, Eric W. Biederman
 wrote:
> Ingo Molnar  writes:
>
>> * Eric W. Biederman  wrote:
>>
>>> "H. Peter Anvin"  writes:
>>>
>>> > On 04/08/2013 03:43 PM, Kees Cook wrote:
>>> >> This makes the IDT unconditionally read-only. This primarily removes
>>> >> the IDT from being a target for arbitrary memory write attacks. It has
>>> >> an added benefit of also not leaking (via the "sidt" instruction) the
>>> >> kernel base offset, if it has been relocated.
>>> >>
>>> >> Signed-off-by: Kees Cook 
>>> >> Cc: Eric Northup 
>>> >
>>> > Also, tglx: does this interfere with your per-cpu IDT efforts?
>>>
>>> Given that we don't change any IDT entries why would anyone want a
>>> per-cpu IDT?  The cache lines should easily be shared accross all
>>> processors.
>>
>> That's true iif they are cached.
>>
>> If not then it's a remote DRAM access cache miss for all CPUs except the 
>> node that
>> holds that memory.
>>
>>> Or are there some giant NUMA machines that trigger cache misses when 
>>> accessing
>>> the IDT and the penalty for pulling the cache line across the NUMA fabric is
>>> prohibitive?
>>
>> IDT accesses for pure userspace execution are pretty rare. So we are not just
>> talking about huge NUMA machines here but about ordinary NUMA machines 
>> taking a
>> remote cache miss hit for the first IRQ or other IDT-accessing operation 
>> they do
>> after some cache-intense user-space processing.
>>
>> It's a small effect, but it exists and improving it would be
>> legitimate.
>
> If the effect is measurable I agree it is a legitimate optimization.  At
> one point there was a suggestion to make the code in the IDT vectors
> differ based on the which interrupt was registed.  While that can also
> reduce cache misses that can get hairy very quickly, and of course that
> would require read-write IDTs.

read-write IDT or GDT are fine: map them twice, once read+write, once
read-only.  Point the GDTR and IDTR at the read-only alias.

>
> My only practical concern with duplicating the IDT tables per cpu is (a)
> there are generic idt handlers that remain unduplicated reducing the
> benefit and this is essentially the same optimization as making the
> entire kernel text per cpu which last time it was examined was not an
> optimization worth making.  So I wonder if just a subset of the
> optimization is worth making.
>
> Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [kernel-hardening] Re: [PATCH] x86: make IDT read-only

2013-04-09 Thread Eric Northup
On Tue, Apr 9, 2013 at 11:46 AM, Kees Cook  wrote:
> On Tue, Apr 9, 2013 at 11:39 AM, H. Peter Anvin  wrote:
>> On 04/09/2013 11:31 AM, Kees Cook wrote:
> ...
> 0x880001e0-0x88001fe0 480M RW PSE GLB 
> NX pmd
>

 That is the 1:1 memory map area...
>>>
>>> Meaning what?
>>>
>>> -Kees
>>>
>>
>> That's the area in which we just map 1:1 to memory.  Anything allocated
>> with e.g. kmalloc() ends up with those addresses.
>
> Ah-ha! Yes, I see now when comparing the debug/kernel_page_tables
> reports. It's just the High Kernel Mapping that we care about.
> Addresses outside that range are less of a leak. Excellent, then GDT
> may not be a problem. Whew.

The GDT is a problem if the address returned by 'sgdt' is
kernel-writable - it doesn't necessarily reveal the random offset, but
I'm pretty sure that writing to the GDT could cause privilege
escalation.

>
> Does the v2 IDT patch look okay, BTW?
>
> -Kees
>
> --
> Kees Cook
> Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] x86: kernel base offset ASLR

2013-04-04 Thread Eric Northup
On Thu, Apr 4, 2013 at 2:01 PM, H. Peter Anvin  wrote:
> What system monitoring?  Most systems don't have much...

The security of an unmonitored system is going to be much lower than
of a well-monitored system.  That's true independent of whether kASLR
is deployed.

>
> Kees Cook  wrote:
>
>>On Thu, Apr 4, 2013 at 1:58 PM, H. Peter Anvin  wrote:
>>> It seems to me that you are assuming that the attacker is targeting a
>>specific system, but a bot might as well target 256 different systems
>>and see what sticks...
>>
>>Certainly, but system monitoring will show 255 crashed machines, which
>>is a huge blip on any radar. :)
>>
>>-Kees
>>
>>>
>>> Kees Cook  wrote:
>>>
On Thu, Apr 4, 2013 at 1:12 PM, H. Peter Anvin  wrote:
> On 04/04/2013 01:07 PM, Kees Cook wrote:
>> However, the benefits of
>> this feature in certain environments exceed the perceived
weaknesses[2].
>
> Could you clarify?

I would summarize the discussion of KASLR weaknesses into to two
general observations:
1- it depends on address location secrecy and leaks are common/easy.
2- it has low entropy so attack success rates may be high.

For "1", as Julien mentions, remote attacks and attacks from a
significantly contained process (via seccomp-bpf) minimizes the leak
exposure. For local attacks, cache timing attacks and other things
also exist, but the ASLR can be improved to defend against that too.
So, KASLR is useful on systems that are virtualization hosts,
providing remote services, or running locally confined processes.

For "2", I think that the comparison to userspace ASLR entropy isn't
as direct. For userspace, most systems don't tend to have any kind of
watchdog on segfaulting processes, so a remote attacker could just
keep trying an attack until they got lucky, in which case low entropy
is a serious problem. In the case of KASLR, a single attack failure
means the system goes down, which makes mounting an attack much more
difficult. I think 8 bits is fine to start with, and I think start
with a base offset ASLR is a good first step. We can improve things
>>in
the future.

-Kees

--
Kees Cook
Chrome OS Security
>>>
>>> --
>>> Sent from my mobile phone. Please excuse brevity and lack of
>>formatting.
>>
>>
>>
>>--
>>Kees Cook
>>Chrome OS Security
>
> --
> Sent from my mobile phone. Please excuse brevity and lack of formatting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] x86: kernel base offset ASLR

2013-04-04 Thread Eric Northup
On Thu, Apr 4, 2013 at 1:58 PM, H. Peter Anvin  wrote:
> It seems to me that you are assuming that the attacker is targeting a 
> specific system, but a bot might as well target 256 different systems and see 
> what sticks...

The alarm signal from the ones that don't stick is, in my opinion, the
primary benefit from this work -- it makes certain classes of attack
much less economical.  A crash dump from a panic'd machine may include
enough information to diagnose the exploited vulnerability - and once
diagnosed and fixed, knowledge about the vulnerability is much less
valuable.

>
> Kees Cook  wrote:
>
>>On Thu, Apr 4, 2013 at 1:12 PM, H. Peter Anvin  wrote:
>>> On 04/04/2013 01:07 PM, Kees Cook wrote:
 However, the benefits of
 this feature in certain environments exceed the perceived
>>weaknesses[2].
>>>
>>> Could you clarify?
>>
>>I would summarize the discussion of KASLR weaknesses into to two
>>general observations:
>>1- it depends on address location secrecy and leaks are common/easy.
>>2- it has low entropy so attack success rates may be high.
>>
>>For "1", as Julien mentions, remote attacks and attacks from a
>>significantly contained process (via seccomp-bpf) minimizes the leak
>>exposure. For local attacks, cache timing attacks and other things
>>also exist, but the ASLR can be improved to defend against that too.
>>So, KASLR is useful on systems that are virtualization hosts,
>>providing remote services, or running locally confined processes.
>>
>>For "2", I think that the comparison to userspace ASLR entropy isn't
>>as direct. For userspace, most systems don't tend to have any kind of
>>watchdog on segfaulting processes, so a remote attacker could just
>>keep trying an attack until they got lucky, in which case low entropy
>>is a serious problem. In the case of KASLR, a single attack failure
>>means the system goes down, which makes mounting an attack much more
>>difficult. I think 8 bits is fine to start with, and I think start
>>with a base offset ASLR is a good first step. We can improve things in
>>the future.
>>
>>-Kees
>>
>>--
>>Kees Cook
>>Chrome OS Security
>
> --
> Sent from my mobile phone. Please excuse brevity and lack of formatting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] x86: kernel base offset ASLR

2013-04-04 Thread Eric Northup
On Thu, Apr 4, 2013 at 1:21 PM, H. Peter Anvin  wrote:
> I have to admit to being somewhat skeptical toward KASLR with only 8
> bits of randomness.

I agree that 8 bits is pretty low and more would be better.  However,
even 8 bits provides a < 1% chance that any particular guess will be
correct.  Combined with kernel crash monitoring, this amount of ASLR
means that brute-force attacks can't occur undetectably, even if they
can eventually be successful.  Having a signal that indicates an
attack-in-progress is a pretty big leg up from not.  Of course,
infoleaks would render this whole discussion moot, but I'm replying to
the "only 8 bits" part here.

> There are at least two potential ways of
> dramatically increasing the available randomness:
>
> 1. actually compose the kernel of multiple independently relocatable
>pieces (maybe chunk it on 2M boundaries or something.)

Without increasing the entropy bits, does this actually increase the #
of tries necessary for an attacker to guess correctly?  It
dramatically increases the number of possible configurations of kernel
address space, but for any given piece there are only 256 possible
locations.

> 2. compile the kernel as one of the memory models which can be executed
>anywhere in the 64-bit address space.  The cost of this would have
>to be quantified, of course.

I attempted to do this, but was limited by my knowledge of the
toolchain.  I would welcome help or suggestions!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/5] KVM: MMU: fast invalid all mmio sptes

2013-03-18 Thread Eric Northup
On Fri, Mar 15, 2013 at 8:29 AM, Xiao Guangrong
 wrote:
> This patch tries to introduce a very simple and scale way to invalid all
> mmio sptes - it need not walk any shadow pages and hold mmu-lock
>
> KVM maintains a global mmio invalid generation-number which is stored in
> kvm->arch.mmio_invalid_gen and every mmio spte stores the current global
> generation-number into his available bits when it is created
>
> When KVM need zap all mmio sptes, it just simply increase the global
> generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
> then it walks the shadow page table and get the mmio spte. If the
> generation-number on the spte does not equal the global generation-number,
> it will go to the normal #PF handler to update the mmio spte
>
> Since 19 bits are used to store generation-number on mmio spte, the
> generation-number can be round after 33554432 times. It is large enough
> for nearly all most cases, but making the code be more strong, we zap all
> shadow pages when the number is round
>
> Signed-off-by: Xiao Guangrong 
> ---
>  arch/x86/include/asm/kvm_host.h |2 +
>  arch/x86/kvm/mmu.c  |   61 
> +--
>  arch/x86/kvm/mmutrace.h |   17 +++
>  arch/x86/kvm/paging_tmpl.h  |7 +++-
>  arch/x86/kvm/vmx.c  |4 ++
>  arch/x86/kvm/x86.c  |6 +--
>  6 files changed, 82 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ef7f4a5..572398e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -529,6 +529,7 @@ struct kvm_arch {
> unsigned int n_requested_mmu_pages;
> unsigned int n_max_mmu_pages;
> unsigned int indirect_shadow_pages;
> +   unsigned int mmio_invalid_gen;

Could this get initialized to something close to the wrap-around
value, so that the wrap-around case gets more real-world coverage?

> struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
> /*
>  * Hash table of struct kvm_mmu_page.
> @@ -765,6 +766,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
> int slot);
>  void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>  struct kvm_memory_slot *slot,
>  gfn_t gfn_offset, unsigned long mask);
> +void kvm_mmu_invalid_mmio_spte(struct kvm *kvm);
>  void kvm_mmu_zap_all(struct kvm *kvm);
>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int 
> kvm_nr_mmu_pages);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 13626f4..7093a92 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -234,12 +234,13 @@ static unsigned int get_mmio_spte_generation(u64 spte)
>  static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
>unsigned access)
>  {
> -   u64 mask = generation_mmio_spte_mask(0);
> +   unsigned int gen = ACCESS_ONCE(kvm->arch.mmio_invalid_gen);
> +   u64 mask = generation_mmio_spte_mask(gen);
>
> access &= ACC_WRITE_MASK | ACC_USER_MASK;
> mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
>
> -   trace_mark_mmio_spte(sptep, gfn, access, 0);
> +   trace_mark_mmio_spte(sptep, gfn, access, gen);
> mmu_spte_set(sptep, mask);
>  }
>
> @@ -269,6 +270,34 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, 
> gfn_t gfn,
> return false;
>  }
>
> +static bool check_mmio_spte(struct kvm *kvm, u64 spte)
> +{
> +   return get_mmio_spte_generation(spte) ==
> +   ACCESS_ONCE(kvm->arch.mmio_invalid_gen);
> +}
> +
> +/*
> + * The caller should protect concurrent access on
> + * kvm->arch.mmio_invalid_gen. Currently, it is used by
> + * kvm_arch_commit_memory_region and protected by kvm->slots_lock.
> + */
> +void kvm_mmu_invalid_mmio_spte(struct kvm *kvm)
> +{
> +   /* Ensure update memslot has been completed. */
> +   smp_mb();
> +
> +trace_kvm_mmu_invalid_mmio_spte(kvm);
> +
> +   /*
> +* The very rare case: if the generation-number is round,
> +* zap all shadow pages.
> +*/
> +   if (unlikely(kvm->arch.mmio_invalid_gen++ == MAX_GEN)) {
> +   kvm->arch.mmio_invalid_gen = 0;
> +   return kvm_mmu_zap_all(kvm);
> +   }
> +}
> +
>  static inline u64 rsvd_bits(int s, int e)
>  {
> return ((1ULL << (e - s + 1)) - 1) << s;
> @@ -3183,9 +3212,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct 
> kvm_vcpu *vcpu, u64 addr)
>  }
>
>  /*
> - * If it is a real mmio page fault, return 1 and emulat the instruction
> - * directly, return 0 to let CPU fault again on the address, -1 is
> - * returned if bug is detected.
> + * Return value:
> + * 2: invalid spte is detected then let the real page fault path
> + *update the mmio spte.
> + * 1: it is a real mmio page fault, 

[PATCH] virtio_scsi: fix memory leak on full queue condition.

2012-11-08 Thread Eric Northup
virtscsi_queuecommand was leaking memory when the virtio queue was full.

Tested: Guest operates correctly even with very small queue sizes, validated
we're not leaking kmalloc-192 sized allocations anymore.

Signed-off-by: Eric Northup 
---
 drivers/scsi/virtio_scsi.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 595af1a..dd8dc27 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -469,6 +469,8 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, 
struct scsi_cmnd *sc)
  sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
  GFP_ATOMIC) >= 0)
ret = 0;
+   else
+   mempool_free(cmd, virtscsi_cmd_pool);
 
 out:
return ret;
-- 
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/