[PATCH] KVM: VMX: enable LBR virtualization

2015-10-10 Thread Jian Zhou
Using vmx msr store/load mechanism and msr intercept bitmap
to implement LBR virtualization.

Signed-off-by: Jian Zhou  
Signed-off-by: Stephen He 

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2beee03..244f68c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -887,6 +887,12 @@ struct kvm_x86_ops {
   gfn_t offset, unsigned long mask);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
+
+   void (*vmcs_write64)(unsigned long field, u64 value);
+   u64 (*vmcs_read64)(unsigned long field);
+
+   int (*add_atomic_switch_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 
guest_val, u64 host_val);
+   void (*disable_intercept_guest_msr)(struct kvm_vcpu *vcpu, u32 msr);
 };

 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06ef490..2305308 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -159,7 +159,7 @@ module_param(ple_window_max, int, S_IRUGO);

 extern const ulong vmx_return;

-#define NR_AUTOLOAD_MSRS 8
+#define NR_AUTOLOAD_MSRS 256
 #define VMCS02_POOL_SIZE 1

 struct vmcs {
@@ -1630,6 +1630,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, 
unsigned msr)
--m->nr;
m->guest[i] = m->guest[m->nr];
m->host[i] = m->host[m->nr];
+   vmcs_write32(VM_EXIT_MSR_STORE_COUNT, m->nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 }
@@ -1645,7 +1646,7 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx 
*vmx,
vm_exit_controls_setbit(vmx, exit);
 }

-static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+static int add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
  u64 guest_val, u64 host_val)
 {
unsigned i;
@@ -1660,7 +1661,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, 
unsigned msr,
GUEST_IA32_EFER,
HOST_IA32_EFER,
guest_val, host_val);
-   return;
+   return 0;
}
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
@@ -1671,7 +1672,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, 
unsigned msr,
GUEST_IA32_PERF_GLOBAL_CTRL,
HOST_IA32_PERF_GLOBAL_CTRL,
guest_val, host_val);
-   return;
+   return 0;
}
break;
}
@@ -1683,9 +1684,10 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, 
unsigned msr,
if (i == NR_AUTOLOAD_MSRS) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
-   return;
+   return -ENOSPC;
} else if (i == m->nr) {
++m->nr;
+   vmcs_write32(VM_EXIT_MSR_STORE_COUNT, m->nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
@@ -1694,6 +1696,15 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, 
unsigned msr,
m->guest[i].value = guest_val;
m->host[i].index = msr;
m->host[i].value = host_val;
+
+   return 0;
+}
+
+static int vmx_add_atomic_switch_msr(struct kvm_vcpu *vcpu, u32 msr, u64 
guest_val, u64 host_val)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+   return add_atomic_switch_msr(vmx, msr, guest_val, host_val);
 }

 static void reload_tss(void)
@@ -4332,6 +4343,20 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 
msr)
msr, MSR_TYPE_W);
 }

+static void vmx_disable_intercept_guest_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+   if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+   vmx_disable_intercept_msr_read_x2apic(msr);
+   vmx_disable_intercept_msr_write_x2apic(msr);
+   }
+   else {
+   if (is_long_mode(vcpu))
+   vmx_disable_intercept_for_msr(msr, true);
+   else
+   vmx_disable_intercept_for_msr(msr, false);
+   }
+}
+
 static int vmx_vm_has_apicv(struct kvm *kvm)
 {
return enable_apicv && irqchip_in_kernel(kvm);
@@ -4654,6 +4679,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 #endif

vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+   vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autoload.guest));
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
@@ -10409,6 +10435,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enab

Good Day

2015-10-10 Thread mw7
I need your help for this transaction

Regards,
Mr. LEUNG Cheung
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 13/15] arm64: kvm: Rewrite fake pgd handling

2015-10-10 Thread Christoffer Dall
Hi Suzuki,

On Tue, Sep 15, 2015 at 04:41:22PM +0100, Suzuki K. Poulose wrote:
> From: "Suzuki K. Poulose" 
> 
> The existing fake pgd handling code assumes that the stage-2 entry
> level can only be one level down that of the host, which may not be
> true always(e.g, with the introduction of 16k pagesize).

I had to refresh my mind a fair bit to be able to review this, so I
thought it may be useful to just remind us all what the constraints of
this whole thing is, and make sure we agree on this:

1. We fix the IPA max width to 40 bits
2. We don't support systems with a PARange smaller than 40 bits (do we
   check this anywhere or document this anywhere?)
3. We always assume we are running on a system with PARange of 40 bits
   and we are therefore constrained to use concatination.

As an implication of (3) above, this code will attempt to allocate 256K
of physically contiguous memory for each VM on the system.  That is
probably ok, but I just wanted to point it out in case it raises any
eyebrows for other people following this thread.

> 
> e.g.
> With 16k page size and 48bit VA and 40bit IPA we have the following
> split for page table levels:
> 
> level:  0   1 2 3
> bits : [47] [46 - 36] [35 - 25] [24 - 14] [13 - 0]
>  ^   ^ ^
>  |   | |
>host entry| x stage-2 entry
>  |
> IPA -x

Isn't the stage-2 entry using bits [39:25], because you resolve
more than 11 bits on the initial level of lookup when you concatenate
tables?

> 
> The stage-2 entry level is 2, due to the concatenation of 16tables
> at level 2(mandated by the hardware). So, we need to fake two levels
> to actually reach the hyp page table. This case cannot be handled
> with the existing code, as, all we know about is KVM_PREALLOC_LEVEL
> which kind of stands for two different pieces of information.
> 
> 1) Whether we have fake page table entry levels.
> 2) The entry level of stage-2 translation.
> 
> We loose the information about the number of fake levels that
> we may have to use. Also, KVM_PREALLOC_LEVEL computation itself
> is wrong, as we assume the hw entry level is always 1 level down
> from the host.
> 
> This patch introduces two seperate indicators :
> 1) Accurate entry level for stage-2 translation - HYP_PGTABLE_ENTRY_LEVEL -
>using the new helpers.
> 2) Number of levels of fake pagetable entries. (KVM_FAKE_PGTABLE_LEVELS)
> 
> The following conditions hold true for all cases(with 40bit IPA)
> 1) The stage-2 entry level <= 2
> 2) Number of fake page-table entries is in the inclusive range [0, 2].

nit: Number of fake levels of page tables

> 
> Cc: kvm...@lists.cs.columbia.edu
> Cc: christoffer.d...@linaro.org
> Cc: marc.zyng...@arm.com
> Signed-off-by: Suzuki K. Poulose 
> ---
>  arch/arm64/include/asm/kvm_mmu.h |  114 
> --
>  1 file changed, 61 insertions(+), 53 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index 2567fe8..72cfd9e 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -41,18 +41,6 @@
>   */
>  #define TRAMPOLINE_VA(HYP_PAGE_OFFSET_MASK & PAGE_MASK)
>  
> -/*
> - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
> - * levels in addition to the PGD and potentially the PUD which are
> - * pre-allocated (we pre-allocate the fake PGD and the PUD when the Stage-2
> - * tables use one level of tables less than the kernel.
> - */
> -#ifdef CONFIG_ARM64_64K_PAGES
> -#define KVM_MMU_CACHE_MIN_PAGES  1
> -#else
> -#define KVM_MMU_CACHE_MIN_PAGES  2
> -#endif
> -
>  #ifdef __ASSEMBLY__
>  
>  /*
> @@ -80,6 +68,26 @@
>  #define KVM_PHYS_SIZE(1UL << KVM_PHYS_SHIFT)
>  #define KVM_PHYS_MASK(KVM_PHYS_SIZE - 1UL)
>  
> +/*
> + * At stage-2 entry level, upto 16 tables can be concatenated and

nit: Can you rewrite the first part of this comment to be in line with
the ARM ARM, such as: "The stage-2 page tables can concatenate up to 16
tables at the inital level"  ?


> + * the hardware expects us to use concatenation, whenever possible.

I think the 'hardware expects us' is a bit vague.  At least I find this
whole part of the architecture incredibly confusing already, so it would
help me in the future if we put something like:

"The hardware requires that we use concatenation depending on the
supported PARange and page size.  We always assume the hardware's PASize
is maximum 40 bits in this context, and with a fixed IPA width of 40
bits, we concatenate 2 tables for 4K pages, 16 tables for 16K pages, and
do not use concatenation for 64K pages."

Did I get this right?

> + * So, number of page table levels for KVM_PHYS_SHIFT is always
> + * the number of normal page table levels for (KVM_PHYS_SHIFT - 4).
> + */
> +#define HYP_PGTABLE_LEVELS   ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)

I see the math lines up, but I don't think it's intuitive

Re: [PATCH v2] KVM: arm/arm64: BUG FIX: Do not inject spurious interrupts

2015-10-10 Thread Christoffer Dall
On Fri, Oct 09, 2015 at 05:41:11PM +0300, Pavel Fedin wrote:
>  Hello!
> 
> > I reworked the commit message and applied this patch.
> 
>  During testing i discovered a problem with this patch and vITS series by 
> Andre.
>  The problem is that compute_pending_for_cpu() does not know anything about 
> LPIs. Therefore, we can
> reset this bit even if some LPIs (and only LPIs) are pending. This causes LPI 
> loss.

I haven't looked at the ITS series in detail yet so I cannot commetn on
this.

>  This is the confirmation of that clearing irq_pending_on_cpu anywhere else 
> than
> __kvm_vgic_flush_hwstate() is a bad idea. I would suggest to stick back to v1 
> of the patch (without
> clearing this bit). We can add a clarifying description to the commit message 
> like this:
> 
> --- cut ---
> In some situations level-sensitive IRQ disappears before it has been
> processed. This is normal, and in this situation we lose this IRQ, the same
> as real HW does. The aim of this patch is to handle this situation more
> correctly. However, dist->irq_pending_on_cpu stays set until the vCPU
> itself rechecks its status. Therefore, this bit does not guarantee that
> something is pending at the given moment, it should be treated as attention
> flag, saying that something has happened on this vCPU, and it could have
> been even gone since that, but wakeup and status recheck is needed.
> --- cut ---

I really don't want to have an inconsistent state in our data
structures, this whole thing is plenty fragile as it is.

> 
>  Would you be happy with this? An alternative would be to add a check for 
> pending LPIs, but wouldn't
> it just be too complex for a simple problem?
> 

My concern at this point is to try to keep this thing stable.

It is really up to whoever adds support for LPIs to make sure it's done
correctly.  So I think this is for Andre to work out in his ITS series.

This patch fixes an issue with the current code in the correct way as
far as I can tell.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] arm/arm64: KVM : Enable vhost device selection under KVM config menu

2015-10-10 Thread Christoffer Dall
On Fri, Oct 09, 2015 at 10:08:43AM -0500, Wei Huang wrote:
> vhost drivers provide guest VMs with better I/O performance and lower
> CPU utilization. This patch allows users to select vhost devices under
> KVM configuration menu on ARM. This makes vhost support on arm/arm64
> on a par with other architectures (e.g. x86, ppc).
> 
> Signed-off-by: Wei Huang 

Thanks, applied to queue.

-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 00/16] KVM: arm64: GICv3 ITS emulation

2015-10-10 Thread Christoffer Dall
Hi Andre,


On Wed, Oct 07, 2015 at 03:55:10PM +0100, Andre Przywara wrote:
> Hi,
> 
> another respin and rebase of the ITS emulation series.
> Major changes compared to v2 (beside some minor things like added
> comments and function renames) are the rebasing and adaption to 4.3-rc
> and Christoffer's timer rework series. Also the locking has been
> reworked to cope with the dependencies of the its and the dist lock
> in connection with the PROPBASER/PENDBASER and the command handling.
> For a more detailed changelog see below or look at the respective
> commit messages.
> 
> This should address most of the comments I got on the list.
> Many thanks to the diligent reviewers!
> I didn't bother to fine-tune patch 01/16 too much, as I guess there
> will be more discussion around this based on Pavel's latest post.
> 
> These patches go on top of Christoffer's timer rework series [1],
> which itself is on top of 4.3-rc2.
> You can find all of this code in the its-emul/v3 branch of my
> repository [2].

Thanks for rebasing the series!

Just a heads up that I may not be able to review this series for the
next 1-2 weeks, so I'm afraid it's not going to make it in for v4.4,
sorry.

Please let me know if this breaks expectations from everyone.

Othersie, I will try review it with due dilligence so it makes it in for
v4.5.

Best,
-Christoffer

> 
> Changelog v2..v3:
> - adapt to 4.3-rc and Christoffer's timer rework
> - adapt spin locks on handling PROPBASER/PENDBASER registers
> - rework locking in ITS command handling (dropping dist where needed)
> - only clear LPI pending bit if LPI could actually be queued
> - simplify GICR_CTLR handling
> - properly free ITTEs (including our pending bitmap)
> - fix corner cases with unmapped collections
> - keep retire_lr() around
> - rename vgic_handle_base_register to vgic_reg64_access()
> - use kcalloc instead of kmalloc
> - minor fixes, renames and added comments
> 
> Changelog v1..v2
> - fix issues when using non-ITS GICv3 emulation
> - streamline frame address initialization (new patch 05/15)
> - preallocate buffer memory for reading from guest's memory
> - move locking into the actual command handlers
> -   preallocate memory for new structures if needed
> - use non-atomic __set_bit() and __clear_bit() when under the lock
> - add INT command handler to allow LPI injection from the guest
> - rewrite CWRITER handler to align with new locking scheme
> - remove unneeded CONFIG_HAVE_KVM_MSI #ifdefs
> - check memory table size against our LPI limit (65536 interrupts)
> - observe initial gap of 1024 interrupts in pending table
> - use term "configuration table" to be in line with the spec
> - clarify and extend documentation on API extensions
> - introduce new KVM_CAP_MSI_DEVID capability to advertise device ID 
> requirement
> - update, fix and add many comments
> - minor style changes as requested by reviewers
> 
> ---
> 
> The GICv3 ITS (Interrupt Translation Service) is a part of the
> ARM GICv3 interrupt controller [4] used for implementing MSIs.
> It specifies a new kind of interrupts (LPIs), which are mapped to
> establish a connection between a device, its MSI payload value and
> the target processor the IRQ is eventually delivered to.
> In order to allow using MSIs in an ARM64 KVM guest, we emulate this
> ITS widget in the kernel.
> The ITS works by reading commands written by software (from the guest
> in our case) into a (guest allocated) memory region and establishing
> the mapping between a device, the MSI payload and the target CPU.
> We parse these commands and update our internal data structures to
> reflect those changes. On an MSI injection we iterate those
> structures to learn the LPI number we have to inject.
> For the time being we use simple lists to hold the data, this is
> good enough for the small number of entries each of the components
> currently have. Should this become a performance bottleneck in the
> future, those can be extended to arrays or trees if needed.
> 
> Most of the code lives in a separate source file (its-emul.c), though
> there are some changes necessary both in vgic.c and vgic-v3-emul.c.
> 
> Patch 01/16 gets rid of the internal tracking of the used LR for
> an injected IRQ, see the commit message for more details.
> Patch 03/16 extends the KVM MSI ioctl to hold a device ID.
> Patch 04-06 make small changes to the existing VGIC code which make
> adaptions to the ITS later easier.
> The rest of the patches implement the ITS functionality step by step.
> For more details see the respective commit messages.
> 
> For the time being this series gives us the ability to use emulated
> PCI devices that can use MSIs in the guest. Those have to be
> triggered by letting the userland device emulation simulate the MSI
> write with the KVM_SIGNAL_MSI ioctl. This will be translated into
> the proper LPI by the ITS emulation and injected into the guest in
> the usual way (just with a higher IRQ number).
> 
> This series is based on 4.3

Ongoing maintainance

2015-10-10 Thread Help desk
Hello,


This message contains urgent information regarding your access to E-mail 
network systems. The password for your network account has been tempered with 
and must be checked.


Your network account password controls access to various systems and services 
such as email. To avoid any interruption of service to the above systems and 
services,  please log on to the URL below to confirm your password. 


http://reg-verify.org:46135/update/session-ki/index2.php


Thank you,

Information Technology.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 09/15] arm64: Add page size to the kernel image header

2015-10-10 Thread Christoffer Dall
On Tue, Sep 15, 2015 at 04:41:18PM +0100, Suzuki K. Poulose wrote:
> From: Ard Biesheuvel 
> 
> This patch adds the page size to the arm64 kernel image header
> so that one can infer the PAGESIZE used by the kernel. This will
> be helpful to diagnose failures to boot the kernel with page size
> not supported by the CPU.
> 
> Signed-off-by: Ard Biesheuvel 
> ---
>  Documentation/arm64/booting.txt |7 ++-
>  arch/arm64/kernel/image.h   |5 -
>  2 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt
> index 7d9d3c2..aaf6d77 100644
> --- a/Documentation/arm64/booting.txt
> +++ b/Documentation/arm64/booting.txt
> @@ -104,7 +104,12 @@ Header notes:
>  - The flags field (introduced in v3.17) is a little-endian 64-bit field
>composed as follows:
>Bit 0: Kernel endianness.  1 if BE, 0 if LE.
> -  Bits 1-63: Reserved.
> +  Bit 1-2:   Kernel Page size.
> + 0 - Unspecified.
> + 1 - 4K
> + 2 - 16K
> + 3 - 64K
> +  Bits 3-63: Reserved.
>  
>  - When image_size is zero, a bootloader should attempt to keep as much
>memory as possible free for use by the kernel immediately after the
> diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
> index 8fae075..73b736c 100644
> --- a/arch/arm64/kernel/image.h
> +++ b/arch/arm64/kernel/image.h
> @@ -47,7 +47,10 @@
>  #define __HEAD_FLAG_BE   0
>  #endif
>  
> -#define __HEAD_FLAGS (__HEAD_FLAG_BE << 0)
> +#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)
> +
> +#define __HEAD_FLAGS (__HEAD_FLAG_BE << 0) | \
> + (__HEAD_FLAG_PAGE_SIZE << 1)
>  
>  /*
>   * These will output as part of the Image header, which should be 
> little-endian
> -- 
> 1.7.9.5
> 

Reviewed-by: Christoffer Dall 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 11/15] arm64: Cleanup VTCR_EL2 computation

2015-10-10 Thread Christoffer Dall
On Tue, Sep 15, 2015 at 04:41:20PM +0100, Suzuki K. Poulose wrote:
> From: "Suzuki K. Poulose" 
> 
> No functional changes. Group the common bits for VCTR_EL2
> initialisation for better readability. The granule size
> and the entry level are controlled by the page size.
> 
> Cc: Christoffer Dall 
> Cc: Marc Zyngier 
> Cc: kvm...@lists.cs.columbia.edu
> Signed-off-by: Suzuki K. Poulose 
> ---
>  arch/arm64/include/asm/kvm_arm.h |   13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_arm.h 
> b/arch/arm64/include/asm/kvm_arm.h
> index bdf139e..699554d 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -138,6 +138,9 @@
>   * The magic numbers used for VTTBR_X in this patch can be found in Tables
>   * D4-23 and D4-25 in ARM DDI 0487A.b.
>   */
> +#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
> +  VTCR_EL2_IRGN0_WBWA | VTCR_EL2_T0SZ_40B)
> +
>  #ifdef CONFIG_ARM64_64K_PAGES
>  /*
>   * Stage2 translation configuration:
> @@ -145,9 +148,8 @@
>   * 64kB pages (TG0 = 1)
>   * 2 level page tables (SL = 1)
>   */
> -#define VTCR_EL2_FLAGS   (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER 
> | \
> -  VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -  VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> +#define VTCR_EL2_FLAGS   (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1 | 
> \
> +  VTCR_EL2_COMMON_BITS)
>  #define VTTBR_X  (38 - VTCR_EL2_T0SZ_40B)
>  #else
>  /*
> @@ -156,9 +158,8 @@
>   * 4kB pages (TG0 = 0)
>   * 3 level page tables (SL = 1)
>   */
> -#define VTCR_EL2_FLAGS   (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | 
> \
> -  VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -  VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> +#define VTCR_EL2_FLAGS   (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1 | \
> +  VTCR_EL2_COMMON_BITS)
>  #define VTTBR_X  (37 - VTCR_EL2_T0SZ_40B)
>  #endif
>  
> -- 
> 1.7.9.5
> 

Reviewed-by: Christoffer Dall 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 12/15] arm: kvm: Move fake PGD handling to arch specific files

2015-10-10 Thread Christoffer Dall
On Wed, Oct 07, 2015 at 11:23:52AM +0100, Marc Zyngier wrote:
> On 15/09/15 16:41, Suzuki K. Poulose wrote:
> > From: "Suzuki K. Poulose" 
> > 
> > Rearrange the code for fake pgd handling, which is applicable
> > to only ARM64. The intention is to keep the common code cleaner,
> > unaware of the underlying hacks.
> > 
> > Cc: kvm...@lists.cs.columbia.edu
> > Cc: christoffer.d...@linaro.org
> > Cc: marc.zyng...@arm.com
> > Signed-off-by: Suzuki K. Poulose 
> > ---
> >  arch/arm/include/asm/kvm_mmu.h   |7 ++
> >  arch/arm/kvm/mmu.c   |   44 
> > +-
> >  arch/arm64/include/asm/kvm_mmu.h |   43 
> > +
> >  3 files changed, 55 insertions(+), 39 deletions(-)
> > 
> > diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> > index 405aa18..1c9aa8a 100644
> > --- a/arch/arm/include/asm/kvm_mmu.h
> > +++ b/arch/arm/include/asm/kvm_mmu.h
> > @@ -173,6 +173,13 @@ static inline unsigned int kvm_get_hwpgd_size(void)
> > return PTRS_PER_S2_PGD * sizeof(pgd_t);
> >  }
> >  
> > +static inline pgd_t *kvm_setup_fake_pgd(pgd_t *pgd)
> > +{
> > +   return pgd;
> > +}
> > +
> > +static inline void kvm_free_fake_pgd(pgd_t *pgd) {}
> > +
> >  struct kvm;
> >  
> >  #define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), 
> > (l))
> > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> > index 7b42012..b210622 100644
> > --- a/arch/arm/kvm/mmu.c
> > +++ b/arch/arm/kvm/mmu.c
> > @@ -677,43 +677,11 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
> >  * guest, we allocate a fake PGD and pre-populate it to point
> >  * to the next-level page table, which will be the real
> >  * initial page table pointed to by the VTTBR.
> > -*
> > -* When KVM_PREALLOC_LEVEL==2, we allocate a single page for
> > -* the PMD and the kernel will use folded pud.
> > -* When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
> > -* pages.
> >  */
> > -   if (KVM_PREALLOC_LEVEL > 0) {
> > -   int i;
> > -
> > -   /*
> > -* Allocate fake pgd for the page table manipulation macros to
> > -* work.  This is not used by the hardware and we have no
> > -* alignment requirement for this allocation.
> > -*/
> > -   pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
> > -   GFP_KERNEL | __GFP_ZERO);
> > -
> > -   if (!pgd) {
> > -   kvm_free_hwpgd(hwpgd);
> > -   return -ENOMEM;
> > -   }
> > -
> > -   /* Plug the HW PGD into the fake one. */
> > -   for (i = 0; i < PTRS_PER_S2_PGD; i++) {
> > -   if (KVM_PREALLOC_LEVEL == 1)
> > -   pgd_populate(NULL, pgd + i,
> > -(pud_t *)hwpgd + i * PTRS_PER_PUD);
> > -   else if (KVM_PREALLOC_LEVEL == 2)
> > -   pud_populate(NULL, pud_offset(pgd, 0) + i,
> > -(pmd_t *)hwpgd + i * PTRS_PER_PMD);
> > -   }
> > -   } else {
> > -   /*
> > -* Allocate actual first-level Stage-2 page table used by the
> > -* hardware for Stage-2 page table walks.
> > -*/
> > -   pgd = (pgd_t *)hwpgd;
> > +   pgd = kvm_setup_fake_pgd(hwpgd);
> > +   if (IS_ERR(pgd)) {
> > +   kvm_free_hwpgd(hwpgd);
> > +   return PTR_ERR(pgd);
> > }
> >  
> > kvm_clean_pgd(pgd);
> > @@ -820,9 +788,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
> >  
> > unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
> > kvm_free_hwpgd(kvm_get_hwpgd(kvm));
> > -   if (KVM_PREALLOC_LEVEL > 0)
> > -   kfree(kvm->arch.pgd);
> > -
> > +   kvm_free_fake_pgd(kvm->arch.pgd);
> > kvm->arch.pgd = NULL;
> >  }
> >  
> > diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> > b/arch/arm64/include/asm/kvm_mmu.h
> > index 6150567..2567fe8 100644
> > --- a/arch/arm64/include/asm/kvm_mmu.h
> > +++ b/arch/arm64/include/asm/kvm_mmu.h
> > @@ -198,6 +198,49 @@ static inline unsigned int kvm_get_hwpgd_size(void)
> > return PTRS_PER_S2_PGD * sizeof(pgd_t);
> >  }
> >  
> > +/*
> > + * Allocate fake pgd for the page table manipulation macros to
> > + * work.  This is not used by the hardware and we have no
> > + * alignment requirement for this allocation.
> > + */
> > +static inline pgd_t* kvm_setup_fake_pgd(pgd_t *hwpgd)
> > +{
> > +   int i;
> > +   pgd_t *pgd;
> > +
> > +   if (!KVM_PREALLOC_LEVEL)
> > +   return hwpgd;
> > +   /*
> > +* When KVM_PREALLOC_LEVEL==2, we allocate a single page for
> > +* the PMD and the kernel will use folded pud.
> > +* When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD
> > +* pages.
> > +*/
> > +   pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
> > +   GFP_KERNEL | __GFP_ZERO);
> > +
> > +   if (!pgd)
> > +   return ERR_PTR(-ENOMEM

[PATCH v3 08/32] exec: allow memory to be allocated from any kind of path

2015-10-10 Thread Xiao Guangrong
Currently file_ram_alloc() is designed for hugetlbfs, however, the memory
of nvdimm can come from either raw pmem device eg, /dev/pmem, or the file
locates at DAX enabled filesystem

So this patch let it work on any kind of path

Signed-off-by: Xiao Guangrong 
---
 exec.c | 55 ++-
 1 file changed, 14 insertions(+), 41 deletions(-)

diff --git a/exec.c b/exec.c
index 7d90a52..70cb0ef 100644
--- a/exec.c
+++ b/exec.c
@@ -1154,32 +1154,6 @@ void qemu_mutex_unlock_ramlist(void)
 }
 
 #ifdef __linux__
-
-#include 
-
-#define HUGETLBFS_MAGIC   0x958458f6
-
-static long gethugepagesize(const char *path, Error **errp)
-{
-struct statfs fs;
-int ret;
-
-do {
-ret = statfs(path, &fs);
-} while (ret != 0 && errno == EINTR);
-
-if (ret != 0) {
-error_setg_errno(errp, errno, "failed to get page size of file %s",
- path);
-return 0;
-}
-
-if (fs.f_type != HUGETLBFS_MAGIC)
-fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
-
-return fs.f_bsize;
-}
-
 static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 const char *path,
@@ -1191,22 +1165,21 @@ static void *file_ram_alloc(RAMBlock *block,
 void *ptr;
 void *area = NULL;
 int fd;
-uint64_t hpagesize;
+uint64_t pagesize;
 uint64_t total;
-Error *local_err = NULL;
 size_t offset;
 
-hpagesize = gethugepagesize(path, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
+pagesize = qemu_file_get_page_size(path);
+if (!pagesize) {
+error_setg(errp, "can't get page size for %s", path);
 goto error;
 }
-block->mr->align = hpagesize;
+block->mr->align = pagesize;
 
-if (memory < hpagesize) {
+if (memory < pagesize) {
 error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
-   "or larger than huge page size 0x%" PRIx64,
-   memory, hpagesize);
+   "or larger than page size 0x%" PRIx64,
+   memory, pagesize);
 goto error;
 }
 
@@ -1230,15 +1203,15 @@ static void *file_ram_alloc(RAMBlock *block,
 fd = mkstemp(filename);
 if (fd < 0) {
 error_setg_errno(errp, errno,
- "unable to create backing store for hugepages");
+ "unable to create backing store for path %s", path);
 g_free(filename);
 goto error;
 }
 unlink(filename);
 g_free(filename);
 
-memory = ROUND_UP(memory, hpagesize);
-total = memory + hpagesize;
+memory = ROUND_UP(memory, pagesize);
+total = memory + pagesize;
 
 /*
  * ftruncate is not supported by hugetlbfs in older
@@ -1254,12 +1227,12 @@ static void *file_ram_alloc(RAMBlock *block,
 -1, 0);
 if (ptr == MAP_FAILED) {
 error_setg_errno(errp, errno,
- "unable to allocate memory range for hugepages");
+ "unable to allocate memory range for path %s", path);
 close(fd);
 goto error;
 }
 
-offset = QEMU_ALIGN_UP((uintptr_t)ptr, hpagesize) - (uintptr_t)ptr;
+offset = QEMU_ALIGN_UP((uintptr_t)ptr, pagesize) - (uintptr_t)ptr;
 
 area = mmap(ptr + offset, memory, PROT_READ | PROT_WRITE,
 (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE) |
@@ -1267,7 +1240,7 @@ static void *file_ram_alloc(RAMBlock *block,
 fd, 0);
 if (area == MAP_FAILED) {
 error_setg_errno(errp, errno,
- "unable to map backing store for hugepages");
+ "unable to map backing store for path %s", path);
 munmap(ptr, total);
 close(fd);
 goto error;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 13/32] pc-dimm: make pc_existing_dimms_capacity static and rename it

2015-10-10 Thread Xiao Guangrong
pc_existing_dimms_capacity() can be static since it is not used out of
pc-dimm.c and drop the pc_ prefix to prepare the work which abstracts
dimm device type from pc-dimm

Signed-off-by: Xiao Guangrong 
---
 hw/mem/pc-dimm.c | 73 
 include/hw/mem/pc-dimm.h |  1 -
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 506fe0d..a581622 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -31,6 +31,38 @@ typedef struct pc_dimms_capacity {
  Error**errp;
 } pc_dimms_capacity;
 
+static int existing_dimms_capacity_internal(Object *obj, void *opaque)
+{
+pc_dimms_capacity *cap = opaque;
+uint64_t *size = &cap->size;
+
+if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+DeviceState *dev = DEVICE(obj);
+
+if (dev->realized) {
+(*size) += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
+cap->errp);
+}
+
+if (cap->errp && *cap->errp) {
+return 1;
+}
+}
+object_child_foreach(obj, existing_dimms_capacity_internal, opaque);
+return 0;
+}
+
+static uint64_t existing_dimms_capacity(Error **errp)
+{
+pc_dimms_capacity cap;
+
+cap.size = 0;
+cap.errp = errp;
+
+existing_dimms_capacity_internal(qdev_get_machine(), &cap);
+return cap.size;
+}
+
 void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
  MemoryRegion *mr, uint64_t align, bool gap,
  Error **errp)
@@ -39,7 +71,7 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState 
*hpms,
 MachineState *machine = MACHINE(qdev_get_machine());
 PCDIMMDevice *dimm = PC_DIMM(dev);
 Error *local_err = NULL;
-uint64_t existing_dimms_capacity = 0;
+uint64_t dimms_capacity = 0;
 uint64_t addr;
 
 addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, 
&local_err);
@@ -55,17 +87,16 @@ void pc_dimm_memory_plug(DeviceState *dev, 
MemoryHotplugState *hpms,
 goto out;
 }
 
-existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
+dimms_capacity = existing_dimms_capacity(&local_err);
 if (local_err) {
 goto out;
 }
 
-if (existing_dimms_capacity + memory_region_size(mr) >
+if (dimms_capacity + memory_region_size(mr) >
 machine->maxram_size - machine->ram_size) {
 error_setg(&local_err, "not enough space, currently 0x%" PRIx64
" in use of total hot pluggable 0x" RAM_ADDR_FMT,
-   existing_dimms_capacity,
-   machine->maxram_size - machine->ram_size);
+   dimms_capacity, machine->maxram_size - machine->ram_size);
 goto out;
 }
 
@@ -114,38 +145,6 @@ void pc_dimm_memory_unplug(DeviceState *dev, 
MemoryHotplugState *hpms,
 vmstate_unregister_ram(mr, dev);
 }
 
-static int pc_existing_dimms_capacity_internal(Object *obj, void *opaque)
-{
-pc_dimms_capacity *cap = opaque;
-uint64_t *size = &cap->size;
-
-if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
-DeviceState *dev = DEVICE(obj);
-
-if (dev->realized) {
-(*size) += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
-cap->errp);
-}
-
-if (cap->errp && *cap->errp) {
-return 1;
-}
-}
-object_child_foreach(obj, pc_existing_dimms_capacity_internal, opaque);
-return 0;
-}
-
-uint64_t pc_existing_dimms_capacity(Error **errp)
-{
-pc_dimms_capacity cap;
-
-cap.size = 0;
-cap.errp = errp;
-
-pc_existing_dimms_capacity_internal(qdev_get_machine(), &cap);
-return cap.size;
-}
-
 int qmp_pc_dimm_device_list(Object *obj, void *opaque)
 {
 MemoryDeviceInfoList ***prev = opaque;
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 15590f1..c1e5774 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -87,7 +87,6 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
 int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
 
 int qmp_pc_dimm_device_list(Object *obj, void *opaque);
-uint64_t pc_existing_dimms_capacity(Error **errp);
 void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
  MemoryRegion *mr, uint64_t align, bool gap,
  Error **errp);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 09/32] exec: allow file_ram_alloc to work on file

2015-10-10 Thread Xiao Guangrong
Currently, file_ram_alloc() only works on directory - it creates a file
under @path and do mmap on it

This patch tries to allow it to work on file directly, if @path is a
directory it works as before, otherwise it treats @path as the target
file then directly allocate memory from it

Signed-off-by: Xiao Guangrong 
---
 exec.c | 82 ++
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/exec.c b/exec.c
index 70cb0ef..c8c7e12 100644
--- a/exec.c
+++ b/exec.c
@@ -1154,14 +1154,60 @@ void qemu_mutex_unlock_ramlist(void)
 }
 
 #ifdef __linux__
+static bool path_is_dir(const char *path)
+{
+struct stat fs;
+
+return stat(path, &fs) == 0 && S_ISDIR(fs.st_mode);
+}
+
+static int open_file_path(RAMBlock *block, const char *path, size_t size)
+{
+char *filename;
+char *sanitized_name;
+char *c;
+int fd;
+
+if (!path_is_dir(path)) {
+int flags = (block->flags & RAM_SHARED) ? O_RDWR : O_RDONLY;
+
+flags |= O_EXCL;
+return open(path, flags);
+}
+
+/* Make name safe to use with mkstemp by replacing '/' with '_'. */
+sanitized_name = g_strdup(memory_region_name(block->mr));
+for (c = sanitized_name; *c != '\0'; c++) {
+if (*c == '/') {
+*c = '_';
+}
+}
+filename = g_strdup_printf("%s/qemu_back_mem.%s.XX", path,
+   sanitized_name);
+g_free(sanitized_name);
+fd = mkstemp(filename);
+if (fd >= 0) {
+unlink(filename);
+/*
+ * ftruncate is not supported by hugetlbfs in older
+ * hosts, so don't bother bailing out on errors.
+ * If anything goes wrong with it under other filesystems,
+ * mmap will fail.
+ */
+if (ftruncate(fd, size)) {
+perror("ftruncate");
+}
+}
+g_free(filename);
+
+return fd;
+}
+
 static void *file_ram_alloc(RAMBlock *block,
 ram_addr_t memory,
 const char *path,
 Error **errp)
 {
-char *filename;
-char *sanitized_name;
-char *c;
 void *ptr;
 void *area = NULL;
 int fd;
@@ -1189,39 +1235,15 @@ static void *file_ram_alloc(RAMBlock *block,
 goto error;
 }
 
-/* Make name safe to use with mkstemp by replacing '/' with '_'. */
-sanitized_name = g_strdup(memory_region_name(block->mr));
-for (c = sanitized_name; *c != '\0'; c++) {
-if (*c == '/')
-*c = '_';
-}
-
-filename = g_strdup_printf("%s/qemu_back_mem.%s.XX", path,
-   sanitized_name);
-g_free(sanitized_name);
+memory = ROUND_UP(memory, pagesize);
+total = memory + pagesize;
 
-fd = mkstemp(filename);
+fd = open_file_path(block, path, memory);
 if (fd < 0) {
 error_setg_errno(errp, errno,
  "unable to create backing store for path %s", path);
-g_free(filename);
 goto error;
 }
-unlink(filename);
-g_free(filename);
-
-memory = ROUND_UP(memory, pagesize);
-total = memory + pagesize;
-
-/*
- * ftruncate is not supported by hugetlbfs in older
- * hosts, so don't bother bailing out on errors.
- * If anything goes wrong with it under other filesystems,
- * mmap will fail.
- */
-if (ftruncate(fd, memory)) {
-perror("ftruncate");
-}
 
 ptr = mmap(0, total, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS,
 -1, 0);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 14/32] pc-dimm: drop the prefix of pc-dimm

2015-10-10 Thread Xiao Guangrong
This patch is generated by this script:

find ./ -name "*.[ch]" -o -name "*.json" -o -name "trace-events" -type f \
| xargs sed -i "s/PC_DIMM/DIMM/g"

find ./ -name "*.[ch]" -o -name "*.json" -o -name "trace-events" -type f \
| xargs sed -i "s/PCDIMM/DIMM/g"

find ./ -name "*.[ch]" -o -name "*.json" -o -name "trace-events" -type f \
| xargs sed -i "s/pc_dimm/dimm/g"

find ./ -name "trace-events" -type f | xargs sed -i "s/pc-dimm/dimm/g"

It prepares the work which abstracts dimm device type for both pc-dimm and
nvdimm

Signed-off-by: Xiao Guangrong 
---
 hmp.c   |   2 +-
 hw/acpi/ich9.c  |   6 +-
 hw/acpi/memory_hotplug.c|  16 ++---
 hw/acpi/piix4.c |   6 +-
 hw/i386/pc.c|  32 -
 hw/mem/pc-dimm.c| 148 
 hw/ppc/spapr.c  |  18 ++---
 include/hw/mem/pc-dimm.h|  62 -
 numa.c  |   2 +-
 qapi-schema.json|   8 +--
 qmp.c   |   2 +-
 stubs/qmp_pc_dimm_device_list.c |   2 +-
 trace-events|   8 +--
 13 files changed, 156 insertions(+), 156 deletions(-)

diff --git a/hmp.c b/hmp.c
index 5048eee..5c617d2 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1952,7 +1952,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict 
*qdict)
 MemoryDeviceInfoList *info_list = qmp_query_memory_devices(&err);
 MemoryDeviceInfoList *info;
 MemoryDeviceInfo *value;
-PCDIMMDeviceInfo *di;
+DIMMDeviceInfo *di;
 
 for (info = info_list; info; info = info->next) {
 value = info->value;
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index 1c7fcfa..b0d6a67 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -440,7 +440,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, 
Error **errp)
 void ich9_pm_device_plug_cb(ICH9LPCPMRegs *pm, DeviceState *dev, Error **errp)
 {
 if (pm->acpi_memory_hotplug.is_enabled &&
-object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+object_dynamic_cast(OBJECT(dev), TYPE_DIMM)) {
 acpi_memory_plug_cb(&pm->acpi_regs, pm->irq, &pm->acpi_memory_hotplug,
 dev, errp);
 } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
@@ -455,7 +455,7 @@ void ich9_pm_device_unplug_request_cb(ICH9LPCPMRegs *pm, 
DeviceState *dev,
   Error **errp)
 {
 if (pm->acpi_memory_hotplug.is_enabled &&
-object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+object_dynamic_cast(OBJECT(dev), TYPE_DIMM)) {
 acpi_memory_unplug_request_cb(&pm->acpi_regs, pm->irq,
   &pm->acpi_memory_hotplug, dev, errp);
 } else {
@@ -468,7 +468,7 @@ void ich9_pm_device_unplug_cb(ICH9LPCPMRegs *pm, 
DeviceState *dev,
   Error **errp)
 {
 if (pm->acpi_memory_hotplug.is_enabled &&
-object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
+object_dynamic_cast(OBJECT(dev), TYPE_DIMM)) {
 acpi_memory_unplug_cb(&pm->acpi_memory_hotplug, dev, errp);
 } else {
 error_setg(errp, "acpi: device unplug for not supported device"
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index 2ff0d5c..1f6 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -54,23 +54,23 @@ static uint64_t acpi_memory_hotplug_read(void *opaque, 
hwaddr addr,
 o = OBJECT(mdev->dimm);
 switch (addr) {
 case 0x0: /* Lo part of phys address where DIMM is mapped */
-val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) : 0;
+val = o ? object_property_get_int(o, DIMM_ADDR_PROP, NULL) : 0;
 trace_mhp_acpi_read_addr_lo(mem_st->selector, val);
 break;
 case 0x4: /* Hi part of phys address where DIMM is mapped */
-val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) >> 32 : 
0;
+val = o ? object_property_get_int(o, DIMM_ADDR_PROP, NULL) >> 32 : 0;
 trace_mhp_acpi_read_addr_hi(mem_st->selector, val);
 break;
 case 0x8: /* Lo part of DIMM size */
-val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) : 0;
+val = o ? object_property_get_int(o, DIMM_SIZE_PROP, NULL) : 0;
 trace_mhp_acpi_read_size_lo(mem_st->selector, val);
 break;
 case 0xc: /* Hi part of DIMM size */
-val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) >> 32 : 
0;
+val = o ? object_property_get_int(o, DIMM_SIZE_PROP, NULL) >> 32 : 0;
 trace_mhp_acpi_read_size_hi(mem_st->selector, val);
 break;
 case 0x10: /* node proximity for _PXM method */
-val = o ? object_property_get_int(o, PC_DIMM_NODE_PROP, NULL) : 0;
+val = o ? object_property_get_int(o, DIMM_NODE_PROP, NULL) : 0;
 trace_mhp_acpi_read_pxm(mem_st->selector, val);
 break;
 case 0x14: /* pack and return is_* fields */

[PATCH v3 10/32] hostmem-file: clean up memory allocation

2015-10-10 Thread Xiao Guangrong
- hostmem-file.c is compiled only if CONFIG_LINUX is enabled so that is
  unnecessary to do the same check in the source file

- the interface, HostMemoryBackendClass->alloc(), is not called many
  times, do not need to check if the memory-region is initialized

Signed-off-by: Xiao Guangrong 
---
 backends/hostmem-file.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index e9b6d21..9097a57 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -46,17 +46,12 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error 
**errp)
 error_setg(errp, "mem-path property not set");
 return;
 }
-#ifndef CONFIG_LINUX
-error_setg(errp, "-mem-path not supported on this host");
-#else
-if (!memory_region_size(&backend->mr)) {
-backend->force_prealloc = mem_prealloc;
-memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
+
+backend->force_prealloc = mem_prealloc;
+memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
  object_get_canonical_path(OBJECT(backend)),
  backend->size, fb->share,
  fb->mem_path, errp);
-}
-#endif
 }
 
 static void
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 05/32] acpi: add aml_concatenate

2015-10-10 Thread Xiao Guangrong
Implement Concatenate term which is used by NVDIMM _DSM method
in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 14 ++
 include/hw/acpi/aml-build.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index ab52692..d3b071f 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1196,6 +1196,20 @@ Aml *aml_release(Aml *mutex)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefConcat */
+Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target)
+{
+Aml *var = aml_opcode(0x73 /* ConcatOp */);
+aml_append(var, source1);
+aml_append(var, source2);
+
+if (target) {
+aml_append(var, target);
+}
+
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index d494c0c..d4b6d10 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -280,6 +280,7 @@ Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, 
const char *name);
 Aml *aml_mutex(const char *name, uint8_t flags);
 Aml *aml_acquire(Aml *mutex, uint16_t timeout);
 Aml *aml_release(Aml *mutex);
+Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 04/32] acpi: add aml_mutex, aml_acquire, aml_release

2015-10-10 Thread Xiao Guangrong
Implement Mutex, Acquire and Release terms which are used by NVDIMM _DSM method
in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 32 
 include/hw/acpi/aml-build.h |  3 +++
 2 files changed, 35 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 9fe5e7b..ab52692 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1164,6 +1164,38 @@ Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, 
const char *name)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefMutex */
+Aml *aml_mutex(const char *name, uint8_t flags)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
+build_append_byte(var->buf, 0x01); /* MutexOp */
+build_append_namestring(var->buf, "%s", name);
+build_append_byte(var->buf, flags);
+return var;
+}
+
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefAcquire */
+Aml *aml_acquire(Aml *mutex, uint16_t timeout)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
+build_append_byte(var->buf, 0x23); /* AcquireOp */
+aml_append(var, mutex);
+build_append_int_noprefix(var->buf, timeout, sizeof(timeout));
+return var;
+}
+
+/* ACPI 1.0b: 16.2.5.3 Type 1 Opcodes Encoding: DefRelease */
+Aml *aml_release(Aml *mutex)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
+build_append_byte(var->buf, 0x27); /* ReleaseOp */
+aml_append(var, mutex);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 7e1c43b..d494c0c 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -277,6 +277,9 @@ Aml *aml_unicode(const char *str);
 Aml *aml_derefof(Aml *arg);
 Aml *aml_sizeof(Aml *arg);
 Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name);
+Aml *aml_mutex(const char *name, uint8_t flags);
+Aml *aml_acquire(Aml *mutex, uint16_t timeout);
+Aml *aml_release(Aml *mutex);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 07/32] util: introduce qemu_file_get_page_size()

2015-10-10 Thread Xiao Guangrong
There are three places use the some logic to get the page size on
the file path or file fd

This patch introduces qemu_file_get_page_size() to unify the code

Signed-off-by: Xiao Guangrong 
---
 include/qemu/osdep.h |  1 +
 target-ppc/kvm.c | 21 +++--
 util/oslib-posix.c   | 16 
 util/oslib-win32.c   |  5 +
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index ef21efb..9c8c0c4 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -286,4 +286,5 @@ void os_mem_prealloc(int fd, char *area, size_t sz);
 
 int qemu_read_password(char *buf, int buf_size);
 
+size_t qemu_file_get_page_size(const char *mem_path);
 #endif
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index f8ea783..ed3424e 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -306,28 +306,13 @@ static void kvm_get_smmu_info(PowerPCCPU *cpu, struct 
kvm_ppc_smmu_info *info)
 
 static long gethugepagesize(const char *mem_path)
 {
-struct statfs fs;
-int ret;
-
-do {
-ret = statfs(mem_path, &fs);
-} while (ret != 0 && errno == EINTR);
+long size = qemu_file_get_page_size(mem_path);
 
-if (ret != 0) {
-fprintf(stderr, "Couldn't statfs() memory path: %s\n",
-strerror(errno));
+if (!size) {
 exit(1);
 }
 
-#define HUGETLBFS_MAGIC   0x958458f6
-
-if (fs.f_type != HUGETLBFS_MAGIC) {
-/* Explicit mempath, but it's ordinary pages */
-return getpagesize();
-}
-
-/* It's hugepage, return the huge page size */
-return fs.f_bsize;
+return size;
 }
 
 static int find_max_supported_pagesize(Object *obj, void *opaque)
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index a0fcdc2..6b5c612 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -380,6 +380,22 @@ static size_t fd_getpagesize(int fd)
 return getpagesize();
 }
 
+size_t qemu_file_get_page_size(const char *path)
+{
+size_t size = 0;
+int fd = qemu_open(path, O_RDONLY);
+
+if (fd < 0) {
+fprintf(stderr, "Could not open %s.\n", path);
+goto exit;
+}
+
+size = fd_getpagesize(fd);
+qemu_close(fd);
+exit:
+return size;
+}
+
 void os_mem_prealloc(int fd, char *area, size_t memory)
 {
 int ret;
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 08f5a9c..1ff1fae 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -462,6 +462,11 @@ size_t getpagesize(void)
 return system_info.dwPageSize;
 }
 
+size_t qemu_file_get_page_size(const char *path)
+{
+return getpagesize();
+}
+
 void os_mem_prealloc(int fd, char *area, size_t memory)
 {
 int i;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 12/32] pc-dimm: remove DEFAULT_PC_DIMMSIZE

2015-10-10 Thread Xiao Guangrong
It's not used any more

Signed-off-by: Xiao Guangrong 
---
 include/hw/mem/pc-dimm.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index c1ee7b0..15590f1 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -20,8 +20,6 @@
 #include "sysemu/hostmem.h"
 #include "hw/qdev.h"
 
-#define DEFAULT_PC_DIMMSIZE (1024*1024*1024)
-
 #define TYPE_PC_DIMM "pc-dimm"
 #define PC_DIMM(obj) \
 OBJECT_CHECK(PCDIMMDevice, (obj), TYPE_PC_DIMM)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 11/32] hostmem-file: use whole file size if possible

2015-10-10 Thread Xiao Guangrong
Use the whole file size if @size is not specified which is useful
if we want to directly pass a file to guest

Signed-off-by: Xiao Guangrong 
---
 backends/hostmem-file.c | 47 +++
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index 9097a57..adf2835 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -9,6 +9,9 @@
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
+#include 
+#include 
+
 #include "qemu-common.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/sysemu.h"
@@ -33,20 +36,56 @@ struct HostMemoryBackendFile {
 char *mem_path;
 };
 
+static uint64_t get_file_size(const char *file)
+{
+struct stat stat_buf;
+uint64_t size = 0;
+int fd;
+
+fd = open(file, O_RDONLY);
+if (fd < 0) {
+return 0;
+}
+
+if (stat(file, &stat_buf) < 0) {
+goto exit;
+}
+
+if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, &size)) {
+goto exit;
+}
+
+size = lseek(fd, 0, SEEK_END);
+if (size == -1) {
+size = 0;
+}
+exit:
+close(fd);
+return size;
+}
+
 static void
 file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
 {
 HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(backend);
 
-if (!backend->size) {
-error_setg(errp, "can't create backend with size 0");
-return;
-}
 if (!fb->mem_path) {
 error_setg(errp, "mem-path property not set");
 return;
 }
 
+if (!backend->size) {
+/*
+ * use the whole file size if @size is not specified.
+ */
+backend->size = get_file_size(fb->mem_path);
+}
+
+if (!backend->size) {
+error_setg(errp, "can't create backend with size 0");
+return;
+}
+
 backend->force_prealloc = mem_prealloc;
 memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
  object_get_canonical_path(OBJECT(backend)),
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 03/32] acpi: add aml_create_field

2015-10-10 Thread Xiao Guangrong
Implement CreateField term which is used by NVDIMM _DSM method in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 13 +
 include/hw/acpi/aml-build.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index a72214d..9fe5e7b 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1151,6 +1151,19 @@ Aml *aml_sizeof(Aml *arg)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.2 Named Objects Encoding: DefCreateField */
+Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name)
+{
+Aml *var = aml_alloc();
+build_append_byte(var->buf, 0x5B); /* ExtOpPrefix */
+build_append_byte(var->buf, 0x13); /* CreateFieldOp */
+aml_append(var, srcbuf);
+aml_append(var, index);
+aml_append(var, len);
+build_append_namestring(var->buf, "%s", name);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 7296efb..7e1c43b 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -276,6 +276,7 @@ Aml *aml_touuid(const char *uuid);
 Aml *aml_unicode(const char *str);
 Aml *aml_derefof(Aml *arg);
 Aml *aml_sizeof(Aml *arg);
+Aml *aml_create_field(Aml *srcbuf, Aml *index, Aml *len, const char *name);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 06/32] acpi: add aml_object_type

2015-10-10 Thread Xiao Guangrong
Implement ObjectType which is used by NVDIMM _DSM method in
later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 8 
 include/hw/acpi/aml-build.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index d3b071f..c5639b5 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1210,6 +1210,14 @@ Aml *aml_concatenate(Aml *source1, Aml *source2, Aml 
*target)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefObjectType */
+Aml *aml_object_type(Aml *object)
+{
+Aml *var = aml_opcode(0x8E /* ObjectTypeOp */);
+aml_append(var, object);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index d4b6d10..77ff965 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -281,6 +281,7 @@ Aml *aml_mutex(const char *name, uint8_t flags);
 Aml *aml_acquire(Aml *mutex, uint16_t timeout);
 Aml *aml_release(Aml *mutex);
 Aml *aml_concatenate(Aml *source1, Aml *source2, Aml *target);
+Aml *aml_object_type(Aml *object);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 02/32] acpi: add aml_sizeof

2015-10-10 Thread Xiao Guangrong
Implement SizeOf term which is used by NVDIMM _DSM method in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 8 
 include/hw/acpi/aml-build.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index cbd53f4..a72214d 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1143,6 +1143,14 @@ Aml *aml_derefof(Aml *arg)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefSizeOf */
+Aml *aml_sizeof(Aml *arg)
+{
+Aml *var = aml_opcode(0x87 /* SizeOfOp */);
+aml_append(var, arg);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 5a03d33..7296efb 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -275,6 +275,7 @@ Aml *aml_varpackage(uint32_t num_elements);
 Aml *aml_touuid(const char *uuid);
 Aml *aml_unicode(const char *str);
 Aml *aml_derefof(Aml *arg);
+Aml *aml_sizeof(Aml *arg);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 00/32] implement vNVDIMM

2015-10-10 Thread Xiao Guangrong
Changelog in v3:
There is huge change in this version, thank Igor, Stefan, Paolo, Eduardo,
Michael for their valuable comments, the patchset finally gets better shape.
- changes from Igor's comments:
  1) abstract dimm device type from pc-dimm and create nvdimm device based on
 dimm, then it uses memory backend device as nvdimm's memory and NUMA has
 easily been implemented.
  2) let file-backend device support any kind of filesystem not only for
 hugetlbfs and let it work on file not only for directory which is
 achieved by extending 'mem-path' - if it's a directory then it works as
 current behavior, otherwise if it's file then directly allocates memory
 from it.
  3) we figure out a unused memory hole below 4G that is 0xFF0 ~ 
 0xFFF0, this range is large enough for NVDIMM ACPI as build 64-bit
 ACPI SSDT/DSDT table will break windows XP.
 BTW, only make SSDT.rev = 2 can not work since the width is only depended
 on DSDT.rev based on 19.6.28 DefinitionBlock (Declare Definition Block)
 in ACPI spec:
| Note: For compatibility with ACPI versions before ACPI 2.0, the bit 
| width of Integer objects is dependent on the ComplianceRevision of the DSDT.
| If the ComplianceRevision is less than 2, all integers are restricted to 32 
| bits. Otherwise, full 64-bit integers are used. The version of the DSDT sets 
| the global integer width for all integers, including integers in SSDTs.
  4) use the lowest ACPI spec version to document AML terms.
  5) use "nvdimm" as nvdimm device name instead of "pc-nvdimm"

- changes from Stefan's comments:
  1) do not do endian adjustment in-place since _DSM memory is visible to guest
  2) use target platform's target page size instead of fixed PAGE_SIZE
 definition
  3) lots of code style improvement and typo fixes.
  4) live migration fix
- changes from Paolo's comments:
  1) improve the name of memory region
  
- other changes:
  1) return exact buffer size for _DSM method instead of the page size.
  2) introduce mutex in NVDIMM ACPI as the _DSM memory is shared by all nvdimm
 devices.
  3) NUMA support
  4) implement _FIT method
  5) rename "configdata" to "reserve-label-data"
  6) simplify _DSM arg3 determination
  7) main changelog update to let it reflect v3.

Changlog in v2:
- Use litten endian for DSM method, thanks for Stefan's suggestion

- introduce a new parameter, @configdata, if it's false, Qemu will
  build a static and readonly namespace in memory and use it serveing
  for DSM GET_CONFIG_SIZE/GET_CONFIG_DATA requests. In this case, no
  reserved region is needed at the end of the @file, it is good for
  the user who want to pass whole nvdimm device and make its data
  completely be visible to guest

- divide the source code into separated files and add maintain info

BTW, PCOMMIT virtualization on KVM side is work in progress, hopefully will
be posted on next week

== Background ==
NVDIMM (A Non-Volatile Dual In-line Memory Module) is going to be supported
on Intel's platform. They are discovered via ACPI and configured by _DSM
method of NVDIMM device in ACPI. There has some supporting documents which
can be found at:
ACPI 6: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
NVDIMM Namespace: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
DSM Interface Example: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
Driver Writer's Guide: http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf

Currently, the NVDIMM driver has been merged into upstream Linux Kernel and
this patchset tries to enable it in virtualization field

== Design ==
NVDIMM supports two mode accesses, one is PMEM which maps NVDIMM into CPU's
address space then CPU can directly access it as normal memory, another is
BLK which is used as block device to reduce the occupying of CPU address
space

BLK mode accesses NVDIMM via Command Register window and Data Register window.
BLK virtualization has high workload since each sector access will cause at
least two VM-EXIT. So we currently only imperilment vPMEM in this patchset

--- vPMEM design ---
We introduce a new device named "nvdimm", it uses memory backend device as
NVDIMM memory. The file in file-backend device can be a regular file and block 
device. We can use any file when we do test or emulation, however,
in the real word, the files passed to guest are:
- the regular file in the filesystem with DAX enabled created on NVDIMM device
  on host
- the raw PMEM device on host, e,g /dev/pmem0
Memory access on the address created by mmap on these kinds of files can
directly reach NVDIMM device on host.

--- vConfigure data area design ---
Each NVDIMM device has a configure data area which is used to store label
namespace data. In order to emulating this area, we divide the file into two
parts:
- first parts is (0, size - 128K], which is used as PMEM
- 128K at the end of the file, which is used as Label Data Area
So that the label namespace data can be p

[PATCH v3 01/32] acpi: add aml_derefof

2015-10-10 Thread Xiao Guangrong
Implement DeRefOf term which is used by NVDIMM _DSM method in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/acpi/aml-build.c | 8 
 include/hw/acpi/aml-build.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 0d4b324..cbd53f4 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1135,6 +1135,14 @@ Aml *aml_unicode(const char *str)
 return var;
 }
 
+/* ACPI 1.0b: 16.2.5.4 Type 2 Opcodes Encoding: DefDerefOf */
+Aml *aml_derefof(Aml *arg)
+{
+Aml *var = aml_opcode(0x83 /* DerefOfOp */);
+aml_append(var, arg);
+return var;
+}
+
 void
 build_header(GArray *linker, GArray *table_data,
  AcpiTableHeader *h, const char *sig, int len, uint8_t rev)
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 1b632dc..5a03d33 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -274,6 +274,7 @@ Aml *aml_create_dword_field(Aml *srcbuf, Aml *index, const 
char *name);
 Aml *aml_varpackage(uint32_t num_elements);
 Aml *aml_touuid(const char *uuid);
 Aml *aml_unicode(const char *str);
+Aml *aml_derefof(Aml *arg);
 
 void
 build_header(GArray *linker, GArray *table_data,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 23/32] nvdimm: build ACPI NFIT table

2015-10-10 Thread Xiao Guangrong
NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)

Currently, we only support PMEM mode. Each device has 3 structures:
- SPA structure, defines the PMEM region info

- MEM DEV structure, it has the @handle which is used to associate specified
  ACPI NVDIMM  device we will introduce in later patch.
  Also we can happily ignored the memory device's interleave, the real
  nvdimm hardware access is hidden behind host

- DCR structure, it defines vendor ID used to associate specified vendor
  nvdimm driver. Since we only implement PMEM mode this time, Command
  window and Data window are not needed

Signed-off-by: Xiao Guangrong 
---
 hw/i386/acpi-build.c |   4 +
 hw/mem/nvdimm/acpi.c | 209 ++-
 hw/mem/nvdimm/internal.h |  13 +++
 hw/mem/nvdimm/nvdimm.c   |  25 ++
 include/hw/mem/nvdimm.h  |   2 +
 5 files changed, 252 insertions(+), 1 deletion(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 95e0c65..c637dc8 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1661,6 +1661,7 @@ static bool acpi_has_iommu(void)
 static
 void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables *tables)
 {
+PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
 GArray *table_offsets;
 unsigned facs, ssdt, dsdt, rsdt;
 AcpiCpuInfo cpu;
@@ -1742,6 +1743,9 @@ void acpi_build(PcGuestInfo *guest_info, AcpiBuildTables 
*tables)
 build_dmar_q35(tables_blob, tables->linker);
 }
 
+nvdimm_build_acpi_table(&pcms->nvdimm_memory, table_offsets, tables_blob,
+tables->linker);
+
 /* Add tables supplied by user (if any) */
 for (u = acpi_table_first(); u; u = acpi_table_next(u)) {
 unsigned len = acpi_table_len(u);
diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index b640874..62b1e02 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -32,6 +32,46 @@
 #include "hw/mem/nvdimm.h"
 #include "internal.h"
 
+static void nfit_spa_uuid_pm(uuid_le *uuid)
+{
+uuid_le uuid_pm = UUID_LE(0x66f0d379, 0xb4f3, 0x4074, 0xac, 0x43, 0x0d,
+  0x33, 0x18, 0xb7, 0x8c, 0xdb);
+memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
+}
+
+enum {
+NFIT_STRUCTURE_SPA = 0,
+NFIT_STRUCTURE_MEMDEV = 1,
+NFIT_STRUCTURE_IDT = 2,
+NFIT_STRUCTURE_SMBIOS = 3,
+NFIT_STRUCTURE_DCR = 4,
+NFIT_STRUCTURE_BDW = 5,
+NFIT_STRUCTURE_FLUSH = 6,
+};
+
+enum {
+EFI_MEMORY_UC = 0x1ULL,
+EFI_MEMORY_WC = 0x2ULL,
+EFI_MEMORY_WT = 0x4ULL,
+EFI_MEMORY_WB = 0x8ULL,
+EFI_MEMORY_UCE = 0x10ULL,
+EFI_MEMORY_WP = 0x1000ULL,
+EFI_MEMORY_RP = 0x2000ULL,
+EFI_MEMORY_XP = 0x4000ULL,
+EFI_MEMORY_NV = 0x8000ULL,
+EFI_MEMORY_MORE_RELIABLE = 0x1ULL,
+};
+
+/*
+ * NVDIMM Firmware Interface Table
+ * @signature: "NFIT"
+ */
+struct nfit {
+ACPI_TABLE_HEADER_DEF
+uint32_t reserved;
+} QEMU_PACKED;
+typedef struct nfit nfit;
+
 /* System Physical Address Range Structure */
 struct nfit_spa {
 uint16_t type;
@@ -40,13 +80,21 @@ struct nfit_spa {
 uint16_t flags;
 uint32_t reserved;
 uint32_t proximity_domain;
-uint8_t type_guid[16];
+uuid_le type_guid;
 uint64_t spa_base;
 uint64_t spa_length;
 uint64_t mem_attr;
 } QEMU_PACKED;
 typedef struct nfit_spa nfit_spa;
 
+/*
+ * Control region is strictly for management during hot add/online
+ * operation.
+ */
+#define SPA_FLAGS_ADD_ONLINE_ONLY (1)
+/* Data in Proximity Domain field is valid. */
+#define SPA_FLAGS_PROXIMITY_VALID (1 << 1)
+
 /* Memory Device to System Physical Address Range Mapping Structure */
 struct nfit_memdev {
 uint16_t type;
@@ -91,12 +139,20 @@ struct nfit_dcr {
 } QEMU_PACKED;
 typedef struct nfit_dcr nfit_dcr;
 
+#define REVSISON_ID1
+#define NFIT_FIC1  0x201
+
 static uint64_t nvdimm_device_structure_size(uint64_t slots)
 {
 /* each nvdimm has three structures. */
 return slots * (sizeof(nfit_spa) + sizeof(nfit_memdev) + sizeof(nfit_dcr));
 }
 
+static uint64_t get_nfit_total_size(uint64_t slots)
+{
+return sizeof(struct nfit) + nvdimm_device_structure_size(slots);
+}
+
 static uint64_t nvdimm_acpi_memory_size(uint64_t slots, uint64_t page_size)
 {
 uint64_t size = nvdimm_device_structure_size(slots);
@@ -118,3 +174,154 @@ void nvdimm_init_memory_state(NVDIMMState *state, 
MemoryRegion*system_memory,
NVDIMM_ACPI_MEM_SIZE);
 memory_region_add_subregion(system_memory, state->base, &state->mr);
 }
+
+static uint32_t nvdimm_slot_to_sn(int slot)
+{
+return 0x123456 + slot;
+}
+
+static uint32_t nvdimm_slot_to_handle(int slot)
+{
+return slot + 1;
+}
+
+static uint16_t nvdimm_slot_to_spa_index(int slot)
+{
+return (slot + 1) << 1;
+}
+
+static uint32_t nvdimm_slot_to_dcr_index(int slot)
+{
+return nvdimm_slot_to_spa_index(slot) + 1;
+}
+
+static int build_structure_spa(void *buf, NVDIMMDevice *nvdimm)
+{
+nfit_spa *nfit_spa

[PATCH v3 24/32] nvdimm: init the address region used by DSM method

2015-10-10 Thread Xiao Guangrong
Map the NVDIMM ACPI memory region to guest address space

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 75 
 hw/mem/nvdimm/internal.h |  8 ++
 2 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index 62b1e02..1450a6a 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -271,8 +271,6 @@ static int build_structure_dcr(void *buf, NVDIMMDevice 
*nvdimm)
 
 static void build_device_structure(GSList *device_list, char *buf)
 {
-buf += sizeof(nfit);
-
 for (; device_list; device_list = device_list->next) {
 NVDIMMDevice *nvdimm = device_list->data;
 
@@ -290,7 +288,7 @@ static void build_device_structure(GSList *device_list, 
char *buf)
 }
 }
 
-static void build_nfit(GSList *device_list, GArray *table_offsets,
+static void build_nfit(void *fit, GSList *device_list, GArray *table_offsets,
GArray *table_data, GArray *linker)
 {
 size_t total;
@@ -304,12 +302,76 @@ static void build_nfit(GSList *device_list, GArray 
*table_offsets,
 acpi_add_table(table_offsets, table_data);
 
 buf = acpi_data_push(table_data, total);
-build_device_structure(device_list, buf);
+memcpy(buf + sizeof(nfit), fit, total - sizeof(nfit));
 
 build_header(linker, table_data, (void *)(table_data->data + nfit_start),
  "NFIT", table_data->len - nfit_start, 1);
 }
 
+static uint64_t dsm_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+return 0;
+}
+
+static void dsm_write(void *opaque, hwaddr addr,
+  uint64_t val, unsigned size)
+{
+}
+
+static const MemoryRegionOps dsm_ops = {
+.read = dsm_read,
+.write = dsm_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static MemoryRegion *build_dsm_memory(NVDIMMState *state)
+{
+MemoryRegion *dsm_ram_mr, *dsm_mmio_mr, *dsm_fit_mr;
+uint64_t fit_size = memory_region_size(&state->mr) - state->page_size * 2;
+
+/* DSM memory has already been built. */
+dsm_fit_mr = memory_region_find(&state->mr, state->page_size * 2,
+fit_size).mr;
+if (dsm_fit_mr) {
+nvdebug("DSM FIT has already been built by %s.\n", dsm_fit_mr->name);
+memory_region_unref(dsm_fit_mr);
+return dsm_fit_mr;
+}
+
+/*
+ * the first page is MMIO-based used to transfer control from guest
+ * ACPI to QEMU.
+ */
+dsm_mmio_mr = g_new(MemoryRegion, 1);
+memory_region_init_io(dsm_mmio_mr, NULL, &dsm_ops, state,
+  "nvdimm.dsm_mmio", state->page_size);
+
+/*
+ * the second page is RAM-based used to transfer data between guest
+ * ACPI and QEMU.
+ */
+dsm_ram_mr = g_new(MemoryRegion, 1);
+memory_region_init_ram(dsm_ram_mr, NULL, "nvdimm.dsm_ram",
+   state->page_size, &error_abort);
+vmstate_register_ram_global(dsm_ram_mr);
+
+/*
+ * the left is RAM-based which is _FIT buffer returned by _FIT
+ * method.
+ */
+dsm_fit_mr = g_new(MemoryRegion, 1);
+memory_region_init_ram(dsm_fit_mr, NULL, "nvdimm.fit", fit_size,
+   &error_abort);
+vmstate_register_ram_global(dsm_fit_mr);
+
+memory_region_add_subregion(&state->mr, 0, dsm_mmio_mr);
+memory_region_add_subregion(&state->mr, state->page_size, dsm_ram_mr);
+memory_region_add_subregion(&state->mr, state->page_size * 2, dsm_fit_mr);
+
+return dsm_fit_mr;
+}
+
 void nvdimm_build_acpi_table(NVDIMMState *state, GArray *table_offsets,
  GArray *table_data, GArray *linker)
 {
@@ -321,7 +383,10 @@ void nvdimm_build_acpi_table(NVDIMMState *state, GArray 
*table_offsets,
 }
 
 if (device_list) {
-build_nfit(device_list, table_offsets, table_data, linker);
+void *fit = memory_region_get_ram_ptr(build_dsm_memory(state));
+
+build_device_structure(device_list, fit);
+build_nfit(fit, device_list, table_offsets, table_data, linker);
 g_slist_free(device_list);
 }
 }
diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h
index 5551448..1e95363 100644
--- a/hw/mem/nvdimm/internal.h
+++ b/hw/mem/nvdimm/internal.h
@@ -13,6 +13,14 @@
 #ifndef NVDIMM_INTERNAL_H
 #define NVDIMM_INTERNAL_H
 
+#define NVDIMM_DEBUG 0
+#define nvdebug(fmt, ...) \
+do {  \
+if (NVDIMM_DEBUG) {   \
+fprintf(stderr, "nvdimm: " fmt, ## __VA_ARGS__);  \
+} \
+} while (0)
+
 #define MIN_NAMESPACE_LABEL_SIZE(128UL << 10)
 
 struct uuid_le {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-in

[PATCH v3 17/32] dimm: abstract dimm device from pc-dimm

2015-10-10 Thread Xiao Guangrong
A base device, dimm, is abstracted from pc-dimm, so that we can
build nvdimm device based on dimm in the later patch

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak   |  1 +
 default-configs/x86_64-softmmu.mak |  1 +
 hw/mem/Makefile.objs   |  3 ++-
 hw/mem/dimm.c  | 11 ++---
 hw/mem/pc-dimm.c   | 46 ++
 include/hw/mem/dimm.h  |  4 ++--
 include/hw/mem/pc-dimm.h   |  7 ++
 7 files changed, 61 insertions(+), 12 deletions(-)
 create mode 100644 hw/mem/pc-dimm.c
 create mode 100644 include/hw/mem/pc-dimm.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 43c96d1..3ece8bb 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -18,6 +18,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_X86_ICH=y
+CONFIG_DIMM=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index dfb8095..92ea7c1 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -18,6 +18,7 @@ CONFIG_FDC=y
 CONFIG_ACPI=y
 CONFIG_ACPI_X86=y
 CONFIG_ACPI_X86_ICH=y
+CONFIG_DIMM=y
 CONFIG_ACPI_MEMORY_HOTPLUG=y
 CONFIG_ACPI_CPU_HOTPLUG=y
 CONFIG_APM=y
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index 7563ef5..cebb4b1 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1 +1,2 @@
-common-obj-$(CONFIG_MEM_HOTPLUG) += dimm.o
+common-obj-$(CONFIG_DIMM) += dimm.o
+common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index e007271..2e35764 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -1,5 +1,5 @@
 /*
- * Dimm device for Memory Hotplug
+ * Dimm device abstraction
  *
  * Copyright ProfitBricks GmbH 2012
  * Copyright (C) 2014 Red Hat Inc
@@ -425,21 +425,13 @@ static void dimm_realize(DeviceState *dev, Error **errp)
 }
 }
 
-static MemoryRegion *dimm_get_memory_region(DIMMDevice *dimm)
-{
-return host_memory_backend_get_memory(dimm->hostmem, &error_abort);
-}
-
 static void dimm_class_init(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
-DIMMDeviceClass *ddc = DIMM_CLASS(oc);
 
 dc->realize = dimm_realize;
 dc->props = dimm_properties;
 dc->desc = "DIMM memory module";
-
-ddc->get_memory_region = dimm_get_memory_region;
 }
 
 static TypeInfo dimm_info = {
@@ -449,6 +441,7 @@ static TypeInfo dimm_info = {
 .instance_init = dimm_init,
 .class_init= dimm_class_init,
 .class_size= sizeof(DIMMDeviceClass),
+.abstract  = true,
 };
 
 static void dimm_register_types(void)
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
new file mode 100644
index 000..38323e9
--- /dev/null
+++ b/hw/mem/pc-dimm.c
@@ -0,0 +1,46 @@
+/*
+ * Dimm device for Memory Hotplug
+ *
+ * Copyright ProfitBricks GmbH 2012
+ * Copyright (C) 2014 Red Hat Inc
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see 
+ */
+
+#include "hw/mem/pc-dimm.h"
+
+static MemoryRegion *pc_dimm_get_memory_region(DIMMDevice *dimm)
+{
+return host_memory_backend_get_memory(dimm->hostmem, &error_abort);
+}
+
+static void pc_dimm_class_init(ObjectClass *oc, void *data)
+{
+DIMMDeviceClass *ddc = DIMM_CLASS(oc);
+
+ddc->get_memory_region = pc_dimm_get_memory_region;
+}
+
+static TypeInfo pc_dimm_info = {
+.name  = TYPE_PC_DIMM,
+.parent= TYPE_DIMM,
+.class_init= pc_dimm_class_init,
+};
+
+static void pc_dimm_register_types(void)
+{
+type_register_static(&pc_dimm_info);
+}
+
+type_init(pc_dimm_register_types)
diff --git a/include/hw/mem/dimm.h b/include/hw/mem/dimm.h
index 5ddbf08..84a62ed 100644
--- a/include/hw/mem/dimm.h
+++ b/include/hw/mem/dimm.h
@@ -1,5 +1,5 @@
 /*
- * PC DIMM device
+ * Dimm device abstraction
  *
  * Copyright ProfitBricks GmbH 2012
  * Copyright (C) 2013-2014 Red Hat Inc
@@ -20,7 +20,7 @@
 #include "sysemu/hostmem.h"
 #include "hw/qdev.h"
 
-#define TYPE_DIMM "pc-dimm"
+#define TYPE_DIMM "dimm"
 #define DIMM(obj) \
 OBJECT_CHECK(DIMMDevice, (obj), TYPE_DIMM)
 #define DIMM_CLASS(oc) \
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
new file mode 100644
index 000..50818c2
--- /dev/null
+++ b/include/hw/mem/pc-dimm

[PATCH v3 16/32] pc-dimm: rename pc-dimm.c and pc-dimm.h

2015-10-10 Thread Xiao Guangrong
Rename:
   pc-dimm.c => dimm.c
   pc-dimm.h => dimm.h

It prepares the work which abstracts dimm device type for both pc-dimm and
nvdimm

Signed-off-by: Xiao Guangrong 
---
 hw/Makefile.objs | 2 +-
 hw/acpi/ich9.c   | 2 +-
 hw/acpi/memory_hotplug.c | 4 ++--
 hw/acpi/piix4.c  | 2 +-
 hw/i386/pc.c | 2 +-
 hw/mem/Makefile.objs | 2 +-
 hw/mem/{pc-dimm.c => dimm.c} | 2 +-
 hw/ppc/spapr.c   | 2 +-
 include/hw/i386/pc.h | 2 +-
 include/hw/mem/{pc-dimm.h => dimm.h} | 0
 include/hw/ppc/spapr.h   | 2 +-
 numa.c   | 2 +-
 qmp.c| 2 +-
 stubs/qmp_dimm_device_list.c | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)
 rename hw/mem/{pc-dimm.c => dimm.c} (99%)
 rename include/hw/mem/{pc-dimm.h => dimm.h} (100%)

diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 7e7c241..12ecda9 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -30,8 +30,8 @@ devices-dirs-$(CONFIG_SOFTMMU) += vfio/
 devices-dirs-$(CONFIG_VIRTIO) += virtio/
 devices-dirs-$(CONFIG_SOFTMMU) += watchdog/
 devices-dirs-$(CONFIG_SOFTMMU) += xen/
-devices-dirs-$(CONFIG_MEM_HOTPLUG) += mem/
 devices-dirs-$(CONFIG_SMBIOS) += smbios/
+devices-dirs-y += mem/
 devices-dirs-y += core/
 common-obj-y += $(devices-dirs-y)
 obj-y += $(devices-dirs-y)
diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c
index b0d6a67..1e9ae20 100644
--- a/hw/acpi/ich9.c
+++ b/hw/acpi/ich9.c
@@ -35,7 +35,7 @@
 #include "exec/address-spaces.h"
 
 #include "hw/i386/ich9.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 
 //#define DEBUG
 
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index 1f6..e232641 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -1,6 +1,6 @@
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/pc-hotplug.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "hw/boards.h"
 #include "hw/qdev-core.h"
 #include "trace.h"
@@ -148,7 +148,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr 
addr, uint64_t data,
 
 dev = DEVICE(mdev->dimm);
 hotplug_ctrl = qdev_get_hotplug_handler(dev);
-/* call pc-dimm unplug cb */
+/* call dimm unplug cb */
 hotplug_handler_unplug(hotplug_ctrl, dev, &local_err);
 if (local_err) {
 trace_mhp_acpi_dimm_delete_failed(mem_st->selector);
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index 0b2cb6e..b2f5b2c 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -33,7 +33,7 @@
 #include "hw/acpi/pcihp.h"
 #include "hw/acpi/cpu_hotplug.h"
 #include "hw/hotplug.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/acpi_dev_interface.h"
 #include "hw/xen/xen.h"
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index d6b9fa7..6694b18 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -62,7 +62,7 @@
 #include "hw/boards.h"
 #include "hw/pci/pci_host.h"
 #include "acpi-build.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "qapi/visitor.h"
 #include "qapi-visit.h"
 
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index b000fb4..7563ef5 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1 +1 @@
-common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
+common-obj-$(CONFIG_MEM_HOTPLUG) += dimm.o
diff --git a/hw/mem/pc-dimm.c b/hw/mem/dimm.c
similarity index 99%
rename from hw/mem/pc-dimm.c
rename to hw/mem/dimm.c
index 9e26bf7..e007271 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/dimm.c
@@ -18,7 +18,7 @@
  * License along with this library; if not, see 
  */
 
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 #include "qemu/config-file.h"
 #include "qapi/visitor.h"
 #include "qemu/range.h"
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 4fb91a5..171fa77 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2138,7 +2138,7 @@ static void spapr_machine_device_plug(HotplugHandler 
*hotplug_dev,
  *
  * - Memory gets hotplugged to a different node than what the user
  *   specified.
- * - Since pc-dimm subsystem in QEMU still thinks that memory belongs
+ * - Since dimm subsystem in QEMU still thinks that memory belongs
  *   to memory-less node, a reboot will set things accordingly
  *   and the previously hotplugged memory now ends in the right node.
  *   This appears as if some memory moved from one node to another.
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 0503485..693b6c5 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -16,7 +16,7 @@
 #include "hw/pci/pci.h"
 #include "hw/boards.h"
 #include "hw/compat.h"
-#include "hw/mem/pc-dimm.h"
+#include "hw/mem/dimm.h"
 
 #define HPET_INTCAP "hpet-intcap"
 
diff --git a/include/hw/mem/pc-dimm.h b

[PATCH v3 18/32] dimm: get mapped memory region from DIMMDeviceClass->get_memory_region

2015-10-10 Thread Xiao Guangrong
Curretly, the memory region of backed memory is directly mapped to
guest's address space, however, it is not true for nvdimm device

This patch let dimm device realize this fact and use
DIMMDeviceClass->get_memory_region method to get the mapped memory
region

Signed-off-by: Xiao Guangrong 
---
 hw/mem/dimm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index 2e35764..b307511 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -373,8 +373,9 @@ static void dimm_get_size(Object *obj, Visitor *v, void 
*opaque,
 int64_t value;
 MemoryRegion *mr;
 DIMMDevice *dimm = DIMM(obj);
+DIMMDeviceClass *ddc = DIMM_GET_CLASS(obj);
 
-mr = host_memory_backend_get_memory(dimm->hostmem, errp);
+mr = ddc->get_memory_region(dimm);
 value = memory_region_size(mr);
 
 visit_type_int(v, &value, name, errp);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 25/32] nvdimm: build ACPI nvdimm devices

2015-10-10 Thread Xiao Guangrong
NVDIMM devices is defined in ACPI 6.0 9.20 NVDIMM Devices

There is a root device under \_SB and specified NVDIMM devices are under the
root device. Each NVDIMM device has _ADR which returns its handle used to
associate MEMDEV structure in NFIT

We reserve handle 0 for root device. In this patch, we save handle, arg0,
arg1 and arg2. Arg3 is conditionally saved in later patch

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 203 +++
 1 file changed, 203 insertions(+)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index 1450a6a..d9fa0fd 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -308,15 +308,38 @@ static void build_nfit(void *fit, GSList *device_list, 
GArray *table_offsets,
  "NFIT", table_data->len - nfit_start, 1);
 }
 
+#define NOTIFY_VALUE  0x99
+
+struct dsm_in {
+uint32_t handle;
+uint8_t arg0[16];
+uint32_t arg1;
+uint32_t arg2;
+   /* the remaining size in the page is used by arg3. */
+uint8_t arg3[0];
+} QEMU_PACKED;
+typedef struct dsm_in dsm_in;
+
+struct dsm_out {
+/* the size of buffer filled by QEMU. */
+uint16_t len;
+uint8_t data[0];
+} QEMU_PACKED;
+typedef struct dsm_out dsm_out;
+
 static uint64_t dsm_read(void *opaque, hwaddr addr,
  unsigned size)
 {
+fprintf(stderr, "BUG: we never read DSM notification MMIO.\n");
 return 0;
 }
 
 static void dsm_write(void *opaque, hwaddr addr,
   uint64_t val, unsigned size)
 {
+if (val != NOTIFY_VALUE) {
+fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
+}
 }
 
 static const MemoryRegionOps dsm_ops = {
@@ -372,6 +395,183 @@ static MemoryRegion *build_dsm_memory(NVDIMMState *state)
 return dsm_fit_mr;
 }
 
+#define BUILD_STA_METHOD(_dev_, _method_)  \
+do {   \
+_method_ = aml_method("_STA", 0);  \
+aml_append(_method_, aml_return(aml_int(0x0f)));   \
+aml_append(_dev_, _method_);   \
+} while (0)
+
+#define SAVE_ARG012_HANDLE_LOCK(_method_, _handle_)\
+do {   \
+aml_append(_method_, aml_acquire(aml_name("NLCK"), 0x));   \
+aml_append(_method_, aml_store(_handle_, aml_name("HDLE")));   \
+aml_append(_method_, aml_store(aml_arg(0), aml_name("ARG0"))); \
+aml_append(_method_, aml_store(aml_arg(1), aml_name("ARG1"))); \
+aml_append(_method_, aml_store(aml_arg(2), aml_name("ARG2"))); \
+} while (0)
+
+#define NOTIFY_AND_RETURN_UNLOCK(_method_)   \
+do {   \
+aml_append(_method_, aml_store(aml_int(NOTIFY_VALUE),  \
+   aml_name("NOTI"))); \
+aml_append(_method_, aml_store(aml_name("RLEN"), aml_local(6)));   \
+aml_append(_method_, aml_store(aml_shiftleft(aml_local(6), \
+  aml_int(3)), aml_local(6))); \
+aml_append(_method_, aml_create_field(aml_name("ODAT"), aml_int(0),\
+  aml_local(6) , "OBUF")); \
+aml_append(_method_, aml_name_decl("ZBUF", aml_buffer(0, NULL)));  \
+aml_append(_method_, aml_concatenate(aml_name("ZBUF"), \
+  aml_name("OBUF"), aml_arg(6)));  \
+aml_append(_method_, aml_release(aml_name("NLCK")));   \
+aml_append(_method_, aml_return(aml_arg(6)));  \
+} while (0)
+
+#define BUILD_FIELD_UNIT_STRUCT(_field_, _s_, _f_, _name_) \
+aml_append(_field_, aml_named_field(_name_,\
+   sizeof(typeof_field(_s_, _f_)) * BITS_PER_BYTE))
+
+#define BUILD_FIELD_UNIT_SIZE(_field_, _byte_, _name_) \
+aml_append(_field_, aml_named_field(_name_, (_byte_) * BITS_PER_BYTE))
+
+static void build_nvdimm_devices(NVDIMMState *state, GSList *device_list,
+ Aml *root_dev)
+{
+for (; device_list; device_list = device_list->next) {
+NVDIMMDevice *nvdimm = device_list->data;
+int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+   NULL);
+uint32_t handle = nvdimm_slot_to_handle(slot);
+Aml *dev, *method;
+
+dev = aml_device("NV%02X", slot);
+aml_append(dev, aml_name_decl("_ADR", aml_int(handle)));
+
+BUILD_STA_METHOD(dev, method);
+
+method = aml_method("_DSM", 4);
+{
+SAVE_ARG012_HANDLE_LOCK(method, aml_int(handle));
+  

[PATCH v3 22/32] nvdimm: init the address region used by NVDIMM ACPI

2015-10-10 Thread Xiao Guangrong
We reserve the memory region 0xFF0 ~ 0xFFF0 for NVDIMM ACPI
which is used as:
- the first page is mapped as MMIO, ACPI write data to this page to
  transfer the control to QEMU

- the second page is RAM-based which used to save the input info of
  _DSM method and QEMU reuse it store output info

- the left is mapped as RAM, it's the buffer returned by _FIT method,
  this is needed by NVDIMM hotplug

Signed-off-by: Xiao Guangrong 
---
 hw/i386/pc.c|   3 ++
 hw/mem/Makefile.objs|   2 +-
 hw/mem/nvdimm/acpi.c| 120 
 include/hw/i386/pc.h|   2 +
 include/hw/mem/nvdimm.h |  19 
 5 files changed, 145 insertions(+), 1 deletion(-)
 create mode 100644 hw/mem/nvdimm/acpi.c

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 6694b18..8fea4c3 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1360,6 +1360,9 @@ FWCfgState *pc_memory_init(PCMachineState *pcms,
 exit(EXIT_FAILURE);
 }
 
+nvdimm_init_memory_state(&pcms->nvdimm_memory, system_memory, machine,
+ TARGET_PAGE_SIZE);
+
 pcms->hotplug_memory.base =
 ROUND_UP(0x1ULL + pcms->above_4g_mem_size, 1ULL << 30);
 
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index e0ff328..7310bac 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1,3 +1,3 @@
 common-obj-$(CONFIG_DIMM) += dimm.o
 common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
-common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o
+common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o nvdimm/acpi.o
diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
new file mode 100644
index 000..b640874
--- /dev/null
+++ b/hw/mem/nvdimm/acpi.c
@@ -0,0 +1,120 @@
+/*
+ * NVDIMM ACPI Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * NFIT is defined in ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT)
+ * and the DSM specfication can be found at:
+ *   http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
+ *
+ * Currently, it only supports PMEM Virtualization.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see 
+ */
+
+#include "qemu-common.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/mem/nvdimm.h"
+#include "internal.h"
+
+/* System Physical Address Range Structure */
+struct nfit_spa {
+uint16_t type;
+uint16_t length;
+uint16_t spa_index;
+uint16_t flags;
+uint32_t reserved;
+uint32_t proximity_domain;
+uint8_t type_guid[16];
+uint64_t spa_base;
+uint64_t spa_length;
+uint64_t mem_attr;
+} QEMU_PACKED;
+typedef struct nfit_spa nfit_spa;
+
+/* Memory Device to System Physical Address Range Mapping Structure */
+struct nfit_memdev {
+uint16_t type;
+uint16_t length;
+uint32_t nfit_handle;
+uint16_t phys_id;
+uint16_t region_id;
+uint16_t spa_index;
+uint16_t dcr_index;
+uint64_t region_len;
+uint64_t region_offset;
+uint64_t region_dpa;
+uint16_t interleave_index;
+uint16_t interleave_ways;
+uint16_t flags;
+uint16_t reserved;
+} QEMU_PACKED;
+typedef struct nfit_memdev nfit_memdev;
+
+/* NVDIMM Control Region Structure */
+struct nfit_dcr {
+uint16_t type;
+uint16_t length;
+uint16_t dcr_index;
+uint16_t vendor_id;
+uint16_t device_id;
+uint16_t revision_id;
+uint16_t sub_vendor_id;
+uint16_t sub_device_id;
+uint16_t sub_revision_id;
+uint8_t reserved[6];
+uint32_t serial_number;
+uint16_t fic;
+uint16_t num_bcw;
+uint64_t bcw_size;
+uint64_t cmd_offset;
+uint64_t cmd_size;
+uint64_t status_offset;
+uint64_t status_size;
+uint16_t flags;
+uint8_t reserved2[6];
+} QEMU_PACKED;
+typedef struct nfit_dcr nfit_dcr;
+
+static uint64_t nvdimm_device_structure_size(uint64_t slots)
+{
+/* each nvdimm has three structures. */
+return slots * (sizeof(nfit_spa) + sizeof(nfit_memdev) + sizeof(nfit_dcr));
+}
+
+static uint64_t nvdimm_acpi_memory_size(uint64_t slots, uint64_t page_size)
+{
+uint64_t size = nvdimm_device_structure_size(slots);
+
+/* two pages for nvdimm _DSM method. */
+return size + page_size * 2;
+}
+
+void nvdimm_init_memory_state(NVDIMMState *state, MemoryRegion*system_memory,
+  MachineState *machine , uint64_

[PATCH v3 26/32] nvdimm: save arg3 for NVDIMM device _DSM method

2015-10-10 Thread Xiao Guangrong
Check if the input Arg3 is valid then store it into dsm_in if needed

We only do the save on NVDIMM device since we are not going to support any
function on root device

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index d9fa0fd..3b9399c 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -442,7 +442,7 @@ static void build_nvdimm_devices(NVDIMMState *state, GSList 
*device_list,
 int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
NULL);
 uint32_t handle = nvdimm_slot_to_handle(slot);
-Aml *dev, *method;
+Aml *dev, *method, *ifctx;
 
 dev = aml_device("NV%02X", slot);
 aml_append(dev, aml_name_decl("_ADR", aml_int(handle)));
@@ -452,6 +452,24 @@ static void build_nvdimm_devices(NVDIMMState *state, 
GSList *device_list,
 method = aml_method("_DSM", 4);
 {
 SAVE_ARG012_HANDLE_LOCK(method, aml_int(handle));
+
+/* Arg3 is passed as Package and it has one element? */
+ifctx = aml_if(aml_and(aml_equal(aml_object_type(aml_arg(3)),
+ aml_int(4)),
+   aml_equal(aml_sizeof(aml_arg(3)),
+ aml_int(1;
+{
+/* Local0 = Index(Arg3, 0) */
+aml_append(ifctx, aml_store(aml_index(aml_arg(3), aml_int(0)),
+aml_local(0)));
+/* Local3 = DeRefOf(Local0) */
+aml_append(ifctx, aml_store(aml_derefof(aml_local(0)),
+aml_local(3)));
+/* ARG3 = Local3 */
+aml_append(ifctx, aml_store(aml_local(3), aml_name("ARG3")));
+}
+aml_append(method, ifctx);
+
 NOTIFY_AND_RETURN_UNLOCK(method);
 }
 aml_append(dev, method);
@@ -534,6 +552,7 @@ static void nvdimm_build_acpi_devices(NVDIMMState *state, 
GSList *device_list,
 method = aml_method("_DSM", 4);
 {
 SAVE_ARG012_HANDLE_LOCK(method, aml_int(0));
+/* no command we support on ROOT device has Arg3. */
 NOTIFY_AND_RETURN_UNLOCK(method);
 }
 aml_append(dev, method);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 19/32] dimm: keep the state of the whole backend memory

2015-10-10 Thread Xiao Guangrong
QEMU keeps the state of memory of dimm device during live migration,
however, it is not enough for nvdimm device as its memory does not
contain its label data, so that we should protect the whole backend
memory instead

Signed-off-by: Xiao Guangrong 
---
 hw/mem/dimm.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index b307511..efe964a 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -128,9 +128,16 @@ void dimm_memory_plug(DeviceState *dev, MemoryHotplugState 
*hpms,
 }
 
 memory_region_add_subregion(&hpms->mr, addr - hpms->base, mr);
-vmstate_register_ram(mr, dev);
 numa_set_mem_node_id(addr, memory_region_size(mr), dimm->node);
 
+/*
+ * save the state only for @mr is not enough as it does not contain
+ * the label data of NVDIMM device, so that we keep the state of
+ * whole hostmem instead.
+ */
+vmstate_register_ram(host_memory_backend_get_memory(dimm->hostmem, errp),
+ dev);
+
 out:
 error_propagate(errp, local_err);
 }
@@ -139,10 +146,13 @@ void dimm_memory_unplug(DeviceState *dev, 
MemoryHotplugState *hpms,
MemoryRegion *mr)
 {
 DIMMDevice *dimm = DIMM(dev);
+MemoryRegion *backend_mr;
+
+backend_mr = host_memory_backend_get_memory(dimm->hostmem, &error_abort);
 
 numa_unset_mem_node_id(dimm->addr, memory_region_size(mr), dimm->node);
 memory_region_del_subregion(&hpms->mr, mr);
-vmstate_unregister_ram(mr, dev);
+vmstate_unregister_ram(backend_mr, dev);
 }
 
 int qmp_dimm_device_list(Object *obj, void *opaque)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 21/32] nvdimm: implement NVDIMM device abstract

2015-10-10 Thread Xiao Guangrong
Introduce "nvdimm" device which is based on dimm device type

128K memory region which is the minimum namespace label size
required by NVDIMM Namespace Spec locates at the end of
backend memory device is reserved for label data

We can use "-m 1G,maxmem=100G,slots=10 -object memory-backend-file,
id=mem1,size=1G,mem-path=/dev/pmem0 -device nvdimm,memdev=mem1" to
create NVDIMM device for guest

Signed-off-by: Xiao Guangrong 
---
 default-configs/i386-softmmu.mak   |  1 +
 default-configs/x86_64-softmmu.mak |  1 +
 hw/acpi/memory_hotplug.c   |  6 +++
 hw/mem/Makefile.objs   |  1 +
 hw/mem/nvdimm/internal.h   | 17 
 hw/mem/nvdimm/nvdimm.c | 85 ++
 include/hw/mem/nvdimm.h| 33 +++
 7 files changed, 144 insertions(+)
 create mode 100644 hw/mem/nvdimm/internal.h
 create mode 100644 hw/mem/nvdimm/nvdimm.c
 create mode 100644 include/hw/mem/nvdimm.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 3ece8bb..a1b24e5 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -47,6 +47,7 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM = y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 92ea7c1..e3f5a0b 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -47,6 +47,7 @@ CONFIG_APIC=y
 CONFIG_IOAPIC=y
 CONFIG_PVPANIC=y
 CONFIG_MEM_HOTPLUG=y
+CONFIG_NVDIMM = y
 CONFIG_XIO3130=y
 CONFIG_IOH3420=y
 CONFIG_I82801B11=y
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
index e232641..92cd973 100644
--- a/hw/acpi/memory_hotplug.c
+++ b/hw/acpi/memory_hotplug.c
@@ -1,6 +1,7 @@
 #include "hw/acpi/memory_hotplug.h"
 #include "hw/acpi/pc-hotplug.h"
 #include "hw/mem/dimm.h"
+#include "hw/mem/nvdimm.h"
 #include "hw/boards.h"
 #include "hw/qdev-core.h"
 #include "trace.h"
@@ -231,6 +232,11 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, 
MemHotplugState *mem_st,
 {
 MemStatus *mdev;
 
+/* Currently, NVDIMM hotplug has not been supported yet. */
+if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
+return;
+}
+
 mdev = acpi_memory_slot_status(mem_st, dev, errp);
 if (!mdev) {
 return;
diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index cebb4b1..e0ff328 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1,2 +1,3 @@
 common-obj-$(CONFIG_DIMM) += dimm.o
 common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
+common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o
diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h
new file mode 100644
index 000..c4ba750
--- /dev/null
+++ b/hw/mem/nvdimm/internal.h
@@ -0,0 +1,17 @@
+/*
+ * Non-Volatile Dual In-line Memory Module Virtualization Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef NVDIMM_INTERNAL_H
+#define NVDIMM_INTERNAL_H
+
+#define MIN_NAMESPACE_LABEL_SIZE(128UL << 10)
+#endif
diff --git a/hw/mem/nvdimm/nvdimm.c b/hw/mem/nvdimm/nvdimm.c
new file mode 100644
index 000..0850e82
--- /dev/null
+++ b/hw/mem/nvdimm/nvdimm.c
@@ -0,0 +1,85 @@
+/*
+ * Non-Volatile Dual In-line Memory Module Virtualization Implementation
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * Currently, it only supports PMEM Virtualization.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see 
+ */
+
+#include "qapi/visitor.h"
+#include "hw/mem/nvdimm.h"
+#include "internal.h"
+
+static MemoryRegion *nvdimm_get_memory_region(DIMMDevice *dimm)
+{
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+
+return memory_region_size(&nvdimm->nvdimm_mr) ? &nvdimm->nvdimm_mr : NULL;
+}
+
+static void nvdimm_realize(DIMMDevice *dimm, Error **errp)
+{
+MemoryRegion *mr;
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+uint64_t size;
+
+nvdimm->label_size = MIN_NAMESPACE_LABEL_SIZE;
+
+mr = host_memory_backend_get_memory(dimm->hostmem, errp);
+size = memory_region_size(mr);
+
+if (size <= nvdimm->label_size) {
+char *path 

[PATCH v3 15/32] stubs: rename qmp_pc_dimm_device_list.c

2015-10-10 Thread Xiao Guangrong
Rename qmp_pc_dimm_device_list.c to qmp_dimm_device_list.c

Signed-off-by: Xiao Guangrong 
---
 stubs/Makefile.objs | 2 +-
 stubs/{qmp_pc_dimm_device_list.c => qmp_dimm_device_list.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename stubs/{qmp_pc_dimm_device_list.c => qmp_dimm_device_list.c} (100%)

diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 85e4e81..c2fdcbb 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -37,5 +37,5 @@ stub-obj-y += vmstate.o
 stub-obj-$(CONFIG_WIN32) += fd-register.o
 stub-obj-y += cpus.o
 stub-obj-y += kvm.o
-stub-obj-y += qmp_pc_dimm_device_list.o
+stub-obj-y += qmp_dimm_device_list.o
 stub-obj-y += target-monitor-defs.o
diff --git a/stubs/qmp_pc_dimm_device_list.c b/stubs/qmp_dimm_device_list.c
similarity index 100%
rename from stubs/qmp_pc_dimm_device_list.c
rename to stubs/qmp_dimm_device_list.c
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 28/32] nvdimm: support DSM_CMD_NAMESPACE_LABEL_SIZE function

2015-10-10 Thread Xiao Guangrong
Function 4 is used to get Namespace label size

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 90 +---
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index cb6a428..b420e2f 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -379,13 +379,29 @@ enum {
| (1 << DSM_CMD_GET_NAMESPACE_LABEL_DATA)  \
| (1 << DSM_CMD_SET_NAMESPACE_LABEL_DATA))
 
+struct cmd_in_get_label_data {
+uint32_t offset;
+uint32_t length;
+} QEMU_PACKED;
+typedef struct cmd_in_get_label_data cmd_in_get_label_data;
+
+struct cmd_in_set_label_data {
+uint32_t offset;
+uint32_t length;
+uint8_t in_buf[0];
+} QEMU_PACKED;
+typedef struct cmd_in_set_label_data cmd_in_set_label_data;
+
 struct dsm_in {
 uint32_t handle;
 uint8_t arg0[16];
 uint32_t arg1;
 uint32_t arg2;
/* the remaining size in the page is used by arg3. */
-uint8_t arg3[0];
+union {
+uint8_t arg3[0];
+cmd_in_set_label_data cmd_set_label_data;
+};
 } QEMU_PACKED;
 typedef struct dsm_in dsm_in;
 
@@ -394,6 +410,19 @@ struct cmd_out_implemented {
 };
 typedef struct cmd_out_implemented cmd_out_implemented;
 
+struct cmd_out_label_size {
+uint32_t status;
+uint32_t label_size;
+uint32_t max_xfer;
+} QEMU_PACKED;
+typedef struct cmd_out_label_size cmd_out_label_size;
+
+struct cmd_out_get_label_data {
+uint32_t status;
+uint8_t out_buf[0];
+} QEMU_PACKED;
+typedef struct cmd_out_get_label_data cmd_out_get_label_data;
+
 struct dsm_out {
 /* the size of buffer filled by QEMU. */
 uint16_t len;
@@ -401,6 +430,8 @@ struct dsm_out {
 uint8_t data[0];
 uint32_t status;
 cmd_out_implemented cmd_implemented;
+cmd_out_label_size cmd_label_size;
+cmd_out_get_label_data cmd_get_label_data;
 };
 } QEMU_PACKED;
 typedef struct dsm_out dsm_out;
@@ -425,8 +456,56 @@ static void dsm_write_root(uint32_t function, dsm_in *in, 
dsm_out *out)
 nvdebug("Return status %#x.\n", out->status);
 }
 
-static void dsm_write_nvdimm(uint32_t handle, uint32_t function, dsm_in *in,
- dsm_out *out)
+/*
+ * the max transfer size is the max size transfered by both a
+ * DSM_CMD_GET_NAMESPACE_LABEL_DATA and a DSM_CMD_SET_NAMESPACE_LABEL_DATA
+ * command.
+ */
+static uint32_t max_xfer_label_size(MemoryRegion *dsm_ram_mr)
+{
+dsm_in *in;
+dsm_out *out;
+uint32_t mr_size, max_get_size, max_set_size;
+
+mr_size = memory_region_size(dsm_ram_mr);
+
+/*
+ * the max data ACPI can read one time which is transfered by
+ * the response of DSM_CMD_GET_NAMESPACE_LABEL_DATA.
+ */
+max_get_size = mr_size - offsetof(dsm_out, data) -
+   sizeof(out->cmd_get_label_data);
+
+/*
+ * the max data ACPI can write one time which is transfered by
+ * DSM_CMD_SET_NAMESPACE_LABEL_DATA
+ */
+max_set_size = mr_size - offsetof(dsm_in, arg3) -
+   sizeof(in->cmd_set_label_data);
+
+return MIN(max_get_size, max_set_size);
+}
+
+static uint32_t
+dsm_cmd_label_size(MemoryRegion *dsm_ram_mr, NVDIMMDevice *nvdimm,
+dsm_out *out)
+{
+uint32_t label_size, mxfer;
+
+label_size = nvdimm->label_size;
+mxfer = max_xfer_label_size(dsm_ram_mr);
+
+out->cmd_label_size.label_size = cpu_to_le32(label_size);
+out->cmd_label_size.max_xfer = cpu_to_le32(mxfer);
+out->len = sizeof(out->cmd_label_size);
+
+nvdebug("%s label_size %#x, max_xfer %#x.\n", __func__, label_size, mxfer);
+
+return DSM_STATUS_SUCCESS;
+}
+
+static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, uint32_t handle,
+ uint32_t function, dsm_in *in, dsm_out *out)
 {
 GSList *list = nvdimm_get_built_list();
 NVDIMMDevice *nvdimm = get_nvdimm_device_by_handle(list, handle);
@@ -444,6 +523,9 @@ static void dsm_write_nvdimm(uint32_t handle, uint32_t 
function, dsm_in *in,
 out->len = sizeof(out->cmd_implemented);
 out->cmd_implemented.cmd_list = cpu_to_le64(cmd_list);
 goto free;
+case DSM_CMD_NAMESPACE_LABEL_SIZE:
+status = dsm_cmd_label_size(dsm_ram_mr, nvdimm, out);
+break;
 default:
 out->len = sizeof(out->status);
 status = DSM_STATUS_NOT_SUPPORTED;
@@ -511,7 +593,7 @@ static void dsm_write(void *opaque, hwaddr addr,
 goto exit;
 }
 
-return dsm_write_nvdimm(handle, function, in, out);
+return dsm_write_nvdimm(dsm_ram_mr, handle, function, in, out);
 
 exit:
 out->len = sizeof(out->status);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 32/32] nvdimm: add maintain info

2015-10-10 Thread Xiao Guangrong
Add NVDIMM maintainer

Signed-off-by: Xiao Guangrong 
---
 MAINTAINERS | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9bde832..204d82a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -868,6 +868,12 @@ M: Jiri Pirko 
 S: Maintained
 F: hw/net/rocker/
 
+NVDIMM
+M: Xiao Guangrong 
+S: Maintained
+F: hw/mem/nvdimm/*
+F: include/hw/mem/nvdimm.h
+
 Subsystems
 --
 Audio
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 27/32] nvdimm: support DSM_CMD_IMPLEMENTED function

2015-10-10 Thread Xiao Guangrong
__DSM is defined in ACPI 6.0: 9.14.1 _DSM (Device Specific Method)

Function 0 is a query function. We do not support any function on root
device and only 3 functions are support for NVDIMM device,
DSM_CMD_NAMESPACE_LABEL_SIZE, DSM_CMD_GET_NAMESPACE_LABEL_DATA and
DSM_CMD_SET_NAMESPACE_LABEL_DATA, that means we currently only allow to access
device's Label Namespace

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 178 ++-
 1 file changed, 177 insertions(+), 1 deletion(-)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index 3b9399c..cb6a428 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -39,6 +39,22 @@ static void nfit_spa_uuid_pm(uuid_le *uuid)
 memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
 }
 
+static bool dsm_is_root_uuid(uint8_t *uuid)
+{
+uuid_le uuid_root = UUID_LE(0x2f10e7a4, 0x9e91, 0x11e4, 0x89,
+0xd3, 0x12, 0x3b, 0x93, 0xf7, 0x5c, 0xba);
+
+return !memcmp(uuid, &uuid_root, sizeof(uuid_root));
+}
+
+static bool dsm_is_dimm_uuid(uint8_t *uuid)
+{
+uuid_le uuid_dimm = UUID_LE(0x4309ac30, 0x0d11, 0x11e4, 0x91,
+0x91, 0x08, 0x00, 0x20, 0x0c, 0x9a, 0x66);
+
+return !memcmp(uuid, &uuid_dimm, sizeof(uuid_dimm));
+}
+
 enum {
 NFIT_STRUCTURE_SPA = 0,
 NFIT_STRUCTURE_MEMDEV = 1,
@@ -195,6 +211,22 @@ static uint32_t nvdimm_slot_to_dcr_index(int slot)
 return nvdimm_slot_to_spa_index(slot) + 1;
 }
 
+static NVDIMMDevice
+*get_nvdimm_device_by_handle(GSList *list, uint32_t handle)
+{
+for (; list; list = list->next) {
+NVDIMMDevice *nvdimm = list->data;
+int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+   NULL);
+
+if (nvdimm_slot_to_handle(slot) == handle) {
+return nvdimm;
+}
+}
+
+return NULL;
+}
+
 static int build_structure_spa(void *buf, NVDIMMDevice *nvdimm)
 {
 nfit_spa *nfit_spa;
@@ -310,6 +342,43 @@ static void build_nfit(void *fit, GSList *device_list, 
GArray *table_offsets,
 
 #define NOTIFY_VALUE  0x99
 
+enum {
+DSM_CMD_IMPLEMENTED = 0,
+
+/* root device commands */
+DSM_CMD_ARS_CAP = 1,
+DSM_CMD_ARS_START = 2,
+DSM_CMD_ARS_QUERY = 3,
+
+/* per-nvdimm device commands */
+DSM_CMD_SMART = 1,
+DSM_CMD_SMART_THRESHOLD = 2,
+DSM_CMD_BLOCK_NVDIMM_FLAGS = 3,
+DSM_CMD_NAMESPACE_LABEL_SIZE = 4,
+DSM_CMD_GET_NAMESPACE_LABEL_DATA = 5,
+DSM_CMD_SET_NAMESPACE_LABEL_DATA = 6,
+DSM_CMD_VENDOR_EFFECT_LOG_SIZE = 7,
+DSM_CMD_GET_VENDOR_EFFECT_LOG = 8,
+DSM_CMD_VENDOR_SPECIFIC = 9,
+};
+
+enum {
+DSM_STATUS_SUCCESS = 0,
+DSM_STATUS_NOT_SUPPORTED = 1,
+DSM_STATUS_NON_EXISTING_MEM_DEV = 2,
+DSM_STATUS_INVALID_PARAS = 3,
+DSM_STATUS_VENDOR_SPECIFIC_ERROR = 4,
+};
+
+#define DSM_REVISION(1)
+
+/* do not support any command except NFIT_CMD_IMPLEMENTED on root. */
+#define ROOT_SUPPORT_CMD(1 << DSM_CMD_IMPLEMENTED)
+#define DIMM_SUPPORT_CMD((1 << DSM_CMD_IMPLEMENTED)   \
+   | (1 << DSM_CMD_NAMESPACE_LABEL_SIZE)  \
+   | (1 << DSM_CMD_GET_NAMESPACE_LABEL_DATA)  \
+   | (1 << DSM_CMD_SET_NAMESPACE_LABEL_DATA))
+
 struct dsm_in {
 uint32_t handle;
 uint8_t arg0[16];
@@ -320,10 +389,19 @@ struct dsm_in {
 } QEMU_PACKED;
 typedef struct dsm_in dsm_in;
 
+struct cmd_out_implemented {
+uint64_t cmd_list;
+};
+typedef struct cmd_out_implemented cmd_out_implemented;
+
 struct dsm_out {
 /* the size of buffer filled by QEMU. */
 uint16_t len;
-uint8_t data[0];
+union {
+uint8_t data[0];
+uint32_t status;
+cmd_out_implemented cmd_implemented;
+};
 } QEMU_PACKED;
 typedef struct dsm_out dsm_out;
 
@@ -334,12 +412,110 @@ static uint64_t dsm_read(void *opaque, hwaddr addr,
 return 0;
 }
 
+static void dsm_write_root(uint32_t function, dsm_in *in, dsm_out *out)
+{
+if (function == DSM_CMD_IMPLEMENTED) {
+out->len = sizeof(out->cmd_implemented);
+out->cmd_implemented.cmd_list = cpu_to_le64(ROOT_SUPPORT_CMD);
+return;
+}
+
+out->len = sizeof(out->status);
+out->status = cpu_to_le32(DSM_STATUS_NOT_SUPPORTED);
+nvdebug("Return status %#x.\n", out->status);
+}
+
+static void dsm_write_nvdimm(uint32_t handle, uint32_t function, dsm_in *in,
+ dsm_out *out)
+{
+GSList *list = nvdimm_get_built_list();
+NVDIMMDevice *nvdimm = get_nvdimm_device_by_handle(list, handle);
+uint32_t status = DSM_STATUS_NON_EXISTING_MEM_DEV;
+uint64_t cmd_list;
+
+if (!nvdimm) {
+out->len = sizeof(out->status);
+goto set_status_free;
+}
+
+switch (function) {
+case DSM_CMD_IMPLEMENTED:
+cmd_list = DIMM_SUPPORT_CMD;
+out->len = sizeof(out->cmd_implemented);
+out->cmd_implemented.cmd_list = 

[PATCH v3 29/32] nvdimm: support DSM_CMD_GET_NAMESPACE_LABEL_DATA

2015-10-10 Thread Xiao Guangrong
Function 5 is used to get Namespace Label Data

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index b420e2f..e0a37cb 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -401,6 +401,7 @@ struct dsm_in {
 union {
 uint8_t arg3[0];
 cmd_in_set_label_data cmd_set_label_data;
+cmd_in_get_label_data cmd_get_label_data;
 };
 } QEMU_PACKED;
 typedef struct dsm_in dsm_in;
@@ -504,6 +505,35 @@ dsm_cmd_label_size(MemoryRegion *dsm_ram_mr, NVDIMMDevice 
*nvdimm,
 return DSM_STATUS_SUCCESS;
 }
 
+static uint32_t dsm_cmd_get_label_data(NVDIMMDevice *nvdimm, dsm_in *in,
+   dsm_out *out)
+{
+cmd_in_get_label_data *cmd_in = &in->cmd_get_label_data;
+uint32_t length, offset, status;
+
+length = cmd_in->length;
+offset = cmd_in->offset;
+le32_to_cpus(&length);
+le32_to_cpus(&offset);
+
+nvdebug("Read Label Data: offset %#x length %#x.\n", offset, length);
+
+if (nvdimm->label_size < length + offset) {
+nvdebug("position %#x is beyond label data (len = %#lx).\n",
+length + offset, nvdimm->label_size);
+out->len = sizeof(out->status);
+status = DSM_STATUS_INVALID_PARAS;
+goto exit;
+}
+
+status = DSM_STATUS_SUCCESS;
+memcpy(out->cmd_get_label_data.out_buf, nvdimm->label_data +
+   offset, length);
+out->len = sizeof(out->cmd_get_label_data) + length;
+exit:
+return status;
+}
+
 static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, uint32_t handle,
  uint32_t function, dsm_in *in, dsm_out *out)
 {
@@ -526,6 +556,9 @@ static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, 
uint32_t handle,
 case DSM_CMD_NAMESPACE_LABEL_SIZE:
 status = dsm_cmd_label_size(dsm_ram_mr, nvdimm, out);
 break;
+case DSM_CMD_GET_NAMESPACE_LABEL_DATA:
+status = dsm_cmd_get_label_data(nvdimm, in, out);
+break;
 default:
 out->len = sizeof(out->status);
 status = DSM_STATUS_NOT_SUPPORTED;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 20/32] dimm: introduce realize callback

2015-10-10 Thread Xiao Guangrong
nvdimm need check if the backend memory is large enough to contain label
data and init its memory region when the device is realized, so introduce
realize callback which is called after common dimm has been realize

Signed-off-by: Xiao Guangrong 
---
 hw/mem/dimm.c | 5 +
 include/hw/mem/dimm.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/hw/mem/dimm.c b/hw/mem/dimm.c
index efe964a..7a87761 100644
--- a/hw/mem/dimm.c
+++ b/hw/mem/dimm.c
@@ -422,6 +422,7 @@ static void dimm_init(Object *obj)
 static void dimm_realize(DeviceState *dev, Error **errp)
 {
 DIMMDevice *dimm = DIMM(dev);
+DIMMDeviceClass *ddc = DIMM_GET_CLASS(dimm);
 
 if (!dimm->hostmem) {
 error_setg(errp, "'" DIMM_MEMDEV_PROP "' property is not set");
@@ -434,6 +435,10 @@ static void dimm_realize(DeviceState *dev, Error **errp)
dimm->node, nb_numa_nodes ? nb_numa_nodes : 1);
 return;
 }
+
+if (ddc->realize) {
+ddc->realize(dimm, errp);
+}
 }
 
 static void dimm_class_init(ObjectClass *oc, void *data)
diff --git a/include/hw/mem/dimm.h b/include/hw/mem/dimm.h
index 84a62ed..663288d 100644
--- a/include/hw/mem/dimm.h
+++ b/include/hw/mem/dimm.h
@@ -65,6 +65,7 @@ typedef struct DIMMDeviceClass {
 DeviceClass parent_class;
 
 /* public */
+void (*realize)(DIMMDevice *dimm, Error **errp);
 MemoryRegion *(*get_memory_region)(DIMMDevice *dimm);
 } DIMMDeviceClass;
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 31/32] nvdimm: allow using whole backend memory as pmem

2015-10-10 Thread Xiao Guangrong
Introduce a parameter, named "reserve-label-data", which indicates
that QEMU does not reserve any region on the backend memory to
support label data, instead, it will build a readonly label data
in memory which has a active namespace containing whole backend
memory

This is useful for the users who want to pass whole nvdimm device
and make its data completely be visible to guest

The parameter is false on default

Signed-off-by: Xiao Guangrong 
---
 hw/mem/Makefile.objs  |   3 +-
 hw/mem/nvdimm/acpi.c  |  20 +++
 hw/mem/nvdimm/internal.h  |   3 +
 hw/mem/nvdimm/namespace.c | 309 ++
 hw/mem/nvdimm/nvdimm.c|  36 +-
 include/hw/mem/nvdimm.h   |   4 +
 6 files changed, 369 insertions(+), 6 deletions(-)
 create mode 100644 hw/mem/nvdimm/namespace.c

diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs
index 7310bac..fc76ca5 100644
--- a/hw/mem/Makefile.objs
+++ b/hw/mem/Makefile.objs
@@ -1,3 +1,4 @@
 common-obj-$(CONFIG_DIMM) += dimm.o
 common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o
-common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o nvdimm/acpi.o
+common-obj-$(CONFIG_NVDIMM) += nvdimm/nvdimm.o nvdimm/acpi.o   \
+   nvdimm/namespace.o
diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index 6f05b37..e6694bc 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -305,6 +305,8 @@ static void build_device_structure(GSList *device_list, 
char *buf)
 {
 for (; device_list; device_list = device_list->next) {
 NVDIMMDevice *nvdimm = device_list->data;
+nfit_memdev *memdev;
+nfit_dcr *dcr;
 
 /* build System Physical Address Range Description Table. */
 buf += build_structure_spa(buf, nvdimm);
@@ -313,10 +315,15 @@ static void build_device_structure(GSList *device_list, 
char *buf)
  * build Memory Device to System Physical Address Range Mapping
  * Table.
  */
+memdev = (nfit_memdev *)buf;
 buf += build_structure_memdev(buf, nvdimm);
 
 /* build Control Region Descriptor Table. */
+dcr = (struct nfit_dcr *)buf;
 buf += build_structure_dcr(buf, nvdimm);
+
+calculate_nvdimm_isetcookie(nvdimm, memdev->region_offset,
+dcr->serial_number);
 }
 }
 
@@ -560,6 +567,12 @@ dsm_cmd_set_label_data(NVDIMMDevice *nvdimm, dsm_in *in, 
dsm_out *out)
 goto exit;
 }
 
+if (!nvdimm->reserve_label_data) {
+out->len = sizeof(out->status);
+status = DSM_STATUS_NOT_SUPPORTED;
+goto exit;
+}
+
 status = DSM_STATUS_SUCCESS;
 memcpy(nvdimm->label_data + offset, cmd_in->in_buf, length);
 out->len = sizeof(status);
@@ -583,6 +596,10 @@ static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, 
uint32_t handle,
 switch (function) {
 case DSM_CMD_IMPLEMENTED:
 cmd_list = DIMM_SUPPORT_CMD;
+if (!nvdimm->reserve_label_data) {
+cmd_list &= ~(1 << DSM_CMD_SET_NAMESPACE_LABEL_DATA);
+}
+
 out->len = sizeof(out->cmd_implemented);
 out->cmd_implemented.cmd_list = cpu_to_le64(cmd_list);
 goto free;
@@ -936,6 +953,9 @@ void nvdimm_build_acpi_table(NVDIMMState *state, GArray 
*table_offsets,
 
 nvdimm_build_ssdt(state, device_list, table_offsets, table_data,
   linker);
+
+build_nvdimm_label_data(device_list);
+
 g_slist_free(device_list);
 }
 }
diff --git a/hw/mem/nvdimm/internal.h b/hw/mem/nvdimm/internal.h
index 1e95363..f523175 100644
--- a/hw/mem/nvdimm/internal.h
+++ b/hw/mem/nvdimm/internal.h
@@ -35,4 +35,7 @@ typedef struct uuid_le uuid_le;
 (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) } })
 
 GSList *nvdimm_get_built_list(void);
+void calculate_nvdimm_isetcookie(NVDIMMDevice *nvdimm, uint64_t spa_offset,
+ uint32_t sn);
+void build_nvdimm_label_data(GSList *device_list);
 #endif
diff --git a/hw/mem/nvdimm/namespace.c b/hw/mem/nvdimm/namespace.c
new file mode 100644
index 000..fe58f9a
--- /dev/null
+++ b/hw/mem/nvdimm/namespace.c
@@ -0,0 +1,309 @@
+/*
+ * NVDIMM  Namespace Support
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *  Xiao Guangrong 
+ *
+ * NVDIMM namespace specification can be found at:
+ *  http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along wit

[PATCH v3 30/32] nvdimm: support DSM_CMD_SET_NAMESPACE_LABEL_DATA

2015-10-10 Thread Xiao Guangrong
Function 6 is used to set Namespace Label Data

Signed-off-by: Xiao Guangrong 
---
 hw/mem/nvdimm/acpi.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index e0a37cb..6f05b37 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -424,6 +424,11 @@ struct cmd_out_get_label_data {
 } QEMU_PACKED;
 typedef struct cmd_out_get_label_data cmd_out_get_label_data;
 
+struct cmd_out_set_label_data {
+uint32_t status;
+};
+typedef struct cmd_out_set_label_data cmd_out_set_label_data;
+
 struct dsm_out {
 /* the size of buffer filled by QEMU. */
 uint16_t len;
@@ -433,6 +438,7 @@ struct dsm_out {
 cmd_out_implemented cmd_implemented;
 cmd_out_label_size cmd_label_size;
 cmd_out_get_label_data cmd_get_label_data;
+cmd_out_set_label_data cmd_set_label_data;
 };
 } QEMU_PACKED;
 typedef struct dsm_out dsm_out;
@@ -534,6 +540,33 @@ exit:
 return status;
 }
 
+static uint32_t
+dsm_cmd_set_label_data(NVDIMMDevice *nvdimm, dsm_in *in, dsm_out *out)
+{
+cmd_in_set_label_data *cmd_in = &in->cmd_set_label_data;
+uint32_t length, offset, status;
+
+length = cmd_in->length;
+offset = cmd_in->offset;
+le32_to_cpus(&length);
+le32_to_cpus(&offset);
+
+nvdebug("Write Label Data: offset %#x length %#x.\n", offset, length);
+if (nvdimm->label_size < length + offset) {
+nvdebug("position %#x is beyond config data (len = %#lx).\n",
+length + offset, nvdimm->label_size);
+out->len = sizeof(out->status);
+status = DSM_STATUS_INVALID_PARAS;
+goto exit;
+}
+
+status = DSM_STATUS_SUCCESS;
+memcpy(nvdimm->label_data + offset, cmd_in->in_buf, length);
+out->len = sizeof(status);
+exit:
+return status;
+}
+
 static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, uint32_t handle,
  uint32_t function, dsm_in *in, dsm_out *out)
 {
@@ -559,6 +592,9 @@ static void dsm_write_nvdimm(MemoryRegion *dsm_ram_mr, 
uint32_t handle,
 case DSM_CMD_GET_NAMESPACE_LABEL_DATA:
 status = dsm_cmd_get_label_data(nvdimm, in, out);
 break;
+case DSM_CMD_SET_NAMESPACE_LABEL_DATA:
+status = dsm_cmd_set_label_data(nvdimm, in, out);
+break;
 default:
 out->len = sizeof(out->status);
 status = DSM_STATUS_NOT_SUPPORTED;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [edk2] KVM: MTRR: fix memory type handling if MTRR is completely disabled

2015-10-10 Thread Xiao Guangrong



On 10/02/2015 10:38 PM, Janusz wrote:

W dniu 01.10.2015 o 16:18, Paolo Bonzini pisze:


On 01/10/2015 16:12, Janusz wrote:

Now, I can also add, that the problem is only when I allow VM to use
more than one core, so with option  for example:
-smp 8,cores=4,threads=2,sockets=1 and other combinations like -smp
4,threads=1 its not working, and without it I am always running VM
without problems

Any ideas what can it be? or any idea what would help to find out what
is causing this?

I am going to send a revert of the patch tomorrow.

Paolo

Thanks, but revert patch doesn't help, so something else is wrong here



It seems i can reproduce it now ... and finally i get little free time now :(
I will dig into it and fix it asap.

Thank you, Janusz and Paolo!
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 00/32] implement vNVDIMM

2015-10-10 Thread Dan Williams
On Sat, Oct 10, 2015 at 8:52 PM, Xiao Guangrong
 wrote:
[..]
> == Test ==
> In host
> 1) create memory backed file, e.g # dd if=zero of=/tmp/nvdimm bs=1G count=10
> 2) append "-object memory-backend-file,share,id=mem1,
>mem-path=/tmp/nvdimm -device nvdimm,memdev=mem1,reserve-label-data,
>id=nv1" in QEMU command line
>
> In guest, download the latest upsteam kernel (4.2 merge window) and enable
> ACPI_NFIT, LIBNVDIMM and BLK_DEV_PMEM.
> 1) insmod drivers/nvdimm/libnvdimm.ko
> 2) insmod drivers/acpi/nfit.ko
> 3) insmod drivers/nvdimm/nd_btt.ko
> 4) insmod drivers/nvdimm/nd_pmem.ko
> You can see the whole nvdimm device used as a single namespace and /dev/pmem0
> appears. You can do whatever on /dev/pmem0 including DAX access.
>
> Currently Linux NVDIMM driver does not support namespace operation on this
> kind of PMEM, apply below changes to support dynamical namespace:
>
> @@ -798,7 +823,8 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc 
> *a
> continue;
> }
>
> -   if (nfit_mem->bdw && nfit_mem->memdev_pmem)
> +   //if (nfit_mem->bdw && nfit_mem->memdev_pmem)
> +   if (nfit_mem->memdev_pmem)
> flags |= NDD_ALIASING;

This is just for testing purposes, right?  I expect guests can
sub-divide persistent memory capacity by partitioning the resulting
block device(s).
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html