Re: [PATCH v18 3/3] vfio/nvgrace-gpu: Add vfio pci variant module for grace hopper

2024-02-19 Thread Zhi Wang
t; same [3]. As previously mentioned, resmem is mapped
> pgprot_writecombine(), that sets the Qemu VMA page properties
> (pgprot) as NORMAL_NC. Using the proposed changes in [3] and [4], KVM
> marks the region with MemAttr[2:0]=0b101 in S2.
> 
> If the device memory properties are not present, the driver registers
> the vfio-pci-core function pointers. Since there are no ACPI memory
> properties generated for the VM, the variant driver inside the VM
> will only use the vfio-pci-core ops and hence try to map the BARs as
> non cached. This is not a problem as the CPUs have FWB enabled which
> blocks the VM mapping's ability to override the cacheability set by
> the host mapping.
> 
> This goes along with a qemu series [6] to provides the necessary
> implementation of the Grace Hopper Superchip firmware specification so
> that the guest operating system can see the correct ACPI modeling for
> the coherent GPU device. Verified with the CUDA workload in the VM.
> 
> [1] https://www.nvidia.com/en-in/technologies/multi-instance-gpu/
> [2] section D8.5.5 of
> https://developer.arm.com/documentation/ddi0487/latest/ [3]
> https://lore.kernel.org/all/20240211174705.31992-1-ank...@nvidia.com/
> [4]
> https://lore.kernel.org/all/20230907181459.18145-2-ank...@nvidia.com/
> [5] https://github.com/NVIDIA/open-gpu-kernel-modules [6]
> https://lore.kernel.org/all/20231203060245.31593-1-ank...@nvidia.com/
> 
> Signed-off-by: Aniket Agashe 
> Signed-off-by: Ankit Agrawal 
> ---
>  MAINTAINERS   |  16 +-
>  drivers/vfio/pci/Kconfig  |   2 +
>  drivers/vfio/pci/Makefile |   2 +
>  drivers/vfio/pci/nvgrace-gpu/Kconfig  |  10 +
>  drivers/vfio/pci/nvgrace-gpu/Makefile |   3 +
>  drivers/vfio/pci/nvgrace-gpu/main.c   | 888
> ++ 6 files changed, 916 insertions(+), 5
> deletions(-) create mode 100644 drivers/vfio/pci/nvgrace-gpu/Kconfig
>  create mode 100644 drivers/vfio/pci/nvgrace-gpu/Makefile
>  create mode 100644 drivers/vfio/pci/nvgrace-gpu/main.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 73d898383e51..7fc7a14c1a20 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -23075,12 +23075,11 @@ L:  k...@vger.kernel.org
>  S:   Maintained
>  F:   drivers/vfio/pci/mlx5/
>  
> -VFIO VIRTIO PCI DRIVER
> -M:   Yishai Hadas 
> +VFIO NVIDIA GRACE GPU DRIVER
> +M:   Ankit Agrawal 
>  L:   k...@vger.kernel.org
> -L:   virtualizat...@lists.linux-foundation.org
> -S:   Maintained
> -F:   drivers/vfio/pci/virtio
> +S:   Supported
> +F:   drivers/vfio/pci/nvgrace-gpu/
>  
>  VFIO PCI DEVICE SPECIFIC DRIVERS
>  R:   Jason Gunthorpe 
> @@ -23105,6 +23104,13 @@ L:   k...@vger.kernel.org
>  S:   Maintained
>  F:   drivers/vfio/platform/
>  
> +VFIO VIRTIO PCI DRIVER
> +M:   Yishai Hadas 
> +L:   k...@vger.kernel.org
> +L:   virtualizat...@lists.linux-foundation.org
> +S:   Maintained
> +F:   drivers/vfio/pci/virtio
> +
>  VGA_SWITCHEROO
>  R:   Lukas Wunner 
>  S:   Maintained
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 18c397df566d..15821a2d77d2 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig"
>  
>  source "drivers/vfio/pci/virtio/Kconfig"
>  
> +source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
> +
>  endmenu
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index 046139a4eca5..ce7a61f1d912 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
>  obj-$(CONFIG_PDS_VFIO_PCI) += pds/
>  
>  obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
> +
> +obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
> diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig
> b/drivers/vfio/pci/nvgrace-gpu/Kconfig new file mode 100644
> index ..a7f624b37e41
> --- /dev/null
> +++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config NVGRACE_GPU_VFIO_PCI
> + tristate "VFIO support for the GPU in the NVIDIA Grace
> Hopper Superchip"
> + depends on ARM64 || (COMPILE_TEST && 64BIT)
> + select VFIO_PCI_CORE
> + help
> +   VFIO support for the GPU in the NVIDIA Grace Hopper
> Superchip is
> +   required to assign the GPU device to userspace using
> KVM/qemu/etc. +
> +   If you don't know what to do here, say N.
> diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile
> b/drivers/vfio/pci/nvgrace-gpu/Makefile new file mode 100644
> index ..3ca8c187897a
> --- /dev/null
> +++ b/drivers/vfio/pci/nvgrace-gpu

Re: [PATCH v17 3/3] vfio/nvgrace-gpu: Add vfio pci variant module for grace hopper

2024-02-07 Thread Zhi Wang
On Tue, 6 Feb 2024 04:31:23 +0530
 wrote:

> From: Ankit Agrawal 
> 
> NVIDIA's upcoming Grace Hopper Superchip provides a PCI-like device
> for the on-chip GPU that is the logical OS representation of the
> internal proprietary chip-to-chip cache coherent interconnect.
> 
> The device is peculiar compared to a real PCI device in that whilst
> there is a real 64b PCI BAR1 (comprising region 2 & region 3) on the
> device, it is not used to access device memory once the faster
> chip-to-chip interconnect is initialized (occurs at the time of host
> system boot). The device memory is accessed instead using the
> chip-to-chip interconnect that is exposed as a contiguous physically
> addressable region on the host. This device memory aperture can be
> obtained from host ACPI table using device_property_read_u64(),
> according to the FW specification. Since the device memory is cache
> coherent with the CPU, it can be mmap into the user VMA with a
> cacheable mapping using remap_pfn_range() and used like a regular
> RAM. The device memory is not added to the host kernel, but mapped
> directly as this reduces memory wastage due to struct pages.
> 
> There is also a requirement of a reserved 1G uncached region (termed
> as resmem) to support the Multi-Instance GPU (MIG) feature [1]. This
> is to work around a HW defect. Based on [2], the requisite properties
> (uncached, unaligned access) can be achieved through a VM mapping (S1)
> of NORMAL_NC and host (S2) mapping with MemAttr[2:0]=0b101. To provide
> a different non-cached property to the reserved 1G region, it needs to
> be carved out from the device memory and mapped as a separate region
> in Qemu VMA with pgprot_writecombine(). pgprot_writecombine() sets the
> Qemu VMA page properties (pgprot) as NORMAL_NC.
> 
> Provide a VFIO PCI variant driver that adapts the unique device memory
> representation into a more standard PCI representation facing
> userspace.
> 
> The variant driver exposes these two regions - the non-cached reserved
> (resmem) and the cached rest of the device memory (termed as usemem)
> as separate VFIO 64b BAR regions. This is divergent from the baremetal
> approach, where the device memory is exposed as a device memory
> region. The decision for a different approach was taken in view of
> the fact that it would necessiate additional code in Qemu to discover
> and insert those regions in the VM IPA, along with the additional VM
> ACPI DSDT changes to communicate the device memory region IPA to the
> VM workloads. Moreover, this behavior would have to be added to a
> variety of emulators (beyond top of tree Qemu) out there desiring
> grace hopper support.
> 
> Since the device implements 64-bit BAR0, the VFIO PCI variant driver
> maps the uncached carved out region to the next available PCI BAR
> (i.e. comprising of region 2 and 3). The cached device memory
> aperture is assigned BAR region 4 and 5. Qemu will then naturally
> generate a PCI device in the VM with the uncached aperture reported
> as BAR2 region, the cacheable as BAR4. The variant driver provides
> emulation for these fake BARs' PCI config space offset registers.
> 
> The hardware ensures that the system does not crash when the memory
> is accessed with the memory enable turned off. It synthesis ~0 reads
> and dropped writes on such access. So there is no need to support the
> disablement/enablement of BAR through PCI_COMMAND config space
> register.
> 
> The memory layout on the host looks like the following:
>devmem (memlength)
> |--|
> |-cached|--NC--|
> |   |
> usemem.phys/memphys resmem.phys
> 
> PCI BARs need to be aligned to the power-of-2, but the actual memory
> on the device may not. A read or write access to the physical address
> from the last device PFN up to the next power-of-2 aligned physical
> address results in reading ~0 and dropped writes. Note that the GPU
> device driver [6] is capable of knowing the exact device memory size
> through separate means. The device memory size is primarily kept in
> the system ACPI tables for use by the VFIO PCI variant module.
> 
> Note that the usemem memory is added by the VM Nvidia device driver
> [5] to the VM kernel as memblocks. Hence make the usable memory size
> memblock aligned.
> 
> Currently there is no provision in KVM for a S2 mapping with
> MemAttr[2:0]=0b101, but there is an ongoing effort to provide the
> same [3]. As previously mentioned, resmem is mapped
> pgprot_writecombine(), that sets the Qemu VMA page properties
> (pgprot) as NORMAL_NC. Using the proposed changes in [4] and [3], KVM
> marks the region with MemAttr[2:0]=0b101 in S2.
> 
> If the bare metal properties are not present, the driver registers the
> vfio-pci-core function pointers.
> 
> This goes along with a qemu series [6] to provides the necessary
> implementation of the Grace 

Re: [PATCH] drm/gvt : fix error message typo

2018-10-05 Thread Zhi Wang

Hi:

Thanks for the patches. :) Several places needs to be improved:

1. The prefix of the patch should be: "drm/i915/gvt:".
2. You can just cc the maintainers of gvt since the gvt is a sub-module 
of i915.
3. Better refine the tittle of the patch and also the commit message. An 
informative title and commit message would be more friendly to people 
who reading this later, even it's only a small typo. :)


Again. Thanks for the patch. :) I will fix that for you and commit it, 
so you don't need to send it again.


Thanks,
Zhi.

On 10/04/18 12:04, Peng Hao wrote:


From: Peng Hao 

gvt_err message typo.

Signed-off-by: Peng Hao 
---
  drivers/gpu/drm/i915/gvt/gvt.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c
index 46c8b72..31d6809 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.c
+++ b/drivers/gpu/drm/i915/gvt/gvt.c
@@ -438,7 +438,7 @@ int intel_gvt_init_device(struct drm_i915_private *dev_priv)
  
  	ret = intel_gvt_debugfs_init(gvt);

if (ret)
-   gvt_err("debugfs registeration failed, go on.\n");
+   gvt_err("debugfs registration failed, go on.\n");
  
  	gvt_dbg_core("gvt device initialization is done\n");

dev_priv->gvt = gvt;



Re: [PATCH] drm/gvt : fix error message typo

2018-10-05 Thread Zhi Wang

Hi:

Thanks for the patches. :) Several places needs to be improved:

1. The prefix of the patch should be: "drm/i915/gvt:".
2. You can just cc the maintainers of gvt since the gvt is a sub-module 
of i915.
3. Better refine the tittle of the patch and also the commit message. An 
informative title and commit message would be more friendly to people 
who reading this later, even it's only a small typo. :)


Again. Thanks for the patch. :) I will fix that for you and commit it, 
so you don't need to send it again.


Thanks,
Zhi.

On 10/04/18 12:04, Peng Hao wrote:


From: Peng Hao 

gvt_err message typo.

Signed-off-by: Peng Hao 
---
  drivers/gpu/drm/i915/gvt/gvt.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c
index 46c8b72..31d6809 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.c
+++ b/drivers/gpu/drm/i915/gvt/gvt.c
@@ -438,7 +438,7 @@ int intel_gvt_init_device(struct drm_i915_private *dev_priv)
  
  	ret = intel_gvt_debugfs_init(gvt);

if (ret)
-   gvt_err("debugfs registeration failed, go on.\n");
+   gvt_err("debugfs registration failed, go on.\n");
  
  	gvt_dbg_core("gvt device initialization is done\n");

dev_priv->gvt = gvt;



Re: [PATCH RFC 0/4] Per-task PTI activation

2018-01-09 Thread Zhi Wang
Is is possible to put per-task PTI control interface into cgroup or 
other interfaces?  Enabling/disabling per-task PTI should be a decision 
from the system administrator not the application itself.


On 2018/1/9 18:02, Willy Tarreau wrote:

Hi Eric,

On Tue, Jan 09, 2018 at 09:31:27AM -0600, Eric W. Biederman wrote:

The dangerous scenario is someone exploting a buffer overflow, or
otherwise getting a network facing application to misbehave, and then
using these new attacks to assist in gaining privilege escalation.

For most use cases sure. But for *some* use cases, if they can control
of the application, you've already lost everything you had. Private keys,
clear text traffic, etc. We're precisely talking about such applications
where the userspace is as much important as the kernel, and where there's
hardly anything left to lose once the application is cracked. However, a
significant performance drop on the application definitely is a problem,
first making it weaker when facing attacks, or even failing to deal with
traffic peaks.


Googling seems to indicate that there is about one issue a year found in
haproxy.  So this is not an unrealistic concern for the case you
mention.

I agree. But in practice, we had two exploitable bugs, one in 2002
(overflow in the logs), and one in 2014 requiring a purposely written
config which makes no pratical sense at all. Most other vulnerabilities
involve freezes, occasionally crashes, though that's even more rare.
And even with the two above, you just have one chance to try to exploit
it, if you get your pointer wrong, it dies and you have to wait for the
admin to restart it. In practice, seeing the process die is the worst
nightmare of admins as the service simply stops. I'm not saying we don't
want to defend them, we even chroot to an empty directory and drop
privileges to mitigate such a risk. But when the intruder is in the
process it's really too late.


So unless I am seeing things wrong this is a patchset designed to drop
your defensense on the most vulnerable applications.

In fact it can be seen very differently. By making it possible for exposed
but critical applications to share some risks with the rest of the system,
we also ensure they remain strong for their initial purpose and against
the most common types of attacks. And quite frankly we're not weakening
much given the risks already involved by the process itself.

What I'm describing represents a small category of processes in only
certain environments. Some database servers will have the same issue.
Imagine a Redis server for example, which normally is very fast and
easily saturates whatever network around it. Some DNS providers may
have the same problem when dealing with hundreds of thousands to
millions of UDP packets each second (not counting attacks).

All such services are critical in themselves, but the fact that we accept
to let them share the risks with the system doesn't mean they should be
running without the protections from the occasional operations guy just
allowed to connect there to verify if logs are full and to retrive stats.


Disably protection on the most vunerable applications is not behavior
I would encourage.

I'm not encouraging this behaviour either but right now the only option
for performance critical applications (even if they are vulnerable) is
to make the whole system vulnerable.


It seems better than disabling protection system
wide but only slightly.   I definitely don't think this is something we
want applications disabling themselves.

In fact that's what I liked with the wrapper approach, except that it
had the downside of being harder to manage in terms of administration
and we'd risk to see it used everywhere by default. The arch_prctl()
approach ensures that only applications where this is relevant can do
it. In the case of haproxy, I can trivially add a config option like
"disable-page-isolation" to let the admin enable it on purpose.

But I suspect there might be some performance critical applications that
cannot be patched, and that's where the wrapper could still provide some
value.


Certainly this is something that should look at no-new-privs and if
no-new-privs is set not allow disabling this protection.

I don't know what is "no-new-privs" and couldn't find info on it
unfortunately. Do you have a link please ?

Thanks!
Willy




Re: [PATCH RFC 0/4] Per-task PTI activation

2018-01-09 Thread Zhi Wang
Is is possible to put per-task PTI control interface into cgroup or 
other interfaces?  Enabling/disabling per-task PTI should be a decision 
from the system administrator not the application itself.


On 2018/1/9 18:02, Willy Tarreau wrote:

Hi Eric,

On Tue, Jan 09, 2018 at 09:31:27AM -0600, Eric W. Biederman wrote:

The dangerous scenario is someone exploting a buffer overflow, or
otherwise getting a network facing application to misbehave, and then
using these new attacks to assist in gaining privilege escalation.

For most use cases sure. But for *some* use cases, if they can control
of the application, you've already lost everything you had. Private keys,
clear text traffic, etc. We're precisely talking about such applications
where the userspace is as much important as the kernel, and where there's
hardly anything left to lose once the application is cracked. However, a
significant performance drop on the application definitely is a problem,
first making it weaker when facing attacks, or even failing to deal with
traffic peaks.


Googling seems to indicate that there is about one issue a year found in
haproxy.  So this is not an unrealistic concern for the case you
mention.

I agree. But in practice, we had two exploitable bugs, one in 2002
(overflow in the logs), and one in 2014 requiring a purposely written
config which makes no pratical sense at all. Most other vulnerabilities
involve freezes, occasionally crashes, though that's even more rare.
And even with the two above, you just have one chance to try to exploit
it, if you get your pointer wrong, it dies and you have to wait for the
admin to restart it. In practice, seeing the process die is the worst
nightmare of admins as the service simply stops. I'm not saying we don't
want to defend them, we even chroot to an empty directory and drop
privileges to mitigate such a risk. But when the intruder is in the
process it's really too late.


So unless I am seeing things wrong this is a patchset designed to drop
your defensense on the most vulnerable applications.

In fact it can be seen very differently. By making it possible for exposed
but critical applications to share some risks with the rest of the system,
we also ensure they remain strong for their initial purpose and against
the most common types of attacks. And quite frankly we're not weakening
much given the risks already involved by the process itself.

What I'm describing represents a small category of processes in only
certain environments. Some database servers will have the same issue.
Imagine a Redis server for example, which normally is very fast and
easily saturates whatever network around it. Some DNS providers may
have the same problem when dealing with hundreds of thousands to
millions of UDP packets each second (not counting attacks).

All such services are critical in themselves, but the fact that we accept
to let them share the risks with the system doesn't mean they should be
running without the protections from the occasional operations guy just
allowed to connect there to verify if logs are full and to retrive stats.


Disably protection on the most vunerable applications is not behavior
I would encourage.

I'm not encouraging this behaviour either but right now the only option
for performance critical applications (even if they are vulnerable) is
to make the whole system vulnerable.


It seems better than disabling protection system
wide but only slightly.   I definitely don't think this is something we
want applications disabling themselves.

In fact that's what I liked with the wrapper approach, except that it
had the downside of being harder to manage in terms of administration
and we'd risk to see it used everywhere by default. The arch_prctl()
approach ensures that only applications where this is relevant can do
it. In the case of haproxy, I can trivially add a config option like
"disable-page-isolation" to let the admin enable it on purpose.

But I suspect there might be some performance critical applications that
cannot be patched, and that's where the wrapper could still provide some
value.


Certainly this is something that should look at no-new-privs and if
no-new-privs is set not allow disabling this protection.

I don't know what is "no-new-privs" and couldn't find info on it
unfortunately. Do you have a link please ?

Thanks!
Willy




Re: [PATCH] drm/i915/gvt: use ARRAY_SIZE

2017-10-23 Thread Zhi Wang

Thanks, applied!

On 10/16/17 10:32, Jérémy Lefaure wrote:

Using the ARRAY_SIZE macro improves the readability of the code. Also,
it's useless to use a variable to store this constant calculated at
compile time.

Found with Coccinelle with the following semantic patch:
@r depends on (org || report)@
type T;
T[] E;
position p;
@@
(
  (sizeof(E)@p /sizeof(*E))
|
  (sizeof(E)@p /sizeof(E[...]))
|
  (sizeof(E)@p /sizeof(T))
)

Signed-off-by: Jérémy Lefaure 
---
This patch was part of a bigger patch [1].

[1]: https://patchwork.kernel.org/patch/9979843/

  drivers/gpu/drm/i915/gvt/vgpu.c | 9 -
  1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 02c61a1ad56a..b32c1c889ea8 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -31,6 +31,7 @@
   *
   */
  
+#include 

  #include "i915_drv.h"
  #include "gvt.h"
  #include "i915_pvinfo.h"
@@ -98,7 +99,6 @@ static struct {
   */
  int intel_gvt_init_vgpu_types(struct intel_gvt *gvt)
  {
-   unsigned int num_types;
unsigned int i, low_avail, high_avail;
unsigned int min_low;
  
@@ -116,15 +116,14 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt)

 */
low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE;
high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE;
-   num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]);
  
-	gvt->types = kzalloc(num_types * sizeof(struct intel_vgpu_type),

-GFP_KERNEL);
+   gvt->types = kzalloc(ARRAY_SIZE(vgpu_types) *
+sizeof(struct intel_vgpu_type), GFP_KERNEL);
if (!gvt->types)
return -ENOMEM;
  
  	min_low = MB_TO_BYTES(32);

-   for (i = 0; i < num_types; ++i) {
+   for (i = 0; i < ARRAY_SIZE(vgpu_types); ++i) {
if (low_avail / vgpu_types[i].low_mm == 0)
break;
  



Re: [PATCH] drm/i915/gvt: use ARRAY_SIZE

2017-10-23 Thread Zhi Wang

Thanks, applied!

On 10/16/17 10:32, Jérémy Lefaure wrote:

Using the ARRAY_SIZE macro improves the readability of the code. Also,
it's useless to use a variable to store this constant calculated at
compile time.

Found with Coccinelle with the following semantic patch:
@r depends on (org || report)@
type T;
T[] E;
position p;
@@
(
  (sizeof(E)@p /sizeof(*E))
|
  (sizeof(E)@p /sizeof(E[...]))
|
  (sizeof(E)@p /sizeof(T))
)

Signed-off-by: Jérémy Lefaure 
---
This patch was part of a bigger patch [1].

[1]: https://patchwork.kernel.org/patch/9979843/

  drivers/gpu/drm/i915/gvt/vgpu.c | 9 -
  1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 02c61a1ad56a..b32c1c889ea8 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -31,6 +31,7 @@
   *
   */
  
+#include 

  #include "i915_drv.h"
  #include "gvt.h"
  #include "i915_pvinfo.h"
@@ -98,7 +99,6 @@ static struct {
   */
  int intel_gvt_init_vgpu_types(struct intel_gvt *gvt)
  {
-   unsigned int num_types;
unsigned int i, low_avail, high_avail;
unsigned int min_low;
  
@@ -116,15 +116,14 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt)

 */
low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE;
high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE;
-   num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]);
  
-	gvt->types = kzalloc(num_types * sizeof(struct intel_vgpu_type),

-GFP_KERNEL);
+   gvt->types = kzalloc(ARRAY_SIZE(vgpu_types) *
+sizeof(struct intel_vgpu_type), GFP_KERNEL);
if (!gvt->types)
return -ENOMEM;
  
  	min_low = MB_TO_BYTES(32);

-   for (i = 0; i < num_types; ++i) {
+   for (i = 0; i < ARRAY_SIZE(vgpu_types); ++i) {
if (low_avail / vgpu_types[i].low_mm == 0)
break;
  



Re: [PATCH] drm/i915/gvt: Clean up dead code in cmd_parser

2017-10-23 Thread Zhi Wang

Thanks, applied! :)

On 10/16/17 06:32, Christos Gkekas wrote:

Delete variables 'gma_bottom' that are set but never used.

Signed-off-by: Christos Gkekas 
---
  drivers/gpu/drm/i915/gvt/cmd_parser.c | 6 ++
  1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c 
b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 2c0ccbb..d75ce70 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -2511,7 +2511,7 @@ static int command_scan(struct parser_exec_state *s,
  
  static int scan_workload(struct intel_vgpu_workload *workload)

  {
-   unsigned long gma_head, gma_tail, gma_bottom;
+   unsigned long gma_head, gma_tail;
struct parser_exec_state s;
int ret = 0;
  
@@ -2521,7 +2521,6 @@ static int scan_workload(struct intel_vgpu_workload *workload)
  
  	gma_head = workload->rb_start + workload->rb_head;

gma_tail = workload->rb_start + workload->rb_tail;
-   gma_bottom = workload->rb_start +  _RING_CTL_BUF_SIZE(workload->rb_ctl);
  
  	s.buf_type = RING_BUFFER_INSTRUCTION;

s.buf_addr_type = GTT_BUFFER;
@@ -2557,7 +2556,7 @@ static int scan_workload(struct intel_vgpu_workload 
*workload)
  static int scan_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
  {
  
-	unsigned long gma_head, gma_tail, gma_bottom, ring_size, ring_tail;

+   unsigned long gma_head, gma_tail, ring_size, ring_tail;
struct parser_exec_state s;
int ret = 0;
struct intel_vgpu_workload *workload = container_of(wa_ctx,
@@ -2573,7 +2572,6 @@ static int scan_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
PAGE_SIZE);
gma_head = wa_ctx->indirect_ctx.guest_gma;
gma_tail = wa_ctx->indirect_ctx.guest_gma + ring_tail;
-   gma_bottom = wa_ctx->indirect_ctx.guest_gma + ring_size;
  
  	s.buf_type = RING_BUFFER_INSTRUCTION;

s.buf_addr_type = GTT_BUFFER;



Re: [PATCH] drm/i915/gvt: Clean up dead code in cmd_parser

2017-10-23 Thread Zhi Wang

Thanks, applied! :)

On 10/16/17 06:32, Christos Gkekas wrote:

Delete variables 'gma_bottom' that are set but never used.

Signed-off-by: Christos Gkekas 
---
  drivers/gpu/drm/i915/gvt/cmd_parser.c | 6 ++
  1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c 
b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 2c0ccbb..d75ce70 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -2511,7 +2511,7 @@ static int command_scan(struct parser_exec_state *s,
  
  static int scan_workload(struct intel_vgpu_workload *workload)

  {
-   unsigned long gma_head, gma_tail, gma_bottom;
+   unsigned long gma_head, gma_tail;
struct parser_exec_state s;
int ret = 0;
  
@@ -2521,7 +2521,6 @@ static int scan_workload(struct intel_vgpu_workload *workload)
  
  	gma_head = workload->rb_start + workload->rb_head;

gma_tail = workload->rb_start + workload->rb_tail;
-   gma_bottom = workload->rb_start +  _RING_CTL_BUF_SIZE(workload->rb_ctl);
  
  	s.buf_type = RING_BUFFER_INSTRUCTION;

s.buf_addr_type = GTT_BUFFER;
@@ -2557,7 +2556,7 @@ static int scan_workload(struct intel_vgpu_workload 
*workload)
  static int scan_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
  {
  
-	unsigned long gma_head, gma_tail, gma_bottom, ring_size, ring_tail;

+   unsigned long gma_head, gma_tail, ring_size, ring_tail;
struct parser_exec_state s;
int ret = 0;
struct intel_vgpu_workload *workload = container_of(wa_ctx,
@@ -2573,7 +2572,6 @@ static int scan_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
PAGE_SIZE);
gma_head = wa_ctx->indirect_ctx.guest_gma;
gma_tail = wa_ctx->indirect_ctx.guest_gma + ring_tail;
-   gma_bottom = wa_ctx->indirect_ctx.guest_gma + ring_size;
  
  	s.buf_type = RING_BUFFER_INSTRUCTION;

s.buf_addr_type = GTT_BUFFER;



Re: [kra...@redhat.com: Re: [Intel-gfx] [PATCH v9 5/7] vfio: Define vfio based dma-buf operations]

2017-06-23 Thread Zhi Wang

Hi Gred and Alex:
Thanks for the reply! It would be better that kernel only provides 
functions instead of maintaining states from my point of view. If there 
is any existing async notification channel in vfio device fd? like 
reporting device events from vfio to QEMU? If so, It would be nice if we 
can extend that channel and notify the userspace application that guest 
framebuffer changed via that channel. So userspace application even 
doesn't need to call the ioctl periodically. :) If VM runs a 
single-buffered application.


Thanks,
Zhi.

On Fri, 2017-06-23 at 15:49 +0800, Zhi Wang wrote:

Hi:
  Thanks for the discussions! If the userspace application has
already maintained a LRU list, it looks like we don't need
generation
anymore,

generation isn't required, things are working just fine without that.
It is just a small optimization, userspace can skip the LRU lookup
altogether if the generation didn't change.

But of couse that only pays off if the kernel doesn't has to put much
effort into maintaining the generation id.  Something simple like
increasing it each time the guest writes a register which affects
plane_info.

If you think it doesn't make sense from the driver point of view we can
drop the generation.

cheers,
   Gerd


Re: [kra...@redhat.com: Re: [Intel-gfx] [PATCH v9 5/7] vfio: Define vfio based dma-buf operations]

2017-06-23 Thread Zhi Wang

Hi Gred and Alex:
Thanks for the reply! It would be better that kernel only provides 
functions instead of maintaining states from my point of view. If there 
is any existing async notification channel in vfio device fd? like 
reporting device events from vfio to QEMU? If so, It would be nice if we 
can extend that channel and notify the userspace application that guest 
framebuffer changed via that channel. So userspace application even 
doesn't need to call the ioctl periodically. :) If VM runs a 
single-buffered application.


Thanks,
Zhi.

On Fri, 2017-06-23 at 15:49 +0800, Zhi Wang wrote:

Hi:
  Thanks for the discussions! If the userspace application has
already maintained a LRU list, it looks like we don't need
generation
anymore,

generation isn't required, things are working just fine without that.
It is just a small optimization, userspace can skip the LRU lookup
altogether if the generation didn't change.

But of couse that only pays off if the kernel doesn't has to put much
effort into maintaining the generation id.  Something simple like
increasing it each time the guest writes a register which affects
plane_info.

If you think it doesn't make sense from the driver point of view we can
drop the generation.

cheers,
   Gerd


Re: [Intel-gfx] [PATCH v9 5/7] vfio: Define vfio based dma-buf operations

2017-06-23 Thread Zhi Wang

Hi:
Thanks for the discussions! If the userspace application has 
already maintained a LRU list, it looks like we don't need generation 
anymore, as userspace application will lookup the guest framebuffer from 
the LRU list by "offset". No matter how, it would know if this is a new 
guest framebuffer or not. If it's a new guest framebuffer, a new dmabuf 
fd should be generated. If it's an old framebuffer, it can just show 
that framebuffer.


Thanks,
Zhi.

On 06/23/17 15:26, Gerd Hoffmann wrote:

   Hi,


Is this only going to support accelerated driver output, not basic
VGA
modes for BIOS interaction?

Right now there is no vgabios or uefi support for the vgpu.

But even with that in place there still is the problem that the display
device initialization happens before the guest runs and therefore there
isn't an plane yet ...


Right now the experimental intel patches throw errors in case no
plane
exists (yet).  Maybe we should have a "bool is_enabled" field in
the
plane_info struct, so drivers can use that to signal whenever the
guest
has programmed a valid video mode or not (likewise for the cursor,
which doesn't exist with fbcon, only when running xorg).  With that
in
place using the QUERY_PLANE ioctl also for probing looks
reasonable.

Sure, or -ENOTTY for ioctl not implemented vs -EINVAL for no
available
plane, but then that might not help the user know how a plane would
be
available if it were available.

So maybe a "enum plane_state" (instead of "bool is_enabled")?  So we
can clearly disturgish ENABLED, DISABLED, NOT_SUPPORTED cases?


Yes, I'd leave that to userspace.  So, when the generation changes
userspace knows the guest changed the plane.  It could be a
configuration the guest has used before (and where userspace could
have
a cached dma-buf handle for), or it could be something new.

But userspace also doesn't know that a dmabuf generation will ever be
visited again,

kernel wouldn't know either, only the guest knows ...


so they're bound to have some stale descriptors.  Are
we thinking userspace would have some LRU list of dmabufs so that
they
don't collect too many?  Each uses some resources,  do we just rely
on
open file handles to set an upper limit?

Yep, this is exactly what my qemu patches are doing, keep a LRU list.
  

What happens to
existing dmabuf fds when the generation updates, do they stop
refreshing?

Depends on what the guest is doing ;)

The dma-buf is just a host-side handle for the piece of video
memory
where the guest stored the framebuffer.

So the resources the user is holding if they don't release their
dmabuf
are potentially non-trivial.

Not really.  Host IGD has a certain amount of memory, some of it is
assigned to the guest, guest stores the framebuffer there, the dma-buf
is a host handle (drm object, usable for rendering ops) for the guest
framebuffer.  So it doesn't use much ressources.  Some memory is needed
for management structs, but not for the actual data as this in the
video memory dedicated to the guest.


Ok, if we want support multiple regions.  Do we?  Using the offset
we
can place multiple planes in a single region.  And I'm not sure
nvidia
plans to use multiple planes in the first place ...

I don't want to take a driver ioctl interface as a throw away, one
time
use exercise.  If we can think of such questions now, let's define
how
they work.  A device could have multiple graphics regions with
multiple
planes within each region.

I'd suggest to settle for one of these two.  Either one region and
multiple planes inside (using offset) or one region per plane.  I'd
prefer the former.  When going for the latter then yes we have to
specify the region.  I'd name the field region_id then to make clear
what it is.

What would be the use case for multiple planes?

cursor support?  We already have plane_type for that.

multihead support?  We'll need (at minimum) a head_id field for that
(for both dma-buf and region)

pageflipping support?  Nothing needed, query_plane will simply return
the currently visible plane.  Region only needs to be big enough to fit
the framebuffer twice.  Then the driver can flip between two buffers,
point to the one qemu should display using "offset".


Do we also want to exclude that device
needs to be strictly region or dmabuf?  Maybe it does both.

Very unlikely IMHO.


Or maybe
it supports dmabuf-ng (ie. whatever comes next).

Possibly happens some day, but who knows what interfaces we'll need to
support that ...


vfio_device_query {
 u32 argsz;
 u32 flags;
 enum query_type;  /* or use flags for that */

We don't have an infinite number of ioctls

The limited ioctl number space is a good reason indeed.
Ok, lets take this route then.

cheers,
   Gerd

___
intel-gvt-dev mailing list
intel-gvt-...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev




Re: [Intel-gfx] [PATCH v9 5/7] vfio: Define vfio based dma-buf operations

2017-06-23 Thread Zhi Wang

Hi:
Thanks for the discussions! If the userspace application has 
already maintained a LRU list, it looks like we don't need generation 
anymore, as userspace application will lookup the guest framebuffer from 
the LRU list by "offset". No matter how, it would know if this is a new 
guest framebuffer or not. If it's a new guest framebuffer, a new dmabuf 
fd should be generated. If it's an old framebuffer, it can just show 
that framebuffer.


Thanks,
Zhi.

On 06/23/17 15:26, Gerd Hoffmann wrote:

   Hi,


Is this only going to support accelerated driver output, not basic
VGA
modes for BIOS interaction?

Right now there is no vgabios or uefi support for the vgpu.

But even with that in place there still is the problem that the display
device initialization happens before the guest runs and therefore there
isn't an plane yet ...


Right now the experimental intel patches throw errors in case no
plane
exists (yet).  Maybe we should have a "bool is_enabled" field in
the
plane_info struct, so drivers can use that to signal whenever the
guest
has programmed a valid video mode or not (likewise for the cursor,
which doesn't exist with fbcon, only when running xorg).  With that
in
place using the QUERY_PLANE ioctl also for probing looks
reasonable.

Sure, or -ENOTTY for ioctl not implemented vs -EINVAL for no
available
plane, but then that might not help the user know how a plane would
be
available if it were available.

So maybe a "enum plane_state" (instead of "bool is_enabled")?  So we
can clearly disturgish ENABLED, DISABLED, NOT_SUPPORTED cases?


Yes, I'd leave that to userspace.  So, when the generation changes
userspace knows the guest changed the plane.  It could be a
configuration the guest has used before (and where userspace could
have
a cached dma-buf handle for), or it could be something new.

But userspace also doesn't know that a dmabuf generation will ever be
visited again,

kernel wouldn't know either, only the guest knows ...


so they're bound to have some stale descriptors.  Are
we thinking userspace would have some LRU list of dmabufs so that
they
don't collect too many?  Each uses some resources,  do we just rely
on
open file handles to set an upper limit?

Yep, this is exactly what my qemu patches are doing, keep a LRU list.
  

What happens to
existing dmabuf fds when the generation updates, do they stop
refreshing?

Depends on what the guest is doing ;)

The dma-buf is just a host-side handle for the piece of video
memory
where the guest stored the framebuffer.

So the resources the user is holding if they don't release their
dmabuf
are potentially non-trivial.

Not really.  Host IGD has a certain amount of memory, some of it is
assigned to the guest, guest stores the framebuffer there, the dma-buf
is a host handle (drm object, usable for rendering ops) for the guest
framebuffer.  So it doesn't use much ressources.  Some memory is needed
for management structs, but not for the actual data as this in the
video memory dedicated to the guest.


Ok, if we want support multiple regions.  Do we?  Using the offset
we
can place multiple planes in a single region.  And I'm not sure
nvidia
plans to use multiple planes in the first place ...

I don't want to take a driver ioctl interface as a throw away, one
time
use exercise.  If we can think of such questions now, let's define
how
they work.  A device could have multiple graphics regions with
multiple
planes within each region.

I'd suggest to settle for one of these two.  Either one region and
multiple planes inside (using offset) or one region per plane.  I'd
prefer the former.  When going for the latter then yes we have to
specify the region.  I'd name the field region_id then to make clear
what it is.

What would be the use case for multiple planes?

cursor support?  We already have plane_type for that.

multihead support?  We'll need (at minimum) a head_id field for that
(for both dma-buf and region)

pageflipping support?  Nothing needed, query_plane will simply return
the currently visible plane.  Region only needs to be big enough to fit
the framebuffer twice.  Then the driver can flip between two buffers,
point to the one qemu should display using "offset".


Do we also want to exclude that device
needs to be strictly region or dmabuf?  Maybe it does both.

Very unlikely IMHO.


Or maybe
it supports dmabuf-ng (ie. whatever comes next).

Possibly happens some day, but who knows what interfaces we'll need to
support that ...


vfio_device_query {
 u32 argsz;
 u32 flags;
 enum query_type;  /* or use flags for that */

We don't have an infinite number of ioctls

The limited ioctl number space is a good reason indeed.
Ok, lets take this route then.

cheers,
   Gerd

___
intel-gvt-dev mailing list
intel-gvt-...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev




Re: [PATCH] mm: mempool: Factor out mempool_refill()

2015-12-15 Thread Zhi Wang

Hi Johannes:
Thanks for the reply. In the end of the mempool_resize(), it will 
call the mempool_refill() to do the rest of the work. So this is not one 
of the "no-caller" case. If you insist this is a "no-caller" case, 
perhaps I should change it to a "static" function without exposing a new 
interface?


Personally I think mempool_refill() should be one of the typical 
interfaces in an implementation of a mempool. Currently the mempool will 
not grow only if pool->min_nr > new_min_nr.


So when user wants to refill the mempool immediately, not resize a 
mempool, in the current implementation, it has to do 2x 
mempool_resize(). First one is mempool_resize(pool->min_nr - 1), second 
one is mempool_resize(new_min_nr). So the refill action would truly 
happen. This is ugly and not convenient.


On 12/16/15 05:26, Johannes Weiner wrote:

On Mon, Dec 14, 2015 at 11:09:43AM +, Wang, Zhi A wrote:

This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang 


Who is going to call that function? Adding a new interace usually
comes with a user, or as part of a series that adds users.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mempool: Factor out mempool_refill()

2015-12-15 Thread Zhi Wang

Hi Johannes:
Thanks for the reply. In the end of the mempool_resize(), it will 
call the mempool_refill() to do the rest of the work. So this is not one 
of the "no-caller" case. If you insist this is a "no-caller" case, 
perhaps I should change it to a "static" function without exposing a new 
interface?


Personally I think mempool_refill() should be one of the typical 
interfaces in an implementation of a mempool. Currently the mempool will 
not grow only if pool->min_nr > new_min_nr.


So when user wants to refill the mempool immediately, not resize a 
mempool, in the current implementation, it has to do 2x 
mempool_resize(). First one is mempool_resize(pool->min_nr - 1), second 
one is mempool_resize(new_min_nr). So the refill action would truly 
happen. This is ugly and not convenient.


On 12/16/15 05:26, Johannes Weiner wrote:

On Mon, Dec 14, 2015 at 11:09:43AM +, Wang, Zhi A wrote:

This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang <zhi.a.w...@intel.com>


Who is going to call that function? Adding a new interace usually
comes with a user, or as part of a series that adds users.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mempool: Factor out mempool_refill()

2015-12-12 Thread Zhi Wang
This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang 
---
 include/linux/mempool.h |  1 +
 mm/mempool.c| 61 -
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..71f7460 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,7 @@ extern mempool_t *mempool_create_node(int min_nr, 
mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int nid);
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_refill(mempool_t *pool);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..139c477 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -223,6 +223,47 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t 
*alloc_fn,
 EXPORT_SYMBOL(mempool_create_node);
 
 /**
+ * mempool_refill - refill an existing memory pool immediately
+ * @pool:   pointer to the memory pool which was allocated via
+ *  mempool_create().
+ *
+ * This function tries to refill the pool with new elements
+ * immediately. Similar with mempool_resize(), it cannot be
+ * guaranteed that the pool will be fully filled immediately.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_refill(mempool_t *pool)
+{
+   void *element;
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr >= pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   while (pool->curr_nr < pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   element = pool->alloc(GFP_KERNEL, pool->pool_data);
+   if (!element)
+   return;
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr < pool->min_nr) {
+   add_element(pool, element);
+   } else {
+   spin_unlock_irqrestore(>lock, flags);
+   pool->free(element, pool->pool_data);   /* Raced */
+   return;
+   }
+   }
+}
+EXPORT_SYMBOL(mempool_refill);
+
+/**
  * mempool_resize - resize an existing memory pool
  * @pool:   pointer to the memory pool which was allocated via
  *  mempool_create().
@@ -256,7 +297,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(>lock, flags);
}
pool->min_nr = new_min_nr;
-   goto out_unlock;
+   spin_unlock_irqrestore(>lock, flags);
+   goto out;
}
spin_unlock_irqrestore(>lock, flags);
 
@@ -279,22 +321,9 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
pool->elements = new_elements;
pool->min_nr = new_min_nr;
 
-   while (pool->curr_nr < pool->min_nr) {
-   spin_unlock_irqrestore(>lock, flags);
-   element = pool->alloc(GFP_KERNEL, pool->pool_data);
-   if (!element)
-   goto out;
-   spin_lock_irqsave(>lock, flags);
-   if (pool->curr_nr < pool->min_nr) {
-   add_element(pool, element);
-   } else {
-   spin_unlock_irqrestore(>lock, flags);
-   pool->free(element, pool->pool_data);   /* Raced */
-   goto out;
-   }
-   }
-out_unlock:
spin_unlock_irqrestore(>lock, flags);
+
+   mempool_refill(pool);
 out:
return 0;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mempool: Factor out mempool_refill()

2015-12-12 Thread Zhi Wang
This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang <zhi.a.w...@intel.com>
---
 include/linux/mempool.h |  1 +
 mm/mempool.c| 61 -
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..71f7460 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,7 @@ extern mempool_t *mempool_create_node(int min_nr, 
mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int nid);
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_refill(mempool_t *pool);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..139c477 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -223,6 +223,47 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t 
*alloc_fn,
 EXPORT_SYMBOL(mempool_create_node);
 
 /**
+ * mempool_refill - refill an existing memory pool immediately
+ * @pool:   pointer to the memory pool which was allocated via
+ *  mempool_create().
+ *
+ * This function tries to refill the pool with new elements
+ * immediately. Similar with mempool_resize(), it cannot be
+ * guaranteed that the pool will be fully filled immediately.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_refill(mempool_t *pool)
+{
+   void *element;
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr >= pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   while (pool->curr_nr < pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   element = pool->alloc(GFP_KERNEL, pool->pool_data);
+   if (!element)
+   return;
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr < pool->min_nr) {
+   add_element(pool, element);
+   } else {
+   spin_unlock_irqrestore(>lock, flags);
+   pool->free(element, pool->pool_data);   /* Raced */
+   return;
+   }
+   }
+}
+EXPORT_SYMBOL(mempool_refill);
+
+/**
  * mempool_resize - resize an existing memory pool
  * @pool:   pointer to the memory pool which was allocated via
  *  mempool_create().
@@ -256,7 +297,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(>lock, flags);
}
pool->min_nr = new_min_nr;
-   goto out_unlock;
+   spin_unlock_irqrestore(>lock, flags);
+   goto out;
}
spin_unlock_irqrestore(>lock, flags);
 
@@ -279,22 +321,9 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
pool->elements = new_elements;
pool->min_nr = new_min_nr;
 
-   while (pool->curr_nr < pool->min_nr) {
-   spin_unlock_irqrestore(>lock, flags);
-   element = pool->alloc(GFP_KERNEL, pool->pool_data);
-   if (!element)
-   goto out;
-   spin_lock_irqsave(>lock, flags);
-   if (pool->curr_nr < pool->min_nr) {
-   add_element(pool, element);
-   } else {
-   spin_unlock_irqrestore(>lock, flags);
-   pool->free(element, pool->pool_data);   /* Raced */
-   goto out;
-   }
-   }
-out_unlock:
spin_unlock_irqrestore(>lock, flags);
+
+   mempool_refill(pool);
 out:
return 0;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mempool: Factor out mempool_refill()

2015-12-11 Thread Zhi Wang
This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang 
---
 include/linux/mempool.h |  1 +
 mm/mempool.c| 61 -
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..71f7460 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,7 @@ extern mempool_t *mempool_create_node(int min_nr, 
mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int nid);
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_refill(mempool_t *pool);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..139c477 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -223,6 +223,47 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t 
*alloc_fn,
 EXPORT_SYMBOL(mempool_create_node);
 
 /**
+ * mempool_refill - refill an existing memory pool immediately
+ * @pool:   pointer to the memory pool which was allocated via
+ *  mempool_create().
+ *
+ * This function tries to refill the pool with new elements
+ * immediately. Similar with mempool_resize(), it cannot be
+ * guaranteed that the pool will be fully filled immediately.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_refill(mempool_t *pool)
+{
+   void *element;
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr >= pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   while (pool->curr_nr < pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   element = pool->alloc(GFP_KERNEL, pool->pool_data);
+   if (!element)
+   return;
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr < pool->min_nr) {
+   add_element(pool, element);
+   } else {
+   spin_unlock_irqrestore(>lock, flags);
+   pool->free(element, pool->pool_data);   /* Raced */
+   return;
+   }
+   }
+}
+EXPORT_SYMBOL(mempool_refill);
+
+/**
  * mempool_resize - resize an existing memory pool
  * @pool:   pointer to the memory pool which was allocated via
  *  mempool_create().
@@ -256,7 +297,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(>lock, flags);
}
pool->min_nr = new_min_nr;
-   goto out_unlock;
+   spin_unlock_irqrestore(>lock, flags);
+   goto out;
}
spin_unlock_irqrestore(>lock, flags);
 
@@ -279,22 +321,9 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
pool->elements = new_elements;
pool->min_nr = new_min_nr;
 
-   while (pool->curr_nr < pool->min_nr) {
-   spin_unlock_irqrestore(>lock, flags);
-   element = pool->alloc(GFP_KERNEL, pool->pool_data);
-   if (!element)
-   goto out;
-   spin_lock_irqsave(>lock, flags);
-   if (pool->curr_nr < pool->min_nr) {
-   add_element(pool, element);
-   } else {
-   spin_unlock_irqrestore(>lock, flags);
-   pool->free(element, pool->pool_data);   /* Raced */
-   goto out;
-   }
-   }
-out_unlock:
spin_unlock_irqrestore(>lock, flags);
+
+   mempool_refill(pool);
 out:
return 0;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mempool: Factor out mempool_refill()

2015-12-11 Thread Zhi Wang
This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang <zhi.a.w...@intel.com>
---
 include/linux/mempool.h |  1 +
 mm/mempool.c| 61 -
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..71f7460 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,7 @@ extern mempool_t *mempool_create_node(int min_nr, 
mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int nid);
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_refill(mempool_t *pool);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..139c477 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -223,6 +223,47 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t 
*alloc_fn,
 EXPORT_SYMBOL(mempool_create_node);
 
 /**
+ * mempool_refill - refill an existing memory pool immediately
+ * @pool:   pointer to the memory pool which was allocated via
+ *  mempool_create().
+ *
+ * This function tries to refill the pool with new elements
+ * immediately. Similar with mempool_resize(), it cannot be
+ * guaranteed that the pool will be fully filled immediately.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_refill(mempool_t *pool)
+{
+   void *element;
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr >= pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   while (pool->curr_nr < pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   element = pool->alloc(GFP_KERNEL, pool->pool_data);
+   if (!element)
+   return;
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr < pool->min_nr) {
+   add_element(pool, element);
+   } else {
+   spin_unlock_irqrestore(>lock, flags);
+   pool->free(element, pool->pool_data);   /* Raced */
+   return;
+   }
+   }
+}
+EXPORT_SYMBOL(mempool_refill);
+
+/**
  * mempool_resize - resize an existing memory pool
  * @pool:   pointer to the memory pool which was allocated via
  *  mempool_create().
@@ -256,7 +297,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(>lock, flags);
}
pool->min_nr = new_min_nr;
-   goto out_unlock;
+   spin_unlock_irqrestore(>lock, flags);
+   goto out;
}
spin_unlock_irqrestore(>lock, flags);
 
@@ -279,22 +321,9 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
pool->elements = new_elements;
pool->min_nr = new_min_nr;
 
-   while (pool->curr_nr < pool->min_nr) {
-   spin_unlock_irqrestore(>lock, flags);
-   element = pool->alloc(GFP_KERNEL, pool->pool_data);
-   if (!element)
-   goto out;
-   spin_lock_irqsave(>lock, flags);
-   if (pool->curr_nr < pool->min_nr) {
-   add_element(pool, element);
-   } else {
-   spin_unlock_irqrestore(>lock, flags);
-   pool->free(element, pool->pool_data);   /* Raced */
-   goto out;
-   }
-   }
-out_unlock:
spin_unlock_irqrestore(>lock, flags);
+
+   mempool_refill(pool);
 out:
return 0;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mempool: Factor out mempool_refill()

2015-12-10 Thread Zhi Wang
This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang 
---
 include/linux/mempool.h |  1 +
 mm/mempool.c| 61 -
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..71f7460 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,7 @@ extern mempool_t *mempool_create_node(int min_nr, 
mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int nid);
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_refill(mempool_t *pool);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..139c477 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -223,6 +223,47 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t 
*alloc_fn,
 EXPORT_SYMBOL(mempool_create_node);
 
 /**
+ * mempool_refill - refill an existing memory pool immediately
+ * @pool:   pointer to the memory pool which was allocated via
+ *  mempool_create().
+ *
+ * This function tries to refill the pool with new elements
+ * immediately. Similar with mempool_resize(), it cannot be
+ * guaranteed that the pool will be fully filled immediately.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_refill(mempool_t *pool)
+{
+   void *element;
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr >= pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   while (pool->curr_nr < pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   element = pool->alloc(GFP_KERNEL, pool->pool_data);
+   if (!element)
+   return;
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr < pool->min_nr) {
+   add_element(pool, element);
+   } else {
+   spin_unlock_irqrestore(>lock, flags);
+   pool->free(element, pool->pool_data);   /* Raced */
+   return;
+   }
+   }
+}
+EXPORT_SYMBOL(mempool_refill);
+
+/**
  * mempool_resize - resize an existing memory pool
  * @pool:   pointer to the memory pool which was allocated via
  *  mempool_create().
@@ -256,7 +297,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(>lock, flags);
}
pool->min_nr = new_min_nr;
-   goto out_unlock;
+   spin_unlock_irqrestore(>lock, flags);
+   goto out;
}
spin_unlock_irqrestore(>lock, flags);
 
@@ -279,22 +321,9 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
pool->elements = new_elements;
pool->min_nr = new_min_nr;
 
-   while (pool->curr_nr < pool->min_nr) {
-   spin_unlock_irqrestore(>lock, flags);
-   element = pool->alloc(GFP_KERNEL, pool->pool_data);
-   if (!element)
-   goto out;
-   spin_lock_irqsave(>lock, flags);
-   if (pool->curr_nr < pool->min_nr) {
-   add_element(pool, element);
-   } else {
-   spin_unlock_irqrestore(>lock, flags);
-   pool->free(element, pool->pool_data);   /* Raced */
-   goto out;
-   }
-   }
-out_unlock:
spin_unlock_irqrestore(>lock, flags);
+
+   mempool_refill(pool);
 out:
return 0;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mempool: Factor out mempool_refill()

2015-12-10 Thread Zhi Wang
This patch factors out mempool_refill() from mempool_resize(). It's reasonable
that the mempool user wants to refill the pool immdiately when it has chance
e.g. inside a sleepible context, so that next time in the IRQ context the pool
would have much more available elements to allocate.

After the refactor, mempool_refill() can also executes with mempool_resize()
/mempool_alloc/mempool_free() or another mempool_refill().

Signed-off-by: Zhi Wang <zhi.a.w...@intel.com>
---
 include/linux/mempool.h |  1 +
 mm/mempool.c| 61 -
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951..71f7460 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,7 @@ extern mempool_t *mempool_create_node(int min_nr, 
mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int nid);
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
+extern void mempool_refill(mempool_t *pool);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..139c477 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -223,6 +223,47 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t 
*alloc_fn,
 EXPORT_SYMBOL(mempool_create_node);
 
 /**
+ * mempool_refill - refill an existing memory pool immediately
+ * @pool:   pointer to the memory pool which was allocated via
+ *  mempool_create().
+ *
+ * This function tries to refill the pool with new elements
+ * immediately. Similar with mempool_resize(), it cannot be
+ * guaranteed that the pool will be fully filled immediately.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_refill(mempool_t *pool)
+{
+   void *element;
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr >= pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   while (pool->curr_nr < pool->min_nr) {
+   spin_unlock_irqrestore(>lock, flags);
+   element = pool->alloc(GFP_KERNEL, pool->pool_data);
+   if (!element)
+   return;
+   spin_lock_irqsave(>lock, flags);
+   if (pool->curr_nr < pool->min_nr) {
+   add_element(pool, element);
+   } else {
+   spin_unlock_irqrestore(>lock, flags);
+   pool->free(element, pool->pool_data);   /* Raced */
+   return;
+   }
+   }
+}
+EXPORT_SYMBOL(mempool_refill);
+
+/**
  * mempool_resize - resize an existing memory pool
  * @pool:   pointer to the memory pool which was allocated via
  *  mempool_create().
@@ -256,7 +297,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(>lock, flags);
}
pool->min_nr = new_min_nr;
-   goto out_unlock;
+   spin_unlock_irqrestore(>lock, flags);
+   goto out;
}
spin_unlock_irqrestore(>lock, flags);
 
@@ -279,22 +321,9 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
pool->elements = new_elements;
pool->min_nr = new_min_nr;
 
-   while (pool->curr_nr < pool->min_nr) {
-   spin_unlock_irqrestore(>lock, flags);
-   element = pool->alloc(GFP_KERNEL, pool->pool_data);
-   if (!element)
-   goto out;
-   spin_lock_irqsave(>lock, flags);
-   if (pool->curr_nr < pool->min_nr) {
-   add_element(pool, element);
-   } else {
-   spin_unlock_irqrestore(>lock, flags);
-   pool->free(element, pool->pool_data);   /* Raced */
-   goto out;
-   }
-   }
-out_unlock:
spin_unlock_irqrestore(>lock, flags);
+
+   mempool_refill(pool);
 out:
return 0;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/