date:20231127

[PATCH v2 2/3] drm/tests: Use KUNIT_DEFINE_ACTION_WRAPPER()

2023-11-27 Thread David Gow

In order to pass functions to kunit_add_action(), they need to be of the
kunit_action_t type. While casting the function pointer can work, it
will break control-flow integrity.

drm_kunit_helpers already defines wrappers, but we now have a macro
which does this automatically. Using this greatly reduces the
boilerplate needed.

Acked-by: Maxime Ripard 
Signed-off-by: David Gow 
---

No changes since v1:
https://lore.kernel.org/linux-kselftest/20231110200830.1832556-2-david...@google.com/

---
 drivers/gpu/drm/tests/drm_kunit_helpers.c | 30 +++
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/tests/drm_kunit_helpers.c 
b/drivers/gpu/drm/tests/drm_kunit_helpers.c
index bccb33b900f3..c251e6b34de0 100644
--- a/drivers/gpu/drm/tests/drm_kunit_helpers.c
+++ b/drivers/gpu/drm/tests/drm_kunit_helpers.c
@@ -27,27 +27,15 @@ static struct platform_driver fake_platform_driver = {
},
 };
 
-static void kunit_action_platform_driver_unregister(void *ptr)
-{
-   struct platform_driver *drv = ptr;
-
-   platform_driver_unregister(drv);
-
-}
-
-static void kunit_action_platform_device_put(void *ptr)
-{
-   struct platform_device *pdev = ptr;
-
-   platform_device_put(pdev);
-}
-
-static void kunit_action_platform_device_del(void *ptr)
-{
-   struct platform_device *pdev = ptr;
-
-   platform_device_del(pdev);
-}
+KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_driver_unregister,
+   platform_driver_unregister,
+   struct platform_driver *);
+KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_put,
+   platform_device_put,
+   struct platform_device *);
+KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_platform_device_del,
+   platform_device_del,
+   struct platform_device *);
 
 /**
  * drm_kunit_helper_alloc_device - Allocate a mock device for a KUnit test
-- 
2.43.0.rc1.413.gea7ed67945-goog

[PATCH v2 3/3] drm/vc4: tests: Use KUNIT_DEFINE_ACTION_WRAPPER

2023-11-27 Thread David Gow

In order to pass functions to kunit_add_action(), they need to be of the
kunit_action_t type. While casting the function pointer can work, it
will break control-flow integrity.

vc4_mock already defines such a wrapper for drm_dev_unregister(), but it
involves less boilerplate to use the new macro, so replace the manual
implementation.

Signed-off-by: David Gow 
---

No changes since v1:
https://lore.kernel.org/linux-kselftest/20231110200830.1832556-3-david...@google.com/

---
 drivers/gpu/drm/vc4/tests/vc4_mock.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/vc4/tests/vc4_mock.c 
b/drivers/gpu/drm/vc4/tests/vc4_mock.c
index 63ca46f4cb35..becb3dbaa548 100644
--- a/drivers/gpu/drm/vc4/tests/vc4_mock.c
+++ b/drivers/gpu/drm/vc4/tests/vc4_mock.c
@@ -153,12 +153,9 @@ static int __build_mock(struct kunit *test, struct 
drm_device *drm,
return 0;
 }
 
-static void kunit_action_drm_dev_unregister(void *ptr)
-{
-   struct drm_device *drm = ptr;
-
-   drm_dev_unregister(drm);
-}
+KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_drm_dev_unregister,
+   drm_dev_unregister,
+   struct drm_device *);
 
 static struct vc4_dev *__mock_device(struct kunit *test, bool is_vc5)
 {
-- 
2.43.0.rc1.413.gea7ed67945-goog

[PATCH v2 1/3] kunit: Add a macro to wrap a deferred action function

2023-11-27 Thread David Gow

KUnit's deferred action API accepts a void(*)(void *) function pointer
which is called when the test is exited. However, we very frequently
want to use existing functions which accept a single pointer, but which
may not be of type void*. While this is probably dodgy enough to be on
the wrong side of the C standard, it's been often used for similar
callbacks, and gcc's -Wcast-function-type seems to ignore cases where
the only difference is the type of the argument, assuming it's
compatible (i.e., they're both pointers to data).

However, clang 16 has introduced -Wcast-function-type-strict, which no
longer permits any deviation in function pointer type. This seems to be
because it'd break CFI, which validates the type of function calls.

This rather ruins our attempts to cast functions to defer them, and
leaves us with a few options. The one we've chosen is to implement a
macro which will generate a wrapper function which accepts a void*, and
casts the argument to the appropriate type.

For example, if you were trying to wrap:
void foo_close(struct foo *handle);
you could use:
KUNIT_DEFINE_ACTION_WRAPPER(kunit_action_foo_close,
foo_close,
struct foo *);

This would create a new kunit_action_foo_close() function, of type
kunit_action_t, which could be passed into kunit_add_action() and
similar functions.

In addition to defining this macro, update KUnit and its tests to use
it.

Link: https://github.com/ClangBuiltLinux/linux/issues/1750
Reviewed-by: Nathan Chancellor 
Tested-by: Nathan Chancellor 
Acked-by: Daniel Vetter 
Reviewed-by: Maxime Ripard 
Signed-off-by: David Gow 
---

Thanks everyone for testing v1 of this: this update only changes
documentation.

Changes since v1:
https://lore.kernel.org/linux-kselftest/20231110200830.1832556-1-david...@google.com/
- Update the usage.rst documentation (Thanks, Nathan)
- Add a better doc comment for KUNIT_DEFINE_ACTION_WRAPPER()

---
 Documentation/dev-tools/kunit/usage.rst | 10 +++---
 include/kunit/resource.h| 21 +
 lib/kunit/kunit-test.c  |  5 +
 lib/kunit/test.c|  6 --
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/Documentation/dev-tools/kunit/usage.rst 
b/Documentation/dev-tools/kunit/usage.rst
index c27e1646ecd9..9db12e91668e 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -651,12 +651,16 @@ For example:
}
 
 Note that, for functions like device_unregister which only accept a single
-pointer-sized argument, it's possible to directly cast that function to
-a ``kunit_action_t`` rather than writing a wrapper function, for example:
+pointer-sized argument, it's possible to automatically generate a wrapper
+with the ``KUNIT_DEFINE_ACTION_WRAPPER()`` macro, for example:
 
 .. code-block:: C
 
-   kunit_add_action(test, (kunit_action_t *)_unregister, );
+   KUNIT_DEFINE_ACTION_WRAPPER(device_unregister, 
device_unregister_wrapper, struct device *);
+   kunit_add_action(test, _unregister_wrapper, );
+
+You should do this in preference to manually casting to the ``kunit_action_t`` 
type,
+as casting function pointers will break Control Flow Integrity (CFI).
 
 ``kunit_add_action`` can fail if, for example, the system is out of memory.
 You can use ``kunit_add_action_or_reset`` instead which runs the action
diff --git a/include/kunit/resource.h b/include/kunit/resource.h
index c7383e90f5c9..4ad69a2642a5 100644
--- a/include/kunit/resource.h
+++ b/include/kunit/resource.h
@@ -390,6 +390,27 @@ void kunit_remove_resource(struct kunit *test, struct 
kunit_resource *res);
 /* A 'deferred action' function to be used with kunit_add_action. */
 typedef void (kunit_action_t)(void *);
 
+/**
+ * KUNIT_DEFINE_ACTION_WRAPPER() - Wrap a function for use as a deferred 
action.
+ *
+ * @wrapper: The name of the new wrapper function define.
+ * @orig: The original function to wrap.
+ * @arg_type: The type of the argument accepted by @orig.
+ *
+ * Defines a wrapper for a function which accepts a single, pointer-sized
+ * argument. This wrapper can then be passed to kunit_add_action() and
+ * similar. This should be used in preference to casting a function
+ * directly to kunit_action_t, as casting function pointers will break
+ * control flow integrity (CFI), leading to crashes.
+ */
+#define KUNIT_DEFINE_ACTION_WRAPPER(wrapper, orig, arg_type)   \
+   static void wrapper(void *in)   \
+   {   \
+   arg_type arg = (arg_type)in;\
+   orig(arg);  \
+   }
+
+
 /**
  * kunit_add_action() - Call a function when the test ends.
  * @test: Test case to associate the action with.
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 99d2a3a528e1..3e9c5192d095 100644
---

Re: [PATCH v6 2/6] iommufd: Add IOMMU_HWPT_INVALIDATE

2023-11-27 Thread Yi Liu


On 2023/11/28 03:53, Nicolin Chen wrote:

On Fri, Nov 24, 2023 at 02:36:29AM +, Tian, Kevin wrote:


+ * @out_driver_error_code: Report a driver speicifc error code

upon

failure.

+ * It's optional, driver has a choice to fill it or
+ * not.


Being optional how does the user tell whether the code is filled or

not?


Well, naming it "error_code" indicates zero means no error while
non-zero means something? An error return from this ioctl could
also tell the user space to look up for this driver error code,
if it ever cares.


probably over-thinking but I'm not sure whether zero is guaranteed to
mean no error in all implementations...


Well, you are right. Usually HW conveniently raises a flag in a
register to indicate something wrong, yet it is probably unsafe
to say it definitely.



this reminds me one open. What about an implementation having
a hierarchical error code layout e.g. one main error register with
each bit representing an error category then multiple error code
registers each for one error category? In this case probably
a single out_driver_error_code cannot carry that raw information.


Hmm, good point.


Instead the iommu driver may need to define a customized error
code convention in uapi header which is converted from the
raw error information.

 From this angle should we simply say that the error code definition
must be included in the uapi header? If raw error information can
be carried by this field then this hw can simply say that the error
code format is same as the hw spec defines.

With that explicit information then the viommu can easily tell
whether error code is filled or not based on its own convention.


That'd be to put this error_code field into the driver uAPI
structure right?


looks to be. Then it would be convenient to reserve a code for
the case of no error (either no error happened or just not used)



I also thought about making this out_driver_error_code per HW.
Yet, an error can be either per array or per entry/quest. The
array-related error should be reported in the array structure
that is a core uAPI, v.s. the per-HW entry structure. Though
we could still report an array error in the entry structure
at the first entry (or indexed by "array->entry_num")?


per-entry error code seems like to be a completion code. Each
entry in the array can have a corresponding code (0 for succ,
others for failure). do you already have such a need?

--
Regards,
Yi Liu

RE: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl

2023-11-27 Thread Duan, Zhenzhong



>-Original Message-
>From: Liu, Yi L 
>Sent: Tuesday, November 28, 2023 11:12 AM
>Subject: Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>ioctl
>
>On 2023/11/27 15:28, Duan, Zhenzhong wrote:
>>
>>
>>> -Original Message-
>>> From: Liu, Yi L 
>>> Sent: Monday, November 27, 2023 2:39 PM
>>> Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
>>> ioctl
>>>
>>> This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
>>> hence userspace could probe PASID capability by it. This is a bit different
>>> with other capabilities which are reported to userspace when the user reads
>>> the device's PCI configuration space. There are two reasons for this.
>>>
>>> - First, Qemu by default exposes all available PCI capabilities in vfio-pci
>>>config space to the guest as read-only, so adding PASID capability in the
>>>vfio-pci config space will make it exposed to the guest automatically 
>>> while
>>>an old Qemu doesn't really support it.
>>>
>>> - Second, PASID capability does not exit on VFs (instead shares the cap of
>>>the PF). Creating a virtual PASID capability in vfio-pci config space 
>>> needs
>>>to find a hole to place it, but doing so may require device specific
>>>knowledge to avoid potential conflict with device specific registers like
>>>hiden bits in VF config space. It's simpler by moving this burden to the
>>>VMM instead of maintaining a quirk system in the kernel.
>>>
>>> Suggested-by: Alex Williamson 
>>> Signed-off-by: Yi Liu 
>>> ---
>>> drivers/vfio/pci/vfio_pci_core.c | 47 
>>> include/uapi/linux/vfio.h| 13 +
>>> 2 files changed, 60 insertions(+)
>>>
>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
>>> b/drivers/vfio/pci/vfio_pci_core.c
>>> index 1929103ee59a..8038aa45500e 100644
>>> --- a/drivers/vfio/pci/vfio_pci_core.c
>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>>> @@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
>>> vfio_device *device, u32 flags,
>>> return 0;
>>> }
>>>
>>> +static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 
>>> flags,
>>> +  struct vfio_device_feature_pasid __user
>>> *arg,
>>> +  size_t argsz)
>>> +{
>>> +   struct vfio_pci_core_device *vdev =
>>> +   container_of(device, struct vfio_pci_core_device, vdev);
>>> +   struct vfio_device_feature_pasid pasid = { 0 };
>>> +   struct pci_dev *pdev = vdev->pdev;
>>> +   u32 capabilities = 0;
>>> +   int ret;
>>> +
>>> +   /* We do not support SET of the PASID capability */
>>> +   ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
>>> +sizeof(pasid));
>>> +   if (ret != 1)
>>> +   return ret;
>>> +
>>> +   /*
>>> +* Needs go to PF if the device is VF as VF shares its PF's
>>> +* PASID Capability.
>>> +*/
>>> +   if (pdev->is_virtfn)
>>> +   pdev = pci_physfn(pdev);
>>> +
>>> +   if (!pdev->pasid_enabled)
>>> +   goto out;
>>
>> Does a PF bound to VFIO have pasid enabled by default?
>
>Today, host iommu driver (at least intel iommu driver) enables it in the
>time of device probe and seems not changed afterward. So yes, VFIO should
>see it if pasid is enabled.
>
>> Isn't the guest kernel's responsibility to enable pasid cap of an assigned 
>> PF?
>
>guest kernel should not have the capability to change host's pasid
>configuration. It can only write to its own vconfig emulated by
>hypervisor.

Understood, thanks Yi.

BRs.
Zhenzhong

Re: [PATCH ipsec-next v1 6/7] bpf: selftests: test_tunnel: Disable CO-RE relocations

2023-11-27 Thread Yonghong Song




On 11/27/23 7:01 PM, Daniel Xu wrote:

On Mon, Nov 27, 2023 at 02:45:11PM -0600, Daniel Xu wrote:

On Sun, Nov 26, 2023 at 09:53:04PM -0800, Yonghong Song wrote:

On 11/27/23 12:44 AM, Yonghong Song wrote:

On 11/26/23 8:52 PM, Eduard Zingerman wrote:

On Sun, 2023-11-26 at 18:04 -0600, Daniel Xu wrote:
[...]

Tbh I'm not sure. This test passes with preserve_static_offset
because it suppresses preserve_access_index. In general clang
translates bitfield access to a set of IR statements like:

    C:
  struct foo {
    unsigned _;
    unsigned a:1;
    ...
  };
  ... foo->a ...

    IR:
  %a = getelementptr inbounds %struct.foo, ptr %0, i32 0, i32 1
  %bf.load = load i8, ptr %a, align 4
  %bf.clear = and i8 %bf.load, 1
  %bf.cast = zext i8 %bf.clear to i32

With preserve_static_offset the getelementptr+load are replaced by a
single statement which is preserved as-is till code generation,
thus load with align 4 is preserved.

On the other hand, I'm not sure that clang guarantees that load or
stores used for bitfield access would be always aligned according to
verifier expectations.

I think we should check if there are some clang knobs that prevent
generation of unaligned memory access. I'll take a look.

Is there a reason to prefer fixing in compiler? I'm not opposed to it,
but the downside to compiler fix is it takes years to propagate and
sprinkles ifdefs into the code.

Would it be possible to have an analogue of BPF_CORE_READ_BITFIELD()?

Well, the contraption below passes verification, tunnel selftest
appears to work. I might have messed up some shifts in the macro,
though.

I didn't test it. But from high level it should work.


Still, if clang would peek unlucky BYTE_{OFFSET,SIZE} for a particular
field access might be unaligned.

clang should pick a sensible BYTE_SIZE/BYTE_OFFSET to meet
alignment requirement. This is also required for BPF_CORE_READ_BITFIELD.


---

diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 3065a716544d..41cd913ac7ff 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -9,6 +9,7 @@
   #include "vmlinux.h"
   #include 
   #include 
+#include 
   #include "bpf_kfuncs.h"
   #include "bpf_tracing_net.h"
   @@ -144,6 +145,38 @@ int ip6gretap_get_tunnel(struct __sk_buff *skb)
   return TC_ACT_OK;
   }
   +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({    \
+    void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET);    \
+    unsigned byte_size = __CORE_RELO(s, field, BYTE_SIZE);    \
+    unsigned lshift = __CORE_RELO(s, field, LSHIFT_U64); \
+    unsigned rshift = __CORE_RELO(s, field, RSHIFT_U64); \
+    unsigned bit_size = (rshift - lshift);    \
+    unsigned long long nval, val, hi, lo;    \
+    \
+    asm volatile("" : "=r"(p) : "0"(p));    \

Use asm volatile("" : "+r"(p)) ?


+    \
+    switch (byte_size) {    \
+    case 1: val = *(unsigned char *)p; break;    \
+    case 2: val = *(unsigned short *)p; break;    \
+    case 4: val = *(unsigned int *)p; break;    \
+    case 8: val = *(unsigned long long *)p; break;    \
+    }    \
+    hi = val >> (bit_size + rshift);    \
+    hi <<= bit_size + rshift;    \
+    lo = val << (bit_size + lshift);    \
+    lo >>= bit_size + lshift;    \
+    nval = new_val;    \
+    nval <<= lshift;    \
+    nval >>= rshift;    \
+    val = hi | nval | lo;    \
+    switch (byte_size) {    \
+    case 1: *(unsigned char *)p  = val; break;    \
+    case 2: *(unsigned short *)p = val; break;    \
+    case 4: *(unsigned int *)p   = val; break;    \
+    case 8: *(unsigned long long *)p = val; break;    \
+    }    \
+})

I think this should be put in libbpf public header files but not sure
where to put it. bpf_core_read.h although it is core write?

But on the other hand, this is a uapi struct bitfield write,
strictly speaking, CORE write is really unnecessary here. It
would be great if we can relieve users from dealing with
such unnecessary CORE writes. In that sense, for this particular
case, I would prefer rewriting the code by using byte-level
stores...

or preserve_static_offset to clearly mean to undo bitfield CORE ...

Ok, I will do byte-level rewrite for next revision.

[...]

This patch seems to work: https://pastes.dxuuu.xyz/0glrf9 .

But I don't think it's very pretty. Also I'm seeing on the internet that
people are saying the exact layout of bitfields is compiler dependent.


Any reference for this

Re: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE ioctl

2023-11-27 Thread Yi Liu


On 2023/11/27 15:28, Duan, Zhenzhong wrote:




-Original Message-
From: Liu, Yi L 
Sent: Monday, November 27, 2023 2:39 PM
Subject: [PATCH 3/3] vfio: Report PASID capability via VFIO_DEVICE_FEATURE
ioctl

This reports the PASID capability data to userspace via VFIO_DEVICE_FEATURE,
hence userspace could probe PASID capability by it. This is a bit different
with other capabilities which are reported to userspace when the user reads
the device's PCI configuration space. There are two reasons for this.

- First, Qemu by default exposes all available PCI capabilities in vfio-pci
   config space to the guest as read-only, so adding PASID capability in the
   vfio-pci config space will make it exposed to the guest automatically while
   an old Qemu doesn't really support it.

- Second, PASID capability does not exit on VFs (instead shares the cap of
   the PF). Creating a virtual PASID capability in vfio-pci config space needs
   to find a hole to place it, but doing so may require device specific
   knowledge to avoid potential conflict with device specific registers like
   hiden bits in VF config space. It's simpler by moving this burden to the
   VMM instead of maintaining a quirk system in the kernel.

Suggested-by: Alex Williamson 
Signed-off-by: Yi Liu 
---
drivers/vfio/pci/vfio_pci_core.c | 47 
include/uapi/linux/vfio.h| 13 +
2 files changed, 60 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1929103ee59a..8038aa45500e 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1495,6 +1495,51 @@ static int vfio_pci_core_feature_token(struct
vfio_device *device, u32 flags,
return 0;
}

+static int vfio_pci_core_feature_pasid(struct vfio_device *device, u32 flags,
+  struct vfio_device_feature_pasid __user
*arg,
+  size_t argsz)
+{
+   struct vfio_pci_core_device *vdev =
+   container_of(device, struct vfio_pci_core_device, vdev);
+   struct vfio_device_feature_pasid pasid = { 0 };
+   struct pci_dev *pdev = vdev->pdev;
+   u32 capabilities = 0;
+   int ret;
+
+   /* We do not support SET of the PASID capability */
+   ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+sizeof(pasid));
+   if (ret != 1)
+   return ret;
+
+   /*
+* Needs go to PF if the device is VF as VF shares its PF's
+* PASID Capability.
+*/
+   if (pdev->is_virtfn)
+   pdev = pci_physfn(pdev);
+
+   if (!pdev->pasid_enabled)
+   goto out;


Does a PF bound to VFIO have pasid enabled by default?


Today, host iommu driver (at least intel iommu driver) enables it in the
time of device probe and seems not changed afterward. So yes, VFIO should
see it if pasid is enabled.


Isn't the guest kernel's responsibility to enable pasid cap of an assigned PF?


guest kernel should not have the capability to change host's pasid
configuration. It can only write to its own vconfig emulated by
hypervisor.


Thanks
Zhenzhong


+
+#ifdef CONFIG_PCI_PASID
+   pci_read_config_dword(pdev, pdev->pasid_cap + PCI_PASID_CAP,
+ );
+#endif
+
+   if (capabilities & PCI_PASID_CAP_EXEC)
+   pasid.capabilities |= VFIO_DEVICE_PASID_CAP_EXEC;
+   if (capabilities & PCI_PASID_CAP_PRIV)
+   pasid.capabilities |= VFIO_DEVICE_PASID_CAP_PRIV;
+
+   pasid.width = (capabilities >> 8) & 0x1f;
+
+out:
+   if (copy_to_user(arg, , sizeof(pasid)))
+   return -EFAULT;
+   return 0;
+}
+
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz)
{
@@ -1508,6 +1553,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
*device, u32 flags,
return vfio_pci_core_pm_exit(device, flags, arg, argsz);
case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
return vfio_pci_core_feature_token(device, flags, arg, argsz);
+   case VFIO_DEVICE_FEATURE_PASID:
+   return vfio_pci_core_feature_pasid(device, flags, arg, argsz);
default:
return -ENOTTY;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 495193629029..8326faf8622b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1512,6 +1512,19 @@ struct vfio_device_feature_bus_master {
};
#define VFIO_DEVICE_FEATURE_BUS_MASTER 10

+/**
+ * Upon VFIO_DEVICE_FEATURE_GET, return the PASID capability for the device.
+ * Zero width means no support for PASID.
+ */
+struct vfio_device_feature_pasid {
+   __u16 capabilities;
+#define VFIO_DEVICE_PASID_CAP_EXEC (1 << 0)
+#define VFIO_DEVICE_PASID_CAP_PRIV (1 << 1)
+   __u8 width;
+   __u8 __reserved;
+};
+#define

Re: [PATCH 2/3] vfio: Add VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT

2023-11-27 Thread Yi Liu





On 2023/11/27 14:50, Duan, Zhenzhong wrote:




-Original Message-
From: Liu, Yi L 
Sent: Monday, November 27, 2023 2:39 PM
Subject: [PATCH 2/3] vfio: Add
VFIO_DEVICE_PASID_[AT|DE]TACH_IOMMUFD_PT

This adds ioctls for the userspace to attach a given pasid of a vfio
device to/from an IOAS/HWPT.

Signed-off-by: Yi Liu 
---
drivers/vfio/device_cdev.c | 45 +++
drivers/vfio/vfio.h|  4 +++
drivers/vfio/vfio_main.c   |  8 ++
include/uapi/linux/vfio.h  | 55 ++
4 files changed, 112 insertions(+)

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index e75da0a70d1f..c2ac7ed44537 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -210,6 +210,51 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
return 0;
}

+int vfio_df_ioctl_pasid_attach_pt(struct vfio_device_file *df,
+ struct vfio_device_pasid_attach_iommufd_pt
__user *arg)
+{
+   struct vfio_device *device = df->device;
+   struct vfio_device_pasid_attach_iommufd_pt attach;
+   unsigned long minsz;
+   int ret;
+
+   minsz = offsetofend(struct vfio_device_pasid_attach_iommufd_pt, pt_id);
+
+   if (copy_from_user(, arg, minsz))
+   return -EFAULT;
+
+   if (attach.argsz < minsz || attach.flags)
+   return -EINVAL;
+
+   mutex_lock(>dev_set->lock);
+   ret = device->ops->pasid_attach_ioas(device, attach.pasid,
_id);
+   mutex_unlock(>dev_set->lock);
+
+   return ret;
+}
+
+int vfio_df_ioctl_pasid_detach_pt(struct vfio_device_file *df,
+ struct vfio_device_pasid_detach_iommufd_pt
__user *arg)
+{
+   struct vfio_device *device = df->device;
+   struct vfio_device_pasid_detach_iommufd_pt detach;
+   unsigned long minsz;
+
+   minsz = offsetofend(struct vfio_device_pasid_detach_iommufd_pt, flags);


Pasid isn't copied, should use pasid here?


good catch! will fix it.

--
Regards,
Yi Liu

Re: [PATCH net-next 0/5] selftests: tc-testing: updates and cleanups for tdc

2023-11-27 Thread patchwork-bot+netdevbpf

Hello:

This series was applied to netdev/net-next.git (main)
by Jakub Kicinski :

On Fri, 24 Nov 2023 12:42:43 -0300 you wrote:
> Address the recommendations from the previous series and cleanup some
> leftovers.
> 
> Pedro Tammela (5):
>   selftests: tc-testing: remove buildebpf plugin
>   selftests: tc-testing: remove unnecessary time.sleep
>   selftests: tc-testing: prefix iproute2 functions with "ipr2"
>   selftests: tc-testing: cleanup on Ctrl-C
>   selftests: tc-testing: remove unused import
> 
> [...]

Here is the summary with links:
  - [net-next,1/5] selftests: tc-testing: remove buildebpf plugin
https://git.kernel.org/netdev/net-next/c/a79d8ba734bd
  - [net-next,2/5] selftests: tc-testing: remove unnecessary time.sleep
https://git.kernel.org/netdev/net-next/c/8059e68b9928
  - [net-next,3/5] selftests: tc-testing: prefix iproute2 functions with "ipr2"
https://git.kernel.org/netdev/net-next/c/56e16bc69bb7
  - [net-next,4/5] selftests: tc-testing: cleanup on Ctrl-C
https://git.kernel.org/netdev/net-next/c/501679f5d4a4
  - [net-next,5/5] selftests: tc-testing: remove unused import
https://git.kernel.org/netdev/net-next/c/ed346fccfc40

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html

Re: [PATCH net 0/4] selftests/net: fix a few small compiler warnings

2023-11-27 Thread patchwork-bot+netdevbpf

Hello:

This series was applied to netdev/net.git (main)
by Jakub Kicinski :

On Fri, 24 Nov 2023 12:15:18 -0500 you wrote:
> From: Willem de Bruijn 
> 
> Observed a clang warning when backporting cmsg_sender.
> Ran the same build against all the .c files under selftests/net.
> 
> This is clang-14 with -Wall
> Which is what tools/testing/selftests/net/Makefile also enables.
> 
> [...]

Here is the summary with links:
  - [net,1/4] selftests/net: ipsec: fix constant out of range
https://git.kernel.org/netdev/net/c/088559815477
  - [net,2/4] selftests/net: fix a char signedness issue
https://git.kernel.org/netdev/net/c/7b29828c5af6
  - [net,3/4] selftests/net: unix: fix unused variable compiler warning
https://git.kernel.org/netdev/net/c/59fef379d453
  - [net,4/4] selftests/net: mptcp: fix uninitialized variable warnings
https://git.kernel.org/netdev/net/c/00a4f8fd9c75

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html

Re: [PATCH net-next 1/5] selftests: tc-testing: remove buildebpf plugin

2023-11-27 Thread Jakub Kicinski

On Fri, 24 Nov 2023 12:42:44 -0300 Pedro Tammela wrote:
> diff --git a/tools/testing/selftests/tc-testing/Makefile 
> b/tools/testing/selftests/tc-testing/Makefile
> index b1fa2e177e2f..e8b3dde4fa16 100644
> --- a/tools/testing/selftests/tc-testing/Makefile
> +++ b/tools/testing/selftests/tc-testing/Makefile
> @@ -1,31 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0
> -include ../../../scripts/Makefile.include
>  
> -top_srcdir = $(abspath ../../../..)
> -APIDIR := $(top_scrdir)/include/uapi
> -TEST_GEN_FILES = action.o
> +TEST_PROGS += ./tdc.sh

nit: could you try to remove the ./ prefix, as a follow up? 
 I think it's not necessary and it confuses one of patchwork checks.

Re: [RFC PATCH 0/5] RISC-V: Add dynamic TSO support

2023-11-27 Thread Guo Ren

On Mon, Nov 27, 2023 at 12:16:43PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 24, 2023 at 09:51:53PM -0500, Guo Ren wrote:
> > On Fri, Nov 24, 2023 at 11:15:19AM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 24, 2023 at 08:21:37AM +0100, Christoph Muellner wrote:
> > > > From: Christoph Müllner 
> > > > 
> > > > The upcoming RISC-V Ssdtso specification introduces a bit in the senvcfg
> > > > CSR to switch the memory consistency model at run-time from RVWMO to TSO
> > > > (and back). The active consistency model can therefore be switched on a
> > > > per-hart base and managed by the kernel on a per-process/thread base.
> > > 
> > > You guys, computers are hartless, nobody told ya?
> > > 
> > > > This patch implements basic Ssdtso support and adds a prctl API on top
> > > > so that user-space processes can switch to a stronger memory consistency
> > > > model (than the kernel was written for) at run-time.
> > > > 
> > > > I am not sure if other architectures support switching the memory
> > > > consistency model at run-time, but designing the prctl API in an
> > > > arch-independent way allows reusing it in the future.
> > > 
> > > IIRC some Sparc chips could do this, but I don't think anybody ever
> > > exposed this to userspace (or used it much).
> > > 
> > > IA64 had planned to do this, except they messed it up and did it the
> > > wrong way around (strong first and then relax it later), which lead to
> > > the discovery that all existing software broke (d'uh).
> > > 
> > > I think ARM64 approached this problem by adding the
> > > load-acquire/store-release instructions and for TSO based code,
> > > translate into those (eg. x86 -> arm64 transpilers).
> 
> > Keeping global TSO order is easier and faster than mixing
> > acquire/release and regular load/store. That means when ssdtso is
> > enabled, the transpiler's load-acquire/store-release becomes regular
> > load/store. Some micro-arch hardwares could speed up the performance.
> 
> Why is it faster? Because the release+acquire thing becomes RcSC instead
> of RcTSO? Surely that can be fixed with a weaker store-release variant
> ot something?
The "ld.acq + st.rel" could only be close to the ideal RCtso because
maintaining "ld.acq + st.rel + ld + st" is more complex in LSU than "ld
+ st" by global TSO.  So, that is why we want a global TSO flag to
simplify the micro-arch implementation, especially for some small
processors in the big-little system.

> 
> The problem I have with all of this is that you need to context switch
> this state and that you need to deal with exceptions, which must be
> written for the weak model but then end up running in the tso model --
> possibly slower than desired.
The s-mode TSO is useless for the riscv Linux kernel and this patch only
uses u-mode TSO. So, the exception handler and the whole kernel always
run in WMO.

Two years ago, we worried about stuff like io_uring, which means
io_uring userspace is in TSO, but the kernel side is in WMO. But it
still seems like no problem because every side has a different
implementation, but they all ensure their order. So, there should be no
problem between TSO & WMO io_uring communication. The only things we
need to prevent are:
1. Do not let the WMO code run in TSO mode, which is inefficient. (you 
mentioned)
2. Do not let the TSO code run in WMO mode, which is incorrect.

> If OTOH you only have a single model, everything becomes so much
> simpler. You just need to be able to express exactly what you want.
The ssdtso is no harm to the current WMO; it's just a tradeoff for
micro-arch implementation. You still could use "ld + st" are "ld.acq +
st.rl", but they are the same in the global tso state.

> 
> 
>

Re: [PATCH ipsec-next v1 6/7] bpf: selftests: test_tunnel: Disable CO-RE relocations

2023-11-27 Thread Daniel Xu

On Mon, Nov 27, 2023 at 02:45:11PM -0600, Daniel Xu wrote:
> On Sun, Nov 26, 2023 at 09:53:04PM -0800, Yonghong Song wrote:
> > 
> > On 11/27/23 12:44 AM, Yonghong Song wrote:
> > > 
> > > On 11/26/23 8:52 PM, Eduard Zingerman wrote:
> > > > On Sun, 2023-11-26 at 18:04 -0600, Daniel Xu wrote:
> > > > [...]
> > > > > > Tbh I'm not sure. This test passes with preserve_static_offset
> > > > > > because it suppresses preserve_access_index. In general clang
> > > > > > translates bitfield access to a set of IR statements like:
> > > > > > 
> > > > > >    C:
> > > > > >  struct foo {
> > > > > >    unsigned _;
> > > > > >    unsigned a:1;
> > > > > >    ...
> > > > > >  };
> > > > > >  ... foo->a ...
> > > > > > 
> > > > > >    IR:
> > > > > >  %a = getelementptr inbounds %struct.foo, ptr %0, i32 0, i32 1
> > > > > >  %bf.load = load i8, ptr %a, align 4
> > > > > >  %bf.clear = and i8 %bf.load, 1
> > > > > >  %bf.cast = zext i8 %bf.clear to i32
> > > > > > 
> > > > > > With preserve_static_offset the getelementptr+load are replaced by a
> > > > > > single statement which is preserved as-is till code generation,
> > > > > > thus load with align 4 is preserved.
> > > > > > 
> > > > > > On the other hand, I'm not sure that clang guarantees that load or
> > > > > > stores used for bitfield access would be always aligned according to
> > > > > > verifier expectations.
> > > > > > 
> > > > > > I think we should check if there are some clang knobs that prevent
> > > > > > generation of unaligned memory access. I'll take a look.
> > > > > Is there a reason to prefer fixing in compiler? I'm not opposed to it,
> > > > > but the downside to compiler fix is it takes years to propagate and
> > > > > sprinkles ifdefs into the code.
> > > > > 
> > > > > Would it be possible to have an analogue of BPF_CORE_READ_BITFIELD()?
> > > > Well, the contraption below passes verification, tunnel selftest
> > > > appears to work. I might have messed up some shifts in the macro,
> > > > though.
> > > 
> > > I didn't test it. But from high level it should work.
> > > 
> > > > 
> > > > Still, if clang would peek unlucky BYTE_{OFFSET,SIZE} for a particular
> > > > field access might be unaligned.
> > > 
> > > clang should pick a sensible BYTE_SIZE/BYTE_OFFSET to meet
> > > alignment requirement. This is also required for BPF_CORE_READ_BITFIELD.
> > > 
> > > > 
> > > > ---
> > > > 
> > > > diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > > b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > > index 3065a716544d..41cd913ac7ff 100644
> > > > --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > > +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > > @@ -9,6 +9,7 @@
> > > >   #include "vmlinux.h"
> > > >   #include 
> > > >   #include 
> > > > +#include 
> > > >   #include "bpf_kfuncs.h"
> > > >   #include "bpf_tracing_net.h"
> > > >   @@ -144,6 +145,38 @@ int ip6gretap_get_tunnel(struct __sk_buff *skb)
> > > >   return TC_ACT_OK;
> > > >   }
> > > >   +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({    \
> > > > +    void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET);    \
> > > > +    unsigned byte_size = __CORE_RELO(s, field, BYTE_SIZE);    \
> > > > +    unsigned lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > > > +    unsigned rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > > > +    unsigned bit_size = (rshift - lshift);    \
> > > > +    unsigned long long nval, val, hi, lo;    \
> > > > +    \
> > > > +    asm volatile("" : "=r"(p) : "0"(p));    \
> > > 
> > > Use asm volatile("" : "+r"(p)) ?
> > > 
> > > > +    \
> > > > +    switch (byte_size) {    \
> > > > +    case 1: val = *(unsigned char *)p; break;    \
> > > > +    case 2: val = *(unsigned short *)p; break;    \
> > > > +    case 4: val = *(unsigned int *)p; break;    \
> > > > +    case 8: val = *(unsigned long long *)p; break;    \
> > > > +    }    \
> > > > +    hi = val >> (bit_size + rshift);    \
> > > > +    hi <<= bit_size + rshift;    \
> > > > +    lo = val << (bit_size + lshift);    \
> > > > +    lo >>= bit_size + lshift;    \
> > > > +    nval = new_val;    \
> > > > +    nval <<= lshift;    \
> > > > +    nval >>= rshift;    \
> > > > +    val = hi | nval | lo;    \
> > > > +    switch (byte_size) {    \
> > > > +    case 1: *(unsigned char *)p  = val; break;    \
> > > > +    case 2: *(unsigned short *)p = val; break;    \
> > > > +    case 4: *(unsigned int *)p   = val; break;    \
> > > > +    case 8: *(unsigned long long *)p = val; break;    \
> > > > +    }

[PATCH v7 5/6] selftests: cgroup: update per-memcg zswap writeback selftest

2023-11-27 Thread Nhat Pham

From: Domenico Cerasuolo 

The memcg-zswap self test is updated to adjust to the behavior change
implemented by commit 87730b165089 ("zswap: make shrinking memcg-aware"),
where zswap performs writeback for specific memcg.

Signed-off-by: Domenico Cerasuolo 
Signed-off-by: Nhat Pham 
---
 tools/testing/selftests/cgroup/test_zswap.c | 74 ++---
 1 file changed, 50 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/cgroup/test_zswap.c 
b/tools/testing/selftests/cgroup/test_zswap.c
index c99d2adaca3f..47fdaa146443 100644
--- a/tools/testing/selftests/cgroup/test_zswap.c
+++ b/tools/testing/selftests/cgroup/test_zswap.c
@@ -50,9 +50,9 @@ static int get_zswap_stored_pages(size_t *value)
return read_int("/sys/kernel/debug/zswap/stored_pages", value);
 }
 
-static int get_zswap_written_back_pages(size_t *value)
+static int get_cg_wb_count(const char *cg)
 {
-   return read_int("/sys/kernel/debug/zswap/written_back_pages", value);
+   return cg_read_key_long(cg, "memory.stat", "zswp_wb");
 }
 
 static long get_zswpout(const char *cgroup)
@@ -73,6 +73,24 @@ static int allocate_bytes(const char *cgroup, void *arg)
return 0;
 }
 
+static char *setup_test_group_1M(const char *root, const char *name)
+{
+   char *group_name = cg_name(root, name);
+
+   if (!group_name)
+   return NULL;
+   if (cg_create(group_name))
+   goto fail;
+   if (cg_write(group_name, "memory.max", "1M")) {
+   cg_destroy(group_name);
+   goto fail;
+   }
+   return group_name;
+fail:
+   free(group_name);
+   return NULL;
+}
+
 /*
  * Sanity test to check that pages are written into zswap.
  */
@@ -117,43 +135,51 @@ static int test_zswap_usage(const char *root)
 
 /*
  * When trying to store a memcg page in zswap, if the memcg hits its memory
- * limit in zswap, writeback should not be triggered.
- *
- * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may
- * not zswap"). Needs to be revised when a per memcg writeback mechanism is
- * implemented.
+ * limit in zswap, writeback should affect only the zswapped pages of that
+ * memcg.
  */
 static int test_no_invasive_cgroup_shrink(const char *root)
 {
-   size_t written_back_before, written_back_after;
int ret = KSFT_FAIL;
-   char *test_group;
+   size_t control_allocation_size = MB(10);
+   char *control_allocation, *wb_group = NULL, *control_group = NULL;
 
/* Set up */
-   test_group = cg_name(root, "no_shrink_test");
-   if (!test_group)
-   goto out;
-   if (cg_create(test_group))
+   wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
+   if (!wb_group)
+   return KSFT_FAIL;
+   if (cg_write(wb_group, "memory.zswap.max", "10K"))
goto out;
-   if (cg_write(test_group, "memory.max", "1M"))
+   control_group = setup_test_group_1M(root, "per_memcg_wb_test2");
+   if (!control_group)
goto out;
-   if (cg_write(test_group, "memory.zswap.max", "10K"))
+
+   /* Push some test_group2 memory into zswap */
+   if (cg_enter_current(control_group))
goto out;
-   if (get_zswap_written_back_pages(_back_before))
+   control_allocation = malloc(control_allocation_size);
+   for (int i = 0; i < control_allocation_size; i += 4095)
+   control_allocation[i] = 'a';
+   if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1)
goto out;
 
-   /* Allocate 10x memory.max to push memory into zswap */
-   if (cg_run(test_group, allocate_bytes, (void *)MB(10)))
+   /* Allocate 10x memory.max to push wb_group memory into zswap and 
trigger wb */
+   if (cg_run(wb_group, allocate_bytes, (void *)MB(10)))
goto out;
 
-   /* Verify that no writeback happened because of the memcg allocation */
-   if (get_zswap_written_back_pages(_back_after))
-   goto out;
-   if (written_back_after == written_back_before)
+   /* Verify that only zswapped memory from gwb_group has been written 
back */
+   if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 
0)
ret = KSFT_PASS;
 out:
-   cg_destroy(test_group);
-   free(test_group);
+   cg_enter_current(root);
+   if (control_group) {
+   cg_destroy(control_group);
+   free(control_group);
+   }
+   cg_destroy(wb_group);
+   free(wb_group);
+   if (control_allocation)
+   free(control_allocation);
return ret;
 }
 
-- 
2.34.1

[PATCH v7 6/6] zswap: shrinks zswap pool based on memory pressure

2023-11-27 Thread Nhat Pham

Currently, we only shrink the zswap pool when the user-defined limit is
hit. This means that if we set the limit too high, cold data that are
unlikely to be used again will reside in the pool, wasting precious
memory. It is hard to predict how much zswap space will be needed ahead
of time, as this depends on the workload (specifically, on factors such
as memory access patterns and compressibility of the memory pages).

This patch implements a memcg- and NUMA-aware shrinker for zswap, that
is initiated when there is memory pressure. The shrinker does not
have any parameter that must be tuned by the user, and can be opted in
or out on a per-memcg basis.

Furthermore, to make it more robust for many workloads and prevent
overshrinking (i.e evicting warm pages that might be refaulted into
memory), we build in the following heuristics:

* Estimate the number of warm pages residing in zswap, and attempt to
  protect this region of the zswap LRU.
* Scale the number of freeable objects by an estimate of the memory
  saving factor. The better zswap compresses the data, the fewer pages
  we will evict to swap (as we will otherwise incur IO for relatively
  small memory saving).
* During reclaim, if the shrinker encounters a page that is also being
  brought into memory, the shrinker will cautiously terminate its
  shrinking action, as this is a sign that it is touching the warmer
  region of the zswap LRU.

As a proof of concept, we ran the following synthetic benchmark:
build the linux kernel in a memory-limited cgroup, and allocate some
cold data in tmpfs to see if the shrinker could write them out and
improved the overall performance. Depending on the amount of cold data
generated, we observe from 14% to 35% reduction in kernel CPU time used
in the kernel builds.

Signed-off-by: Nhat Pham 
---
 Documentation/admin-guide/mm/zswap.rst |   7 +
 include/linux/mmzone.h |   2 +
 include/linux/zswap.h  |  25 +++-
 mm/mmzone.c|   1 +
 mm/swap_state.c|   2 +
 mm/zswap.c | 177 -
 6 files changed, 208 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/zswap.rst 
b/Documentation/admin-guide/mm/zswap.rst
index 45b98390e938..522ae22ccb84 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -153,6 +153,13 @@ attribute, e. g.::
 
 Setting this parameter to 100 will disable the hysteresis.
 
+When there is a sizable amount of cold memory residing in the zswap pool, it
+can be advantageous to proactively write these cold pages to swap and reclaim
+the memory for other use cases. By default, the zswap shrinker is disabled.
+User can enable it as follows:
+
+  echo Y > /sys/module/zswap/parameters/shrinker_enabled
+
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, same-value filled pages and various counters for the reasons
 pages are rejected.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7b1816450bfc..b23bc5390240 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -641,6 +642,7 @@ struct lruvec {
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
+   struct zswap_lruvec_state zswap_lruvec_state;
 };
 
 /* Isolate for asynchronous migration */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e571e393669b..03253a5965bb 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -5,20 +5,40 @@
 #include 
 #include 
 
+struct lruvec;
+
 extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
 
+struct zswap_lruvec_state {
+   /*
+* Number of pages in zswap that should be protected from the shrinker.
+* This number is an estimate of the following counts:
+*
+* a) Recent page faults.
+* b) Recent insertion to the zswap LRU. This includes new zswap stores,
+*as well as recent zswap LRU rotations.
+*
+* These pages are likely to be warm, and might incur IO if the are 
written
+* to swap.
+*/
+   atomic_long_t nr_zswap_protected;
+};
+
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
-
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_lruvec_swapin(struct page *page);
 #else
 
+struct zswap_lruvec_state {};
+
 static inline bool zswap_store(struct folio *folio)
 {
return false;
@@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) 
{}
 static inline void zswap_swapon(int type) {}
 static inline void

[PATCH v7 2/6] memcontrol: add a new function to traverse online-only memcg hierarchy

2023-11-27 Thread Nhat Pham

The new zswap writeback scheme requires an online-only memcg hierarchy
traversal. Add this functionality via the new mem_cgroup_iter_online()
function - the old mem_cgroup_iter() is a special case of this new
function.

Suggested-by: Andrew Morton 
Signed-off-by: Nhat Pham 
---
 include/linux/memcontrol.h | 13 +
 mm/memcontrol.c| 29 +
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7bdcf3020d7a..5bfe41840e80 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -833,6 +833,10 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
   struct mem_cgroup *,
   struct mem_cgroup_reclaim_cookie *);
+struct mem_cgroup *mem_cgroup_iter_online(struct mem_cgroup *root,
+  struct mem_cgroup *prev,
+  struct mem_cgroup_reclaim_cookie *reclaim,
+  bool online);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
   int (*)(struct task_struct *, void *), void *arg);
@@ -1386,6 +1390,15 @@ mem_cgroup_iter(struct mem_cgroup *root,
return NULL;
 }
 
+static inline struct mem_cgroup *
+mem_cgroup_iter_online(struct mem_cgroup *root,
+   struct mem_cgroup *prev,
+   struct mem_cgroup_reclaim_cookie *reclaim,
+   bool online)
+{
+   return NULL;
+}
+
 static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 struct mem_cgroup *prev)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 470821d1ba1a..b1fb96233fa1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -,10 +,11 @@ struct mem_cgroup *get_mem_cgroup_from_current(void)
 }
 
 /**
- * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * mem_cgroup_iter_online - iterate over memory cgroup hierarchy
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ * @online: whether to skip offline memcgs
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
@@ -1123,13 +1124,16 @@ struct mem_cgroup *get_mem_cgroup_from_current(void)
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
+ * Caller can skip offline memcgs by passing true for @online.
+ *
  * Reclaimers can specify a node in @reclaim to divide up the memcgs
  * in the hierarchy among all concurrent reclaimers operating on the
  * same node.
  */
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter_online(struct mem_cgroup *root,
   struct mem_cgroup *prev,
-  struct mem_cgroup_reclaim_cookie *reclaim)
+  struct mem_cgroup_reclaim_cookie *reclaim,
+  bool online)
 {
struct mem_cgroup_reclaim_iter *iter;
struct cgroup_subsys_state *css = NULL;
@@ -1199,7 +1203,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
 * is provided by the caller, so we know it's alive
 * and kicking, and don't take an extra reference.
 */
-   if (css == >css || css_tryget(css)) {
+   if (css == >css || (!online && css_tryget(css)) ||
+   css_tryget_online(css)) {
memcg = mem_cgroup_from_css(css);
break;
}
@@ -1228,6 +1233,22 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
return memcg;
 }
 
+/**
+ * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * @root: hierarchy root
+ * @prev: previously returned memcg, NULL on first invocation
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ *
+ * Perform an iteration on the memory cgroup hierarchy without skipping
+ * offline memcgs.
+ */
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+  struct mem_cgroup *prev,
+  struct mem_cgroup_reclaim_cookie *reclaim)
+{
+   return mem_cgroup_iter_online(root, prev, reclaim, false);
+}
+
 /**
  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
  * @root: hierarchy root
-- 
2.34.1

[PATCH v7 3/6] zswap: make shrinking memcg-aware

2023-11-27 Thread Nhat Pham

From: Domenico Cerasuolo 

Currently, we only have a single global LRU for zswap. This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing
pages from other memcgs. This issue has been previously observed in
practice and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-npha...@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

Signed-off-by: Domenico Cerasuolo 
Co-developed-by: Nhat Pham 
Signed-off-by: Nhat Pham 
---
 include/linux/memcontrol.h |   5 +
 include/linux/zswap.h  |   2 +
 mm/memcontrol.c|   2 +
 mm/swap.h  |   3 +-
 mm/swap_state.c|  24 +++-
 mm/zswap.c | 248 +
 6 files changed, 223 insertions(+), 61 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5bfe41840e80..a568f70a2677 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1191,6 +1191,11 @@ static inline struct mem_cgroup *page_memcg_check(struct 
page *page)
return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup 
*objcg)
+{
+   return NULL;
+}
+
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
return false;
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 2a60ce39cfde..e571e393669b 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 
 #else
 
@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio)
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
+static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b1fb96233fa1..8c0f3f971179 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5635,6 +5635,8 @@ static void mem_cgroup_css_offline(struct 
cgroup_subsys_state *css)
page_counter_set_min(>memory, 0);
page_counter_set_low(>memory, 0);
 
+   zswap_memcg_offline_cleanup(memcg);
+
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
diff --git a/mm/swap.h b/mm/swap.h
index 73c332ee4d91..c0dc73e10e91 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t 
gfp_mask,
   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct mempolicy *mpol, pgoff_t ilx,
-bool *new_page_allocated);
+bool *new_page_allocated,
+bool skip_if_exists);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85d9e5806a6a..6c84236382f3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space 
*mapping,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct mempolicy *mpol, pgoff_t ilx,
-bool *new_page_allocated)
+bool *new_page_allocated,
+bool skip_if_exists)
 {
struct swap_info_struct *si;
struct folio *folio;
@@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
gfp_t gfp_mask,
if (err != -EEXIST)
goto fail_put_swap;
 
+   /*
+* Protect against a recursive call to __read_swap_cache_async()
+* on the same entry waiting forever here because SWAP_HAS_CACHE
+* is set but the folio is not the swap cache yet. This can
+* happen today if mem_cgroup_swapin_charge_folio() below
+* triggers reclaim through zswap, which

[PATCH v7 0/6] workload-specific and memory pressure-driven zswap writeback

2023-11-27 Thread Nhat Pham

Changelog:
v7:
   * Added the mem_cgroup_iter_online() function to the API for the new
 behavior (suggested by Andrew Morton) (patch 2)
   * Fixed a missing list_lru_del -> list_lru_del_obj (patch 1)
v6:
   * Rebase on top of latest mm-unstable.
   * Fix/improve the in-code documentation of the new list_lru
 manipulation functions (patch 1)
v5:
   * Replace reference getting with an rcu_read_lock() section for
 zswap lru modifications (suggested by Yosry)
   * Add a new prep patch that allows mem_cgroup_iter() to return
 online cgroup.
   * Add a callback that updates pool->next_shrink when the cgroup is
 offlined (suggested by Yosry Ahmed, Johannes Weiner)
v4:
   * Rename list_lru_add to list_lru_add_obj and __list_lru_add to
 list_lru_add (patch 1) (suggested by Johannes Weiner and
 Yosry Ahmed)
   * Some cleanups on the memcg aware LRU patch (patch 2)
 (suggested by Yosry Ahmed)
   * Use event interface for the new per-cgroup writeback counters.
 (patch 3) (suggested by Yosry Ahmed)
   * Abstract zswap's lruvec states and handling into 
 zswap_lruvec_state (patch 5) (suggested by Yosry Ahmed)
v3:
   * Add a patch to export per-cgroup zswap writeback counters
   * Add a patch to update zswap's kselftest
   * Separate the new list_lru functions into its own prep patch
   * Do not start from the top of the hierarchy when encounter a memcg
 that is not online for the global limit zswap writeback (patch 2)
 (suggested by Yosry Ahmed)
   * Do not remove the swap entry from list_lru in
 __read_swapcache_async() (patch 2) (suggested by Yosry Ahmed)
   * Removed a redundant zswap pool getting (patch 2)
 (reported by Ryan Roberts)
   * Use atomic for the nr_zswap_protected (instead of lruvec's lock)
 (patch 5) (suggested by Yosry Ahmed)
   * Remove the per-cgroup zswap shrinker knob (patch 5)
 (suggested by Yosry Ahmed)
v2:
   * Fix loongarch compiler errors
   * Use pool stats instead of memcg stats when !CONFIG_MEMCG_KEM

There are currently several issues with zswap writeback:

1. There is only a single global LRU for zswap, making it impossible to
   perform worload-specific shrinking - an memcg under memory pressure
   cannot determine which pages in the pool it owns, and often ends up
   writing pages from other memcgs. This issue has been previously
   observed in practice and mitigated by simply disabling
   memcg-initiated shrinking:

   https://lore.kernel.org/all/20230530232435.3097106-1-npha...@gmail.com/T/#u

   But this solution leaves a lot to be desired, as we still do not
   have an avenue for an memcg to free up its own memory locked up in
   the zswap pool.

2. We only shrink the zswap pool when the user-defined limit is hit.
   This means that if we set the limit too high, cold data that are
   unlikely to be used again will reside in the pool, wasting precious
   memory. It is hard to predict how much zswap space will be needed
   ahead of time, as this depends on the workload (specifically, on
   factors such as memory access patterns and compressibility of the
   memory pages).

This patch series solves these issues by separating the global zswap
LRU into per-memcg and per-NUMA LRUs, and performs workload-specific
(i.e memcg- and NUMA-aware) zswap writeback under memory pressure. The
new shrinker does not have any parameter that must be tuned by the
user, and can be opted in or out on a per-memcg basis.

As a proof of concept, we ran the following synthetic benchmark:
build the linux kernel in a memory-limited cgroup, and allocate some
cold data in tmpfs to see if the shrinker could write them out and
improved the overall performance. Depending on the amount of cold data
generated, we observe from 14% to 35% reduction in kernel CPU time used
in the kernel builds.

Domenico Cerasuolo (3):
  zswap: make shrinking memcg-aware
  mm: memcg: add per-memcg zswap writeback stat
  selftests: cgroup: update per-memcg zswap writeback selftest

Nhat Pham (3):
  list_lru: allows explicit memcg and NUMA node selection
  memcontrol: add a new function to traverse online-only memcg hierarchy
  zswap: shrinks zswap pool based on memory pressure

 Documentation/admin-guide/mm/zswap.rst  |   7 +
 drivers/android/binder_alloc.c  |   7 +-
 fs/dcache.c |   8 +-
 fs/gfs2/quota.c |   6 +-
 fs/inode.c  |   4 +-
 fs/nfs/nfs42xattr.c |   8 +-
 fs/nfsd/filecache.c |   4 +-
 fs/xfs/xfs_buf.c|   6 +-
 fs/xfs/xfs_dquot.c  |   2 +-
 fs/xfs/xfs_qm.c |   2 +-
 include/linux/list_lru.h|  54 ++-
 include/linux/memcontrol.h  |  18 +
 include/linux/mmzone.h  |   2 +
 include/linux/vm_event_item.h   |   1 +
 include/linux/zswap.h   |  27 +-

[RFC PATCH v2 1/2] kselftest: Add test to verify probe of devices from discoverable busses

2023-11-27 Thread Nícolas F . R . A . Prado

Add a new test to verify that a list of expected devices on a given
platform have been successfully probed by a driver.

Add a new test to verify that all expected devices from discoverable
busses (ie USB, PCI) have been successfully instantiated and probed by a
driver.

The per-platform list of expected devices is selected from the ones
under the boards/ directory based on compatible.

Signed-off-by: Nícolas F. R. A. Prado 

---

(no changes since v1)

 tools/testing/selftests/Makefile  |   1 +
 tools/testing/selftests/devices/.gitignore|   1 +
 tools/testing/selftests/devices/Makefile  |   8 +
 .../devices/test_discoverable_devices.sh  | 160 ++
 4 files changed, 170 insertions(+)
 create mode 100644 tools/testing/selftests/devices/.gitignore
 create mode 100644 tools/testing/selftests/devices/Makefile
 create mode 100755 tools/testing/selftests/devices/test_discoverable_devices.sh

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 3b2061d1c1a5..7f5088006c3c 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -13,6 +13,7 @@ TARGETS += core
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += damon
+TARGETS += devices
 TARGETS += dmabuf-heaps
 TARGETS += drivers/dma-buf
 TARGETS += drivers/s390x/uvdevice
diff --git a/tools/testing/selftests/devices/.gitignore 
b/tools/testing/selftests/devices/.gitignore
new file mode 100644
index ..e3c5c04d1b19
--- /dev/null
+++ b/tools/testing/selftests/devices/.gitignore
@@ -0,0 +1 @@
+ktap_helpers.sh
diff --git a/tools/testing/selftests/devices/Makefile 
b/tools/testing/selftests/devices/Makefile
new file mode 100644
index ..ff2fdc8fc5e2
--- /dev/null
+++ b/tools/testing/selftests/devices/Makefile
@@ -0,0 +1,8 @@
+TEST_PROGS := test_discoverable_devices.sh
+TEST_GEN_FILES := ktap_helpers.sh
+TEST_FILES := boards
+
+include ../lib.mk
+
+$(OUTPUT)/ktap_helpers.sh:
+   cp ../dt/ktap_helpers.sh $@
diff --git a/tools/testing/selftests/devices/test_discoverable_devices.sh 
b/tools/testing/selftests/devices/test_discoverable_devices.sh
new file mode 100755
index ..82bad5ba7081
--- /dev/null
+++ b/tools/testing/selftests/devices/test_discoverable_devices.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2023 Collabora Ltd
+#
+# This script tests for presence and driver binding of devices from 
discoverable
+# busses (ie USB, PCI).
+#
+# The per-platform list of devices to be tested is stored inside the boards/
+# directory and chosen based on compatible. Each line of the file follows the
+# following format:
+#
+#   usb|pci test_name number_of_matches field=value [ field=value ... ]
+#
+# The available match fields vary by bus. The field-value match pairs for a
+# device can be retrieved from the device's modalias attribute in sysfs.
+#
+
+DIR="$(dirname $(readlink -f "$0"))"
+
+source "${DIR}"/ktap_helpers.sh
+
+KSFT_FAIL=1
+KSFT_SKIP=4
+
+retval=0
+
+match()
+{
+   FILE="$1"
+   ID="$2"
+
+   [ ! -f "$FILE" ] && return 1
+   [ "$ID" = "" ] && return 0
+   grep -q "$ID" "$FILE" || return 1
+   return 0
+}
+
+usb()
+{
+   name="$1"
+   count="$2"
+   shift 2
+
+   for arg in $@; do
+   [[ "$arg" =~ ^v= ]] && v="${arg#v=}"
+   [[ "$arg" =~ ^p= ]] && p="${arg#p=}"
+   [[ "$arg" =~ ^d= ]] && d="${arg#d=}"
+   [[ "$arg" =~ ^dc= ]] && dc="${arg#dc=}"
+   [[ "$arg" =~ ^dsc= ]] && dsc="${arg#dsc=}"
+   [[ "$arg" =~ ^dp= ]] && dp="${arg#dp=}"
+   [[ "$arg" =~ ^ic= ]] && ic="${arg#ic=}"
+   [[ "$arg" =~ ^isc= ]] && isc="${arg#isc=}"
+   [[ "$arg" =~ ^ip= ]] && ip="${arg#ip=}"
+   [[ "$arg" =~ ^in= ]] && in="${arg#in=}"
+   done
+
+   cur_count=0
+
+   for dev in $(find /sys/bus/usb/devices -maxdepth 1); do
+   match "$dev"/idVendor "$v" || continue
+   match "$dev"/idProduct "$p" || continue
+   match "$dev"/bcdDevice "$d" || continue
+   match "$dev"/bDeviceClass "$dc" || continue
+   match "$dev"/bDeviceSubClass "$dsc" || continue
+   match "$dev"/bDeviceProtocol "$dp" || continue
+
+   # Matched device. Now search through interfaces
+   for intf in $(find "$dev"/ -maxdepth 1 -type d); do
+   match "$intf"/bInterfaceClass "$ic" || continue
+   match "$intf"/bInterfaceSubClass "$isc" || continue
+   match "$intf"/bInterfaceProtocol "$ip" || continue
+   match "$intf"/bInterfaceNumber "$in" || continue
+
+   # Matched interface. Add to count if it was probed by 
driver.
+   [ -d "$intf"/driver ] && cur_count=$((cur_count+1))
+   done
+   done
+
+   if [ "$cur_count" -eq "$count" ]; then
+

[RFC PATCH v2 0/2] Add test to verify probe of devices from discoverable busses

2023-11-27 Thread Nícolas F . R . A . Prado



Hi,

v1 [1] was discussed during Plumbers [2], where a lot of feedback was given. I
hope to justify the changes in v2 and address the feedback here.

One feedback from Shuah was that keeping per-platform files with the USB/PCI
devices to test as part of the kselftest tree wasn't maintainable. One proposed
alternative was to generate a list of probed devices on a known-good kernel and
use that as a reference. However you need someone to look at that generated
reference to be able to say it is a good one, and you need to save it to ensure
it will be reproducible later anyway, so that wouldn't actually solve the
problem. It is a matter of hand-crafting vs generating the test definitions, but
they will need to be vouched by someone and stored somewhere in both cases.

So for this v2, in patch 2 I just have a sample test definition, and the
per-platform test definitions would be added to a separate repository.

The other feedback received was that the BIOS might reconfigure the PCI
topology (at least on x86), meaning that relying on a sequence of device and
function numbers (eg 1d.0/02.0/0.0) as a stable description of a device on the
platform is not possible. I couldn't verify whether this is really the case (if
you have any more insight into this, please let me know), but with that in mind,
here in v2 I have taken a different approach. Here I'm using the device's
properties which are used for driver matching (the same that show on modalias)
to identify a device in a stable way.

This approach has some drawbacks compared to the one on v1. For one it doesn't
uniquely identify a device, so if there are multiple of the same device on a
platform they have to be checked as a group. Also the test definition isn't as
human-readable.

I'm adding in CC the people I recognized at the Plumbers session that were
interested in this work. Feel free to add anyone missing.

Thanks,
Nícolas

[1] https://lore.kernel.org/all/20231024211818.365844-1-nfrapr...@collabora.com
[2] https://www.youtube.com/watch?v=oE73eVSyFXQ=9377s

Original cover letter:

This is part of an effort to improve detection of regressions impacting
device probe on all platforms. The recently merged DT kselftest [3]
detects probe issues for all devices described statically in the DT.
That leaves out devices discovered at run-time from discoverable busses.

This is where this test comes in. All of the devices that are connected
through discoverable busses (ie USB and PCI), and which are internal and
therefore always present, can be described in a per-platform file so
they can be checked for. The test will check that the device has been
instantiated and bound to a driver.

Patch 1 introduces the test. Patch 2 adds the test definitions for the
google,spherion machine (Acer Chromebook 514) as an example.

This is the sample output from the test running on Spherion:

TAP version 13
 Using board file:  boards/google,spherion
1..3
ok 1 usb.camera
ok 2 usb.bluetooth
ok 3 pci.wifi
 Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0

[3] 
https://lore.kernel.org/all/20230828211424.2964562-1-nfrapr...@collabora.com/

Changes in v2:
- Changed approach of encoding stable device reference in test file from
  HW topology to device match fields (the ones from modalias)
- Better documented test format

Nícolas F. R. A. Prado (2):
  kselftest: Add test to verify probe of devices from discoverable
busses
  kselftest: devices: Add sample board file for google,spherion

 tools/testing/selftests/Makefile  |   1 +
 tools/testing/selftests/devices/.gitignore|   1 +
 tools/testing/selftests/devices/Makefile  |   8 +
 .../selftests/devices/boards/google,spherion  |  12 ++
 .../devices/test_discoverable_devices.sh  | 160 ++
 5 files changed, 182 insertions(+)
 create mode 100644 tools/testing/selftests/devices/.gitignore
 create mode 100644 tools/testing/selftests/devices/Makefile
 create mode 100644 tools/testing/selftests/devices/boards/google,spherion
 create mode 100755 tools/testing/selftests/devices/test_discoverable_devices.sh

-- 
2.42.1

[PATCH v6 1/6] list_lru: allows explicit memcg and NUMA node selection (fix)

2023-11-27 Thread Nhat Pham

The original patch missed a list_lru_del() -> list_lru_del_obj()
conversion. This patch fixed that.

Signed-off-by: Nhat Pham 
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index e80669d4e037..f69d30c9f50f 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -234,7 +234,7 @@ static int binder_update_page_range(struct binder_alloc 
*alloc, int allocate,
if (page->page_ptr) {
trace_binder_alloc_lru_start(alloc, index);
 
-   on_lru = list_lru_del(_alloc_lru, >lru);
+   on_lru = list_lru_del_obj(_alloc_lru, 
>lru);
WARN_ON(!on_lru);
 
trace_binder_alloc_lru_end(alloc, index);
-- 
2.34.1

Re: [PATCH v6 2/6] memcontrol: allows mem_cgroup_iter() to check for onlineness

2023-11-27 Thread Nhat Pham

On Mon, Nov 27, 2023 at 1:43 PM Andrew Morton  wrote:
>
> On Mon, 27 Nov 2023 11:36:59 -0800 Nhat Pham  wrote:
>
> > The new zswap writeback scheme requires an online-only memcg hierarchy
> > traversal. Add a new parameter to mem_cgroup_iter() to check for
> > onlineness before returning.
>
> I get a few build errors, perhaps because of patch timing issues...

Ah I thought I got all of them. Must have somehow missed it.

>
> mm/shrinker_debug.c: In function 'shrinker_debugfs_count_show':
> mm/shrinker_debug.c:64:17: error: too few arguments to function 
> 'mem_cgroup_iter'
>64 | memcg = mem_cgroup_iter(NULL, NULL, NULL);
>   | ^~~
> In file included from mm/shrinker_debug.c:7:
> ./include/linux/memcontrol.h:833:20: note: declared here
>   833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
>   |^~~
> mm/shrinker_debug.c:89:27: error: too few arguments to function 
> 'mem_cgroup_iter'
>89 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != 
> NULL);
>   |   ^~~
> ./include/linux/memcontrol.h:833:20: note: declared here
>   833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
>   |^~~
> mm/damon/sysfs-schemes.c: In function 'damon_sysfs_memcg_path_to_id':
> mm/damon/sysfs-schemes.c:1594:22: error: too few arguments to function 
> 'mem_cgroup_iter'
>  1594 | for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
>   |  ^~~
> In file included from ./include/linux/damon.h:11,
>  from mm/damon/sysfs-common.h:8,
>  from mm/damon/sysfs-schemes.c:10:
> ./include/linux/memcontrol.h:833:20: note: declared here
>   833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
>   |^~~
> mm/damon/sysfs-schemes.c:1595:33: error: too few arguments to function 
> 'mem_cgroup_iter'
>  1595 | memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
>   | ^~~
> ./include/linux/memcontrol.h:833:20: note: declared here
>   833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
>   |^~~
>
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -832,7 +832,7 @@ static inline void mem_cgroup_put(struct mem_cgroup 
> > *memcg)
> >
> >  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
> >  struct mem_cgroup *,
> > -struct mem_cgroup_reclaim_cookie *);
> > +struct mem_cgroup_reclaim_cookie *, bool 
> > online);
>
> How many callsites do we expect to utilize the new `online' argument?
> Few, I suspect.
>
> How about we fix the above and simplify the patch by adding a new
> mem_cgroup_iter_online() and make mem_cgroup_iter() a one-line wrapper
> which calls that and adds the online=false argument?

But yes, this is a much smarter idea. Should have thought about it initially :)

>
> I also saw this, didn't investigate.
>
> drivers/android/binder_alloc.c: In function 'binder_update_page_range':
> drivers/android/binder_alloc.c:237:34: error: too few arguments to function 
> 'list_lru_del'
>   237 | on_lru = list_lru_del(_alloc_lru, 
> >lru);

Oh yeah I missed this too - it's due to the API change introduced to
the previous bug. The old "list_lru_del" is now "list_lru_del_obj".

Let me double check everything again and send out the fixes. My apologies.

>

Re: [PATCH v1 2/3] KVM: selftests: aarch64: Move the pmu helper function into lib/

2023-11-27 Thread Raghavendra Rao Ananta

Hi Shaoqin,

On Wed, Nov 22, 2023 at 10:39 PM Shaoqin Huang  wrote:
>
> Move those pmu helper function into lib/, thus it can be used by other
> pmu test.
>
> Signed-off-by: Shaoqin Huang 
> ---
>  .../kvm/aarch64/vpmu_counter_access.c | 118 -
>  .../selftests/kvm/include/aarch64/vpmu.h  | 119 ++
>  2 files changed, 119 insertions(+), 118 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c 
> b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
> index 17305408a334..62d6315790ab 100644
> --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
> +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c
> @@ -20,12 +20,6 @@
>  #include 
>  #include 
>
> -/* The max number of the PMU event counters (excluding the cycle counter) */
> -#define ARMV8_PMU_MAX_GENERAL_COUNTERS (ARMV8_PMU_MAX_COUNTERS - 1)
> -
> -/* The cycle counter bit position that's common among the PMU registers */
> -#define ARMV8_PMU_CYCLE_IDX31
> -
>  static struct vpmu_vm *vpmu_vm;
>
>  struct pmreg_sets {
> @@ -35,118 +29,6 @@ struct pmreg_sets {
>
>  #define PMREG_SET(set, clr) {.set_reg_id = set, .clr_reg_id = clr}
>
> -static uint64_t get_pmcr_n(uint64_t pmcr)
> -{
> -   return (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
> -}
> -
> -static void set_pmcr_n(uint64_t *pmcr, uint64_t pmcr_n)
> -{
> -   *pmcr = *pmcr & ~(ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT);
> -   *pmcr |= (pmcr_n << ARMV8_PMU_PMCR_N_SHIFT);
> -}
> -
> -static uint64_t get_counters_mask(uint64_t n)
> -{
> -   uint64_t mask = BIT(ARMV8_PMU_CYCLE_IDX);
> -
> -   if (n)
> -   mask |= GENMASK(n - 1, 0);
> -   return mask;
> -}
> -
> -/* Read PMEVTCNTR_EL0 through PMXEVCNTR_EL0 */
> -static inline unsigned long read_sel_evcntr(int sel)
> -{
> -   write_sysreg(sel, pmselr_el0);
> -   isb();
> -   return read_sysreg(pmxevcntr_el0);
> -}
> -
> -/* Write PMEVTCNTR_EL0 through PMXEVCNTR_EL0 */
> -static inline void write_sel_evcntr(int sel, unsigned long val)
> -{
> -   write_sysreg(sel, pmselr_el0);
> -   isb();
> -   write_sysreg(val, pmxevcntr_el0);
> -   isb();
> -}
> -
> -/* Read PMEVTYPER_EL0 through PMXEVTYPER_EL0 */
> -static inline unsigned long read_sel_evtyper(int sel)
> -{
> -   write_sysreg(sel, pmselr_el0);
> -   isb();
> -   return read_sysreg(pmxevtyper_el0);
> -}
> -
> -/* Write PMEVTYPER_EL0 through PMXEVTYPER_EL0 */
> -static inline void write_sel_evtyper(int sel, unsigned long val)
> -{
> -   write_sysreg(sel, pmselr_el0);
> -   isb();
> -   write_sysreg(val, pmxevtyper_el0);
> -   isb();
> -}
> -
> -static inline void enable_counter(int idx)
> -{
> -   uint64_t v = read_sysreg(pmcntenset_el0);
> -
> -   write_sysreg(BIT(idx) | v, pmcntenset_el0);
> -   isb();
> -}
> -
> -static inline void disable_counter(int idx)
> -{
> -   uint64_t v = read_sysreg(pmcntenset_el0);
> -
> -   write_sysreg(BIT(idx) | v, pmcntenclr_el0);
> -   isb();
> -}
> -
> -static void pmu_disable_reset(void)
> -{
> -   uint64_t pmcr = read_sysreg(pmcr_el0);
> -
> -   /* Reset all counters, disabling them */
> -   pmcr &= ~ARMV8_PMU_PMCR_E;
> -   write_sysreg(pmcr | ARMV8_PMU_PMCR_P, pmcr_el0);
> -   isb();
> -}
> -
> -#define RETURN_READ_PMEVCNTRN(n) \
> -   return read_sysreg(pmevcntr##n##_el0)
> -static unsigned long read_pmevcntrn(int n)
> -{
> -   PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
> -   return 0;
> -}
> -
> -#define WRITE_PMEVCNTRN(n) \
> -   write_sysreg(val, pmevcntr##n##_el0)
> -static void write_pmevcntrn(int n, unsigned long val)
> -{
> -   PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
> -   isb();
> -}
> -
> -#define READ_PMEVTYPERN(n) \
> -   return read_sysreg(pmevtyper##n##_el0)
> -static unsigned long read_pmevtypern(int n)
> -{
> -   PMEVN_SWITCH(n, READ_PMEVTYPERN);
> -   return 0;
> -}
> -
> -#define WRITE_PMEVTYPERN(n) \
> -   write_sysreg(val, pmevtyper##n##_el0)
> -static void write_pmevtypern(int n, unsigned long val)
> -{
> -   PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
> -   isb();
> -}
> -
>  /*
>   * The pmc_accessor structure has pointers to PMEV{CNTR,TYPER}_EL0
>   * accessors that test cases will use. Each of the accessors will
> diff --git a/tools/testing/selftests/kvm/include/aarch64/vpmu.h 
> b/tools/testing/selftests/kvm/include/aarch64/vpmu.h
> index 0a56183644ee..e0cc1ca1c4b7 100644
> --- a/tools/testing/selftests/kvm/include/aarch64/vpmu.h
> +++ b/tools/testing/selftests/kvm/include/aarch64/vpmu.h
> @@ -1,10 +1,17 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>
>  #include 
> +#include 
>
>  #define GICD_BASE_GPA  0x800ULL
>  #define GICR_BASE_GPA  0x80AULL
>
> +/* The max number of the PMU event counters (excluding the cycle counter) */
> +#define ARMV8_PMU_MAX_GENERAL_COUNTERS (ARMV8_PMU_MAX_COUNTERS - 1)
> +
> +/* The cycle counter bit position

Re: [PATCH v6 2/6] memcontrol: allows mem_cgroup_iter() to check for onlineness

2023-11-27 Thread Andrew Morton

On Mon, 27 Nov 2023 11:36:59 -0800 Nhat Pham  wrote:

> The new zswap writeback scheme requires an online-only memcg hierarchy
> traversal. Add a new parameter to mem_cgroup_iter() to check for
> onlineness before returning.

I get a few build errors, perhaps because of patch timing issues...

mm/shrinker_debug.c: In function 'shrinker_debugfs_count_show':
mm/shrinker_debug.c:64:17: error: too few arguments to function 
'mem_cgroup_iter'
   64 | memcg = mem_cgroup_iter(NULL, NULL, NULL);
  | ^~~
In file included from mm/shrinker_debug.c:7:
./include/linux/memcontrol.h:833:20: note: declared here
  833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
  |^~~
mm/shrinker_debug.c:89:27: error: too few arguments to function 
'mem_cgroup_iter'
   89 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
  |   ^~~
./include/linux/memcontrol.h:833:20: note: declared here
  833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
  |^~~
mm/damon/sysfs-schemes.c: In function 'damon_sysfs_memcg_path_to_id':
mm/damon/sysfs-schemes.c:1594:22: error: too few arguments to function 
'mem_cgroup_iter'
 1594 | for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
  |  ^~~
In file included from ./include/linux/damon.h:11,
 from mm/damon/sysfs-common.h:8,
 from mm/damon/sysfs-schemes.c:10:
./include/linux/memcontrol.h:833:20: note: declared here
  833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
  |^~~
mm/damon/sysfs-schemes.c:1595:33: error: too few arguments to function 
'mem_cgroup_iter'
 1595 | memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
  | ^~~
./include/linux/memcontrol.h:833:20: note: declared here
  833 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
  |^~~

> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -832,7 +832,7 @@ static inline void mem_cgroup_put(struct mem_cgroup 
> *memcg)
>  
>  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
>  struct mem_cgroup *,
> -struct mem_cgroup_reclaim_cookie *);
> +struct mem_cgroup_reclaim_cookie *, bool 
> online);

How many callsites do we expect to utilize the new `online' argument? 
Few, I suspect.

How about we fix the above and simplify the patch by adding a new
mem_cgroup_iter_online() and make mem_cgroup_iter() a one-line wrapper
which calls that and adds the online=false argument?

I also saw this, didn't investigate.

drivers/android/binder_alloc.c: In function 'binder_update_page_range':
drivers/android/binder_alloc.c:237:34: error: too few arguments to function 
'list_lru_del'
  237 | on_lru = list_lru_del(_alloc_lru, 
>lru);

Re: [PATCH v6 6/6] zswap: shrinks zswap pool based on memory pressure

2023-11-27 Thread Andrew Morton

On Mon, 27 Nov 2023 11:37:03 -0800 Nhat Pham  wrote:

> Currently, we only shrink the zswap pool when the user-defined limit is
> hit. This means that if we set the limit too high, cold data that are
> unlikely to be used again will reside in the pool, wasting precious
> memory. It is hard to predict how much zswap space will be needed ahead
> of time, as this depends on the workload (specifically, on factors such
> as memory access patterns and compressibility of the memory pages).
> 
> This patch implements a memcg- and NUMA-aware shrinker for zswap, that
> is initiated when there is memory pressure. The shrinker does not
> have any parameter that must be tuned by the user, and can be opted in
> or out on a per-memcg basis.
> 
> Furthermore, to make it more robust for many workloads and prevent
> overshrinking (i.e evicting warm pages that might be refaulted into
> memory), we build in the following heuristics:
> 
> * Estimate the number of warm pages residing in zswap, and attempt to
>   protect this region of the zswap LRU.
> * Scale the number of freeable objects by an estimate of the memory
>   saving factor. The better zswap compresses the data, the fewer pages
>   we will evict to swap (as we will otherwise incur IO for relatively
>   small memory saving).
> * During reclaim, if the shrinker encounters a page that is also being
>   brought into memory, the shrinker will cautiously terminate its
>   shrinking action, as this is a sign that it is touching the warmer
>   region of the zswap LRU.
> 
> As a proof of concept, we ran the following synthetic benchmark:
> build the linux kernel in a memory-limited cgroup, and allocate some
> cold data in tmpfs to see if the shrinker could write them out and
> improved the overall performance. Depending on the amount of cold data
> generated, we observe from 14% to 35% reduction in kernel CPU time used
> in the kernel builds.
> 
> ...
>
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  /* Free memory management - zoned buddy allocator.  */
> @@ -641,6 +642,7 @@ struct lruvec {
>  #ifdef CONFIG_MEMCG
>   struct pglist_data *pgdat;
>  #endif
> + struct zswap_lruvec_state zswap_lruvec_state;

Normally we'd put this in #ifdef CONFIG_ZSWAP.

> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -5,20 +5,40 @@
>  #include 
>  #include 
>  
> +struct lruvec;
> +
>  extern u64 zswap_pool_total_size;
>  extern atomic_t zswap_stored_pages;
>  
>  #ifdef CONFIG_ZSWAP
>  
> +struct zswap_lruvec_state {
> + /*
> +  * Number of pages in zswap that should be protected from the shrinker.
> +  * This number is an estimate of the following counts:
> +  *
> +  * a) Recent page faults.
> +  * b) Recent insertion to the zswap LRU. This includes new zswap stores,
> +  *as well as recent zswap LRU rotations.
> +  *
> +  * These pages are likely to be warm, and might incur IO if the are 
> written
> +  * to swap.
> +  */
> + atomic_long_t nr_zswap_protected;
> +};
> +
>  bool zswap_store(struct folio *folio);
>  bool zswap_load(struct folio *folio);
>  void zswap_invalidate(int type, pgoff_t offset);
>  void zswap_swapon(int type);
>  void zswap_swapoff(int type);
>  void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
> -
> +void zswap_lruvec_state_init(struct lruvec *lruvec);
> +void zswap_lruvec_swapin(struct page *page);
>  #else
>  
> +struct zswap_lruvec_state {};

But instead you made it an empty struct in this case.

That's a bit funky, but I guess OK.  It does send a careful reader of
struct lruvec over to look at the zswap_lruvec_state definition to
understand what's going on.

>  static inline bool zswap_store(struct folio *folio)
>  {
>   return false;
> @@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t 
> offset) {}
>  static inline void zswap_swapon(int type) {}
>  static inline void zswap_swapoff(int type) {}
>  static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
> -
> +static inline void zswap_lruvec_init(struct lruvec *lruvec) {}
> +static inline void zswap_lruvec_swapin(struct page *page) {}

Needed this build fix:

--- 
a/include/linux/zswap.h~zswap-shrinks-zswap-pool-based-on-memory-pressure-fix
+++ a/include/linux/zswap.h
@@ -54,6 +54,7 @@ static inline void zswap_swapon(int type
 static inline void zswap_swapoff(int type) {}
 static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 static inline void zswap_lruvec_init(struct lruvec *lruvec) {}
+static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
 static inline void zswap_lruvec_swapin(struct page *page) {}
 #endif
 
_

Re: [PATCH ipsec-next v1 6/7] bpf: selftests: test_tunnel: Disable CO-RE relocations

2023-11-27 Thread Daniel Xu

On Sun, Nov 26, 2023 at 09:53:04PM -0800, Yonghong Song wrote:
> 
> On 11/27/23 12:44 AM, Yonghong Song wrote:
> > 
> > On 11/26/23 8:52 PM, Eduard Zingerman wrote:
> > > On Sun, 2023-11-26 at 18:04 -0600, Daniel Xu wrote:
> > > [...]
> > > > > Tbh I'm not sure. This test passes with preserve_static_offset
> > > > > because it suppresses preserve_access_index. In general clang
> > > > > translates bitfield access to a set of IR statements like:
> > > > > 
> > > > >    C:
> > > > >  struct foo {
> > > > >    unsigned _;
> > > > >    unsigned a:1;
> > > > >    ...
> > > > >  };
> > > > >  ... foo->a ...
> > > > > 
> > > > >    IR:
> > > > >  %a = getelementptr inbounds %struct.foo, ptr %0, i32 0, i32 1
> > > > >  %bf.load = load i8, ptr %a, align 4
> > > > >  %bf.clear = and i8 %bf.load, 1
> > > > >  %bf.cast = zext i8 %bf.clear to i32
> > > > > 
> > > > > With preserve_static_offset the getelementptr+load are replaced by a
> > > > > single statement which is preserved as-is till code generation,
> > > > > thus load with align 4 is preserved.
> > > > > 
> > > > > On the other hand, I'm not sure that clang guarantees that load or
> > > > > stores used for bitfield access would be always aligned according to
> > > > > verifier expectations.
> > > > > 
> > > > > I think we should check if there are some clang knobs that prevent
> > > > > generation of unaligned memory access. I'll take a look.
> > > > Is there a reason to prefer fixing in compiler? I'm not opposed to it,
> > > > but the downside to compiler fix is it takes years to propagate and
> > > > sprinkles ifdefs into the code.
> > > > 
> > > > Would it be possible to have an analogue of BPF_CORE_READ_BITFIELD()?
> > > Well, the contraption below passes verification, tunnel selftest
> > > appears to work. I might have messed up some shifts in the macro,
> > > though.
> > 
> > I didn't test it. But from high level it should work.
> > 
> > > 
> > > Still, if clang would peek unlucky BYTE_{OFFSET,SIZE} for a particular
> > > field access might be unaligned.
> > 
> > clang should pick a sensible BYTE_SIZE/BYTE_OFFSET to meet
> > alignment requirement. This is also required for BPF_CORE_READ_BITFIELD.
> > 
> > > 
> > > ---
> > > 
> > > diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > index 3065a716544d..41cd913ac7ff 100644
> > > --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
> > > @@ -9,6 +9,7 @@
> > >   #include "vmlinux.h"
> > >   #include 
> > >   #include 
> > > +#include 
> > >   #include "bpf_kfuncs.h"
> > >   #include "bpf_tracing_net.h"
> > >   @@ -144,6 +145,38 @@ int ip6gretap_get_tunnel(struct __sk_buff *skb)
> > >   return TC_ACT_OK;
> > >   }
> > >   +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({    \
> > > +    void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET);    \
> > > +    unsigned byte_size = __CORE_RELO(s, field, BYTE_SIZE);    \
> > > +    unsigned lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > > +    unsigned rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > > +    unsigned bit_size = (rshift - lshift);    \
> > > +    unsigned long long nval, val, hi, lo;    \
> > > +    \
> > > +    asm volatile("" : "=r"(p) : "0"(p));    \
> > 
> > Use asm volatile("" : "+r"(p)) ?
> > 
> > > +    \
> > > +    switch (byte_size) {    \
> > > +    case 1: val = *(unsigned char *)p; break;    \
> > > +    case 2: val = *(unsigned short *)p; break;    \
> > > +    case 4: val = *(unsigned int *)p; break;    \
> > > +    case 8: val = *(unsigned long long *)p; break;    \
> > > +    }    \
> > > +    hi = val >> (bit_size + rshift);    \
> > > +    hi <<= bit_size + rshift;    \
> > > +    lo = val << (bit_size + lshift);    \
> > > +    lo >>= bit_size + lshift;    \
> > > +    nval = new_val;    \
> > > +    nval <<= lshift;    \
> > > +    nval >>= rshift;    \
> > > +    val = hi | nval | lo;    \
> > > +    switch (byte_size) {    \
> > > +    case 1: *(unsigned char *)p  = val; break;    \
> > > +    case 2: *(unsigned short *)p = val; break;    \
> > > +    case 4: *(unsigned int *)p   = val; break;    \
> > > +    case 8: *(unsigned long long *)p = val; break;    \
> > > +    }    \
> > > +})
> > 
> > I think this should be put in libbpf public header files but not sure
> > where to put it. bpf_core_read.h although it is core write?
> > 
> > But on the other hand, this is a uapi struct bitfield write,
> >

Re: [PATCH v6 2/6] iommufd: Add IOMMU_HWPT_INVALIDATE

2023-11-27 Thread Nicolin Chen

On Fri, Nov 24, 2023 at 02:36:29AM +, Tian, Kevin wrote:

> > > > > > >> + * @out_driver_error_code: Report a driver speicifc error code
> > upon
> > > > > > failure.
> > > > > > >> + * It's optional, driver has a choice 
> > > > > > >> to fill it or
> > > > > > >> + * not.
> > > > > > >
> > > > > > > Being optional how does the user tell whether the code is filled 
> > > > > > > or
> > not?
> > > >
> > > > Well, naming it "error_code" indicates zero means no error while
> > > > non-zero means something? An error return from this ioctl could
> > > > also tell the user space to look up for this driver error code,
> > > > if it ever cares.
> > >
> > > probably over-thinking but I'm not sure whether zero is guaranteed to
> > > mean no error in all implementations...
> >
> > Well, you are right. Usually HW conveniently raises a flag in a
> > register to indicate something wrong, yet it is probably unsafe
> > to say it definitely.
> >
> 
> this reminds me one open. What about an implementation having
> a hierarchical error code layout e.g. one main error register with
> each bit representing an error category then multiple error code
> registers each for one error category? In this case probably
> a single out_driver_error_code cannot carry that raw information.

Hmm, good point.

> Instead the iommu driver may need to define a customized error
> code convention in uapi header which is converted from the
> raw error information.
> 
> From this angle should we simply say that the error code definition
> must be included in the uapi header? If raw error information can
> be carried by this field then this hw can simply say that the error
> code format is same as the hw spec defines.
> 
> With that explicit information then the viommu can easily tell
> whether error code is filled or not based on its own convention.

That'd be to put this error_code field into the driver uAPI
structure right?

I also thought about making this out_driver_error_code per HW.
Yet, an error can be either per array or per entry/quest. The
array-related error should be reported in the array structure
that is a core uAPI, v.s. the per-HW entry structure. Though
we could still report an array error in the entry structure
at the first entry (or indexed by "array->entry_num")?

Thanks
Nic

[PATCH] cgroup/cpuset: Expose cpuset.cpus.isolated

2023-11-27 Thread Waiman Long

The root-only cpuset.cpus.isolated control file shows the current set
of isolated CPUs in isolated partitions. This control file is currently
exposed only with the cgroup_debug boot command line option which also
adds the ".__DEBUG__." prefix. This is actually a useful control file if
users want to find out which CPUs are currently in an isolated state by
the cpuset controller. Remove CFTYPE_DEBUG flag for this control file and
make it available by default without any prefix.

The test_cpuset_prs.sh test script and the cgroup-v2.rst documentation
file are also updated accordingly. Minor code change is also made in
test_cpuset_prs.sh to avoid false test failure when running on debug
kernel.

Signed-off-by: Waiman Long 
---
 Documentation/admin-guide/cgroup-v2.rst   |  7 
 kernel/cgroup/cpuset.c|  2 +-
 .../selftests/cgroup/test_cpuset_prs.sh   | 32 +++
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index cf5651a11df8..30f6ff2eba47 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2316,6 +2316,13 @@ Cpuset Interface Files
treated to have an implicit value of "cpuset.cpus" in the
formation of local partition.
 
+  cpuset.cpus.isolated
+   A read-only and root cgroup only multiple values file.
+
+   This file shows the set of all isolated CPUs used in existing
+   isolated partitions. It will be empty if no isolated partition
+   is created.
+
   cpuset.cpus.partition
A read-write single value file which exists on non-root
cpuset-enabled cgroups.  This flag is owned by the parent cgroup
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1bad4007ff4b..2a16df86c55c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3974,7 +3974,7 @@ static struct cftype dfl_files[] = {
.name = "cpus.isolated",
.seq_show = cpuset_common_seq_show,
.private = FILE_ISOLATED_CPULIST,
-   .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
+   .flags = CFTYPE_ONLY_ON_ROOT,
},
 
{ } /* terminate */
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh 
b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 7b7c4c2b6d85..b5eb1be2248c 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -508,7 +508,7 @@ dump_states()
XECPUS=$DIR/cpuset.cpus.exclusive.effective
PRS=$DIR/cpuset.cpus.partition
PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions
-   ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated
+   ISCPUS=$DIR/cpuset.cpus.isolated
[[ -e $CPUS   ]] && echo "$CPUS: $(cat $CPUS)"
[[ -e $XCPUS  ]] && echo "$XCPUS: $(cat $XCPUS)"
[[ -e $ECPUS  ]] && echo "$ECPUS: $(cat $ECPUS)"
@@ -593,17 +593,17 @@ check_cgroup_states()
 
 #
 # Get isolated (including offline) CPUs by looking at
-# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file,
+# /sys/kernel/debug/sched/domains and cpuset.cpus.isolated control file,
 # if available, and compare that with the expected value.
 #
 # Note that isolated CPUs from the sched/domains context include offline
 # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may
-# not be included in the *cpuset.cpus.isolated control file which contains
+# not be included in the cpuset.cpus.isolated control file which contains
 # only CPUs in isolated partitions.
 #
 # $1 - expected isolated cpu list(s) {,}
 #  - expected sched/domains value
-#  - *cpuset.cpus.isolated value =  if not defined
+#  - cpuset.cpus.isolated value =  if not defined
 #
 check_isolcpus()
 {
@@ -611,7 +611,7 @@ check_isolcpus()
ISOLCPUS=
LASTISOLCPU=
SCHED_DOMAINS=/sys/kernel/debug/sched/domains
-   ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated
+   ISCPUS=${CGROUP2}/cpuset.cpus.isolated
if [[ $EXPECT_VAL = . ]]
then
EXPECT_VAL=
@@ -692,14 +692,18 @@ test_fail()
 null_isolcpus_check()
 {
[[ $VERBOSE -gt 0 ]] || return 0
-   pause 0.02
-   check_isolcpus "."
-   if [[ $? -ne 0 ]]
-   then
-   echo "Unexpected isolated CPUs: $ISOLCPUS"
-   dump_states
-   exit 1
-   fi
+   # Retry a few times before printing error
+   RETRY=0
+   while [[ $RETRY -lt 5 ]]
+   do
+   pause 0.01
+   check_isolcpus "."
+   [[ $? -eq 0 ]] && return 0
+   ((RETRY++))
+   done
+   echo "Unexpected isolated CPUs: $ISOLCPUS"
+   dump_states
+   exit 1
 }
 
 #
@@ -776,7 +780,7 @@ run_state_test()
#
NEWLIST=$(cat cpuset.cpus.effective)

[PATCH v6 6/6] zswap: shrinks zswap pool based on memory pressure

2023-11-27 Thread Nhat Pham

Currently, we only shrink the zswap pool when the user-defined limit is
hit. This means that if we set the limit too high, cold data that are
unlikely to be used again will reside in the pool, wasting precious
memory. It is hard to predict how much zswap space will be needed ahead
of time, as this depends on the workload (specifically, on factors such
as memory access patterns and compressibility of the memory pages).

This patch implements a memcg- and NUMA-aware shrinker for zswap, that
is initiated when there is memory pressure. The shrinker does not
have any parameter that must be tuned by the user, and can be opted in
or out on a per-memcg basis.

Furthermore, to make it more robust for many workloads and prevent
overshrinking (i.e evicting warm pages that might be refaulted into
memory), we build in the following heuristics:

* Estimate the number of warm pages residing in zswap, and attempt to
  protect this region of the zswap LRU.
* Scale the number of freeable objects by an estimate of the memory
  saving factor. The better zswap compresses the data, the fewer pages
  we will evict to swap (as we will otherwise incur IO for relatively
  small memory saving).
* During reclaim, if the shrinker encounters a page that is also being
  brought into memory, the shrinker will cautiously terminate its
  shrinking action, as this is a sign that it is touching the warmer
  region of the zswap LRU.

As a proof of concept, we ran the following synthetic benchmark:
build the linux kernel in a memory-limited cgroup, and allocate some
cold data in tmpfs to see if the shrinker could write them out and
improved the overall performance. Depending on the amount of cold data
generated, we observe from 14% to 35% reduction in kernel CPU time used
in the kernel builds.

Signed-off-by: Nhat Pham 
---
 Documentation/admin-guide/mm/zswap.rst |   7 +
 include/linux/mmzone.h |   2 +
 include/linux/zswap.h  |  25 +++-
 mm/mmzone.c|   1 +
 mm/swap_state.c|   2 +
 mm/zswap.c | 177 -
 6 files changed, 208 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/mm/zswap.rst 
b/Documentation/admin-guide/mm/zswap.rst
index 45b98390e938..522ae22ccb84 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -153,6 +153,13 @@ attribute, e. g.::
 
 Setting this parameter to 100 will disable the hysteresis.
 
+When there is a sizable amount of cold memory residing in the zswap pool, it
+can be advantageous to proactively write these cold pages to swap and reclaim
+the memory for other use cases. By default, the zswap shrinker is disabled.
+User can enable it as follows:
+
+  echo Y > /sys/module/zswap/parameters/shrinker_enabled
+
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, same-value filled pages and various counters for the reasons
 pages are rejected.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7b1816450bfc..b23bc5390240 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Free memory management - zoned buddy allocator.  */
@@ -641,6 +642,7 @@ struct lruvec {
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
+   struct zswap_lruvec_state zswap_lruvec_state;
 };
 
 /* Isolate for asynchronous migration */
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index e571e393669b..cbd373ba88d2 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -5,20 +5,40 @@
 #include 
 #include 
 
+struct lruvec;
+
 extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
 
+struct zswap_lruvec_state {
+   /*
+* Number of pages in zswap that should be protected from the shrinker.
+* This number is an estimate of the following counts:
+*
+* a) Recent page faults.
+* b) Recent insertion to the zswap LRU. This includes new zswap stores,
+*as well as recent zswap LRU rotations.
+*
+* These pages are likely to be warm, and might incur IO if the are 
written
+* to swap.
+*/
+   atomic_long_t nr_zswap_protected;
+};
+
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
-
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_lruvec_swapin(struct page *page);
 #else
 
+struct zswap_lruvec_state {};
+
 static inline bool zswap_store(struct folio *folio)
 {
return false;
@@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) 
{}
 static inline void zswap_swapon(int type) {}
 static inline void

[PATCH v6 1/6] list_lru: allows explicit memcg and NUMA node selection

2023-11-27 Thread Nhat Pham

The interface of list_lru is based on the assumption that the list node
and the data it represents belong to the same allocated on the correct
node/memcg. While this assumption is valid for existing slab objects LRU
such as dentries and inodes, it is undocumented, and rather inflexible
for certain potential list_lru users (such as the upcoming zswap
shrinker and the THP shrinker). It has caused us a lot of issues during
our development.

This patch changes list_lru interface so that the caller must explicitly
specify numa node and memcg when adding and removing objects. The old
list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and
list_lru_del_obj(), respectively.

It also extends the list_lru API with a new function, list_lru_putback,
which undoes a previous list_lru_isolate call. Unlike list_lru_add, it
does not increment the LRU node count (as list_lru_isolate does not
decrement the node count). list_lru_putback also allows for explicit
memcg and NUMA node selection.

Suggested-by: Johannes Weiner 
Signed-off-by: Nhat Pham 
---
 drivers/android/binder_alloc.c |  5 ++--
 fs/dcache.c|  8 +++--
 fs/gfs2/quota.c|  6 ++--
 fs/inode.c |  4 +--
 fs/nfs/nfs42xattr.c|  8 ++---
 fs/nfsd/filecache.c|  4 +--
 fs/xfs/xfs_buf.c   |  6 ++--
 fs/xfs/xfs_dquot.c |  2 +-
 fs/xfs/xfs_qm.c|  2 +-
 include/linux/list_lru.h   | 54 --
 mm/list_lru.c  | 48 +-
 mm/workingset.c|  4 +--
 12 files changed, 116 insertions(+), 35 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 138f6d43d13b..e80669d4e037 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -285,7 +285,7 @@ static int binder_update_page_range(struct binder_alloc 
*alloc, int allocate,
 
trace_binder_free_lru_start(alloc, index);
 
-   ret = list_lru_add(_alloc_lru, >lru);
+   ret = list_lru_add_obj(_alloc_lru, >lru);
WARN_ON(!ret);
 
trace_binder_free_lru_end(alloc, index);
@@ -848,7 +848,7 @@ void binder_alloc_deferred_release(struct binder_alloc 
*alloc)
if (!alloc->pages[i].page_ptr)
continue;
 
-   on_lru = list_lru_del(_alloc_lru,
+   on_lru = list_lru_del_obj(_alloc_lru,
  >pages[i].lru);
page_addr = alloc->buffer + i * PAGE_SIZE;
binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
@@ -1287,4 +1287,3 @@ int binder_alloc_copy_from_buffer(struct binder_alloc 
*alloc,
return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset,
   dest, bytes);
 }
-
diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..2ba37643b9c5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -428,7 +428,8 @@ static void d_lru_add(struct dentry *dentry)
this_cpu_inc(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_inc(nr_dentry_negative);
-   WARN_ON_ONCE(!list_lru_add(>d_sb->s_dentry_lru, 
>d_lru));
+   WARN_ON_ONCE(!list_lru_add_obj(
+   >d_sb->s_dentry_lru, >d_lru));
 }
 
 static void d_lru_del(struct dentry *dentry)
@@ -438,7 +439,8 @@ static void d_lru_del(struct dentry *dentry)
this_cpu_dec(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_dec(nr_dentry_negative);
-   WARN_ON_ONCE(!list_lru_del(>d_sb->s_dentry_lru, 
>d_lru));
+   WARN_ON_ONCE(!list_lru_del_obj(
+   >d_sb->s_dentry_lru, >d_lru));
 }
 
 static void d_shrink_del(struct dentry *dentry)
@@ -1240,7 +1242,7 @@ static enum lru_status dentry_lru_isolate(struct 
list_head *item,
 *
 * This is guaranteed by the fact that all LRU management
 * functions are intermediated by the LRU API calls like
-* list_lru_add and list_lru_del. List movement in this file
+* list_lru_add_obj and list_lru_del_obj. List movement in this 
file
 * only ever occur through this functions or through callbacks
 * like this one, that are called from the LRU API.
 *
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 95dae7838b4e..b57f8c7b35be 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -271,7 +271,7 @@ static struct gfs2_quota_data 
*gfs2_qd_search_bucket(unsigned int hash,
if (qd->qd_sbd != sdp)
continue;
if (lockref_get_not_dead(>qd_lockref)) {
-   list_lru_del(_qd_lru, >qd_lru);
+   list_lru_del_obj(_qd_lru, >qd_lru);
return qd;
}

[PATCH v6 0/6] workload-specific and memory pressure-driven zswap writeback

2023-11-27 Thread Nhat Pham

Changelog:
v6:
   * Rebase on top of latest mm-unstable.
   * Fix/improve the in-code documentation of the new list_lru
 manipulation functions (patch 1)
v5:
   * Replace reference getting with an rcu_read_lock() section for
 zswap lru modifications (suggested by Yosry)
   * Add a new prep patch that allows mem_cgroup_iter() to return
 online cgroup.
   * Add a callback that updates pool->next_shrink when the cgroup is
 offlined (suggested by Yosry Ahmed, Johannes Weiner)
v4:
   * Rename list_lru_add to list_lru_add_obj and __list_lru_add to
 list_lru_add (patch 1) (suggested by Johannes Weiner and
 Yosry Ahmed)
   * Some cleanups on the memcg aware LRU patch (patch 2)
 (suggested by Yosry Ahmed)
   * Use event interface for the new per-cgroup writeback counters.
 (patch 3) (suggested by Yosry Ahmed)
   * Abstract zswap's lruvec states and handling into 
 zswap_lruvec_state (patch 5) (suggested by Yosry Ahmed)
v3:
   * Add a patch to export per-cgroup zswap writeback counters
   * Add a patch to update zswap's kselftest
   * Separate the new list_lru functions into its own prep patch
   * Do not start from the top of the hierarchy when encounter a memcg
 that is not online for the global limit zswap writeback (patch 2)
 (suggested by Yosry Ahmed)
   * Do not remove the swap entry from list_lru in
 __read_swapcache_async() (patch 2) (suggested by Yosry Ahmed)
   * Removed a redundant zswap pool getting (patch 2)
 (reported by Ryan Roberts)
   * Use atomic for the nr_zswap_protected (instead of lruvec's lock)
 (patch 5) (suggested by Yosry Ahmed)
   * Remove the per-cgroup zswap shrinker knob (patch 5)
 (suggested by Yosry Ahmed)
v2:
   * Fix loongarch compiler errors
   * Use pool stats instead of memcg stats when !CONFIG_MEMCG_KEM

There are currently several issues with zswap writeback:

1. There is only a single global LRU for zswap, making it impossible to
   perform worload-specific shrinking - an memcg under memory pressure
   cannot determine which pages in the pool it owns, and often ends up
   writing pages from other memcgs. This issue has been previously
   observed in practice and mitigated by simply disabling
   memcg-initiated shrinking:

   https://lore.kernel.org/all/20230530232435.3097106-1-npha...@gmail.com/T/#u

   But this solution leaves a lot to be desired, as we still do not
   have an avenue for an memcg to free up its own memory locked up in
   the zswap pool.

2. We only shrink the zswap pool when the user-defined limit is hit.
   This means that if we set the limit too high, cold data that are
   unlikely to be used again will reside in the pool, wasting precious
   memory. It is hard to predict how much zswap space will be needed
   ahead of time, as this depends on the workload (specifically, on
   factors such as memory access patterns and compressibility of the
   memory pages).

This patch series solves these issues by separating the global zswap
LRU into per-memcg and per-NUMA LRUs, and performs workload-specific
(i.e memcg- and NUMA-aware) zswap writeback under memory pressure. The
new shrinker does not have any parameter that must be tuned by the
user, and can be opted in or out on a per-memcg basis.

As a proof of concept, we ran the following synthetic benchmark:
build the linux kernel in a memory-limited cgroup, and allocate some
cold data in tmpfs to see if the shrinker could write them out and
improved the overall performance. Depending on the amount of cold data
generated, we observe from 14% to 35% reduction in kernel CPU time used
in the kernel builds.

Domenico Cerasuolo (3):
  zswap: make shrinking memcg-aware
  mm: memcg: add per-memcg zswap writeback stat
  selftests: cgroup: update per-memcg zswap writeback selftest

Nhat Pham (3):
  list_lru: allows explicit memcg and NUMA node selection
  memcontrol: allows mem_cgroup_iter() to check for onlineness
  zswap: shrinks zswap pool based on memory pressure

 Documentation/admin-guide/mm/zswap.rst  |   7 +
 drivers/android/binder_alloc.c  |   5 +-
 fs/dcache.c |   8 +-
 fs/gfs2/quota.c |   6 +-
 fs/inode.c  |   4 +-
 fs/nfs/nfs42xattr.c |   8 +-
 fs/nfsd/filecache.c |   4 +-
 fs/xfs/xfs_buf.c|   6 +-
 fs/xfs/xfs_dquot.c  |   2 +-
 fs/xfs/xfs_qm.c |   2 +-
 include/linux/list_lru.h|  54 ++-
 include/linux/memcontrol.h  |   9 +-
 include/linux/mmzone.h  |   2 +
 include/linux/vm_event_item.h   |   1 +
 include/linux/zswap.h   |  27 +-
 mm/list_lru.c   |  48 ++-
 mm/memcontrol.c |  20 +-
 mm/mmzone.c |   1 +
 mm/shrinker.c

[PATCH v6 4/6] mm: memcg: add per-memcg zswap writeback stat

2023-11-27 Thread Nhat Pham

From: Domenico Cerasuolo 

Since zswap now writes back pages from memcg-specific LRUs, we now need a
new stat to show writebacks count for each memcg.

Suggested-by: Nhat Pham 
Signed-off-by: Domenico Cerasuolo 
Signed-off-by: Nhat Pham 
---
 include/linux/vm_event_item.h | 1 +
 mm/memcontrol.c   | 1 +
 mm/vmstat.c   | 1 +
 mm/zswap.c| 3 +++
 4 files changed, 6 insertions(+)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d1b847502f09..f4569ad98edf 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -142,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_ZSWAP
ZSWPIN,
ZSWPOUT,
+   ZSWP_WB,
 #endif
 #ifdef CONFIG_X86
DIRECT_MAP_LEVEL2_SPLIT,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0399aec8c0e3..a82f9c695c6c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -703,6 +703,7 @@ static const unsigned int memcg_vm_event_stat[] = {
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
ZSWPIN,
ZSWPOUT,
+   ZSWP_WB,
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
THP_FAULT_ALLOC,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index afa5a38fcc9c..2249f85e4a87 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1401,6 +1401,7 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_ZSWAP
"zswpin",
"zswpout",
+   "zswp_wb",
 #endif
 #ifdef CONFIG_X86
"direct_map_level2_splits",
diff --git a/mm/zswap.c b/mm/zswap.c
index e441cbcab9a9..9f5142524d48 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -754,6 +754,9 @@ static enum lru_status shrink_memcg_cb(struct list_head 
*item, struct list_lru_o
}
zswap_written_back_pages++;
 
+   if (entry->objcg)
+   count_objcg_event(entry->objcg, ZSWP_WB);
+
/*
 * Writeback started successfully, the page now belongs to the
 * swapcache. Drop the entry from zswap - unless invalidate already
-- 
2.34.1

[PATCH v6 3/6] zswap: make shrinking memcg-aware

2023-11-27 Thread Nhat Pham

From: Domenico Cerasuolo 

Currently, we only have a single global LRU for zswap. This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing
pages from other memcgs. This issue has been previously observed in
practice and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-npha...@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

Signed-off-by: Domenico Cerasuolo 
Co-developed-by: Nhat Pham 
Signed-off-by: Nhat Pham 
---
 include/linux/memcontrol.h |   5 +
 include/linux/zswap.h  |   2 +
 mm/memcontrol.c|   2 +
 mm/swap.h  |   3 +-
 mm/swap_state.c|  24 +++-
 mm/zswap.c | 248 +
 6 files changed, 223 insertions(+), 61 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 86adce081a08..83590fd0d6d1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1187,6 +1187,11 @@ static inline struct mem_cgroup *page_memcg_check(struct 
page *page)
return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup 
*objcg)
+{
+   return NULL;
+}
+
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
return false;
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 2a60ce39cfde..e571e393669b 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio);
 void zswap_invalidate(int type, pgoff_t offset);
 void zswap_swapon(int type);
 void zswap_swapoff(int type);
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
 
 #else
 
@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio)
 static inline void zswap_invalidate(int type, pgoff_t offset) {}
 static inline void zswap_swapon(int type) {}
 static inline void zswap_swapoff(int type) {}
+static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
 
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1f051adaa15..0399aec8c0e3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5617,6 +5617,8 @@ static void mem_cgroup_css_offline(struct 
cgroup_subsys_state *css)
page_counter_set_min(>memory, 0);
page_counter_set_low(>memory, 0);
 
+   zswap_memcg_offline_cleanup(memcg);
+
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
diff --git a/mm/swap.h b/mm/swap.h
index 73c332ee4d91..c0dc73e10e91 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t 
gfp_mask,
   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct mempolicy *mpol, pgoff_t ilx,
-bool *new_page_allocated);
+bool *new_page_allocated,
+bool skip_if_exists);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85d9e5806a6a..6c84236382f3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space 
*mapping,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct mempolicy *mpol, pgoff_t ilx,
-bool *new_page_allocated)
+bool *new_page_allocated,
+bool skip_if_exists)
 {
struct swap_info_struct *si;
struct folio *folio;
@@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, 
gfp_t gfp_mask,
if (err != -EEXIST)
goto fail_put_swap;
 
+   /*
+* Protect against a recursive call to __read_swap_cache_async()
+* on the same entry waiting forever here because SWAP_HAS_CACHE
+* is set but the folio is not the swap cache yet. This can
+* happen today if mem_cgroup_swapin_charge_folio() below
+* triggers reclaim through zswap, which

Re: [PATCH v3 06/25] KVM: arm64: Save/restore POE registers

2023-11-27 Thread Marc Zyngier

On Fri, 24 Nov 2023 16:34:51 +,
Joey Gouly  wrote:
> 
> Define the new system registers that POE introduces and context switch them.

I would really like to see a discussion on the respective lifetimes of
these two registers (see below).

>
> Signed-off-by: Joey Gouly 
> Cc: Marc Zyngier 
> Cc: Oliver Upton 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> ---
>  arch/arm64/include/asm/kvm_arm.h   |  4 ++--
>  arch/arm64/include/asm/kvm_host.h  |  4 
>  arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 10 ++
>  arch/arm64/kvm/sys_regs.c  |  2 ++
>  4 files changed, 18 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_arm.h 
> b/arch/arm64/include/asm/kvm_arm.h
> index b85f46a73e21..597470e0b87b 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -346,14 +346,14 @@
>   */
>  #define __HFGRTR_EL2_RES0(GENMASK(63, 56) | GENMASK(53, 51))
>  #define __HFGRTR_EL2_MASKGENMASK(49, 0)
> -#define __HFGRTR_EL2_nMASK   (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50))
> +#define __HFGRTR_EL2_nMASK   (GENMASK(60, 57) | GENMASK(55, 54) | BIT(50))
>  
>  #define __HFGWTR_EL2_RES0(GENMASK(63, 56) | GENMASK(53, 51) |\
>BIT(46) | BIT(42) | BIT(40) | BIT(28) | \
>GENMASK(26, 25) | BIT(21) | BIT(18) |  \
>GENMASK(15, 14) | GENMASK(10, 9) | BIT(2))
>  #define __HFGWTR_EL2_MASKGENMASK(49, 0)
> -#define __HFGWTR_EL2_nMASK   (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50))
> +#define __HFGWTR_EL2_nMASK   (GENMASK(60, 57) | GENMASK(55, 54) | BIT(50))
>  
>  #define __HFGITR_EL2_RES0GENMASK(63, 57)
>  #define __HFGITR_EL2_MASKGENMASK(54, 0)
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 824f29f04916..fa9ebd8fce40 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -401,6 +401,10 @@ enum vcpu_sysreg {
>   PIR_EL1,   /* Permission Indirection Register 1 (EL1) */
>   PIRE0_EL1, /*  Permission Indirection Register 0 (EL1) */
>  
> + /* Permission Overlay Extension registers */
> + POR_EL1,/* Permission Overlay Register 1 (EL1) */
> + POR_EL0,/* Permission Overlay Register 0 (EL0) */
> +
>   /* 32bit specific registers. */
>   DACR32_EL2, /* Domain Access Control Register */
>   IFSR32_EL2, /* Instruction Fault Status Register */
> diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h 
> b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
> index bb6b571ec627..22f07ee43e7e 100644
> --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
> +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
> @@ -19,6 +19,9 @@
>  static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
>  {
>   ctxt_sys_reg(ctxt, MDSCR_EL1)   = read_sysreg(mdscr_el1);
> +
> + if (system_supports_poe())
> + ctxt_sys_reg(ctxt, POR_EL0) = read_sysreg_s(SYS_POR_EL0);

So this is saved as eagerly as it gets. Why? If it only affects EL0,
it can be saved/restored in a much lazier way.

>  }
>  
>  static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
> @@ -59,6 +62,8 @@ static inline void __sysreg_save_el1_state(struct 
> kvm_cpu_context *ctxt)
>   ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR);
>   ctxt_sys_reg(ctxt, PIRE0_EL1)   = read_sysreg_el1(SYS_PIRE0);

And the fact that you only touch PIRE0_EL1 here seems to be a good
indication that the above can be relaxed.

>   }
> + if (system_supports_poe())

nit: missing new line before the if().

> + ctxt_sys_reg(ctxt, POR_EL1) = read_sysreg_el1(SYS_POR);
>   ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par();
>   ctxt_sys_reg(ctxt, TPIDR_EL1)   = read_sysreg(tpidr_el1);
>  
> @@ -89,6 +94,9 @@ static inline void __sysreg_save_el2_return_state(struct 
> kvm_cpu_context *ctxt)
>  static inline void __sysreg_restore_common_state(struct kvm_cpu_context 
> *ctxt)
>  {
>   write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1),  mdscr_el1);
> +
> + if (system_supports_poe())
> + write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0), SYS_POR_EL0);

Same thing here about the eager restore.

>  }
>  
>  static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
> @@ -135,6 +143,8 @@ static inline void __sysreg_restore_el1_state(struct 
> kvm_cpu_context *ctxt)
>   write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1),   SYS_PIR);
>   write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1), SYS_PIRE0);
>   }
> + if (system_supports_poe())

new line.

> + write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1),   SYS_POR);
>   write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1),   par_el1);
>   write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1);
>  
> diff --git a/arch/arm64/kvm/sys_regs.c

Re: [PATCH net 4/4] selftests/net: mptcp: fix uninitialized variable warnings

2023-11-27 Thread Matthieu Baerts

On 27/11/2023 16:46, Willem de Bruijn wrote:
> Matthieu Baerts wrote:
>> Hi Willem,
>>
>> (+ cc MPTCP list)
>>
>> On 24/11/2023 18:15, Willem de Bruijn wrote:
>>> From: Willem de Bruijn 
>>>
>>> Same init_rng() in both tests. The function reads /dev/urandom to
>>> initialize srand(). In case of failure, it falls back onto the
>>> entropy in the uninitialized variable. Not sure if this is on purpose.
>>> But failure reading urandom should be rare, so just fail hard. While
>>> at it, convert to getrandom(). Which man 4 random suggests is simpler
>>> and more robust.
>>>
>>> mptcp_inq.c:525:6:
>>> mptcp_connect.c:1131:6:
>>>
>>> error: variable 'foo' is used uninitialized
>>> whenever 'if' condition is false
>>> [-Werror,-Wsometimes-uninitialized]
>>
>> Thank you for the patch!
>>
>> It looks good to me:
>>
>> Reviewed-by: Matthieu Baerts 
>>
>>> Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp")
>>> Fixes: b51880568f20 ("selftests: mptcp: add inq test case")
>>> Cc: Florian Westphal 
>>> Signed-off-by: Willem de Bruijn 
>>>
>>> 
>>>
>>> When input is randomized because this is expected to meaningfully
>>> explore edge cases, should we also add
>>> 1. logging the random seed to stdout and
>>> 2. adding a command line argument to replay from a specific seed
>>> I can do this in net-next, if authors find it useful in this case.
>>
>> I think we should have done that from the beginning, otherwise we cannot
>> easily reproduce these edge cases. To be honest, I don't think this
>> technique helped to find bugs, and it was probably used here as a good
>> habit to increase the coverage. But on the other hand, we might not
>> realise some inputs are randomised and can cause instabilities in the
>> tests because we don't print anything about that.
>>
>> So I would say that the minimal thing to do is to log the random seed.
>> But it might not be that easy to do, for example 'mptcp_connect' is used
>> a lot of time by the .sh scripts: printing this seed number each time
>> 'mptcp_connect' is started will "flood" the logs. Maybe we should only
>> print that at the end, in case of errors: e.g. in xerror() and
>> die_perror() for example, but I see 'exit(1)' is directly used in other
>> places...
>>
>> That's more code to change, but if it is still OK for you to do that,
>> please also note that you will need to log this to stderr: mptcp_connect
>> prints what has been received from the other peer to stdout.
>>
>> Because it is more than just adding a 'printf()', I just created a
>> ticket in our bug tracker, so anybody can look at that and check all the
>> details about that:
>>
>> https://github.com/multipath-tcp/mptcp_net-next/issues/462
> 
> Thanks for the detailed feedback, Matthieu!
> 
> Another option to avoid flooding the logs might be to choose a pseudo
> random number in the script and pass the explicit value mptcp_connect.

Good idea!

If I understood correctly, from the .c file, we can check if an env var
is set (e.g. `MPTCP_RND_SEED`) and use it. If not, we generate a random
one like before. The .sh scripts should generate a random number if the
env var is not already set. In any case, this seed should be printed by
the scripts.

> I haven't looked closely, but for transport layer tests it is likely
> that the payload is entirely ignored. Bar perhaps checksum coverage.
> If it does not increase code coverage, randomization can also just be
> turned off.

Here the randomisation is used to change the length of the data that are
exchanged or to do some actions in different orders. I think it still
makes sense to have randomisation. But in case of issues around that, it
might not be clear what the userspace was exactly doing. That's what can
be improved later in net-next.

Cheers,
Matt

Re: [PATCH net-next 0/5] selftests: tc-testing: updates and cleanups for tdc

2023-11-27 Thread Jamal Hadi Salim

On Fri, Nov 24, 2023 at 10:43 AM Pedro Tammela  wrote:
>
> Address the recommendations from the previous series and cleanup some
> leftovers.
>
> Pedro Tammela (5):
>   selftests: tc-testing: remove buildebpf plugin
>   selftests: tc-testing: remove unnecessary time.sleep
>   selftests: tc-testing: prefix iproute2 functions with "ipr2"
>   selftests: tc-testing: cleanup on Ctrl-C
>   selftests: tc-testing: remove unused import
>
>  tools/testing/selftests/tc-testing/Makefile   |  29 +---
>  tools/testing/selftests/tc-testing/README |   2 -
>  .../testing/selftests/tc-testing/action-ebpf  | Bin 0 -> 856 bytes
>  .../tc-testing/plugin-lib/buildebpfPlugin.py  |  67 --
>  .../tc-testing/plugin-lib/nsPlugin.py |  20 +++---
>  .../tc-testing/tc-tests/actions/bpf.json  |  14 ++--
>  .../tc-testing/tc-tests/filters/bpf.json  |  10 ++-
>  tools/testing/selftests/tc-testing/tdc.py |  11 ++-
>  tools/testing/selftests/tc-testing/tdc.sh |   2 +-
>  9 files changed, 25 insertions(+), 130 deletions(-)
>  create mode 100644 tools/testing/selftests/tc-testing/action-ebpf
>  delete mode 100644 
> tools/testing/selftests/tc-testing/plugin-lib/buildebpfPlugin.py

For the patch series:
Acked-by: Jamal Hadi Salim 

cheers,
jamal

> --
> 2.40.1
>

Re: [PATCH net 4/4] selftests/net: mptcp: fix uninitialized variable warnings

2023-11-27 Thread Willem de Bruijn

Matthieu Baerts wrote:
> Hi Willem,
> 
> (+ cc MPTCP list)
> 
> On 24/11/2023 18:15, Willem de Bruijn wrote:
> > From: Willem de Bruijn 
> > 
> > Same init_rng() in both tests. The function reads /dev/urandom to
> > initialize srand(). In case of failure, it falls back onto the
> > entropy in the uninitialized variable. Not sure if this is on purpose.
> > But failure reading urandom should be rare, so just fail hard. While
> > at it, convert to getrandom(). Which man 4 random suggests is simpler
> > and more robust.
> > 
> > mptcp_inq.c:525:6:
> > mptcp_connect.c:1131:6:
> > 
> > error: variable 'foo' is used uninitialized
> > whenever 'if' condition is false
> > [-Werror,-Wsometimes-uninitialized]
> 
> Thank you for the patch!
> 
> It looks good to me:
> 
> Reviewed-by: Matthieu Baerts 
> 
> > Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp")
> > Fixes: b51880568f20 ("selftests: mptcp: add inq test case")
> > Cc: Florian Westphal 
> > Signed-off-by: Willem de Bruijn 
> > 
> > 
> > 
> > When input is randomized because this is expected to meaningfully
> > explore edge cases, should we also add
> > 1. logging the random seed to stdout and
> > 2. adding a command line argument to replay from a specific seed
> > I can do this in net-next, if authors find it useful in this case.
> 
> I think we should have done that from the beginning, otherwise we cannot
> easily reproduce these edge cases. To be honest, I don't think this
> technique helped to find bugs, and it was probably used here as a good
> habit to increase the coverage. But on the other hand, we might not
> realise some inputs are randomised and can cause instabilities in the
> tests because we don't print anything about that.
> 
> So I would say that the minimal thing to do is to log the random seed.
> But it might not be that easy to do, for example 'mptcp_connect' is used
> a lot of time by the .sh scripts: printing this seed number each time
> 'mptcp_connect' is started will "flood" the logs. Maybe we should only
> print that at the end, in case of errors: e.g. in xerror() and
> die_perror() for example, but I see 'exit(1)' is directly used in other
> places...
> 
> That's more code to change, but if it is still OK for you to do that,
> please also note that you will need to log this to stderr: mptcp_connect
> prints what has been received from the other peer to stdout.
> 
> Because it is more than just adding a 'printf()', I just created a
> ticket in our bug tracker, so anybody can look at that and check all the
> details about that:
> 
> https://github.com/multipath-tcp/mptcp_net-next/issues/462

Thanks for the detailed feedback, Matthieu!

Another option to avoid flooding the logs might be to choose a pseudo
random number in the script and pass the explicit value mptcp_connect.

I haven't looked closely, but for transport layer tests it is likely
that the payload is entirely ignored. Bar perhaps checksum coverage.
If it does not increase code coverage, randomization can also just be
turned off.
 
> > ---
> >  tools/testing/selftests/net/mptcp/mptcp_connect.c | 11 ---
> >  tools/testing/selftests/net/mptcp/mptcp_inq.c | 11 ---
> >  2 files changed, 8 insertions(+), 14 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c 
> > b/tools/testing/selftests/net/mptcp/mptcp_connect.c
> > index c7f9ebeebc2c5..d2043ec3bf6d6 100644
> > --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
> > +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
> 
> (...)
> 
> > @@ -1125,15 +1126,11 @@ int main_loop_s(int listensock)
> >  
> >  static void init_rng(void)
> >  {
> > -   int fd = open("/dev/urandom", O_RDONLY);
> > unsigned int foo;
> >  
> > -   if (fd > 0) {
> 
> I just realised that here, we could have fd == 0 which is a valid value.
> I don't think we would have that when executing the selftests, but
> that's another reason to change this :)
> 
> > -   int ret = read(fd, , sizeof(foo));
> > -
> > -   if (ret < 0)
> > -   srand(fd + foo);
> > -   close(fd);
> > +   if (getrandom(, sizeof(foo), 0) == -1) {
> > +   perror("getrandom");
> > +   exit(1);
> > }
> >  
> > srand(foo);
> 
> Cheers,
> Matt

Re: lsm_list_modules_test.c:104:22: error: 'LSM_ID_IMA' undeclared (first use in this function); did you mean 'LSM_ID_YAMA'?

2023-11-27 Thread Paul Moore

On Mon, Nov 27, 2023 at 5:43 AM Naresh Kamboju
 wrote:
>
> Following build errors noticed while building selftests lsm tests for x86
> with gcc-13 toolchain on Linux next-20231127 tag.
>
> Build log:
> --
> selftest/lsm/lsm_list_modules_test
> lsm_list_modules_test.c: In function 'correct_lsm_list_modules':
> lsm_list_modules_test.c:104:22: error: 'LSM_ID_IMA' undeclared (first
> use in this function); did you mean 'LSM_ID_YAMA'?
>   104 | case LSM_ID_IMA:
>   |  ^~
>   |  LSM_ID_YAMA
> lsm_list_modules_test.c:104:22: note: each undeclared identifier is
> reported only once for each function it appears in
>
> Reported-by: Linux Kernel Functional Testing 

A fix was posted last week and should be merged this week, lore link below:

https://lore.kernel.org/linux-security-module/20231122160742.109270-2-p...@paul-moore.com/

-- 
paul-moore.com

Re: [RFC PATCH 2/5] RISC-V: Expose Ssdtso via hwprobe API

2023-11-27 Thread Christoph Müllner

On Mon, Nov 27, 2023 at 3:32 PM Samuel Holland
 wrote:
>
> Hi Christoph,
>
> On 2023-11-24 1:21 AM, Christoph Muellner wrote:
> > From: Christoph Müllner 
> >
> > This patch adds Ssdtso to the list of extensions which
> > are announced to user-space using te hwprobe API.
> >
> > Signed-off-by: Christoph Müllner 
> > ---
> >  Documentation/arch/riscv/hwprobe.rst  | 3 +++
> >  arch/riscv/include/uapi/asm/hwprobe.h | 1 +
> >  arch/riscv/kernel/sys_riscv.c | 1 +
> >  3 files changed, 5 insertions(+)
> >
> > diff --git a/Documentation/arch/riscv/hwprobe.rst 
> > b/Documentation/arch/riscv/hwprobe.rst
> > index 7b2384de471f..8de3349e0ca2 100644
> > --- a/Documentation/arch/riscv/hwprobe.rst
> > +++ b/Documentation/arch/riscv/hwprobe.rst
> > @@ -80,6 +80,9 @@ The following keys are defined:
> >* :c:macro:`RISCV_HWPROBE_EXT_ZICBOZ`: The Zicboz extension is 
> > supported, as
> > ratified in commit 3dd606f ("Create cmobase-v1.0.pdf") of 
> > riscv-CMOs.
> >
> > +  * :c:macro:`RISCV_HWPROBE_EXT_ZICBOZ`: The Ssdtso extension is 
> > supported, as
>
> Should be RISCV_HWPROBE_EXT_SSDTSO.

Thanks for reporting!
I've fixed this now as well in the github branch:
  https://github.com/cmuellner/linux/tree/ssdtso

BR
Christoph

>
> Regards,
> Samuel
>
> > +   in version v1.0-draft2 of the corresponding extension.
> > +
> >  * :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: A bitmask that contains 
> > performance
> >information about the selected set of processors.
> >
> > diff --git a/arch/riscv/include/uapi/asm/hwprobe.h 
> > b/arch/riscv/include/uapi/asm/hwprobe.h
> > index b659ffcfcdb4..ed450c64e6b2 100644
> > --- a/arch/riscv/include/uapi/asm/hwprobe.h
> > +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> > @@ -30,6 +30,7 @@ struct riscv_hwprobe {
> >  #define  RISCV_HWPROBE_EXT_ZBB   (1 << 4)
> >  #define  RISCV_HWPROBE_EXT_ZBS   (1 << 5)
> >  #define  RISCV_HWPROBE_EXT_ZICBOZ(1 << 6)
> > +#define  RISCV_HWPROBE_EXT_SSDTSO(1 << 7)
> >  #define RISCV_HWPROBE_KEY_CPUPERF_0  5
> >  #define  RISCV_HWPROBE_MISALIGNED_UNKNOWN(0 << 0)
> >  #define  RISCV_HWPROBE_MISALIGNED_EMULATED   (1 << 0)
> > diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
> > index c712037dbe10..c654f43b9699 100644
> > --- a/arch/riscv/kernel/sys_riscv.c
> > +++ b/arch/riscv/kernel/sys_riscv.c
> > @@ -162,6 +162,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
> >   EXT_KEY(ZBB);
> >   EXT_KEY(ZBS);
> >   EXT_KEY(ZICBOZ);
> > + EXT_KEY(SSDTSO);
> >  #undef EXT_KEY
> >   }
> >
>

Re: [PATCH net 1/4] selftests/net: ipsec: fix constant out of range

2023-11-27 Thread Dmitry Safonov

On 11/24/23 17:15, Willem de Bruijn wrote:
> From: Willem de Bruijn 
> 
> Fix a small compiler warning.
> 
> nr_process must be a signed long: it is assigned a signed long by
> strtol() and is compared against LONG_MIN and LONG_MAX.
> 
> ipsec.c:2280:65:
> error: result of comparison of constant -9223372036854775808
> with expression of type 'unsigned int' is always false
> [-Werror,-Wtautological-constant-out-of-range-compare]
> 
>   if ((errno == ERANGE && (nr_process == LONG_MAX || nr_process == LONG_MIN))
> 
> Fixes: bc2652b7ae1e ("selftest/net/xfrm: Add test for ipsec tunnel")
> Cc: Dmitry Safonov <0x7f454...@gmail.com>
> Signed-off-by: Willem de Bruijn 

LGTM, thanks!
Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com>

> ---
>  tools/testing/selftests/net/ipsec.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/testing/selftests/net/ipsec.c 
> b/tools/testing/selftests/net/ipsec.c
> index 9a8229abfa026..be4a30a0d02ae 100644
> --- a/tools/testing/selftests/net/ipsec.c
> +++ b/tools/testing/selftests/net/ipsec.c
> @@ -2263,7 +2263,7 @@ static int check_results(void)
>  
>  int main(int argc, char **argv)
>  {
> - unsigned int nr_process = 1;
> + long nr_process = 1;
>   int route_sock = -1, ret = KSFT_SKIP;
>   int test_desc_fd[2];
>   uint32_t route_seq;
> @@ -2284,7 +2284,7 @@ int main(int argc, char **argv)
>   exit_usage(argv);
>   }
>  
> - if (nr_process > MAX_PROCESSES || !nr_process) {
> + if (nr_process > MAX_PROCESSES || nr_process < 1) {
>   printk("nr_process should be between [1; %u]",
>   MAX_PROCESSES);
>   exit_usage(argv);

-- 
Dmitry

Re: [RFC PATCH 2/5] RISC-V: Expose Ssdtso via hwprobe API

2023-11-27 Thread Samuel Holland

Hi Christoph,

On 2023-11-24 1:21 AM, Christoph Muellner wrote:
> From: Christoph Müllner 
> 
> This patch adds Ssdtso to the list of extensions which
> are announced to user-space using te hwprobe API.
> 
> Signed-off-by: Christoph Müllner 
> ---
>  Documentation/arch/riscv/hwprobe.rst  | 3 +++
>  arch/riscv/include/uapi/asm/hwprobe.h | 1 +
>  arch/riscv/kernel/sys_riscv.c | 1 +
>  3 files changed, 5 insertions(+)
> 
> diff --git a/Documentation/arch/riscv/hwprobe.rst 
> b/Documentation/arch/riscv/hwprobe.rst
> index 7b2384de471f..8de3349e0ca2 100644
> --- a/Documentation/arch/riscv/hwprobe.rst
> +++ b/Documentation/arch/riscv/hwprobe.rst
> @@ -80,6 +80,9 @@ The following keys are defined:
>* :c:macro:`RISCV_HWPROBE_EXT_ZICBOZ`: The Zicboz extension is supported, 
> as
> ratified in commit 3dd606f ("Create cmobase-v1.0.pdf") of riscv-CMOs.
>  
> +  * :c:macro:`RISCV_HWPROBE_EXT_ZICBOZ`: The Ssdtso extension is supported, 
> as

Should be RISCV_HWPROBE_EXT_SSDTSO.

Regards,
Samuel

> +   in version v1.0-draft2 of the corresponding extension.
> +
>  * :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: A bitmask that contains performance
>information about the selected set of processors.
>  
> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h 
> b/arch/riscv/include/uapi/asm/hwprobe.h
> index b659ffcfcdb4..ed450c64e6b2 100644
> --- a/arch/riscv/include/uapi/asm/hwprobe.h
> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
> @@ -30,6 +30,7 @@ struct riscv_hwprobe {
>  #define  RISCV_HWPROBE_EXT_ZBB   (1 << 4)
>  #define  RISCV_HWPROBE_EXT_ZBS   (1 << 5)
>  #define  RISCV_HWPROBE_EXT_ZICBOZ(1 << 6)
> +#define  RISCV_HWPROBE_EXT_SSDTSO(1 << 7)
>  #define RISCV_HWPROBE_KEY_CPUPERF_0  5
>  #define  RISCV_HWPROBE_MISALIGNED_UNKNOWN(0 << 0)
>  #define  RISCV_HWPROBE_MISALIGNED_EMULATED   (1 << 0)
> diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
> index c712037dbe10..c654f43b9699 100644
> --- a/arch/riscv/kernel/sys_riscv.c
> +++ b/arch/riscv/kernel/sys_riscv.c
> @@ -162,6 +162,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
>   EXT_KEY(ZBB);
>   EXT_KEY(ZBS);
>   EXT_KEY(ZICBOZ);
> + EXT_KEY(SSDTSO);
>  #undef EXT_KEY
>   }
>

Re: [PATCH net-next 01/38] selftests/net: add lib.sh

2023-11-27 Thread Petr Machata

Hangbin Liu  writes:

> On Fri, Nov 24, 2023 at 03:35:51PM +0100, Petr Machata wrote:
>> 
>> Hangbin Liu  writes:
>> 
>> > +  fi
>> > +  done
>> > +
>> > +  [ $errexit -eq 1 ] && set -e
>> > +  return 0
>> > +}
>> > +
>> > +# By default, remove all netns before EXIT.
>> > +cleanup_all_ns()
>> > +{
>> > +  cleanup_ns $NS_LIST
>> > +}
>> > +trap cleanup_all_ns EXIT
>> 
>> Hmm, OK, this is a showstopper for inclusion from forwarding/lib.sh,
>> because basically all users of forwarding/lib.sh use the EXIT trap.
>> 
>> I wonder if we need something like these push_cleanup / on_exit helpers:
>> 
>>  https://github.com/pmachata/stuff/blob/master/ptp-test/lib.sh#L15
>
> When I added this, I just want to make sure the netns are cleaned up if the
> client script forgot. I think the client script trap function should
> cover this one, no?

So the motivation makes sense. But in general, invoking cleanup from the
same abstraction layer that acquired the resource, makes the code easier
to analyze. And in particular here that we are talking about traps,
which are a global resource, and one that the client might well want to
use for their own management. The client should be using the trap
instead of the framework.

The framework might expose APIs to allow clients to register cleanups
etc., which the framework itself is then free to use of course, for
resources that it itself has acquired. But even with these APIs in place
I think it would be better if the client that acquires a resource also
schedules its release. (Though it's not as clear-cut in that case.)

>> 
>> But I don't want to force this on your already large patchset :)
>
> Yes, Paolo also told me that this is too large. I will break it to
> 2 path set or merge some small patches together for next version.
>
>> So just ignore the bit about including from forwarding/lib.sh.
>
>> Actually I take this back. The cleanup should be invoked from where the
>> init was called. I don't think the library should be auto-invoking it,
>> the client scripts should. Whether through a trap or otherwise.
>
> OK, also makes sense. I will remove this trap.
>
> Thanks for all your comments.
> Hangbin

Re: [PATCH net-next 01/38] selftests/net: add lib.sh

2023-11-27 Thread Petr Machata



Hangbin Liu  writes:

> On Fri, Nov 24, 2023 at 03:05:18PM +0100, Petr Machata wrote:
>> 
>> Hangbin Liu  writes:
>> 
>> > +# Helpers
>> > +busywait()
>> > +{
>> > +  local timeout=$1; shift
>> > +
>> > +  local start_time="$(date -u +%s%3N)"
>> > +  while true
>> > +  do
>> > +  local out
>> > +  out=$($@)
>> > +  local ret=$?
>> > +  if ((!ret)); then
>> > +  echo -n "$out"
>> > +  return 0
>> > +  fi
>> > +
>> > +  local current_time="$(date -u +%s%3N)"
>> > +  if ((current_time - start_time > timeout)); then
>> > +  echo -n "$out"
>> > +  return 1
>> > +  fi
>> > +  done
>> > +}
>> 
>> This is lifted from forwarding/lib.sh, right? Would it make sense to
>
> Yes.
>
>> just source this new file from forwarding/lib.sh instead of copying
>
> Do you mean let net/forwarding/lib.sh source net.lib, and let other net
> tests source the net/forwarding/lib.sh?
>
> Or move the busywait() function from net/forwarding/lib.sh to net.lib.
> Then let net/forwarding/lib.sh source net.lib?

This.

>> stuff around? I imagine there will eventually be more commonality, and
>> when that pops up, we can just shuffle the forwarding code to
>> net/lib.sh.
>
> Yes, make sense.
>
> Thanks
> Hangbin

Re: [RFC PATCH 0/5] RISC-V: Add dynamic TSO support

2023-11-27 Thread Christoph Müllner

On Mon, Nov 27, 2023 at 11:37 AM Conor Dooley
 wrote:
>
> Hi,
>
> On Fri, Nov 24, 2023 at 08:21:37AM +0100, Christoph Muellner wrote:
> > From: Christoph Müllner 
> >
> > The upcoming RISC-V Ssdtso specification introduces a bit in the senvcfg
> > CSR to switch the memory consistency model at run-time from RVWMO to TSO
> > (and back). The active consistency model can therefore be switched on a
> > per-hart base and managed by the kernel on a per-process/thread base.
> >
> > This patch implements basic Ssdtso support and adds a prctl API on top
> > so that user-space processes can switch to a stronger memory consistency
> > model (than the kernel was written for) at run-time.
> >
> > I am not sure if other architectures support switching the memory
> > consistency model at run-time, but designing the prctl API in an
> > arch-independent way allows reusing it in the future.
> >
> > The patchset also comes with a short documentation of the prctl API.
> >
> > This series is based on the second draft of the Ssdtso specification
> > which was published recently on an RVI list:
> >   https://lists.riscv.org/g/tech-arch-review/message/183
> > Note, that the Ssdtso specification is in development state
> > (i.e., not frozen or even ratified) which is also the reason
> > why I marked the series as RFC.
> >
> > One aspect that is not covered in this patchset is virtualization.
> > It is planned to add virtualization support in a later version.
> > Hints/suggestions on how to implement this part are very much
> > appreciated.
> >
> > Christoph Müllner (5):
>
> I know this is an RFC, but it could probably do with a bit more compile
> testing, as:
>
> >   RISC-V: Add basic Ssdtso support
>
> This patch doesn't build for rv64 allmodconfig
>
> >   RISC-V: Expose Ssdtso via hwprobe API
>
> This one seems to build fine
>
> >   uapi: prctl: Add new prctl call to set/get the memory consistency
> > model
> >   RISC-V: Implement prctl call to set/get the memory consistency model
> >   RISC-V: selftests: Add DTSO tests
>
> These don't build for:
> rv32 defconfig
> rv64 allmodconfig
> rv64 nommu

Thanks for reporting this. You are absolutely right.
In my defense, this patchset was compile-tested and got some limited
run-time testing in QEMU.
But after that, I wrote the documentation, which triggered a renaming
of several function/macro names,
and these changes did not see adequate testing. I am sorry for that.

I've already fixed the patches (addressing the issues you have
reported, plus other small issues).
To not distract the ongoing discussion, I will not send an updated
patchset right now.
In case you are interested, you can find the latest changes (rebased
on upstream/master) here:
  https://github.com/cmuellner/linux/tree/ssdtso
I've also extended my local compile-test script to include all
mentioned configs.

In case you want to play a bit with these changes, you can also have a
look at the QEMU
patchset, which also got support for the prctl (which is not part of
the published mailpatch):
  https://github.com/cmuellner/qemu/tree/ssdtso
With these changes, you can run the kernel self-test binary in
user-mode emulation.

BR
Christoph

Re: [PATCH net-next v2] net: ctnetlink: support filtering by zone

2023-11-27 Thread Pablo Neira Ayuso

Hi,

On Mon, Nov 27, 2023 at 11:49:16AM +, Felix Huettner wrote:
> conntrack zones are heavily used by tools like openvswitch to run
> multiple virtual "routers" on a single machine. In this context each
> conntrack zone matches to a single router, thereby preventing
> overlapping IPs from becoming issues.
> In these systems it is common to operate on all conntrack entries of a
> given zone, e.g. to delete them when a router is deleted. Previously this
> required these tools to dump the full conntrack table and filter out the
> relevant entries in userspace potentially causing performance issues.
> 
> To do this we reuse the existing CTA_ZONE attribute. This was previous
> parsed but not used during dump and flush requests. Now if CTA_ZONE is
> set we filter these operations based on the provided zone.
> However this means that users that previously passed CTA_ZONE will
> experience a difference in functionality.
> 
> Alternatively CTA_FILTER could have been used for the same
> functionality. However it is not yet supported during flush requests and
> is only available when using AF_INET or AF_INET6.

You mean, AF_UNSPEC cannot be specified in CTA_FILTER?

Please, extend libnetfilter_conntrack to support for this feature,
there is a filter API that can be used for this purpose.

Thanks.

Re: [PATCH net 4/4] selftests/net: mptcp: fix uninitialized variable warnings

2023-11-27 Thread Matthieu Baerts

Hi Willem,

(+ cc MPTCP list)

On 24/11/2023 18:15, Willem de Bruijn wrote:
> From: Willem de Bruijn 
> 
> Same init_rng() in both tests. The function reads /dev/urandom to
> initialize srand(). In case of failure, it falls back onto the
> entropy in the uninitialized variable. Not sure if this is on purpose.
> But failure reading urandom should be rare, so just fail hard. While
> at it, convert to getrandom(). Which man 4 random suggests is simpler
> and more robust.
> 
> mptcp_inq.c:525:6:
> mptcp_connect.c:1131:6:
> 
> error: variable 'foo' is used uninitialized
> whenever 'if' condition is false
> [-Werror,-Wsometimes-uninitialized]

Thank you for the patch!

It looks good to me:

Reviewed-by: Matthieu Baerts 

> Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp")
> Fixes: b51880568f20 ("selftests: mptcp: add inq test case")
> Cc: Florian Westphal 
> Signed-off-by: Willem de Bruijn 
> 
> 
> 
> When input is randomized because this is expected to meaningfully
> explore edge cases, should we also add
> 1. logging the random seed to stdout and
> 2. adding a command line argument to replay from a specific seed
> I can do this in net-next, if authors find it useful in this case.

I think we should have done that from the beginning, otherwise we cannot
easily reproduce these edge cases. To be honest, I don't think this
technique helped to find bugs, and it was probably used here as a good
habit to increase the coverage. But on the other hand, we might not
realise some inputs are randomised and can cause instabilities in the
tests because we don't print anything about that.

So I would say that the minimal thing to do is to log the random seed.
But it might not be that easy to do, for example 'mptcp_connect' is used
a lot of time by the .sh scripts: printing this seed number each time
'mptcp_connect' is started will "flood" the logs. Maybe we should only
print that at the end, in case of errors: e.g. in xerror() and
die_perror() for example, but I see 'exit(1)' is directly used in other
places...

That's more code to change, but if it is still OK for you to do that,
please also note that you will need to log this to stderr: mptcp_connect
prints what has been received from the other peer to stdout.

Because it is more than just adding a 'printf()', I just created a
ticket in our bug tracker, so anybody can look at that and check all the
details about that:

https://github.com/multipath-tcp/mptcp_net-next/issues/462

> ---
>  tools/testing/selftests/net/mptcp/mptcp_connect.c | 11 ---
>  tools/testing/selftests/net/mptcp/mptcp_inq.c | 11 ---
>  2 files changed, 8 insertions(+), 14 deletions(-)
> 
> diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c 
> b/tools/testing/selftests/net/mptcp/mptcp_connect.c
> index c7f9ebeebc2c5..d2043ec3bf6d6 100644
> --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
> +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c

(...)

> @@ -1125,15 +1126,11 @@ int main_loop_s(int listensock)
>  
>  static void init_rng(void)
>  {
> - int fd = open("/dev/urandom", O_RDONLY);
>   unsigned int foo;
>  
> - if (fd > 0) {

I just realised that here, we could have fd == 0 which is a valid value.
I don't think we would have that when executing the selftests, but
that's another reason to change this :)

> - int ret = read(fd, , sizeof(foo));
> -
> - if (ret < 0)
> - srand(fd + foo);
> - close(fd);
> + if (getrandom(, sizeof(foo), 0) == -1) {
> + perror("getrandom");
> + exit(1);
>   }
>  
>   srand(foo);

Cheers,
Matt

Re: [RFC PATCH 0/5] RISC-V: Add dynamic TSO support

2023-11-27 Thread Mark Rutland

On Fri, Nov 24, 2023 at 12:54:30PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 24, 2023 at 12:04:09PM +0100, Jonas Oberhauser wrote:
> > > I think ARM64 approached this problem by adding the
> > > load-acquire/store-release instructions and for TSO based code,
> > > translate into those (eg. x86 -> arm64 transpilers).
> > 
> > Although those instructions have a bit more ordering constraints.
> > 
> > I have heard rumors that the apple chips also have a register that can be
> > set at runtime.
> 
> Oh, I thought they made do with the load-acquire/store-release thingies.
> But to be fair, I haven't been paying *that* much attention to the apple
> stuff.
> 
> I did read about how they fudged some of the x86 flags thing.

I don't know what others may have built specifically, but architecturally on
arm64 we expect people to express ordering requirements through instructions.
ARMv8.0 has load-acquire and store-release, ARMv8.3 added RCpc forms of
load-acquire as part of FEAT_LRCPC, and ARMv8.4 added a number of instructions
as part of FEAT_LRCPC2.

For a number of reasons we avoid IMPLEMENTATION DEFINED controls for things
like this.

Thanks
Mark.

Re: [RFC PATCH 0/5] RISC-V: Add dynamic TSO support

2023-11-27 Thread Peter Zijlstra

On Fri, Nov 24, 2023 at 09:51:53PM -0500, Guo Ren wrote:
> On Fri, Nov 24, 2023 at 11:15:19AM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 24, 2023 at 08:21:37AM +0100, Christoph Muellner wrote:
> > > From: Christoph Müllner 
> > > 
> > > The upcoming RISC-V Ssdtso specification introduces a bit in the senvcfg
> > > CSR to switch the memory consistency model at run-time from RVWMO to TSO
> > > (and back). The active consistency model can therefore be switched on a
> > > per-hart base and managed by the kernel on a per-process/thread base.
> > 
> > You guys, computers are hartless, nobody told ya?
> > 
> > > This patch implements basic Ssdtso support and adds a prctl API on top
> > > so that user-space processes can switch to a stronger memory consistency
> > > model (than the kernel was written for) at run-time.
> > > 
> > > I am not sure if other architectures support switching the memory
> > > consistency model at run-time, but designing the prctl API in an
> > > arch-independent way allows reusing it in the future.
> > 
> > IIRC some Sparc chips could do this, but I don't think anybody ever
> > exposed this to userspace (or used it much).
> > 
> > IA64 had planned to do this, except they messed it up and did it the
> > wrong way around (strong first and then relax it later), which lead to
> > the discovery that all existing software broke (d'uh).
> > 
> > I think ARM64 approached this problem by adding the
> > load-acquire/store-release instructions and for TSO based code,
> > translate into those (eg. x86 -> arm64 transpilers).

> Keeping global TSO order is easier and faster than mixing
> acquire/release and regular load/store. That means when ssdtso is
> enabled, the transpiler's load-acquire/store-release becomes regular
> load/store. Some micro-arch hardwares could speed up the performance.

Why is it faster? Because the release+acquire thing becomes RcSC instead
of RcTSO? Surely that can be fixed with a weaker store-release variant
ot something?

The problem I have with all of this is that you need to context switch
this state and that you need to deal with exceptions, which must be
written for the weak model but then end up running in the tso model --
possibly slower than desired.

If OTOH you only have a single model, everything becomes so much
simpler. You just need to be able to express exactly what you want.

lsm_list_modules_test.c:104:22: error: 'LSM_ID_IMA' undeclared (first use in this function); did you mean 'LSM_ID_YAMA'?

2023-11-27 Thread Naresh Kamboju

Following build errors noticed while building selftests lsm tests for x86
with gcc-13 toolchain on Linux next-20231127 tag.

Build log:
--
selftest/lsm/lsm_list_modules_test
lsm_list_modules_test.c: In function 'correct_lsm_list_modules':
lsm_list_modules_test.c:104:22: error: 'LSM_ID_IMA' undeclared (first
use in this function); did you mean 'LSM_ID_YAMA'?
  104 | case LSM_ID_IMA:
  |  ^~
  |  LSM_ID_YAMA
lsm_list_modules_test.c:104:22: note: each undeclared identifier is
reported only once for each function it appears in

Reported-by: Linux Kernel Functional Testing 

Steps to reproduce:
-

tuxmake --runtime podman --target-arch x86_64 --toolchain gcc-13  \
 --kconfig 
https://storage.tuxsuite.com/public/linaro/lkft/builds/2Yk9XptRIQra77bvzZHcgyzkH7w/config
\
   debugkernel cpupower headers kernel kselftest modules

Links:
 - 
https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20231127/testrun/21324802/suite/build/test/gcc-13-lkftconfig-kselftest/log
 - 
https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20231127/testrun/21327065/suite/build/test/gcc-13-lkftconfig-kselftest/history/
 - 
https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20231127/testrun/21324802/suite/build/test/gcc-13-lkftconfig-kselftest/details/


--
Linaro LKFT
https://lkft.linaro.org

Re: [RFC PATCH 0/5] RISC-V: Add dynamic TSO support

2023-11-27 Thread Conor Dooley

Hi,

On Fri, Nov 24, 2023 at 08:21:37AM +0100, Christoph Muellner wrote:
> From: Christoph Müllner 
> 
> The upcoming RISC-V Ssdtso specification introduces a bit in the senvcfg
> CSR to switch the memory consistency model at run-time from RVWMO to TSO
> (and back). The active consistency model can therefore be switched on a
> per-hart base and managed by the kernel on a per-process/thread base.
> 
> This patch implements basic Ssdtso support and adds a prctl API on top
> so that user-space processes can switch to a stronger memory consistency
> model (than the kernel was written for) at run-time.
> 
> I am not sure if other architectures support switching the memory
> consistency model at run-time, but designing the prctl API in an
> arch-independent way allows reusing it in the future.
> 
> The patchset also comes with a short documentation of the prctl API.
> 
> This series is based on the second draft of the Ssdtso specification
> which was published recently on an RVI list:
>   https://lists.riscv.org/g/tech-arch-review/message/183
> Note, that the Ssdtso specification is in development state
> (i.e., not frozen or even ratified) which is also the reason
> why I marked the series as RFC.
> 
> One aspect that is not covered in this patchset is virtualization.
> It is planned to add virtualization support in a later version.
> Hints/suggestions on how to implement this part are very much
> appreciated.
> 
> Christoph Müllner (5):

I know this is an RFC, but it could probably do with a bit more compile
testing, as:

>   RISC-V: Add basic Ssdtso support

This patch doesn't build for rv64 allmodconfig

>   RISC-V: Expose Ssdtso via hwprobe API

This one seems to build fine

>   uapi: prctl: Add new prctl call to set/get the memory consistency
> model
>   RISC-V: Implement prctl call to set/get the memory consistency model
>   RISC-V: selftests: Add DTSO tests

These don't build for:
rv32 defconfig
rv64 allmodconfig
rv64 nommu

Cheers,
Conor.


signature.asc
Description: PGP signature

Re: [PATCH v1 3/3] KVM: selftests: aarch64: Introduce pmu_event_filter_test

2023-11-27 Thread Eric Auger

Hi Shaoqin,

On 11/23/23 07:37, Shaoqin Huang wrote:
> Introduce pmu_event_filter_test for arm64 platforms. The test configures
> PMUv3 for a vCPU, and sets different pmu event filter for the vCPU, and
filters
> check if the guest can use those events which user allow and can't use
> those events which use deny.
> 
> This test refactor the create_vpmu_vm() and make it a wrapper for
> __create_vpmu_vm(), which can let we do some extra init before
which can let we do -> which allows some extra init code.
> KVM_ARM_VCPU_PMU_V3_INIT.
> 
> This test choose the branches_retired and the instructions_retired
> event, and let guest use the two events in pmu. And check if the result
Are you sure those events are supported?
> is expected.
> 
> Signed-off-by: Shaoqin Huang 
> ---
>  tools/testing/selftests/kvm/Makefile  |   1 +
>  .../kvm/aarch64/pmu_event_filter_test.c   | 227 ++
>  .../selftests/kvm/include/aarch64/vpmu.h  |   4 +
>  .../testing/selftests/kvm/lib/aarch64/vpmu.c  |  14 +-
>  4 files changed, 244 insertions(+), 2 deletions(-)
>  create mode 100644 
> tools/testing/selftests/kvm/aarch64/pmu_event_filter_test.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile 
> b/tools/testing/selftests/kvm/Makefile
> index b60852c222ac..5f126e1a1dbf 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -155,6 +155,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
>  TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
>  TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
>  TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test
> +TEST_GEN_PROGS_aarch64 += aarch64/pmu_event_filter_test
>  TEST_GEN_PROGS_aarch64 += aarch64/psci_test
>  TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs
>  TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter
> diff --git a/tools/testing/selftests/kvm/aarch64/pmu_event_filter_test.c 
> b/tools/testing/selftests/kvm/aarch64/pmu_event_filter_test.c
> new file mode 100644
> index ..a876f5c2033b
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/aarch64/pmu_event_filter_test.c
> @@ -0,0 +1,227 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * pmu_event_filter_test - Test user limit pmu event for guest.
> + *
> + * Copyright (c) 2023 Red Hat, Inc.
> + *
> + * This test checks if the guest only see the limited pmu event that 
> userspace
sees
> + * sets, if the gust can use those events which user allow, and if the guest
s/gust/guest
> + * can't use those events which user deny.
> + * It also checks set invalid filter return the expected error.
it also checks that setting invalid filter ranges ...
> + * This test runs only when KVM_CAP_ARM_PMU_V3 is supported on the host.
> + */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct {
> + uint64_t branches_retired;
> + uint64_t instructions_retired;
> +} pmc_results;
> +
> +static struct vpmu_vm *vpmu_vm;
> +
> +#define FILTER_NR 10
> +
> +struct test_desc {
> + const char *name;
> + void (*check_result)(void);
> + struct kvm_pmu_event_filter filter[FILTER_NR];
> +};
> +
> +#define __DEFINE_FILTER(base, num, act)  \
> + ((struct kvm_pmu_event_filter) {\
> + .base_event = base, \
> + .nevents= num,  \
> + .action = act,  \
> + })
> +
> +#define DEFINE_FILTER(base, act) __DEFINE_FILTER(base, 1, act)
> +
> +#define EMPTY_FILTER { 0 }
> +
> +#define SW_INCR  0x0
> +#define INST_RETIRED 0x8
> +#define BR_RETIERD   0x21
looks like a typo
> +
> +#define NUM_BRANCHES 10
> +
> +static void run_and_measure_loop(void)
> +{
> + asm volatile(
> + "   mov x10, %[loop]\n"
> + "1: sub x10, x10, #1\n"
> + "   cmp x10, #0x0\n"
> + "   b.gt1b\n"
> + :
> + : [loop] "r" (NUM_BRANCHES)
> + : "x10", "cc");
> +}
> +
> +static void guest_code(void)
> +{
> + uint64_t pmcr = read_sysreg(pmcr_el0);
> +
> + pmu_disable_reset();
> +
> + write_pmevtypern(0, BR_RETIERD);
> + write_pmevtypern(1, INST_RETIRED);
> + enable_counter(0);
> + enable_counter(1);
> + write_sysreg(pmcr | ARMV8_PMU_PMCR_E, pmcr_el0);
> +
> + run_and_measure_loop();
> +
> + write_sysreg(pmcr, pmcr_el0);
> +
> + pmc_results.branches_retired = read_sysreg(pmevcntr0_el0);
> + pmc_results.instructions_retired = read_sysreg(pmevcntr1_el0);
> +
> + GUEST_DONE();
another direct way to see if the guest can use those filters is to read
the PMCEIDx that indicates whether an event is supported.

> +}
> +
> +static void pmu_event_filter_init(struct vpmu_vm *vm, void *arg)
> +{
> + struct kvm_device_attr attr = {
> + .group  = KVM_ARM_VCPU_PMU_V3_CTRL,
> + .attr   = KVM_ARM_VCPU_PMU_V3_FILTER,
> + };
> + struct kvm_pmu_event_filter *filter = (struct

51 matches

Mail list logo